用于非Convex优化的斯托卡移动平均动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动向动动动动动动动动动动动动动动动动动动动动动动动动向动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动动 (On Stochastic Moving-Average Estimators for Non-Convex Optimization)

Zhishuai Guo,Yi Xu,Wotao Yin,Rong Jin,Tianbao Yang

In this paper, we demonstrate the power of a widely used stochastic estimator based on moving average (SEMA) on a range of stochastic non-convex optimization problems, which only requires {\bf a general unbiased stochastic oracle}. We analyze various stochastic methods (existing or newly proposed) based on the {\bf variance recursion property} of SEMA for three families of non-convex optimization, namely standard stochastic non-convex minimization, stochastic non-convex strongly-concave min-max optimization, and stochastic bilevel optimization. Our contributions include: (i) for standard stochastic non-convex minimization, we present a simple and intuitive proof of convergence for a family Adam-style methods (including Adam) with an increasing or large "momentum" parameter for the first-order moment, which gives an alternative yet more natural way to guarantee Adam converge; (ii) for stochastic non-convex strongly-concave min-max optimization, we present a single-loop stochastic gradient descent ascent method based on the moving average estimators and establish its oracle complexity of $O(1/\epsilon^4)$ without using a large mini-batch size, addressing a gap in the literature; (iii) for stochastic bilevel optimization, we present a single-loop stochastic method based on the moving average estimators and establish its oracle complexity of $\widetilde O(1/\epsilon^4)$ without computing the inverse or SVD of the Hessian matrix, improving state-of-the-art results. For all these problems, we also establish a variance diminishing result for the used stochastic gradient estimators.

翻译：在本文中, 我们展示了基于移动平均值( SEMA ) 的广泛使用的随机估测估算器的力量, 其基础是一系列的随机平均( SEMA ), 它只是需要 & bf 一个普遍的无偏透度或新提议。我们分析了基于 & bf 差异循环属性的 SEMA 的各种随机评估方法( 现有或新提议 ) 。 3个非 convex 优化的家族使用SEMA, 即标准的抽查性非相异性( 最小值最小值最小化 ) 、平均非相异性( SEMA ), 以及双向优化。我们的贡献包括:( ) 对于标准的随机度非正视性( 包括亚当 ), 我们展示了一种简单和直观方法的趋同性( 快速度), 并且用一种不断增长或巨大的“ moment” 参数来保证 Adam 的变异性( 二) 。