Why Gradient Descent Zigzags and How Momentum Fixes It

PLOT_STEPS = 55

x_ = np.linspace(-5, 5, 500)
y_ = np.linspace(-2.2, 2.2, 500)
X, Y = np.meshgrid(x_, y_)
Z = loss(X, Y)

fig = plt.determine(figsize=(16, 10), facecolor=”#FAFAF8″)
gs = GridSpec(2, 3, determine=fig, hspace=0.45, wspace=0.38,
left=0.07, proper=0.97, high=0.88, backside=0.08)

COLORS = {
“gd”: “#E05C4B”,
“mom_good”: “#3A7CA5”,
“mom_large”: “#F4A536”,
“contour”: “#D4C9B8”,
“minima”: “#2A9D5C”,
“begin”: “#444444”,
}

PANEL_TITLES = [
“Vanilla Gradient DescentnOscillates, slow (185 steps to converge)”,
“Momentum β = 0.90nSmooth, fast (159 steps to converge)”,
“Momentum β = 0.99 (too large)nOvershoots — never converges”,
]

paths_plot = [
path_gd[:PLOT_STEPS+1],
path_mom_good[:PLOT_STEPS+1],
path_mom_large[:PLOT_STEPS+1],
]
colours = [COLORS[“gd”], COLORS[“mom_good”], COLORS[“mom_large”]]

# high row: trajectory panels
for col, (path, coloration, title) in enumerate(zip(paths_plot, colours, PANEL_TITLES)):
ax = fig.add_subplot(gs[0, col])
ax.set_facecolor(“#F5F3EE”)

ranges = np.geomspace(0.005, 3.5, 28)
ax.contour(X, Y, Z, ranges=ranges, colours=COLORS[“contour”],
linewidths=0.7, alpha=0.9)

ax.plot(path[:, 0], path[:, 1], coloration=coloration, lw=1.8, alpha=0.85, zorder=3)
ax.scatter(path[:, 0], path[:, 1], coloration=coloration, s=18, zorder=4, alpha=0.6)

ax.scatter(*path[0], marker=”o”, s=90, coloration=COLORS[“start”], zorder=5, label=”begin”)
ax.scatter(*path[-1], marker=”*”, s=120, coloration=COLORS[“minima”], zorder=5, label=”finish”)
ax.scatter(0, 0, marker=”+”, s=200, coloration=COLORS[“minima”], linewidths=2.5, zorder=6)

ax.set_xlim(-5, 5)
ax.set_ylim(-2.2, 2.2)
ax.set_title(title, fontsize=9.5, fontweight=”daring”, coloration=”#222″, pad=7, loc=”left”)
ax.set_xlabel(“θ₁ (sluggish route)”, fontsize=8, coloration=”#666″)
ax.set_ylabel(“θ₂ (quick route)”, fontsize=8, coloration=”#666″)
ax.tick_params(labelsize=7, colours=”#888″)
for backbone in ax.spines.values():
backbone.set_edgecolor(“#CCCCCC”)

# bottom-left: loss curves (full 300 steps)
ax_loss = fig.add_subplot(gs[1, :2])
ax_loss.set_facecolor(“#F5F3EE”)

full_paths = [path_gd, path_mom_good, path_mom_large]
full_labels = [“Vanilla GD (185 steps)”, “Momentum β=0.90 (159 steps)”, “Momentum β=0.99 (diverges)”]

for path, coloration, label in zip(full_paths, colours, full_labels):
losses = [loss(*p) for p in path]
steps_range = np.arange(len(path))
ax_loss.plot(steps_range, losses, coloration=coloration, lw=2, label=label, alpha=0.9)

ax_loss.axhline(0.001, coloration=”#999″, lw=1, ls=”–“, alpha=0.6)
ax_loss.textual content(305, 0.001, “convergencenthreshold”, fontsize=7, coloration=”#888″, va=”heart”)

ax_loss.set_yscale(“log”)
ax_loss.set_xlim(0, STEPS)
ax_loss.set_title(“Loss vs. Optimisation Step (log scale, 300 steps)”,
fontsize=10.5, fontweight=”daring”, coloration=”#222″, loc=”left”)
ax_loss.set_xlabel(“Step”, fontsize=9, coloration=”#666″)
ax_loss.set_ylabel(“Loss f(θ)”, fontsize=9, coloration=”#666″)
ax_loss.legend(fontsize=8.5, framealpha=0.6)
ax_loss.tick_params(labelsize=8, colours=”#888″)
for backbone in ax_loss.spines.values():
backbone.set_edgecolor(“#CCCCCC”)

# bottom-right: annotation panel
ax_ann = fig.add_subplot(gs[1, 2])
ax_ann.set_facecolor(“#F5F3EE”)
ax_ann.axis(“off”)

annotation = (
“Replace rulesnn”
“Vanilla GDn”
” θ ← θ − α·∇L(θ)nn”
“Momentum GDn”
” v ← β·v + (1−β)·∇L(θ)n”
” θ ← θ − α·vnn”
“Key intuitionn”
” v accumulates previous gradients.n”
” Vertical oscillations cancel out.n”
” Horizontal steps compound.nn”
“Hyperparameter βn”
” β → 0 : behaves like GDn”
” β = 0.9: typical candy spotn”
” β → 1 : overshoots / diverges”
)
ax_ann.textual content(0.05, 0.97, annotation, remodel=ax_ann.transAxes,
fontsize=8.8, va=”high”, ha=”left”,
fontfamily=”monospace”, coloration=”#333″, linespacing=1.7)

fig.suptitle(“Momentum in Gradient Descent”,
fontsize=16, fontweight=”daring”, coloration=”#111″, y=0.95)

plt.savefig(“momentum_explainer.png”, dpi=150, bbox_inches=”tight”,
facecolor=fig.get_facecolor())
plt.present()

What's Hot

NYT Strands hints and solutions for Tuesday, Might 12 (sport #800)

OpenAI Introduces Dawn: A Cybersecurity Initiative That Places Codex Safety on the Middle of Vulnerability Detection and Patch Validation

FAQ on hantavirus and outbreak on cruise ship Hondius

OpenAI Introduces Dawn: A Cybersecurity Initiative That Places Codex Safety on the Middle of Vulnerability Detection and Patch Validation

College students Boo Graduation Speaker After She Calls AI the ‘Subsequent Industrial Revolution’

10 GitHub Repositories to Grasp FastAPI

Constructing internet search-enabled brokers with Strands and Exa

Understanding LLM Distillation Methods – MarkTechPost

Your AI Use Is Breaking My Mind

NYT Strands hints and solutions for Tuesday, Might 12 (sport #800)

OpenAI Introduces Dawn: A Cybersecurity Initiative That Places Codex Safety on the Middle of Vulnerability Detection and Patch Validation

FAQ on hantavirus and outbreak on cruise ship Hondius

NYT Strands hints and solutions for Tuesday, Might 12 (sport #800)

OpenAI Introduces Dawn: A Cybersecurity Initiative That Places Codex Safety on the Middle of Vulnerability Detection and Patch Validation

FAQ on hantavirus and outbreak on cruise ship Hondius

Usefull link

categories

What's Hot

Related Posts

Usefull link

categories