22Learning Rate Mapping for Unified Optimizer Interface
SpotOptim provides a sophisticated learning rate mapping system through the map_lr() function, enabling a unified interface for learning rates across different PyTorch optimizers. This solves the challenge that different optimizers operate on vastly different learning rate scales.
22.1 Overview
Different PyTorch optimizers use different default learning rates and optimal ranges:
Adam: default 0.001, typical range 0.0001-0.01
SGD: default 0.01, typical range 0.001-0.1
RMSprop: default 0.01, typical range 0.001-0.1
This makes it difficult to compare optimizer performance fairly or optimize learning rates across different optimizers. The map_lr() function provides a unified scale where lr_unified=1.0 corresponds to each optimizer’s PyTorch default.
Module: spotoptim.utils.mapping
Key Features:
Unified learning rate scale across all optimizers
Fair comparison when evaluating different optimizers
Maps a unified learning rate to an optimizer-specific learning rate.
Parameters:
lr_unified (float): Unified learning rate multiplier. A value of 1.0 corresponds to the optimizer’s default learning rate. Typical range: [0.001, 100.0].
optimizer_name (str): Name of the PyTorch optimizer. Must be one of: “Adadelta”, “Adagrad”, “Adam”, “AdamW”, “SparseAdam”, “Adamax”, “ASGD”, “LBFGS”, “NAdam”, “RAdam”, “RMSprop”, “Rprop”, “SGD”.
use_default_scale (bool, optional): Whether to scale by the optimizer’s default learning rate. If True (default), lr_unified is multiplied by the default lr. If False, returns lr_unified directly.
All major PyTorch optimizers are supported with their default learning rates:
Optimizer
Default LR
Typical Range
Notes
Adam
0.001
0.0001-0.01
Most popular, good default
AdamW
0.001
0.0001-0.01
Adam with weight decay
Adamax
0.002
0.0001-0.01
Adam variant with infinity norm
NAdam
0.002
0.0001-0.01
Adam with Nesterov momentum
RAdam
0.001
0.0001-0.01
Rectified Adam
SparseAdam
0.001
0.0001-0.01
For sparse gradients
SGD
0.01
0.001-0.1
Classic, needs momentum
RMSprop
0.01
0.001-0.1
Good for RNNs
Adagrad
0.01
0.001-0.1
Adaptive learning rate
Adadelta
1.0
0.1-10.0
Extension of Adagrad
ASGD
0.01
0.001-0.1
Averaged SGD
LBFGS
1.0
0.1-10.0
Second-order optimizer
Rprop
0.01
0.001-0.1
Resilient backpropagation
22.5 Use Cases
22.5.1 Comparing Different Optimizers
import torchimport torch.nn as nnfrom spotoptim.nn.linear_regressor import LinearRegressorfrom spotoptim.data import get_diabetes_dataloaders# Load datatrain_loader, test_loader, _ = get_diabetes_dataloaders(batch_size=32, random_state=42)# Test different optimizers with unified learning rateunified_lr =1.0optimizers_to_test = ["Adam", "SGD", "RMSprop", "AdamW"]results = {}for opt_name in optimizers_to_test:# Reset for fair comparison torch.manual_seed(42) model = LinearRegressor(input_dim=10, output_dim=1, l1=32, num_hidden_layers=2, lr=unified_lr)# Create optimizer with mapped learning rateif opt_name =="SGD": optimizer = model.get_optimizer(opt_name, momentum=0.9)else: optimizer = model.get_optimizer(opt_name) criterion = nn.MSELoss()# Train model.train()for epoch inrange(50):for batch_X, batch_y in train_loader: optimizer.zero_grad() predictions = model(batch_X) loss = criterion(predictions, batch_y) loss.backward() optimizer.step()# Evaluate model.eval() test_loss =0.0with torch.no_grad():for batch_X, batch_y in test_loader: predictions = model(batch_X) test_loss += criterion(predictions, batch_y).item() avg_test_loss = test_loss /len(test_loader) results[opt_name] = avg_test_lossprint(f"{opt_name:10s}: Test MSE = {avg_test_loss:.4f} "f"(actual lr = {optimizer.param_groups[0]['lr']:.6f})")# Find best optimizerbest_opt =min(results, key=results.get)print(f"\nBest optimizer: {best_opt} with MSE = {results[best_opt]:.4f}")
Adam : Test MSE = 3859.4478 (actual lr = 0.001000)
SGD : Test MSE = 5271.6840 (actual lr = 0.010000)
RMSprop : Test MSE = 2769.5164 (actual lr = 0.010000)
AdamW : Test MSE = 3866.1197 (actual lr = 0.001000)
Best optimizer: RMSprop with MSE = 2769.5164
22.5.2 Hyperparameter Optimization
from spotoptim import SpotOptimfrom spotoptim.nn.linear_regressor import LinearRegressorfrom spotoptim.data import get_diabetes_dataloadersimport torchimport torch.nn as nnimport numpy as npdef train_model(X):"""Objective function for hyperparameter optimization.""" results = []# Load data once train_loader, test_loader, _ = get_diabetes_dataloaders(batch_size=32, random_state=42)for params in X: lr_unified =10** params[0] # Log scale: [-4, 0] optimizer_name = params[1] # Factor: "Adam", "SGD", "RMSprop"# Create model with unified lr - automatically scaled per optimizer torch.manual_seed(42) model = LinearRegressor(input_dim=10, output_dim=1, l1=32, num_hidden_layers=2, lr=lr_unified) optimizer = model.get_optimizer(optimizer_name) criterion = nn.MSELoss()# Train model.train()for epoch inrange(30):for batch_X, batch_y in train_loader: optimizer.zero_grad() predictions = model(batch_X) loss = criterion(predictions, batch_y) loss.backward() optimizer.step()# Evaluate model.eval() test_loss =0.0with torch.no_grad():for batch_X, batch_y in test_loader: predictions = model(batch_X) test_loss += criterion(predictions, batch_y).item() avg_test_loss = test_loss /len(test_loader) results.append(avg_test_loss)return np.array(results)# Optimize unified lr across different optimizersspot_optimizer = SpotOptim( fun=train_model, bounds=[(-4, 0), ("Adam", "SGD", "RMSprop")], var_type=["float", "factor"], max_iter=10, # Small for demo n_initial=5, seed=42)result = spot_optimizer.optimize()print(f"\nBest unified lr: {10**result.x[0]:.6f}")print(f"Best optimizer: {result.x[1]}")print(f"Best test MSE: {result.fun:.4f}")# Show actual learning rate usedfrom spotoptim.utils.mapping import map_lractual_lr = map_lr(10**result.x[0], result.x[1])print(f"Actual {result.x[1]} learning rate: {actual_lr:.6f}")
Best unified lr: 0.003144
Best optimizer: SGD
Best test MSE: 4135.5200
Actual SGD learning rate: 0.000031
22.5.3 Hyperparameter Optimization with SpotOptim
Note, N_INITIAL and MAX_ITER are kept small for demonstration; increase for real use.
from spotoptim import SpotOptimfrom spotoptim.nn.linear_regressor import LinearRegressorfrom spotoptim.data import get_diabetes_dataloadersimport torch.nn as nnimport torchimport numpy as npMAX_ITER =10N_INITIAL =5def train_and_evaluate(X):"""Objective function for hyperparameter optimization.""" results = []# Load data once train_loader, test_loader, _ = get_diabetes_dataloaders( batch_size=32, random_state=42 )for params in X:# Extract hyperparameters lr_unified =10** params[0] # Log scale optimizer_name = params[1] # Factor variable l1 =int(params[2]) # Integer num_layers =int(params[3]) # Integer# Create model with unified learning rate model = LinearRegressor( input_dim=10, output_dim=1, l1=l1, num_hidden_layers=num_layers, lr=lr_unified # Automatically mapped per optimizer )# Get optimizer (lr already mapped internally)if optimizer_name =="SGD": optimizer = model.get_optimizer(optimizer_name, momentum=0.9)else: optimizer = model.get_optimizer(optimizer_name) criterion = nn.MSELoss()# Train model.train()for epoch inrange(30):for batch_X, batch_y in train_loader: optimizer.zero_grad() predictions = model(batch_X) loss = criterion(predictions, batch_y) loss.backward() optimizer.step()# Evaluate model.eval() test_loss =0.0with torch.no_grad():for batch_X, batch_y in test_loader: predictions = model(batch_X) test_loss += criterion(predictions, batch_y).item() avg_test_loss = test_loss /len(test_loader) results.append(avg_test_loss)return np.array(results)# Optimize learning rate, optimizer choice, and architectureoptimizer = SpotOptim( fun=train_and_evaluate, bounds=[ (-4, 0), # log10(lr_unified): [0.0001, 1.0] ("Adam", "SGD", "RMSprop", "AdamW"), # Optimizer choice (16, 128), # Layer size (1, 3) # Number of hidden layers ], var_type=["float", "factor", "int", "int"], max_iter=MAX_ITER, n_initial=N_INITIAL, seed=42)result = optimizer.optimize()# Display resultsprint("\nOptimization Results:")print(f"Best unified lr: {10**result.x[0]:.6f}")print(f"Best optimizer: {result.x[1]}")print(f"Best layer size: {int(result.x[2])}")print(f"Best num layers: {int(result.x[3])}")print(f"Best test MSE: {result.fun:.4f}")# Show actual learning rate usedfrom spotoptim.utils.mapping import map_lractual_lr = map_lr(10**result.x[0], result.x[1])print(f"Actual {result.x[1]} learning rate: {actual_lr:.6f}")
Optimization Results:
Best unified lr: 0.305427
Best optimizer: RMSprop
Best layer size: 118
Best num layers: 2
Best test MSE: 2874.4881
Actual RMSprop learning rate: 0.003054
22.5.4 Log-Scale Hyperparameter Search
from spotoptim.utils.mapping import map_lrimport numpy as np# Common pattern: sample unified lr from log scalelog_lr_range = np.linspace(-4, 0, 10) # [-4, -3.56, ..., 0]optimizers = ["Adam", "SGD", "RMSprop"]print("Log-scale learning rate search:")print()print(f"{'log_lr':<10}{'unified_lr':<12}{'Adam':<12}{'SGD':<12}{'RMSprop':<12}")print("-"*60)for log_lr in log_lr_range: lr_unified =10** log_lr lr_adam = map_lr(lr_unified, "Adam") lr_sgd = map_lr(lr_unified, "SGD") lr_rmsprop = map_lr(lr_unified, "RMSprop")print(f"{log_lr:<10.2f}{lr_unified:<12.6f}{lr_adam:<12.8f} "f"{lr_sgd:<12.8f}{lr_rmsprop:<12.8f}")