SGD

pr_number	weight_decay	momentum	other_params
272	—	—	{"matrix_lr":0.08,"scalar_lr":0.04,"embed_lr":0.05}
301	—	0.9	{"learning_rate":0.002}
338	—	0.9	{"used_for":"TTT fine-tuning"}
397	—	0	{"learning_rate":0.001,"rank_local":true}
417	—	—	{"phase":"TTT phase 2","learning_rate":0.005}
461	—	0.9	{"learning_rate":0.002,"epochs_per_chunk":3,"freeze_blocks":2}
473	—	0.9	{"used_for":"TTT adaptation","learning_rate":0.002,"epochs":3,"gradient_clip":1,"batch_size":32}
484	—	—	—
508	—	0.9	{"epochs_per_chunk":3,"grad_clip":1}
526	—	0.9	{"learning_rate":0.002}
533	—	—	{"lr":0.002}
549	—	0.9	{"learning_rate":0.002}
589	—	0.9	{"learning_rate":0.002,"cosine_decay":true,"grad_clip":1}
610	—	0.9	{"learning_rate":0.002,"lr_schedule":"cosine","epochs_per_chunk":3,"chunk_size_tokens":32768,"freeze_blocks":2,"score_first":true}
612	—	0.9	{"learning_rate":0.002,"epochs_per_chunk":10,"gradient_clip":1,"freeze_first_blocks":2}
628	0.04	0.9	{"learning_rate":0.002,"lr_schedule":"cosine decay with 5% warmup"}
696	—	0.9	{"learning_rate":0.002}
752	—	0.9	{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"all_blocks_unfrozen":true}
885	—	0.9	{"grad_clip":1}
959	—	—	{"learning_rate":0.05}
967	—	0.9	{"learning_rate":0.002}
995	—	0.95	{"learning_rate":0.002}
1014	—	0.95	{"per_layer_lr":true}
1084	—	0.9	{"ttt_gradient_clip":1,"ttt_batch_seqs":32}
1087	—	0.95	{"learning_rate":0.005}
1092	—	0.9	{"ttt_learning_rate":0.03,"ttt_epochs":3,"grad_clip":1}
1150	—	—	{"test_time_training":true,"epochs":3}
1185	—	0.9	{"learning_rate":0.002}
1186	—	0.9	—
1202	—	0.9	{"lr":0.002,"cosine_decay":true,"grad_clip":1}
1209	—	0.9	{"used_for":"TTT"}
1222	—	—	{"inner_loop":true,"used_for":"test-time adaptation and FOMAML inner loop"}
1231	—	0.9	{"used_for_ttt":true}
1240	—	0.9	—
1320	—	0.9	{"learning_rate":0.002}
1341	—	0.9	{"epochs":3,"freeze_blocks":2,"grad_clip":1}
1370	—	0.9	{"learning_rate":0.002}
1457	0.01	0.9	{"grad_clip":1}
1460	—	0.9	{"lr":0.005}
1492	—	0.9	{"learning_rate":0.005,"epochs":3,"gradient_clipping":1}
1493	0.095	0.9	{"learning_rate":0.005}
1515	—	0.9	{"learning_rate":0.005}
1532	0.095	0.9	{"lr":0.005}
1561	—	0.9	{"lr":0.01}
1572	—	—	{"learning_rate":0.005}
1601	—	—	{"inner_loop":"SAM","meta_training":"FOMAML"}
1605	—	—	{"learning_rate":0.005}
1610	—	—	{"distributed":true}
1616	0.095	0.9	{"mlr":0.022}
1638	—	—	{"used_for_ttt":true,"reverted_from":"AdamW"}
1695	—	0.9	{"phased":true,"base_model_weight_updates":true}
1696	—	0.9	{"used_for":"TTT"}
1711	—	0.9	{"learning_rate":0.005}
1715	0.095	0.9	{"learning_rate":0.005,"epochs_per_chunk":3}
1727	—	—	{"matrix_lr":0.026}
1731	—	0.9	{"learning_rate":0.005,"epochs_per_chunk":3}
1747	—	0.9	{"learning_rate":0.005,"scope":"all-weights","chunk_size":2048}
1773	—	0.9	{"learning_rate":0.005}
1775	—	—	{"global_ttt":true,"phases":3}
1776	—	0.9	{"learning_rate":0.005,"epochs_per_chunk":3,"freeze_first_blocks":9,"gradient_clip":1}
1790	—	—	{"global_ttt_lr":0.001,"phased_ttt_num_phases":3,"phased_ttt_prefix_docs":2000}
1802	—	0.9	{"learning_rate":0.015,"gradient_clip":1}
1834	—	0.9	{"learning_rate":0.005,"gradient_clip":1}
1837	—	0.9	{"grad_clip":1}
1858	0.095	0.9	{"muon_variant":"MuonEq-R","newton_schulz_steps":5}
1880	—	0.9	{"used_for":"TTT"}
1895	—	0.9	{"gradient_clip":1,"learning_rate":0.017}
1909	—	—	—
1923	0.5	0.9	{"global_ttt_momentum":0.9,"beta2":0.99,"ttt_beta2":0.99}
2144	—	—	{"ttt_learning_rate":0.005}