← Back to Optimizer

SGD

Optimizer
Used in
57 PRs
0
Avg BPB
1.0885

Submissions

PR #272by simon-marcus
1.2427
PR #301by lookin-zz
1.1807
PR #338by alertcat
1.1254
PR #397by translatingthename
1.1364
PR #417by EthanYangTW
1.1227
PR #461by Christopher-Lee-McClendon
1.1446
PR #473by abaybektursun
1.1214
PR #484by Robby955
1.1185
PR #508by newjordan
1.1215
PR #526by Christopher-Lee-McClendon
1.1425
PR #533by newjordan
1.1207
PR #549by abaybektursunRECORD
1.1194
PR #589by RoyiRa
1.1178
PR #610by ChaosCodes
1.1190
PR #612by Christopher-Lee-McClendon
1.1079
PR #628by Christopher-Lee-McClendon
1.0983
PR #696by gravelBridge
1.2622
PR #752by Naazimsnh02
1.1182
PR #885by lolrazh
0.9958
PR #959by himanalot
0.0000
PR #967by dexhunter
1.0450
PR #995by dexhunter
1.0362
PR #1014by haimianbaobao007
1.6200
PR #1084by AnubhavBharadwaaj
1.1185
PR #1087by Dhenenjay
1.1407
PR #1092by teddyoweh
1.1219
PR #1150by sahiee-dev
1.1151
PR #1185by skoustav35
0.9641
PR #1186by andrewbaggio1
0.9850
PR #1202by VirajDeshwal
1.1412
PR #1209by andrewbaggio1
1.1064
PR #1222by abaybektursun
1.4707
PR #1231by nestamidavaine
1.1163
PR #1240by andrewbaggio1
1.1064
PR #1320by jpfeiffe
1.1196
PR #1341by himanshudongre
1.1000
PR #1370by Christopher-Lee-McClendon
1.0030
PR #1457by DilpreetBansi
1.1454
PR #1460by resouer
1.0827
PR #1492by bigbag
1.0810
PR #1493by bigbagRECORD
1.0810
PR #1515by dexhunter
1.0872
PR #1532by nogakeren
1.0803
PR #1561by EthanYangTW
1.0783
PR #1572by anthony-maio
1.0797
PR #1601by SPThole
1.1190
PR #1605by renqianluo
0.2988
PR #1610by romeerp
1.0729
PR #1616by Vickyrrrrrr
1.4100
PR #1638by kunwar-vikrant
1.0832
PR #1695by X-Abhishek-X
1.0759
PR #1696by kings-crown
1.1224
PR #1711by aamodbhatt
1.0098
PR #1715by G3sparky
1.0809
PR #1727by yahya010
1.0722
PR #1731by Victory963
1.0785
PR #1747by swapp1990
1.0820

Hyperparameters Across PRs

pr_numberweight_decaymomentumother_params
272{"matrix_lr":0.08,"scalar_lr":0.04,"embed_lr":0.05}
3010.9{"learning_rate":0.002}
3380.9{"used_for":"TTT fine-tuning"}
3970{"learning_rate":0.001,"rank_local":true}
417{"phase":"TTT phase 2","learning_rate":0.005}
4610.9{"learning_rate":0.002,"epochs_per_chunk":3,"freeze_blocks":2}
4730.9{"used_for":"TTT adaptation","learning_rate":0.002,"epochs":3,"gradient_clip":1,"batch_size":32}
484
5080.9{"epochs_per_chunk":3,"grad_clip":1}
5260.9{"learning_rate":0.002}
533{"lr":0.002}
5490.9{"learning_rate":0.002}
5890.9{"learning_rate":0.002,"cosine_decay":true,"grad_clip":1}
6100.9{"learning_rate":0.002,"lr_schedule":"cosine","epochs_per_chunk":3,"chunk_size_tokens":32768,"freeze_blocks":2,"score_first":true}
6120.9{"learning_rate":0.002,"epochs_per_chunk":10,"gradient_clip":1,"freeze_first_blocks":2}
6280.040.9{"learning_rate":0.002,"lr_schedule":"cosine decay with 5% warmup"}
6960.9{"learning_rate":0.002}
7520.9{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"all_blocks_unfrozen":true}
8850.9{"grad_clip":1}
959{"learning_rate":0.05}
9670.9{"learning_rate":0.002}
9950.95{"learning_rate":0.002}
10140.95{"per_layer_lr":true}
10840.9{"ttt_gradient_clip":1,"ttt_batch_seqs":32}
10870.95{"learning_rate":0.005}
10920.9{"ttt_learning_rate":0.03,"ttt_epochs":3,"grad_clip":1}
1150{"test_time_training":true,"epochs":3}
11850.9{"learning_rate":0.002}
11860.9
12020.9{"lr":0.002,"cosine_decay":true,"grad_clip":1}
12090.9{"used_for":"TTT"}
1222{"inner_loop":true,"used_for":"test-time adaptation and FOMAML inner loop"}
12310.9{"used_for_ttt":true}
12400.9
13200.9{"learning_rate":0.002}
13410.9{"epochs":3,"freeze_blocks":2,"grad_clip":1}
13700.9{"learning_rate":0.002}
14570.010.9{"grad_clip":1}
14600.9{"lr":0.005}
14920.9{"learning_rate":0.005,"epochs":3,"gradient_clipping":1}
14930.0950.9{"learning_rate":0.005}
15150.9{"learning_rate":0.005}
15320.0950.9{"lr":0.005}
15610.9{"lr":0.01}
1572{"learning_rate":0.005}
1601{"inner_loop":"SAM","meta_training":"FOMAML"}
1605{"learning_rate":0.005}
1610{"distributed":true}
16160.0950.9{"mlr":0.022}
1638{"used_for_ttt":true,"reverted_from":"AdamW"}
16950.9{"phased":true,"base_model_weight_updates":true}
16960.9{"used_for":"TTT"}
17110.9{"learning_rate":0.005}
17150.0950.9{"learning_rate":0.005,"epochs_per_chunk":3}
1727{"matrix_lr":0.026}
17310.9{"learning_rate":0.005,"epochs_per_chunk":3}
17470.9{"learning_rate":0.005,"scope":"all-weights","chunk_size":2048}