← Back to Optimizer0
SGD
OptimizerUsed in
57 PRs
Avg BPB
1.0885
Submissions
PR #272by simon-marcus
1.2427PR #301by lookin-zz
1.1807PR #338by alertcat
1.1254PR #397by translatingthename
1.1364PR #417by EthanYangTW
1.1227PR #461by Christopher-Lee-McClendon
1.1446PR #473by abaybektursun
1.1214PR #484by Robby955
1.1185PR #508by newjordan
1.1215PR #526by Christopher-Lee-McClendon
1.1425PR #533by newjordan
1.1207PR #549by abaybektursunRECORD
1.1194PR #589by RoyiRa
1.1178PR #610by ChaosCodes
1.1190PR #612by Christopher-Lee-McClendon
1.1079PR #628by Christopher-Lee-McClendon
1.0983PR #696by gravelBridge
1.2622PR #752by Naazimsnh02
1.1182PR #885by lolrazh
0.9958PR #959by himanalot
0.0000PR #967by dexhunter
1.0450PR #995by dexhunter
1.0362PR #1014by haimianbaobao007
1.6200PR #1084by AnubhavBharadwaaj
1.1185PR #1087by Dhenenjay
1.1407PR #1092by teddyoweh
1.1219PR #1150by sahiee-dev
1.1151PR #1185by skoustav35
0.9641PR #1186by andrewbaggio1
0.9850PR #1202by VirajDeshwal
1.1412PR #1209by andrewbaggio1
1.1064PR #1222by abaybektursun
1.4707PR #1231by nestamidavaine
1.1163PR #1240by andrewbaggio1
1.1064PR #1320by jpfeiffe
1.1196PR #1341by himanshudongre
1.1000PR #1370by Christopher-Lee-McClendon
1.0030PR #1457by DilpreetBansi
1.1454PR #1460by resouer
1.0827PR #1492by bigbag
1.0810PR #1493by bigbagRECORD
1.0810PR #1515by dexhunter
1.0872PR #1532by nogakeren
1.0803PR #1561by EthanYangTW
1.0783PR #1572by anthony-maio
1.0797PR #1601by SPThole
1.1190PR #1605by renqianluo
0.2988PR #1610by romeerp
1.0729PR #1616by Vickyrrrrrr
1.4100PR #1638by kunwar-vikrant
1.0832PR #1695by X-Abhishek-X
1.0759PR #1696by kings-crown
1.1224PR #1711by aamodbhatt
1.0098PR #1715by G3sparky
1.0809PR #1727by yahya010
1.0722PR #1731by Victory963
1.0785PR #1747by swapp1990
1.0820Hyperparameters Across PRs
| pr_number | weight_decay | momentum | other_params |
|---|---|---|---|
| 272 | — | — | {"matrix_lr":0.08,"scalar_lr":0.04,"embed_lr":0.05} |
| 301 | — | 0.9 | {"learning_rate":0.002} |
| 338 | — | 0.9 | {"used_for":"TTT fine-tuning"} |
| 397 | — | 0 | {"learning_rate":0.001,"rank_local":true} |
| 417 | — | — | {"phase":"TTT phase 2","learning_rate":0.005} |
| 461 | — | 0.9 | {"learning_rate":0.002,"epochs_per_chunk":3,"freeze_blocks":2} |
| 473 | — | 0.9 | {"used_for":"TTT adaptation","learning_rate":0.002,"epochs":3,"gradient_clip":1,"batch_size":32} |
| 484 | — | — | — |
| 508 | — | 0.9 | {"epochs_per_chunk":3,"grad_clip":1} |
| 526 | — | 0.9 | {"learning_rate":0.002} |
| 533 | — | — | {"lr":0.002} |
| 549 | — | 0.9 | {"learning_rate":0.002} |
| 589 | — | 0.9 | {"learning_rate":0.002,"cosine_decay":true,"grad_clip":1} |
| 610 | — | 0.9 | {"learning_rate":0.002,"lr_schedule":"cosine","epochs_per_chunk":3,"chunk_size_tokens":32768,"freeze_blocks":2,"score_first":true} |
| 612 | — | 0.9 | {"learning_rate":0.002,"epochs_per_chunk":10,"gradient_clip":1,"freeze_first_blocks":2} |
| 628 | 0.04 | 0.9 | {"learning_rate":0.002,"lr_schedule":"cosine decay with 5% warmup"} |
| 696 | — | 0.9 | {"learning_rate":0.002} |
| 752 | — | 0.9 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"all_blocks_unfrozen":true} |
| 885 | — | 0.9 | {"grad_clip":1} |
| 959 | — | — | {"learning_rate":0.05} |
| 967 | — | 0.9 | {"learning_rate":0.002} |
| 995 | — | 0.95 | {"learning_rate":0.002} |
| 1014 | — | 0.95 | {"per_layer_lr":true} |
| 1084 | — | 0.9 | {"ttt_gradient_clip":1,"ttt_batch_seqs":32} |
| 1087 | — | 0.95 | {"learning_rate":0.005} |
| 1092 | — | 0.9 | {"ttt_learning_rate":0.03,"ttt_epochs":3,"grad_clip":1} |
| 1150 | — | — | {"test_time_training":true,"epochs":3} |
| 1185 | — | 0.9 | {"learning_rate":0.002} |
| 1186 | — | 0.9 | — |
| 1202 | — | 0.9 | {"lr":0.002,"cosine_decay":true,"grad_clip":1} |
| 1209 | — | 0.9 | {"used_for":"TTT"} |
| 1222 | — | — | {"inner_loop":true,"used_for":"test-time adaptation and FOMAML inner loop"} |
| 1231 | — | 0.9 | {"used_for_ttt":true} |
| 1240 | — | 0.9 | — |
| 1320 | — | 0.9 | {"learning_rate":0.002} |
| 1341 | — | 0.9 | {"epochs":3,"freeze_blocks":2,"grad_clip":1} |
| 1370 | — | 0.9 | {"learning_rate":0.002} |
| 1457 | 0.01 | 0.9 | {"grad_clip":1} |
| 1460 | — | 0.9 | {"lr":0.005} |
| 1492 | — | 0.9 | {"learning_rate":0.005,"epochs":3,"gradient_clipping":1} |
| 1493 | 0.095 | 0.9 | {"learning_rate":0.005} |
| 1515 | — | 0.9 | {"learning_rate":0.005} |
| 1532 | 0.095 | 0.9 | {"lr":0.005} |
| 1561 | — | 0.9 | {"lr":0.01} |
| 1572 | — | — | {"learning_rate":0.005} |
| 1601 | — | — | {"inner_loop":"SAM","meta_training":"FOMAML"} |
| 1605 | — | — | {"learning_rate":0.005} |
| 1610 | — | — | {"distributed":true} |
| 1616 | 0.095 | 0.9 | {"mlr":0.022} |
| 1638 | — | — | {"used_for_ttt":true,"reverted_from":"AdamW"} |
| 1695 | — | 0.9 | {"phased":true,"base_model_weight_updates":true} |
| 1696 | — | 0.9 | {"used_for":"TTT"} |
| 1711 | — | 0.9 | {"learning_rate":0.005} |
| 1715 | 0.095 | 0.9 | {"learning_rate":0.005,"epochs_per_chunk":3} |
| 1727 | — | — | {"matrix_lr":0.026} |
| 1731 | — | 0.9 | {"learning_rate":0.005,"epochs_per_chunk":3} |
| 1747 | — | 0.9 | {"learning_rate":0.005,"scope":"all-weights","chunk_size":2048} |