← Back to Optimizer
Parallel Muon
OptimizerUsed in
109 PRs
Best BPB
0.0830
Avg BPB
1.0253
Submissions
PR #399by abaybektursun
1.1247PR #473by abaybektursun
1.1214PR #549by abaybektursunRECORD
1.1194PR #593by abaybektursun
1.1163PR #609by saml212
1.1154PR #625by Joeavaib
1.1194PR #626by kshitizz36
1.1180PR #634by raahilshah
1.1171PR #659by deanbrr
1.0920PR #667by suchitj2702
1.1352PR #670by abaybektursun
1.1171PR #680by bro4all
1.1483PR #681by Alfaxad
1.4775PR #703by Gusanidas
1.1176PR #714by Upsalla
1.1187PR #720by agalimova
1.1078PR #726by DeepReinforce
1.1147PR #754by aryanbhosale
1.1253PR #768by mradassaad
1.1201PR #774by travispchen
0.9370PR #778by raahilshah
0.9605PR #785by SirSaltySalmon
1.5364PR #786by shinegami-2002
0.8128PR #794by jeremyschied
1.3346PR #797by armantsaturian
0.8960PR #808by Naazimsnh02
0.6364PR #816by jimliu741523
1.1194PR #831by sseanliu
1.1284PR #836by autocode-rayes
1.1219PR #838by aryanbhosale
1.1215PR #840by quietsmile
0.2873PR #841by someone114514
1.1157PR #843by quietsmile
0.2834PR #845by rubenbalbastre
1.1407PR #850by callithyia
0.3212PR #852by Prush69
1.1189PR #864by aryanbhosale
0.2841PR #865by aryanbhosale
0.2841PR #952by FlashyFlash3011
1.1144PR #953by dexhunter
1.0722PR #960by ADIITJ
1.1882PR #961by callithyia
0.0881PR #964by vivekvar-dl
1.3900PR #965by Adam-Jacuch
1.1184PR #977by michaelwinczuk
1.1185PR #986by sofiabod
0.0830PR #988by ymrohit
1.0857PR #1005by OnlyJundong
1.0853PR #1008by monkeyKingProgrammer
1.1538PR #1019by abaybektursunRECORD
1.1147PR #1031by michaelwinczuk
1.1185PR #1039by yufengli-oai
1.1184PR #1043by okezue
1.1261PR #1050by Taleef7
1.1194PR #1069by manfromnowhere143
1.1190PR #1070by manfromnowhere143
1.1190PR #1072by vimeto
1.1170PR #1081by michaelwinczuk
1.1220PR #1094by michaelwinczuk
0.4027PR #1098by adityakm24
1.1187PR #1099by Bortlesboat
1.1133PR #1105by abaybektursun
1.2208PR #1117by adityakm24
1.1187PR #1118by adityakm24
1.1187PR #1120by newjordan
1.1099PR #1122by icryo
1.1146PR #1130by Gusanidas
1.1140PR #1135by barneywohl
1.1116PR #1150by sahiee-dev
1.1151PR #1171by EthanYangTW
1.1145PR #1172by dexhunter
1.1015PR #1176by bigbag
1.0962PR #1179by dexhunter
1.1105PR #1182by adityakm24
1.1227PR #1184by icryo
0.9485PR #1216by SoHarshh
1.1574PR #1217by bigbag
1.1027PR #1242by Campbellb
1.0903PR #1244by monkeyKingProgrammer
1.1443PR #1252by ahmetdenizyilmaz
1.0713PR #1269by Jtss-ux
1.1194PR #1274by MatoTeziTanka
1.0876PR #1278by GitGeeks
1.1147PR #1282by newjordan
1.1035PR #1284by tyrel-beede
1.1207PR #1298by Omrigotlieb
1.1043PR #1309by cadenmcmann
1.1143PR #1310by cadenmcmann
1.1177PR #1319by canivel
0.6951PR #1350by resouer
1.0046PR #1359by LucasErcolano
0.4188PR #1379by LucasErcolano
0.4162PR #1383by nirmathur
1.3151PR #1407by OnlyJundong
1.0960PR #1418by Park-Tae-Hwan
1.4192PR #1424by OnlyJundong
1.0858PR #1442by akaiHuang
1.1854PR #1452by bsisduck
0.3509PR #1454by bsisduck
0.3509PR #1467by PhamPhuHoa-23
1.1056PR #1473by AVINASH0052
1.1156PR #1520by taka6745
1.0824PR #1523by EthanYangTW
1.0778PR #1561by EthanYangTW
1.0783PR #1568by yuitokyouni
1.1639PR #1621by mrbese
1.1531PR #1634by arsenis-cmd
1.1335PR #1666by mrbese
1.1531PR #1696by kings-crown
1.1224Hyperparameters Across PRs
| pr_number | weight_decay | momentum | other_params |
|---|---|---|---|
| 399 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"warmdown_iters":3000,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035} |
| 473 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"warmdown_iters":3500,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035} |
| 549 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035} |
| 593 | 0.04 | 0.99 | {"parameter_banking":true,"async_reduce_scatter_all_gather":true} |
| 609 | — | — | — |
| 625 | 0.04 | — | {"post_backward_reduce_scatter":true,"local_NS5":true,"all_gather":true} |
| 626 | — | — | — |
| 634 | 0.04 | 0.99 | {"lr_matrices":0.025,"lr_embeddings":0.035,"Newton-Schulz_steps":5,"gradient_clip":0.3,"batch_tokens":786432,"seq_len":2048} |
| 659 | — | — | — |
| 667 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500} |
| 670 | — | — | — |
| 680 | 0.04 | 0.99 | {"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035,"momentum_warmup_start":0.92,"momentum_warmup_steps":1500} |
| 681 | — | — | — |
| 703 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"warmdown_iters":3500} |
| 714 | 0.04 | 0.99 | {"lr":0.025} |
| 720 | — | — | {"parameter_banking":true} |
| 726 | — | — | — |
| 754 | 0.04 | 0.92 | {"momentum_schedule":"0.92→0.99 over 1500 steps","newton_schulz_steps":5,"parameter_banking":true,"async_reduce_scatter_all_gather":true} |
| 768 | — | — | — |
| 774 | 0.04 | 0.99 | {"adam_weight_decay":0.04,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035,"momentum_warmup_start":0.92,"momentum_warmup_steps":1500} |
| 778 | — | — | — |
| 785 | 0.04 | 0.99 | {"momentum_warmup_start":0.92,"momentum_warmup_steps":1500} |
| 786 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"adam_weight_decay":0.04} |
| 794 | — | — | {"matrix_lr":0.05,"muon_backend_steps":6,"muon_momentum_warmup_steps":300,"grad_clip_norm":1} |
| 797 | — | — | — |
| 808 | 0.04 | 0.99 | {"momentum_warmup_start":0.92,"momentum_warmup_steps":1500} |
| 816 | 0.04 | 0.99 | {"warmup_momentum":0.92,"warmup_steps":1500} |
| 831 | 0.04 | 0.99 | {"batched_banks":true} |
| 836 | 0.04 | — | — |
| 838 | 0.04 | 0.92 | {"momentum_schedule_end":0.99,"momentum_schedule_steps":1500,"newton_schulz_steps":5,"parameter_banking":true,"async_reduce_scatter_all_gather":true} |
| 840 | — | — | — |
| 841 | — | 0.99 | {"matrix_lr":0.025} |
| 843 | — | — | — |
| 845 | — | — | — |
| 850 | — | — | {"newton_schulz":5,"per_group_banking":true,"encoder_lr":0.025,"decoder_lr":0.05} |
| 852 | — | — | {"asynchronous_reduce_scatter":true,"asynchronous_all_gather":true,"orthogonalization":"Newton-Schulz 5","communication_computation_overlap":true} |
| 864 | — | — | {"parameter_banking":true,"batched_ns5":true} |
| 865 | — | — | {"parameter_banking":true,"batched_ns5":true} |
| 952 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500} |
| 953 | — | — | — |
| 960 | — | — | — |
| 961 | — | — | {"parameter_banking":true} |
| 964 | — | — | — |
| 965 | 0.04 | 0.99 | {"momentum_warmup_start":0.92,"momentum_warmup_steps":1500} |
| 977 | — | — | {"matrix_lr":0.027} |
| 986 | — | — | — |
| 988 | — | — | — |
| 1005 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":8350} |
| 1008 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035} |
| 1019 | — | — | — |
| 1031 | — | — | {"MATRIX_LR":0.027} |
| 1039 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"warmdown_iters":3500,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035} |
| 1043 | — | — | — |
| 1050 | — | — | — |
| 1069 | — | — | — |
| 1070 | — | — | {"phases":3,"overlapped_comms":true} |
| 1072 | — | — | {"parameter_banking":true,"overlapped_reduce_scatter_all_gather":true,"ddp":false} |
| 1081 | — | — | — |
| 1094 | — | 0.99 | {"warmup_start_momentum":0.92} |
| 1098 | — | — | {"adamw":true} |
| 1099 | — | — | — |
| 1105 | — | — | {"turbo_variant":true,"aol_preconditioned":true,"iterations":4,"polar_express":true,"ns_variant":"NS4"} |
| 1117 | — | — | {"adamw":true} |
| 1118 | — | — | {"adamw":true} |
| 1120 | — | — | — |
| 1122 | — | — | {"ns_steps":4} |
| 1130 | — | — | {"split_early_late_lr":true,"matrix_lr_early":0.036,"matrix_lr_late":0.044,"scalar_lr_early":0.028,"scalar_lr_late":0.018} |
| 1135 | — | — | {"with_adamw_embeddings":true} |
| 1150 | — | — | {"adamw":true} |
| 1171 | 0.04 | — | {"ns":5,"lr":0.025} |
| 1172 | — | — | {"split_lr":true,"early_layers_lr":0.025,"late_layers_lr":0.03} |
| 1176 | — | — | {"embeddings_optimizer":"AdamW"} |
| 1179 | — | — | — |
| 1182 | — | — | {"newton_schulz_steps":5} |
| 1184 | — | — | — |
| 1216 | — | — | {"async_reduce_scatter":true,"no_ddp":true} |
| 1217 | — | — | {"MuonEq-R":true,"row_normalization_before_newton_schulz":true} |
| 1242 | 0.04 | 0.99 | {"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035} |
| 1244 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500} |
| 1252 | — | — | {"adam_split":true} |
| 1269 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500} |
| 1274 | 0.04 | 0.99 | {"momentum_warmup_start":0.92,"momentum_warmup_steps":1500} |
| 1278 | — | — | — |
| 1282 | — | — | — |
| 1284 | — | — | {"parameter_banking":true} |
| 1298 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"muon_backend_steps":4} |
| 1309 | — | — | — |
| 1310 | — | — | {"parameter_banking":true} |
| 1319 | 0.04 | 0.92 | {"lr":0.025,"momentum_schedule_end":0.99,"momentum_schedule_steps":1500} |
| 1350 | — | — | {"training":"used for base training"} |
| 1359 | — | — | — |
| 1379 | — | — | — |
| 1383 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500} |
| 1407 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":3340,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035} |
| 1418 | — | — | {"row_normalization":true} |
| 1424 | 0.04 | 0.99 | {"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":3340} |
| 1442 | — | — | {"adamw_scalars":true} |
| 1452 | — | — | {"newton_schulz":true} |
| 1454 | — | — | {"newton_schulz":true} |
| 1467 | — | — | — |
| 1473 | — | — | {"multi_gpu":true} |
| 1520 | — | — | {"batched_newton_schulz":true} |
| 1523 | 0.095 | 0.97 | {"lr":0.022} |
| 1561 | 0.095 | 0.97 | {"lr":0.022} |
| 1568 | — | — | {"batched_newton_schulz":true,"reduce_scatter_overlap":true} |
| 1621 | 0.095 | 0.99 | {"warmup_momentum_start":0.92,"warmup_steps":1500} |
| 1634 | — | — | {"adam":true} |
| 1666 | 0.095 | 0.99 | {"warmup_from":0.92,"warmup_steps":1500} |
| 1696 | — | — | {"muon_momentum_warmup_steps":1500} |