← Back to Optimizer

Parallel Muon

Optimizer
Used in
109 PRs
Best BPB
0.0830
Avg BPB
1.0253

Submissions

PR #399by abaybektursun
1.1247
PR #473by abaybektursun
1.1214
PR #549by abaybektursunRECORD
1.1194
PR #593by abaybektursun
1.1163
PR #609by saml212
1.1154
PR #625by Joeavaib
1.1194
PR #626by kshitizz36
1.1180
PR #634by raahilshah
1.1171
PR #659by deanbrr
1.0920
PR #667by suchitj2702
1.1352
PR #670by abaybektursun
1.1171
PR #680by bro4all
1.1483
PR #681by Alfaxad
1.4775
PR #703by Gusanidas
1.1176
PR #714by Upsalla
1.1187
PR #720by agalimova
1.1078
PR #726by DeepReinforce
1.1147
PR #754by aryanbhosale
1.1253
PR #768by mradassaad
1.1201
PR #774by travispchen
0.9370
PR #778by raahilshah
0.9605
PR #785by SirSaltySalmon
1.5364
PR #786by shinegami-2002
0.8128
PR #794by jeremyschied
1.3346
PR #797by armantsaturian
0.8960
PR #808by Naazimsnh02
0.6364
PR #816by jimliu741523
1.1194
PR #831by sseanliu
1.1284
PR #836by autocode-rayes
1.1219
PR #838by aryanbhosale
1.1215
PR #840by quietsmile
0.2873
PR #841by someone114514
1.1157
PR #843by quietsmile
0.2834
PR #845by rubenbalbastre
1.1407
PR #850by callithyia
0.3212
PR #852by Prush69
1.1189
PR #864by aryanbhosale
0.2841
PR #865by aryanbhosale
0.2841
PR #952by FlashyFlash3011
1.1144
PR #953by dexhunter
1.0722
PR #960by ADIITJ
1.1882
PR #961by callithyia
0.0881
PR #964by vivekvar-dl
1.3900
PR #965by Adam-Jacuch
1.1184
PR #977by michaelwinczuk
1.1185
PR #986by sofiabod
0.0830
PR #988by ymrohit
1.0857
PR #1005by OnlyJundong
1.0853
PR #1008by monkeyKingProgrammer
1.1538
PR #1019by abaybektursunRECORD
1.1147
PR #1031by michaelwinczuk
1.1185
PR #1039by yufengli-oai
1.1184
PR #1043by okezue
1.1261
PR #1050by Taleef7
1.1194
PR #1069by manfromnowhere143
1.1190
PR #1070by manfromnowhere143
1.1190
PR #1072by vimeto
1.1170
PR #1081by michaelwinczuk
1.1220
PR #1094by michaelwinczuk
0.4027
PR #1098by adityakm24
1.1187
PR #1099by Bortlesboat
1.1133
PR #1105by abaybektursun
1.2208
PR #1117by adityakm24
1.1187
PR #1118by adityakm24
1.1187
PR #1120by newjordan
1.1099
PR #1122by icryo
1.1146
PR #1130by Gusanidas
1.1140
PR #1135by barneywohl
1.1116
PR #1150by sahiee-dev
1.1151
PR #1171by EthanYangTW
1.1145
PR #1172by dexhunter
1.1015
PR #1176by bigbag
1.0962
PR #1179by dexhunter
1.1105
PR #1182by adityakm24
1.1227
PR #1184by icryo
0.9485
PR #1216by SoHarshh
1.1574
PR #1217by bigbag
1.1027
PR #1242by Campbellb
1.0903
PR #1244by monkeyKingProgrammer
1.1443
PR #1252by ahmetdenizyilmaz
1.0713
PR #1269by Jtss-ux
1.1194
PR #1274by MatoTeziTanka
1.0876
PR #1278by GitGeeks
1.1147
PR #1282by newjordan
1.1035
PR #1284by tyrel-beede
1.1207
PR #1298by Omrigotlieb
1.1043
PR #1309by cadenmcmann
1.1143
PR #1310by cadenmcmann
1.1177
PR #1319by canivel
0.6951
PR #1350by resouer
1.0046
PR #1359by LucasErcolano
0.4188
PR #1379by LucasErcolano
0.4162
PR #1383by nirmathur
1.3151
PR #1407by OnlyJundong
1.0960
PR #1418by Park-Tae-Hwan
1.4192
PR #1424by OnlyJundong
1.0858
PR #1442by akaiHuang
1.1854
PR #1452by bsisduck
0.3509
PR #1454by bsisduck
0.3509
PR #1467by PhamPhuHoa-23
1.1056
PR #1473by AVINASH0052
1.1156
PR #1520by taka6745
1.0824
PR #1523by EthanYangTW
1.0778
PR #1561by EthanYangTW
1.0783
PR #1568by yuitokyouni
1.1639
PR #1621by mrbese
1.1531
PR #1634by arsenis-cmd
1.1335
PR #1666by mrbese
1.1531
PR #1696by kings-crown
1.1224

Hyperparameters Across PRs

pr_numberweight_decaymomentumother_params
3990.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"warmdown_iters":3000,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035}
4730.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"warmdown_iters":3500,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035}
5490.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035}
5930.040.99{"parameter_banking":true,"async_reduce_scatter_all_gather":true}
609
6250.04{"post_backward_reduce_scatter":true,"local_NS5":true,"all_gather":true}
626
6340.040.99{"lr_matrices":0.025,"lr_embeddings":0.035,"Newton-Schulz_steps":5,"gradient_clip":0.3,"batch_tokens":786432,"seq_len":2048}
659
6670.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500}
670
6800.040.99{"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035,"momentum_warmup_start":0.92,"momentum_warmup_steps":1500}
681
7030.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"warmdown_iters":3500}
7140.040.99{"lr":0.025}
720{"parameter_banking":true}
726
7540.040.92{"momentum_schedule":"0.92→0.99 over 1500 steps","newton_schulz_steps":5,"parameter_banking":true,"async_reduce_scatter_all_gather":true}
768
7740.040.99{"adam_weight_decay":0.04,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035,"momentum_warmup_start":0.92,"momentum_warmup_steps":1500}
778
7850.040.99{"momentum_warmup_start":0.92,"momentum_warmup_steps":1500}
7860.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"adam_weight_decay":0.04}
794{"matrix_lr":0.05,"muon_backend_steps":6,"muon_momentum_warmup_steps":300,"grad_clip_norm":1}
797
8080.040.99{"momentum_warmup_start":0.92,"momentum_warmup_steps":1500}
8160.040.99{"warmup_momentum":0.92,"warmup_steps":1500}
8310.040.99{"batched_banks":true}
8360.04
8380.040.92{"momentum_schedule_end":0.99,"momentum_schedule_steps":1500,"newton_schulz_steps":5,"parameter_banking":true,"async_reduce_scatter_all_gather":true}
840
8410.99{"matrix_lr":0.025}
843
845
850{"newton_schulz":5,"per_group_banking":true,"encoder_lr":0.025,"decoder_lr":0.05}
852{"asynchronous_reduce_scatter":true,"asynchronous_all_gather":true,"orthogonalization":"Newton-Schulz 5","communication_computation_overlap":true}
864{"parameter_banking":true,"batched_ns5":true}
865{"parameter_banking":true,"batched_ns5":true}
9520.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500}
953
960
961{"parameter_banking":true}
964
9650.040.99{"momentum_warmup_start":0.92,"momentum_warmup_steps":1500}
977{"matrix_lr":0.027}
986
988
10050.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":8350}
10080.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035}
1019
1031{"MATRIX_LR":0.027}
10390.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"warmdown_iters":3500,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035}
1043
1050
1069
1070{"phases":3,"overlapped_comms":true}
1072{"parameter_banking":true,"overlapped_reduce_scatter_all_gather":true,"ddp":false}
1081
10940.99{"warmup_start_momentum":0.92}
1098{"adamw":true}
1099
1105{"turbo_variant":true,"aol_preconditioned":true,"iterations":4,"polar_express":true,"ns_variant":"NS4"}
1117{"adamw":true}
1118{"adamw":true}
1120
1122{"ns_steps":4}
1130{"split_early_late_lr":true,"matrix_lr_early":0.036,"matrix_lr_late":0.044,"scalar_lr_early":0.028,"scalar_lr_late":0.018}
1135{"with_adamw_embeddings":true}
1150{"adamw":true}
11710.04{"ns":5,"lr":0.025}
1172{"split_lr":true,"early_layers_lr":0.025,"late_layers_lr":0.03}
1176{"embeddings_optimizer":"AdamW"}
1179
1182{"newton_schulz_steps":5}
1184
1216{"async_reduce_scatter":true,"no_ddp":true}
1217{"MuonEq-R":true,"row_normalization_before_newton_schulz":true}
12420.040.99{"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035}
12440.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500}
1252{"adam_split":true}
12690.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500}
12740.040.99{"momentum_warmup_start":0.92,"momentum_warmup_steps":1500}
1278
1282
1284{"parameter_banking":true}
12980.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500,"muon_backend_steps":4}
1309
1310{"parameter_banking":true}
13190.040.92{"lr":0.025,"momentum_schedule_end":0.99,"momentum_schedule_steps":1500}
1350{"training":"used for base training"}
1359
1379
13830.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":1500}
14070.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":3340,"matrix_lr":0.025,"scalar_lr":0.025,"tied_embed_lr":0.035}
1418{"row_normalization":true}
14240.040.99{"muon_momentum_warmup_start":0.92,"muon_momentum_warmup_steps":3340}
1442{"adamw_scalars":true}
1452{"newton_schulz":true}
1454{"newton_schulz":true}
1467
1473{"multi_gpu":true}
1520{"batched_newton_schulz":true}
15230.0950.97{"lr":0.022}
15610.0950.97{"lr":0.022}
1568{"batched_newton_schulz":true,"reduce_scatter_overlap":true}
16210.0950.99{"warmup_momentum_start":0.92,"warmup_steps":1500}
1634{"adam":true}
16660.0950.99{"warmup_from":0.92,"warmup_steps":1500}
1696{"muon_momentum_warmup_steps":1500}