← Back to Test-Time Training

score-first TTT

Test-Time Training
Used in
265 PRs
Best BPB
0.0274
Avg BPB
1.0166

Submissions

PR #267by andrewgcodes
1.1374
PR #461by Christopher-Lee-McClendon
1.1446
PR #473by abaybektursun
1.1214
PR #503by EthanYangTW
1.1195
PR #508by newjordan
1.1215
PR #526by Christopher-Lee-McClendon
1.1425
PR #528by EthanYangTW
1.1195
PR #529by EthanYangTW
1.1195
PR #537by Christopher-Lee-McClendon
1.1387
PR #545by EthanYangTW
1.1179
PR #549by abaybektursunRECORD
1.1194
PR #576by cmcdnd
1.1164
PR #585by EthanYangTW
1.1179
PR #589by RoyiRa
1.1178
PR #598by Christopher-Lee-McClendon
1.1334
PR #606by EthanYangTW
1.1162
PR #610by ChaosCodes
1.1190
PR #612by Christopher-Lee-McClendon
1.1079
PR #615by danialht
1.1169
PR #625by Joeavaib
1.1194
PR #628by Christopher-Lee-McClendon
1.0983
PR #644by Christopher-Lee-McClendon
1.0944
PR #653by demirelo
1.1552
PR #656by newjordan
1.1190
PR #668by Christopher-Lee-McClendon
1.0920
PR #670by abaybektursun
1.1171
PR #680by bro4all
1.1483
PR #688by RoyiRa
1.0745
PR #691by xexyz
1.0988
PR #697by Danishlynx
1.1194
PR #700by RoyiRa
1.0541
PR #706by newjordan
1.0461
PR #714by Upsalla
1.1187
PR #720by agalimova
1.1078
PR #726by DeepReinforce
1.1147
PR #731by pentxayc
1.0400
PR #733by stukenov
1.0278
PR #734by Robby955
1.1198
PR #745by stukenov
1.0222
PR #752by Naazimsnh02
1.1182
PR #753by newjordan
0.9625
PR #754by aryanbhosale
1.1253
PR #756by abaybektursun
1.1142
PR #758by hypery11
1.0465
PR #761by Asukabot0
0.9581
PR #764by ndokutovich
0.9633
PR #768by mradassaad
1.1201
PR #772by abaybektursun
1.3055
PR #778by raahilshah
0.9605
PR #779by deanbrr
0.6683
PR #792by xexyz
1.0340
PR #795by hypery11
0.8881
PR #798by travispchen
0.5466
PR #803by pentxayc
0.4416
PR #809by AayushBaniya2006
0.2952
PR #811by quietsmile
0.4377
PR #813by hypery11
0.6671
PR #814by newjordan
0.4820
PR #816by jimliu741523
1.1194
PR #825by hypery11
0.5440
PR #826by himanshudongre
0.2951
PR #830by zlxi02
1.4096
PR #831by sseanliu
1.1284
PR #834by AnirudhRahul
0.1663
PR #836by autocode-rayes
1.1219
PR #838by aryanbhosale
1.1215
PR #846by himanshudongre
0.1434
PR #849by dttdrv
1.1105
PR #851by RoyiRa
0.2071
PR #861by JoeProAI
1.1326
PR #869by THUQiXuan
0.1290
PR #870by simon-marcus
0.0935
PR #872by gowtham0992
1.0467
PR #873by gowtham0992
1.0467
PR #880by RoyiRa
0.1003
PR #883by THUQiXuan
0.0308
PR #885by lolrazh
0.9958
PR #886by abaybektursun
0.3779
PR #887by anthony-maio
0.9642
PR #891by robbiebusinessacc
1.1428
PR #892by robbiebusinessacc
1.1428
PR #893by aryanbhosale
0.1310
PR #913by RoyiRa
0.0887
PR #914by mkenney2
1.1873
PR #916by Bortlesboat
0.3461
PR #924by THUQiXuan
0.0280
PR #925by THUQiXuan
0.0281
PR #927by Tonyy1977
1.1696
PR #931by AnirudhRahul
0.0498
PR #940by antaloaalonso
0.9581
PR #945by TimPietrusky
0.0274
PR #947by aamodbhatt
1.1576
PR #952by FlashyFlash3011
1.1144
PR #953by dexhunter
1.0722
PR #958by shouryamaanjain
1.1382
PR #967by dexhunter
1.0450
PR #972by Idan3011
0.3922
PR #988by ymrohit
1.0857
PR #991by ibarrajo
1.1145
PR #995by dexhunter
1.0362
PR #999by aamodbhatt
1.1179
PR #1001by ibarrajo
1.1188
PR #1004by ibarrajo
1.1182
PR #1008by monkeyKingProgrammer
1.1538
PR #1014by haimianbaobao007
1.6200
PR #1016by ADIITJ
1.1269
PR #1027by Syed-M-Zeeshan
1.3036
PR #1037by TimPietruskyRunPod
1.1179
PR #1039by yufengli-oai
1.1184
PR #1040by JoeProAI
1.1336
PR #1041by JoeProAI
1.1356
PR #1057by Programmerryoki
1.2201
PR #1066by adityakm24
1.1259
PR #1069by manfromnowhere143
1.1190
PR #1070by manfromnowhere143
1.1190
PR #1081by michaelwinczuk
1.1220
PR #1084by AnubhavBharadwaaj
1.1185
PR #1087by Dhenenjay
1.1407
PR #1092by teddyoweh
1.1219
PR #1098by adityakm24
1.1187
PR #1117by adityakm24
1.1187
PR #1118by adityakm24
1.1187
PR #1123by sisegod
1.1986
PR #1124by NewyorkDev
1.1194
PR #1128by AnubhavBharadwaaj
1.1154
PR #1129by EthanYangTW
1.1174
PR #1143by simon-marcus
1.0806
PR #1148by aamodbhatt
1.1179
PR #1150by sahiee-dev
1.1151
PR #1156by haikosys
1.1161
PR #1159by JDAppleseed
0.3693
PR #1170by Christopher-Lee-McClendon
1.1199
PR #1171by EthanYangTW
1.1145
PR #1172by dexhunter
1.1015
PR #1176by bigbag
1.0962
PR #1182by adityakm24
1.1227
PR #1185by skoustav35
0.9641
PR #1202by VirajDeshwal
1.1412
PR #1209by andrewbaggio1
1.1064
PR #1217by bigbag
1.1027
PR #1222by abaybektursun
1.4707
PR #1227by himanshudongre
1.4841
PR #1229by resouer
0.9300
PR #1230by nestamidavaine
1.1163
PR #1231by nestamidavaine
1.1163
PR #1232by Christopher-Lee-McClendon
1.0929
PR #1233by ibarrajo
1.1460
PR #1234by ibarrajo
1.1461
PR #1236by ibarrajo
1.1179
PR #1237by ibarrajo
1.1198
PR #1238by ibarrajo
1.1521
PR #1239by tmancino
1.5918
PR #1240by andrewbaggio1
1.1064
PR #1242by Campbellb
1.0903
PR #1244by monkeyKingProgrammer
1.1443
PR #1245by mkenney2
1.1470
PR #1248by ibarrajo
1.1264
PR #1252by ahmetdenizyilmaz
1.0713
PR #1263by xexyz
0.9354
PR #1269by Jtss-ux
1.1194
PR #1272by andrewbaggio1
1.1100
PR #1274by MatoTeziTanka
1.0876
PR #1276by BiggerDABOSS
1.1100
PR #1280by aamodbhatt
1.1156
PR #1284by tyrel-beede
1.1207
PR #1284by tyrel-beede
1.1207
PR #1289by MatoTeziTanka
1.0819
PR #1303by anthony-maio
0.9462
PR #1313by anthony-maio
0.8637
PR #1319by canivel
0.6951
PR #1320by jpfeiffe
1.1196
PR #1322by newjordan
1.0854
PR #1325by monisha-max
1.3868
PR #1326by aryanbhosale
1.0896
PR #1333by aryanbhosale
1.0766
PR #1338by bigbag
1.0955
PR #1339by bigbag
1.0955
PR #1370by Christopher-Lee-McClendon
1.0030
PR #1376by stukenov
0.7094
PR #1379by LucasErcolano
0.4162
PR #1399by AnubhavBharadwaaj
1.0898
PR #1406by aamodbhatt
1.0887
PR #1410by izlley
1.1158
PR #1413by dexhunterRECORD
1.0828
PR #1425by dentity007
1.4479
PR #1431by Idan3011
1.1266
PR #1437by dexhunter
1.0780
PR #1440by Mertyandimata
1.1026
PR #1456by sisegod
1.1465
PR #1460by resouer
1.0827
PR #1465by sisegod
1.1381
PR #1476by aryan-cs
1.0842
PR #1477by aryanbhosaleRECORD
1.0822
PR #1492by bigbag
1.0810
PR #1493by bigbagRECORD
1.0810
PR #1501by SPThole
1.1159
PR #1502by SPThole
1.1147
PR #1514by dexhunter
1.0798
PR #1515by dexhunter
1.0872
PR #1516by dexhunter
1.0805
PR #1518by abaybektursun
1.0788
PR #1520by taka6745
1.0824
PR #1521by aryanbhosale
1.0802
PR #1523by EthanYangTW
1.0778
PR #1530by samacqua
1.0764
PR #1532by nogakeren
1.0803
PR #1533by aryanbhosale
1.0790
PR #1535by newjordan
1.0742
PR #1537by pireylow
1.3971
PR #1541by bigbag
1.0778
PR #1546by SPThole
1.0850
PR #1555by andrewbaggio1
1.0764
PR #1557by ndokutovich
1.0773
PR #1561by EthanYangTW
1.0783
PR #1565by Idan3011
1.1036
PR #1572by anthony-maio
1.0797
PR #1579by Tonyy1977
1.1372
PR #1583by codemath3000
1.0801
PR #1584by codemath3000
1.0752
PR #1585by codemath3000
1.0639
PR #1598by amrayach
1.0813
PR #1605by renqianluo
0.2988
PR #1616by Vickyrrrrrr
1.4100
PR #1626by dexhunter
1.0719
PR #1628by yu314-coder
1.1921
PR #1629by channyzf6
1.0829
PR #1639by kunwar-vikrant
1.0832
PR #1643by mradassaad
1.1473
PR #1644by mradassaad
1.1473
PR #1645by scottcui-georgian
1.1131
PR #1646by sergeevii123
1.0909
PR #1647by powerpratik
1.0616
PR #1655by himanalot
1.1135
PR #1667by MarioPaerle
1.0714
PR #1670by dexhunter
1.0597
PR #1672by andrewbaggio1
1.0119
PR #1679by ChideraIbe123
0.7625
PR #1688by Buld1n
1.0809
PR #1693by dexhunter
1.0573
PR #1696by kings-crown
1.1224
PR #1698by arsenis-cmd
1.0099
PR #1700by jorge-asenjo
1.0722
PR #1706by aamodbhatt
1.0815
PR #1711by aamodbhatt
1.0098
PR #1715by G3sparky
1.0809
PR #1716by himanshudongre
1.0788
PR #1718by himanshudongre
1.0788
PR #1722by deborahnelson8788726
0.6580
PR #1727by yahya010
1.0722
PR #1729by romeerp
1.0678
PR #1730by N10ELabs
1.0845
PR #1731by Victory963
1.0785
PR #1732by Victory963
1.0785
PR #1733by G3sparky
1.3262
PR #1734by yahya010
1.0108
PR #1736by dexhunter
1.0655
PR #1737by sakthivarshans
1.0723
PR #1744by MuhammedErinArchitecture
1.0889
PR #1747by swapp1990
1.0820
PR #1749by gracebml
1.0996
PR #1750by teslaeco
1.0809
PR #1755by OE-GOD
1.0746
PR #1756by romeerp
1.0651
PR #1759by yijieyuan
1.0799
PR #1760by BrandtChristian
1.1863

Hyperparameters Across PRs

pr_numberparameters
267{"epochs_per_chunk":12,"chunks":64,"learning_rate":0.004,"momentum":0.9}
461{"learning_rate":0.002,"epochs_per_chunk":3,"chunk_size":32768,"stride":64,"freeze_blocks":2,"momentum":0.9}
473{"chunk_size":32768,"epochs":3,"learning_rate":0.002,"optimizer":"SGD + momentum","freeze_blocks":0,"gradient_clip":1,"batch_size":32}
503{"epochs_per_chunk":3,"chunk_size":131072,"stride":32,"learning_rate":0.0001,"weight_decay":0}
508{"epochs":8,"learning_rate":0.002,"momentum":0.9}
526{"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs_per_chunk":30,"chunk_size":32768,"frozen_blocks":2,"trainable_params":19911748}
528{"epochs_per_chunk":3,"learning_rate":0.0001,"weight_decay":0}
529{"epochs_per_chunk":3,"chunk_size":131072,"stride":32,"learning_rate":0.0001,"weight_decay":0}
537{"optimizer":"SGD","momentum":0.9,"base_learning_rate":0.002,"per_layer_lr":{"mlp.proj":3,"mlp.fc":0.5},"intra_chunk_cosine_decay":true,"epochs_per_chunk":30,"chunk_size_tokens":32768,"stride":64,"frozen_blocks":2,"trainable_params":19911748,"total_params":24634452}
545{"learning_rate":0.0001,"chunk_tokens":131072,"freeze_blocks":2,"optimizer":"AdamW"}
549{"chunk_size":32768,"epochs":3,"learning_rate":0.002,"momentum":0.9,"freeze_blocks":0,"gradient_clip":1,"legal":true}
576{"learning_rate":0.0001,"chunk_size":131000,"epochs":3,"temperature":1,"layers":"last 2 blocks"}
585{"learning_rate":0.0001,"weight_decay":0,"epochs_per_chunk":"2-3","chunk_tokens":131072}
589{"chunk_size":32768,"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs":3,"grad_clip":1,"frozen_blocks":null}
598{"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs_per_chunk":10,"chunk_size_tokens":32768,"stride":64,"frozen_blocks":2,"trainable_params":22301260,"total_params":27030108}
606{"chunk_tokens":131072,"epochs_per_chunk":3,"optimizer":"AdamW","learning_rate":0.0001,"weight_decay":0,"unfrozen_params":"last 2 blocks + norms + lm_head (~5.8M / 33.6M)","cosine_lr_decay":true,"every_token_scored_before_update":true}
610{"learning_rate":0.002,"momentum":0.9,"cosine_lr_schedule":true,"max_chunks":900,"chunk_size_tokens":32768,"freeze_blocks":2,"epochs_per_chunk":3}
612{"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":10,"chunk_size_tokens":32768,"stride_tokens":64,"frozen_blocks":2,"gradient_clip":1,"total_chunks":1893}
615{"chunk":131072,"last 2 blocks plus control params unfrozen":true,"optimizer":"Legal score-first AdamW"}
625{"learning_rate":0.002,"epochs":3,"momentum":0.9,"freeze_blocks":0,"chunk_tokens":32768,"batch_seqs":32,"grad_clip":1}
628{"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":10,"chunk_size":32768,"stride":64,"frozen_blocks":2,"gradient_clip":1,"lr_warmup_percent":5}
644{"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":10,"chunk_size":32768,"stride":64,"frozen_blocks":2,"gradient_clip":1}
653{"chunk_size":32768,"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":3,"frozen_blocks":0,"gradient_clip":1,"batch_seqs":32}
656{"freeze_blocks":0,"grad_clip":0.8}
668{"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs":10,"tokens_per_chunk":32768,"freeze_first_blocks":2}
670{"experiments":22}
680
688{"learning_rate":0.0001,"chunk_tokens":131072,"epochs":3,"polyak_decay":0.998,"frozen_blocks":9}
691{"epochs":3,"chunk_tokens":32768,"learning_rate":0.002}
697{"recovery_epochs":20,"recovery_lr":0.001}
700{"epochs":4,"learning_rate":0.0001,"freeze_blocks":2,"chunk_tokens":131072}
706{"disabled":true}
714{"learning_rate":0.002,"epochs_per_chunk":3,"optimizer":"SGD","momentum":0.9,"freeze_blocks":0}
720{"epochs":4,"optimizer":"AdamW"}
726{"chunk_size":32768,"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs":3,"frozen_blocks":2,"gradient_clip":1,"stride":64}
731{"optimizer":"AdamW","learning_rate":0.0005,"polyak_decay":0.998,"freeze_first_blocks":9,"unfreeze_last_blocks":2,"epochs_per_chunk":3,"byte_weighted_loss":true,"adaptive_cosine_lr":true}
733{"epochs":3,"learning_rate":0.002,"momentum":0.9,"freeze_blocks":0}
734{"learning_rate":0.0001,"epochs":3,"freeze_blocks":9,"chunk_tokens":131072}
745{"epochs":1,"learning_rate":0.002,"momentum":0.9}
752{"learning_rate":0.002,"momentum":0.9,"epochs":3,"chunk_tokens":32768,"all_blocks_unfrozen":true}
753{"enabled":false}
754{"learning_rate":0.002,"momentum":0.9,"epochs":3,"chunk_size":32000}
756{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"stride":64}
758{"deterministic":true,"enabled":false}
761{"chunk_size_tokens":131000,"learning_rate":0.0001,"epochs":4,"freeze_first_blocks":2,"grad_clip":1}
764{"epochs":3,"freeze_last_blocks":2}
768{"epochs":3,"learning_rate":0.002,"momentum":0.9,"chunk_tokens":32768,"batch_seqs":32,"freeze_blocks":0,"grad_clip":1}
772{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"stride":64}
778{"backward_looking_cache":true,"ngram_orders":"2-7"}
779{"qttt":1,"eta":0.02,"learning_rate":0.00003,"chunk_tokens":1048576,"epochs":1,"adaptive_lr":0,"polyak":0,"freeze_blocks":1}
792{"cache_update_after_scoring":true}
795{"enabled":false}
798{"learning_rate":0.00003,"epochs":1,"chunk_tokens":1000000,"freeze_blocks":2,"polyak_decay":0.998}
803{"optimizer":"AdamW","learning_rate":0.0005,"epochs_per_chunk":4,"freeze_blocks":9,"polyak_ema":0.998}
809{"rank":8,"learning_rate":0.01,"chunk_size":2048,"epochs":3}
811{"learning_rate":0.0005,"epochs":4,"freeze_blocks":2,"temperature":0.98}
813{"backward_looking":true,"entropy_adaptive_alpha":true}
814{"chunk_based":true,"update_after_scoring":true}
816{"epochs":3,"optimizer":"SGD","all_blocks_unfrozen":true}
825{"backward_looking":true}
826{"rank":8,"learning_rate":0.01,"chunk_size":2048,"epochs_per_chunk":3,"polyak_decay":0.998,"temperature":0.98}
830{"backoff_orders":[1,2,3,4,5,6,7],"entropy_adaptive_alpha":true,"implemented_in_c":true}
831
834{"epochs":1,"freeze_blocks":1,"learning_rate":0.00003}
836{"optimizer":"SGD","learning_rate":0.002,"epochs":3,"chunk_size":"32K"}
838{"learning_rate":0.002,"momentum":0.9,"epochs":3,"all_blocks_unfrozen":true}
846{"optimizer":"AdamW","temperature":0.98,"chunk_size":2048}
849{"epochs":4,"optimizer":"AdamW","learning_rate":0.0005,"freeze_blocks":2,"byte_weighted":true,"polyak_averaging":0.998,"adaptive_cosine_lr":true}
851{"epochs":4,"learning_rate":0.0001,"freeze_blocks":2,"chunk_tokens":131072}
861{"learning_rate":0.0004,"epochs":1,"params":"MLP-only (up_proj, down_proj, gate_proj, scale)"}
869
870{"pass1_store_probs":true,"pass2_rescore_all_tokens":true}
872{"inner_loop":"gradient descent on MLP weights","meta_learning":true}
873{"inner_learning_rate":0.001}
880{"epochs":2,"learning_rate":0.0001,"freeze_blocks":2}
883{"phases":2}
885{"learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":3,"chunk_size":32768}
886
887{"ngram_backoff":true,"orders":"2-7"}
891{"learning_rate":0.002,"chunk_size":256,"freeze_blocks":0}
892{"learning_rate":0.002,"chunk_size":256,"freeze_blocks":0}
893{"passes":2,"cache_orders":"2-12","cold_cache_chunks":50}
913{"online_cache_update":true}
914{"learning_rate":0.002,"momentum":0.9,"epochs":3,"freeze_blocks":2}
916{"pass_1":"store per-token model probabilities without n-gram blending","pass_2":"rescore with frozen cache"}
924{"epochs":1,"learning_rate":0.001}
925{"epochs":1,"learning_rate":0.001}
927{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768}
931{"chunk_tokens":131072,"temperature":0.85,"freeze_blocks":2,"epochs":2,"learning_rate":0.0001}
940
945{"epochs":1,"learning_rate":0.001,"adaptive_temperature":[0.9,1.05],"byte_weighted_loss":true}
947{"enabled":false}
952{"epochs":3,"learning_rate":0.002,"freeze_blocks":0,"momentum":0.9}
953{"epochs":4,"freeze_blocks":1,"learning_rate":0.0005,"chunk_tokens":32768}
958{"epochs":3,"learning_rate":0.0001}
967{"learning_rate":0.002,"optimizer":"SGD","momentum":0.9}
972
988
991{"learning_rate":0.0001,"epochs":3,"blocks_unfrozen":2}
995{"learning_rate":0.002,"momentum":0.95,"epochs":4,"freeze_depth":0}
999{"learning_rate":0.002,"epochs":"2/3/4 adaptive","chunk_tokens":32768}
1001
1004{"chunk_tokens":131072}
1008{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"momentum":0.9,"freeze_blocks":0,"batch_seqs":32,"grad_clip":1}
1014
1016{"learning_rate":0.002,"momentum":0.9,"epochs":3,"chunk_size":32000}
1027{"optimizer":"AdamW","chunk_size":32768,"pre_quantization":true}
1037{"chunk_size":32000,"stride":64,"all_blocks_unfrozen":true}
1039{"learning_rate":0.0025,"epochs":4,"chunk_tokens":32768,"momentum":0.9,"freeze_blocks":0,"batch_seqs":32,"grad_clip":1}
1040
1041
1057{"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":7,"chunk_size":32768,"all_blocks_unfrozen":true}
1066{"enabled":1,"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0}
1069{"steps":3,"learning_rate":0.0001}
1070{"epochs":3,"optimizer":"SGD","learning_rate":0.002,"momentum":0.9}
1081
1084{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768}
1087{"learning_rate":0.005,"epochs":3,"chunk_tokens":16384,"freeze_blocks":0}
1092{"learning_rate":0.03,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9}
1098{"learning_rate":0.0025,"epochs":6,"freeze_blocks":0}
1117{"learning_rate":0.0025,"epochs":6,"freeze_blocks":0}
1118{"learning_rate":0.0025,"epochs":6,"freeze_blocks":0}
1123{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2}
1124
1128{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1}
1129{"learning_rate":0.0001,"epochs":3,"unfrozen_blocks":2}
1143{"legal":true,"backward_looking":true}
1148{"learning_rate":0.002,"epochs":[2,3,4],"chunk_tokens":32768,"entropy_adaptive":true}
1150{"learning_rate":0.002,"chunk_tokens":32768,"batch_seqs":32,"epochs":3}
1156
1159{"enabled":1,"epochs":3,"batch_seqs":32,"chunk_tokens":32768,"learning_rate":0.002,"momentum":0.9}
1170{"learning_rate":0.002,"epochs":10,"chunk_size":32768,"freeze_blocks":2,"momentum":0.9}
1171{"epochs":5,"learning_rate":0.0001,"chunks":262144}
1172{"learning_rate":0.005,"steps":8}
1176{"epochs":3}
1182{"learning_rate":0.002,"epochs":4,"freeze_blocks":0}
1185{"epochs":3,"chunk_size":32000,"stride":64,"optimizer":"SGD"}
1202{"chunk_size":32768,"epochs_per_chunk":3,"learning_rate":0.002,"momentum":0.9,"stride":64}
1209{"epochs":3,"chunk_tokens":65536,"learning_rate":0.002}
1217{"learning_rate":null}
1222{"learning_rate":0.01,"chunk_size":1024,"adapted_components":"prime MLPs only"}
1227{"epochs":10,"optimizer":"SGD","schedule":"cosine decay","rank":8}
1229{"learning_rate":0.008,"steps":16}
1230{"chunk_tokens":32768,"epochs":3,"learning_rate":0.002,"optimizer":"SGD + momentum","momentum":0.9}
1231{"chunk_size":32768,"epochs":3,"learning_rate":0.002,"gradient_clip":1,"eval_passes":3}
1232{"optimizer":"SGD","learning_rate":0.002,"epochs":10,"chunk_size":32768,"frozen_blocks":2,"grad_clip":1,"stride":64}
1233
1234{"epochs":3,"learning_rate":0.0001}
1236{"epochs":3}
1237
1238
1239{"learning_rate":0.0005,"steps":1}
1240{"epochs":3,"learning_rate":0.002,"chunk_tokens":65536}
1242{"learning_rate":0.005,"epochs":3,"chunk_tokens":32768}
1244{"learning_rate":0.002,"momentum":0.9,"epochs":3,"chunk_tokens":32768}
1245{"epochs":25,"freeze_blocks":0,"learning_rate":0.002}
1248
1252{"epochs":2,"freeze_blocks":2,"learning_rate":0.002}
1263{"steps":16,"learning_rate":0.008,"min_learning_rate":0.0008}
1269{"chunk_tokens":32768,"learning_rate":0.002,"epochs":3,"momentum":0.9,"freeze_blocks":0,"grad_clip":1}
1272
1274{"learning_rate":0.005,"epochs":3}
1276{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"batch_seqs":32,"grad_clip":1}
1280{"learning_rate":0.002,"chunk_tokens":32768,"epochs":"2/3/4 adaptive"}
1284{"stride":64}
1284{"stride":16}
1289
1303{"steps":16,"learning_rate":0.008,"min_learning_rate":0.0008}
1313{"steps":24,"learning_rate":0.012}
1319{"steps":64,"warmstart_alpha":0.85,"learning_rate_start":0.01,"learning_rate_end":0.001}
1320{"epochs_per_chunk":3,"learning_rate":0.002,"momentum":0.9,"all_gpu_per_chunk":true,"full_sequence_loss":true,"skip_final_chunk_training":true}
1322{"steps":32,"learning_rate":0.04}
1325{"epochs":3}
1326{"enabled":true,"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0}
1333{"learning_rate":0.008,"steps":16,"delta_dim":512}
1338{"learning_rate":0.002,"epochs":3}
1339{"learning_rate":0.002,"epochs":3}
1370{"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs_per_chunk":3,"chunk_size":32768,"freeze_first_blocks":2}
1376{"steps":24,"learning_rate_start":0.024,"learning_rate_min":0.001,"stride":96}
1379
1399{"learning_rate":0.0005,"epochs":1,"freeze_blocks":9,"chunk_size":32768,"scope":"pre-quantization"}
1406{"learning_rate":0.0005,"epochs":10,"freeze_blocks":0,"cosine_decay":true,"pre_quant":true}
1410{"learning_rate":0.0008,"chunk_size":65536,"epochs":4,"momentum":0.9}
1413{"learning_rate":0.005,"epochs":3,"freeze_blocks":0}
1425{"enabled":1}
1431{"learning_rate":0.003,"epochs_per_chunk":20,"chunks":348}
1437{"learning_rate":0.005,"epochs":3}
1440{"epochs":3,"learning_rate":0.01,"reset_per_chunk":0}
1456{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2}
1460{"learning_rate":0.005,"momentum":0.9,"epochs_per_chunk":3,"chunk_size":32000,"freeze":0}
1465{"muon":true,"stride":64}
1476
1477{"epochs":3,"learning_rate":0.005}
1492{"learning_rate":0.005,"epochs":3,"chunk_size":32000}
1493{"learning_rate":0.005,"epochs":3}
1501{"optimizer":"SGD","momentum":0.9,"learning_rate":0.004,"epochs":4}
1502{"optimizer":"SGD","lr_schedule":"cosine decay","momentum":0.9,"epochs":4}
1514{"learning_rate":0.005,"epochs":3,"freeze_blocks":0,"chunk_tokens":32768}
1515{"epochs_per_chunk":3,"learning_rate":0.005,"momentum":0.9}
1516
1518{"learning_rate":0.005,"freeze_blocks":0,"epochs":3,"chunk_tokens":32768}
1520{"chunk_size":32000,"epochs":3}
1521{"epochs":3}
1523{"learning_rate":0.01}
1530{"batched_loras":true}
1532{"learning_rate":0.005,"epochs":3,"momentum":0.9}
1533{"epochs":3,"learning_rate":0.005}
1535
1537
1541{"learning_rate":0.005,"epochs":3,"momentum":0.9}
1546{"freeze_blocks":2}
1555{"learning_rate":0.005,"momentum":0.9,"epochs_per_chunk":3}
1557{"optimizer":"SGD","epochs":5,"learning_rate":0.005}
1561{"learning_rate":0.01,"epochs":3,"chunk_size":32000}
1565{"chunk_size":32768,"optimizer":"SGD","momentum":0.9,"learning_rate":0.01,"epochs":3,"gradient_clip":1}
1572{"epochs":3,"chunks":1238}
1579{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":1,"momentum":0.9,"batch_seqs":32,"grad_clip":1}
1583{"learning_rate":0.005,"momentum":0.9,"epochs_per_chunk":3,"chunk_length":32000}
1584{"chunk_size":32000,"epochs_per_chunk":3,"learning_rate":0.01,"momentum":0.9}
1585{"chunk_tokens":32000,"epochs_per_chunk":3,"learning_rate":0.005,"momentum":0.9,"gradient_clipping":1}
1598{"seeds":5}
1605{"epochs":1,"learning_rate":0.005,"optimizer":"SGD"}
1616{"learning_rate":0.005,"epochs":3}
1626{"phased":true,"num_phases":3,"prefix_docs":2000}
1628{"learning_rate":0.005,"epochs":3,"momentum":0.9}
1629{"epochs":3}
1639{"learning_rate":0.005,"momentum":0.9,"base_epochs":3,"adaptive_epochs":true,"max_epochs":5,"min_epochs":1,"ema_alpha":0.3}
1643{"chunks":310,"chunk_tokens":32,"seq_len":4096,"learning_rate":0.01,"momentum":0.9,"epochs":1}
1644{"chunks":310,"chunk_tokens":32768,"learning_rate":0.01,"momentum":0.9,"epochs":1}
1645{"learning_rate":0.005,"epochs":3}
1646
1647{"epochs_per_chunk":3}
1655{"learning_rate":0.002,"momentum":0.9,"stride":76,"freeze_layers":2,"epochs":1}
1667{"optimizer":"SGD","learning_rate":0.005,"epochs_per_chunk":3}
1670{"method_variant":"Multi-Phase Global SGD","phases":3,"prefix_documents":2000,"learning_rate":0.001,"momentum":0.9,"gradient_clipping":1}
1672{"learning_rate":0.005,"epochs_per_chunk":3}
1679
1688{"chunk_size":32768,"learning_rate":0.005,"momentum":0.9,"epochs":3}
1693{"phases":3,"prefix_docs":2000,"learning_rate":0.001,"momentum":0.9,"gradient_clip":1}
1696{"learning_rate":0.003,"epochs":3,"freeze_blocks":1,"chunk_tokens":32768}
1698{"learning_rate":0.005,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2,"momentum":0.9}
1700{"phased":true,"num_phases":3}
1706{"epochs":4,"learning_rate":0.0055}
1711{"learning_rate":0.005,"momentum":0.9,"epochs":3,"freeze_blocks":2}
1715{"learning_rate":0.005,"epochs":3}
1716{"epochs":3,"learning_rate":0.005,"momentum":0.9}
1718{"epochs":4}
1722{"learning_rate":0.001,"epochs":1,"chunk_tokens":32768,"freeze_blocks":10}
1727{"phases":4,"enabled":true}
1729{"phases":3}
1730{"learning_rate":0.005,"epochs":3,"chunk_tokens":32768,"batch_seqs":32,"freeze_blocks":0,"eval_stride":64}
1731{"learning_rate":0.005,"momentum":0.9,"epochs":3}
1732{"learning_rate":0.005,"momentum":0.9,"epochs":3}
1733
1734{"learning_rate":0.005,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2,"momentum":0.9}
1736{"phased":true,"num_phases":3,"prefix_docs":2000,"per_doc_lora_reset":true}
1737{"learning_rate":0.005,"momentum":0.9,"epochs":3}
1744{"freeze_blocks":10,"param_mode":"all","loss_gate_mode":"running_mean","loss_gate_margin":0,"final_block_only":true}
1747{"learning_rate":0.005,"momentum":0.9,"chunk_size":2048}
1749{"chunk_size":32768,"optimizer":"AdamW"}
1750{"learning_rate":0.005,"epochs":3}
1755{"optimizer":"SGD"}
1756{"phased":true}
1759{"learning_rate":0.005,"epochs":3,"chunk_size":32000}
1760{"epochs":3,"learning_rate":0.005}