← Back to Test-Time Training
score-first TTT
Test-Time TrainingUsed in
265 PRs
Best BPB
0.0274
Avg BPB
1.0166
Submissions
PR #267by andrewgcodes
1.1374PR #461by Christopher-Lee-McClendon
1.1446PR #473by abaybektursun
1.1214PR #503by EthanYangTW
1.1195PR #508by newjordan
1.1215PR #526by Christopher-Lee-McClendon
1.1425PR #528by EthanYangTW
1.1195PR #529by EthanYangTW
1.1195PR #537by Christopher-Lee-McClendon
1.1387PR #545by EthanYangTW
1.1179PR #549by abaybektursunRECORD
1.1194PR #576by cmcdnd
1.1164PR #585by EthanYangTW
1.1179PR #589by RoyiRa
1.1178PR #598by Christopher-Lee-McClendon
1.1334PR #606by EthanYangTW
1.1162PR #610by ChaosCodes
1.1190PR #612by Christopher-Lee-McClendon
1.1079PR #615by danialht
1.1169PR #625by Joeavaib
1.1194PR #628by Christopher-Lee-McClendon
1.0983PR #644by Christopher-Lee-McClendon
1.0944PR #653by demirelo
1.1552PR #656by newjordan
1.1190PR #668by Christopher-Lee-McClendon
1.0920PR #670by abaybektursun
1.1171PR #680by bro4all
1.1483PR #688by RoyiRa
1.0745PR #691by xexyz
1.0988PR #697by Danishlynx
1.1194PR #700by RoyiRa
1.0541PR #706by newjordan
1.0461PR #714by Upsalla
1.1187PR #720by agalimova
1.1078PR #726by DeepReinforce
1.1147PR #731by pentxayc
1.0400PR #733by stukenov
1.0278PR #734by Robby955
1.1198PR #745by stukenov
1.0222PR #752by Naazimsnh02
1.1182PR #753by newjordan
0.9625PR #754by aryanbhosale
1.1253PR #756by abaybektursun
1.1142PR #758by hypery11
1.0465PR #761by Asukabot0
0.9581PR #764by ndokutovich
0.9633PR #768by mradassaad
1.1201PR #772by abaybektursun
1.3055PR #778by raahilshah
0.9605PR #779by deanbrr
0.6683PR #792by xexyz
1.0340PR #795by hypery11
0.8881PR #798by travispchen
0.5466PR #803by pentxayc
0.4416PR #809by AayushBaniya2006
0.2952PR #811by quietsmile
0.4377PR #813by hypery11
0.6671PR #814by newjordan
0.4820PR #816by jimliu741523
1.1194PR #825by hypery11
0.5440PR #826by himanshudongre
0.2951PR #830by zlxi02
1.4096PR #831by sseanliu
1.1284PR #834by AnirudhRahul
0.1663PR #836by autocode-rayes
1.1219PR #838by aryanbhosale
1.1215PR #846by himanshudongre
0.1434PR #849by dttdrv
1.1105PR #851by RoyiRa
0.2071PR #861by JoeProAI
1.1326PR #869by THUQiXuan
0.1290PR #870by simon-marcus
0.0935PR #872by gowtham0992
1.0467PR #873by gowtham0992
1.0467PR #880by RoyiRa
0.1003PR #883by THUQiXuan
0.0308PR #885by lolrazh
0.9958PR #886by abaybektursun
0.3779PR #887by anthony-maio
0.9642PR #891by robbiebusinessacc
1.1428PR #892by robbiebusinessacc
1.1428PR #893by aryanbhosale
0.1310PR #913by RoyiRa
0.0887PR #914by mkenney2
1.1873PR #916by Bortlesboat
0.3461PR #924by THUQiXuan
0.0280PR #925by THUQiXuan
0.0281PR #927by Tonyy1977
1.1696PR #931by AnirudhRahul
0.0498PR #940by antaloaalonso
0.9581PR #945by TimPietrusky
0.0274PR #947by aamodbhatt
1.1576PR #952by FlashyFlash3011
1.1144PR #953by dexhunter
1.0722PR #958by shouryamaanjain
1.1382PR #967by dexhunter
1.0450PR #972by Idan3011
0.3922PR #988by ymrohit
1.0857PR #991by ibarrajo
1.1145PR #995by dexhunter
1.0362PR #999by aamodbhatt
1.1179PR #1001by ibarrajo
1.1188PR #1004by ibarrajo
1.1182PR #1008by monkeyKingProgrammer
1.1538PR #1014by haimianbaobao007
1.6200PR #1016by ADIITJ
1.1269PR #1027by Syed-M-Zeeshan
1.3036PR #1037by TimPietruskyRunPod
1.1179PR #1039by yufengli-oai
1.1184PR #1040by JoeProAI
1.1336PR #1041by JoeProAI
1.1356PR #1057by Programmerryoki
1.2201PR #1066by adityakm24
1.1259PR #1069by manfromnowhere143
1.1190PR #1070by manfromnowhere143
1.1190PR #1081by michaelwinczuk
1.1220PR #1084by AnubhavBharadwaaj
1.1185PR #1087by Dhenenjay
1.1407PR #1092by teddyoweh
1.1219PR #1098by adityakm24
1.1187PR #1117by adityakm24
1.1187PR #1118by adityakm24
1.1187PR #1123by sisegod
1.1986PR #1124by NewyorkDev
1.1194PR #1128by AnubhavBharadwaaj
1.1154PR #1129by EthanYangTW
1.1174PR #1143by simon-marcus
1.0806PR #1148by aamodbhatt
1.1179PR #1150by sahiee-dev
1.1151PR #1156by haikosys
1.1161PR #1159by JDAppleseed
0.3693PR #1170by Christopher-Lee-McClendon
1.1199PR #1171by EthanYangTW
1.1145PR #1172by dexhunter
1.1015PR #1176by bigbag
1.0962PR #1182by adityakm24
1.1227PR #1185by skoustav35
0.9641PR #1202by VirajDeshwal
1.1412PR #1209by andrewbaggio1
1.1064PR #1217by bigbag
1.1027PR #1222by abaybektursun
1.4707PR #1227by himanshudongre
1.4841PR #1229by resouer
0.9300PR #1230by nestamidavaine
1.1163PR #1231by nestamidavaine
1.1163PR #1232by Christopher-Lee-McClendon
1.0929PR #1233by ibarrajo
1.1460PR #1234by ibarrajo
1.1461PR #1236by ibarrajo
1.1179PR #1237by ibarrajo
1.1198PR #1238by ibarrajo
1.1521PR #1239by tmancino
1.5918PR #1240by andrewbaggio1
1.1064PR #1242by Campbellb
1.0903PR #1244by monkeyKingProgrammer
1.1443PR #1245by mkenney2
1.1470PR #1248by ibarrajo
1.1264PR #1252by ahmetdenizyilmaz
1.0713PR #1263by xexyz
0.9354PR #1269by Jtss-ux
1.1194PR #1272by andrewbaggio1
1.1100PR #1274by MatoTeziTanka
1.0876PR #1276by BiggerDABOSS
1.1100PR #1280by aamodbhatt
1.1156PR #1284by tyrel-beede
1.1207PR #1284by tyrel-beede
1.1207PR #1289by MatoTeziTanka
1.0819PR #1303by anthony-maio
0.9462PR #1313by anthony-maio
0.8637PR #1319by canivel
0.6951PR #1320by jpfeiffe
1.1196PR #1322by newjordan
1.0854PR #1325by monisha-max
1.3868PR #1326by aryanbhosale
1.0896PR #1333by aryanbhosale
1.0766PR #1338by bigbag
1.0955PR #1339by bigbag
1.0955PR #1370by Christopher-Lee-McClendon
1.0030PR #1376by stukenov
0.7094PR #1379by LucasErcolano
0.4162PR #1399by AnubhavBharadwaaj
1.0898PR #1406by aamodbhatt
1.0887PR #1410by izlley
1.1158PR #1413by dexhunterRECORD
1.0828PR #1425by dentity007
1.4479PR #1431by Idan3011
1.1266PR #1437by dexhunter
1.0780PR #1440by Mertyandimata
1.1026PR #1456by sisegod
1.1465PR #1460by resouer
1.0827PR #1465by sisegod
1.1381PR #1476by aryan-cs
1.0842PR #1477by aryanbhosaleRECORD
1.0822PR #1492by bigbag
1.0810PR #1493by bigbagRECORD
1.0810PR #1501by SPThole
1.1159PR #1502by SPThole
1.1147PR #1514by dexhunter
1.0798PR #1515by dexhunter
1.0872PR #1516by dexhunter
1.0805PR #1518by abaybektursun
1.0788PR #1520by taka6745
1.0824PR #1521by aryanbhosale
1.0802PR #1523by EthanYangTW
1.0778PR #1530by samacqua
1.0764PR #1532by nogakeren
1.0803PR #1533by aryanbhosale
1.0790PR #1535by newjordan
1.0742PR #1537by pireylow
1.3971PR #1541by bigbag
1.0778PR #1546by SPThole
1.0850PR #1555by andrewbaggio1
1.0764PR #1557by ndokutovich
1.0773PR #1561by EthanYangTW
1.0783PR #1565by Idan3011
1.1036PR #1572by anthony-maio
1.0797PR #1579by Tonyy1977
1.1372PR #1583by codemath3000
1.0801PR #1584by codemath3000
1.0752PR #1585by codemath3000
1.0639PR #1598by amrayach
1.0813PR #1605by renqianluo
0.2988PR #1616by Vickyrrrrrr
1.4100PR #1626by dexhunter
1.0719PR #1628by yu314-coder
1.1921PR #1629by channyzf6
1.0829PR #1639by kunwar-vikrant
1.0832PR #1643by mradassaad
1.1473PR #1644by mradassaad
1.1473PR #1645by scottcui-georgian
1.1131PR #1646by sergeevii123
1.0909PR #1647by powerpratik
1.0616PR #1655by himanalot
1.1135PR #1667by MarioPaerle
1.0714PR #1670by dexhunter
1.0597PR #1672by andrewbaggio1
1.0119PR #1679by ChideraIbe123
0.7625PR #1688by Buld1n
1.0809PR #1693by dexhunter
1.0573PR #1696by kings-crown
1.1224PR #1698by arsenis-cmd
1.0099PR #1700by jorge-asenjo
1.0722PR #1706by aamodbhatt
1.0815PR #1711by aamodbhatt
1.0098PR #1715by G3sparky
1.0809PR #1716by himanshudongre
1.0788PR #1718by himanshudongre
1.0788PR #1722by deborahnelson8788726
0.6580PR #1727by yahya010
1.0722PR #1729by romeerp
1.0678PR #1730by N10ELabs
1.0845PR #1731by Victory963
1.0785PR #1732by Victory963
1.0785PR #1733by G3sparky
1.3262PR #1734by yahya010
1.0108PR #1736by dexhunter
1.0655PR #1737by sakthivarshans
1.0723PR #1744by MuhammedErinArchitecture
1.0889PR #1747by swapp1990
1.0820PR #1749by gracebml
1.0996PR #1750by teslaeco
1.0809PR #1755by OE-GOD
1.0746PR #1756by romeerp
1.0651PR #1759by yijieyuan
1.0799PR #1760by BrandtChristian
1.1863Hyperparameters Across PRs
| pr_number | parameters |
|---|---|
| 267 | {"epochs_per_chunk":12,"chunks":64,"learning_rate":0.004,"momentum":0.9} |
| 461 | {"learning_rate":0.002,"epochs_per_chunk":3,"chunk_size":32768,"stride":64,"freeze_blocks":2,"momentum":0.9} |
| 473 | {"chunk_size":32768,"epochs":3,"learning_rate":0.002,"optimizer":"SGD + momentum","freeze_blocks":0,"gradient_clip":1,"batch_size":32} |
| 503 | {"epochs_per_chunk":3,"chunk_size":131072,"stride":32,"learning_rate":0.0001,"weight_decay":0} |
| 508 | {"epochs":8,"learning_rate":0.002,"momentum":0.9} |
| 526 | {"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs_per_chunk":30,"chunk_size":32768,"frozen_blocks":2,"trainable_params":19911748} |
| 528 | {"epochs_per_chunk":3,"learning_rate":0.0001,"weight_decay":0} |
| 529 | {"epochs_per_chunk":3,"chunk_size":131072,"stride":32,"learning_rate":0.0001,"weight_decay":0} |
| 537 | {"optimizer":"SGD","momentum":0.9,"base_learning_rate":0.002,"per_layer_lr":{"mlp.proj":3,"mlp.fc":0.5},"intra_chunk_cosine_decay":true,"epochs_per_chunk":30,"chunk_size_tokens":32768,"stride":64,"frozen_blocks":2,"trainable_params":19911748,"total_params":24634452} |
| 545 | {"learning_rate":0.0001,"chunk_tokens":131072,"freeze_blocks":2,"optimizer":"AdamW"} |
| 549 | {"chunk_size":32768,"epochs":3,"learning_rate":0.002,"momentum":0.9,"freeze_blocks":0,"gradient_clip":1,"legal":true} |
| 576 | {"learning_rate":0.0001,"chunk_size":131000,"epochs":3,"temperature":1,"layers":"last 2 blocks"} |
| 585 | {"learning_rate":0.0001,"weight_decay":0,"epochs_per_chunk":"2-3","chunk_tokens":131072} |
| 589 | {"chunk_size":32768,"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs":3,"grad_clip":1,"frozen_blocks":null} |
| 598 | {"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs_per_chunk":10,"chunk_size_tokens":32768,"stride":64,"frozen_blocks":2,"trainable_params":22301260,"total_params":27030108} |
| 606 | {"chunk_tokens":131072,"epochs_per_chunk":3,"optimizer":"AdamW","learning_rate":0.0001,"weight_decay":0,"unfrozen_params":"last 2 blocks + norms + lm_head (~5.8M / 33.6M)","cosine_lr_decay":true,"every_token_scored_before_update":true} |
| 610 | {"learning_rate":0.002,"momentum":0.9,"cosine_lr_schedule":true,"max_chunks":900,"chunk_size_tokens":32768,"freeze_blocks":2,"epochs_per_chunk":3} |
| 612 | {"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":10,"chunk_size_tokens":32768,"stride_tokens":64,"frozen_blocks":2,"gradient_clip":1,"total_chunks":1893} |
| 615 | {"chunk":131072,"last 2 blocks plus control params unfrozen":true,"optimizer":"Legal score-first AdamW"} |
| 625 | {"learning_rate":0.002,"epochs":3,"momentum":0.9,"freeze_blocks":0,"chunk_tokens":32768,"batch_seqs":32,"grad_clip":1} |
| 628 | {"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":10,"chunk_size":32768,"stride":64,"frozen_blocks":2,"gradient_clip":1,"lr_warmup_percent":5} |
| 644 | {"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":10,"chunk_size":32768,"stride":64,"frozen_blocks":2,"gradient_clip":1} |
| 653 | {"chunk_size":32768,"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":3,"frozen_blocks":0,"gradient_clip":1,"batch_seqs":32} |
| 656 | {"freeze_blocks":0,"grad_clip":0.8} |
| 668 | {"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs":10,"tokens_per_chunk":32768,"freeze_first_blocks":2} |
| 670 | {"experiments":22} |
| 680 | — |
| 688 | {"learning_rate":0.0001,"chunk_tokens":131072,"epochs":3,"polyak_decay":0.998,"frozen_blocks":9} |
| 691 | {"epochs":3,"chunk_tokens":32768,"learning_rate":0.002} |
| 697 | {"recovery_epochs":20,"recovery_lr":0.001} |
| 700 | {"epochs":4,"learning_rate":0.0001,"freeze_blocks":2,"chunk_tokens":131072} |
| 706 | {"disabled":true} |
| 714 | {"learning_rate":0.002,"epochs_per_chunk":3,"optimizer":"SGD","momentum":0.9,"freeze_blocks":0} |
| 720 | {"epochs":4,"optimizer":"AdamW"} |
| 726 | {"chunk_size":32768,"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs":3,"frozen_blocks":2,"gradient_clip":1,"stride":64} |
| 731 | {"optimizer":"AdamW","learning_rate":0.0005,"polyak_decay":0.998,"freeze_first_blocks":9,"unfreeze_last_blocks":2,"epochs_per_chunk":3,"byte_weighted_loss":true,"adaptive_cosine_lr":true} |
| 733 | {"epochs":3,"learning_rate":0.002,"momentum":0.9,"freeze_blocks":0} |
| 734 | {"learning_rate":0.0001,"epochs":3,"freeze_blocks":9,"chunk_tokens":131072} |
| 745 | {"epochs":1,"learning_rate":0.002,"momentum":0.9} |
| 752 | {"learning_rate":0.002,"momentum":0.9,"epochs":3,"chunk_tokens":32768,"all_blocks_unfrozen":true} |
| 753 | {"enabled":false} |
| 754 | {"learning_rate":0.002,"momentum":0.9,"epochs":3,"chunk_size":32000} |
| 756 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"stride":64} |
| 758 | {"deterministic":true,"enabled":false} |
| 761 | {"chunk_size_tokens":131000,"learning_rate":0.0001,"epochs":4,"freeze_first_blocks":2,"grad_clip":1} |
| 764 | {"epochs":3,"freeze_last_blocks":2} |
| 768 | {"epochs":3,"learning_rate":0.002,"momentum":0.9,"chunk_tokens":32768,"batch_seqs":32,"freeze_blocks":0,"grad_clip":1} |
| 772 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"stride":64} |
| 778 | {"backward_looking_cache":true,"ngram_orders":"2-7"} |
| 779 | {"qttt":1,"eta":0.02,"learning_rate":0.00003,"chunk_tokens":1048576,"epochs":1,"adaptive_lr":0,"polyak":0,"freeze_blocks":1} |
| 792 | {"cache_update_after_scoring":true} |
| 795 | {"enabled":false} |
| 798 | {"learning_rate":0.00003,"epochs":1,"chunk_tokens":1000000,"freeze_blocks":2,"polyak_decay":0.998} |
| 803 | {"optimizer":"AdamW","learning_rate":0.0005,"epochs_per_chunk":4,"freeze_blocks":9,"polyak_ema":0.998} |
| 809 | {"rank":8,"learning_rate":0.01,"chunk_size":2048,"epochs":3} |
| 811 | {"learning_rate":0.0005,"epochs":4,"freeze_blocks":2,"temperature":0.98} |
| 813 | {"backward_looking":true,"entropy_adaptive_alpha":true} |
| 814 | {"chunk_based":true,"update_after_scoring":true} |
| 816 | {"epochs":3,"optimizer":"SGD","all_blocks_unfrozen":true} |
| 825 | {"backward_looking":true} |
| 826 | {"rank":8,"learning_rate":0.01,"chunk_size":2048,"epochs_per_chunk":3,"polyak_decay":0.998,"temperature":0.98} |
| 830 | {"backoff_orders":[1,2,3,4,5,6,7],"entropy_adaptive_alpha":true,"implemented_in_c":true} |
| 831 | — |
| 834 | {"epochs":1,"freeze_blocks":1,"learning_rate":0.00003} |
| 836 | {"optimizer":"SGD","learning_rate":0.002,"epochs":3,"chunk_size":"32K"} |
| 838 | {"learning_rate":0.002,"momentum":0.9,"epochs":3,"all_blocks_unfrozen":true} |
| 846 | {"optimizer":"AdamW","temperature":0.98,"chunk_size":2048} |
| 849 | {"epochs":4,"optimizer":"AdamW","learning_rate":0.0005,"freeze_blocks":2,"byte_weighted":true,"polyak_averaging":0.998,"adaptive_cosine_lr":true} |
| 851 | {"epochs":4,"learning_rate":0.0001,"freeze_blocks":2,"chunk_tokens":131072} |
| 861 | {"learning_rate":0.0004,"epochs":1,"params":"MLP-only (up_proj, down_proj, gate_proj, scale)"} |
| 869 | — |
| 870 | {"pass1_store_probs":true,"pass2_rescore_all_tokens":true} |
| 872 | {"inner_loop":"gradient descent on MLP weights","meta_learning":true} |
| 873 | {"inner_learning_rate":0.001} |
| 880 | {"epochs":2,"learning_rate":0.0001,"freeze_blocks":2} |
| 883 | {"phases":2} |
| 885 | {"learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":3,"chunk_size":32768} |
| 886 | — |
| 887 | {"ngram_backoff":true,"orders":"2-7"} |
| 891 | {"learning_rate":0.002,"chunk_size":256,"freeze_blocks":0} |
| 892 | {"learning_rate":0.002,"chunk_size":256,"freeze_blocks":0} |
| 893 | {"passes":2,"cache_orders":"2-12","cold_cache_chunks":50} |
| 913 | {"online_cache_update":true} |
| 914 | {"learning_rate":0.002,"momentum":0.9,"epochs":3,"freeze_blocks":2} |
| 916 | {"pass_1":"store per-token model probabilities without n-gram blending","pass_2":"rescore with frozen cache"} |
| 924 | {"epochs":1,"learning_rate":0.001} |
| 925 | {"epochs":1,"learning_rate":0.001} |
| 927 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768} |
| 931 | {"chunk_tokens":131072,"temperature":0.85,"freeze_blocks":2,"epochs":2,"learning_rate":0.0001} |
| 940 | — |
| 945 | {"epochs":1,"learning_rate":0.001,"adaptive_temperature":[0.9,1.05],"byte_weighted_loss":true} |
| 947 | {"enabled":false} |
| 952 | {"epochs":3,"learning_rate":0.002,"freeze_blocks":0,"momentum":0.9} |
| 953 | {"epochs":4,"freeze_blocks":1,"learning_rate":0.0005,"chunk_tokens":32768} |
| 958 | {"epochs":3,"learning_rate":0.0001} |
| 967 | {"learning_rate":0.002,"optimizer":"SGD","momentum":0.9} |
| 972 | — |
| 988 | — |
| 991 | {"learning_rate":0.0001,"epochs":3,"blocks_unfrozen":2} |
| 995 | {"learning_rate":0.002,"momentum":0.95,"epochs":4,"freeze_depth":0} |
| 999 | {"learning_rate":0.002,"epochs":"2/3/4 adaptive","chunk_tokens":32768} |
| 1001 | — |
| 1004 | {"chunk_tokens":131072} |
| 1008 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"momentum":0.9,"freeze_blocks":0,"batch_seqs":32,"grad_clip":1} |
| 1014 | — |
| 1016 | {"learning_rate":0.002,"momentum":0.9,"epochs":3,"chunk_size":32000} |
| 1027 | {"optimizer":"AdamW","chunk_size":32768,"pre_quantization":true} |
| 1037 | {"chunk_size":32000,"stride":64,"all_blocks_unfrozen":true} |
| 1039 | {"learning_rate":0.0025,"epochs":4,"chunk_tokens":32768,"momentum":0.9,"freeze_blocks":0,"batch_seqs":32,"grad_clip":1} |
| 1040 | — |
| 1041 | — |
| 1057 | {"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs_per_chunk":7,"chunk_size":32768,"all_blocks_unfrozen":true} |
| 1066 | {"enabled":1,"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0} |
| 1069 | {"steps":3,"learning_rate":0.0001} |
| 1070 | {"epochs":3,"optimizer":"SGD","learning_rate":0.002,"momentum":0.9} |
| 1081 | — |
| 1084 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768} |
| 1087 | {"learning_rate":0.005,"epochs":3,"chunk_tokens":16384,"freeze_blocks":0} |
| 1092 | {"learning_rate":0.03,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9} |
| 1098 | {"learning_rate":0.0025,"epochs":6,"freeze_blocks":0} |
| 1117 | {"learning_rate":0.0025,"epochs":6,"freeze_blocks":0} |
| 1118 | {"learning_rate":0.0025,"epochs":6,"freeze_blocks":0} |
| 1123 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2} |
| 1124 | — |
| 1128 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1} |
| 1129 | {"learning_rate":0.0001,"epochs":3,"unfrozen_blocks":2} |
| 1143 | {"legal":true,"backward_looking":true} |
| 1148 | {"learning_rate":0.002,"epochs":[2,3,4],"chunk_tokens":32768,"entropy_adaptive":true} |
| 1150 | {"learning_rate":0.002,"chunk_tokens":32768,"batch_seqs":32,"epochs":3} |
| 1156 | — |
| 1159 | {"enabled":1,"epochs":3,"batch_seqs":32,"chunk_tokens":32768,"learning_rate":0.002,"momentum":0.9} |
| 1170 | {"learning_rate":0.002,"epochs":10,"chunk_size":32768,"freeze_blocks":2,"momentum":0.9} |
| 1171 | {"epochs":5,"learning_rate":0.0001,"chunks":262144} |
| 1172 | {"learning_rate":0.005,"steps":8} |
| 1176 | {"epochs":3} |
| 1182 | {"learning_rate":0.002,"epochs":4,"freeze_blocks":0} |
| 1185 | {"epochs":3,"chunk_size":32000,"stride":64,"optimizer":"SGD"} |
| 1202 | {"chunk_size":32768,"epochs_per_chunk":3,"learning_rate":0.002,"momentum":0.9,"stride":64} |
| 1209 | {"epochs":3,"chunk_tokens":65536,"learning_rate":0.002} |
| 1217 | {"learning_rate":null} |
| 1222 | {"learning_rate":0.01,"chunk_size":1024,"adapted_components":"prime MLPs only"} |
| 1227 | {"epochs":10,"optimizer":"SGD","schedule":"cosine decay","rank":8} |
| 1229 | {"learning_rate":0.008,"steps":16} |
| 1230 | {"chunk_tokens":32768,"epochs":3,"learning_rate":0.002,"optimizer":"SGD + momentum","momentum":0.9} |
| 1231 | {"chunk_size":32768,"epochs":3,"learning_rate":0.002,"gradient_clip":1,"eval_passes":3} |
| 1232 | {"optimizer":"SGD","learning_rate":0.002,"epochs":10,"chunk_size":32768,"frozen_blocks":2,"grad_clip":1,"stride":64} |
| 1233 | — |
| 1234 | {"epochs":3,"learning_rate":0.0001} |
| 1236 | {"epochs":3} |
| 1237 | — |
| 1238 | — |
| 1239 | {"learning_rate":0.0005,"steps":1} |
| 1240 | {"epochs":3,"learning_rate":0.002,"chunk_tokens":65536} |
| 1242 | {"learning_rate":0.005,"epochs":3,"chunk_tokens":32768} |
| 1244 | {"learning_rate":0.002,"momentum":0.9,"epochs":3,"chunk_tokens":32768} |
| 1245 | {"epochs":25,"freeze_blocks":0,"learning_rate":0.002} |
| 1248 | — |
| 1252 | {"epochs":2,"freeze_blocks":2,"learning_rate":0.002} |
| 1263 | {"steps":16,"learning_rate":0.008,"min_learning_rate":0.0008} |
| 1269 | {"chunk_tokens":32768,"learning_rate":0.002,"epochs":3,"momentum":0.9,"freeze_blocks":0,"grad_clip":1} |
| 1272 | — |
| 1274 | {"learning_rate":0.005,"epochs":3} |
| 1276 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"batch_seqs":32,"grad_clip":1} |
| 1280 | {"learning_rate":0.002,"chunk_tokens":32768,"epochs":"2/3/4 adaptive"} |
| 1284 | {"stride":64} |
| 1284 | {"stride":16} |
| 1289 | — |
| 1303 | {"steps":16,"learning_rate":0.008,"min_learning_rate":0.0008} |
| 1313 | {"steps":24,"learning_rate":0.012} |
| 1319 | {"steps":64,"warmstart_alpha":0.85,"learning_rate_start":0.01,"learning_rate_end":0.001} |
| 1320 | {"epochs_per_chunk":3,"learning_rate":0.002,"momentum":0.9,"all_gpu_per_chunk":true,"full_sequence_loss":true,"skip_final_chunk_training":true} |
| 1322 | {"steps":32,"learning_rate":0.04} |
| 1325 | {"epochs":3} |
| 1326 | {"enabled":true,"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0} |
| 1333 | {"learning_rate":0.008,"steps":16,"delta_dim":512} |
| 1338 | {"learning_rate":0.002,"epochs":3} |
| 1339 | {"learning_rate":0.002,"epochs":3} |
| 1370 | {"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"epochs_per_chunk":3,"chunk_size":32768,"freeze_first_blocks":2} |
| 1376 | {"steps":24,"learning_rate_start":0.024,"learning_rate_min":0.001,"stride":96} |
| 1379 | — |
| 1399 | {"learning_rate":0.0005,"epochs":1,"freeze_blocks":9,"chunk_size":32768,"scope":"pre-quantization"} |
| 1406 | {"learning_rate":0.0005,"epochs":10,"freeze_blocks":0,"cosine_decay":true,"pre_quant":true} |
| 1410 | {"learning_rate":0.0008,"chunk_size":65536,"epochs":4,"momentum":0.9} |
| 1413 | {"learning_rate":0.005,"epochs":3,"freeze_blocks":0} |
| 1425 | {"enabled":1} |
| 1431 | {"learning_rate":0.003,"epochs_per_chunk":20,"chunks":348} |
| 1437 | {"learning_rate":0.005,"epochs":3} |
| 1440 | {"epochs":3,"learning_rate":0.01,"reset_per_chunk":0} |
| 1456 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2} |
| 1460 | {"learning_rate":0.005,"momentum":0.9,"epochs_per_chunk":3,"chunk_size":32000,"freeze":0} |
| 1465 | {"muon":true,"stride":64} |
| 1476 | — |
| 1477 | {"epochs":3,"learning_rate":0.005} |
| 1492 | {"learning_rate":0.005,"epochs":3,"chunk_size":32000} |
| 1493 | {"learning_rate":0.005,"epochs":3} |
| 1501 | {"optimizer":"SGD","momentum":0.9,"learning_rate":0.004,"epochs":4} |
| 1502 | {"optimizer":"SGD","lr_schedule":"cosine decay","momentum":0.9,"epochs":4} |
| 1514 | {"learning_rate":0.005,"epochs":3,"freeze_blocks":0,"chunk_tokens":32768} |
| 1515 | {"epochs_per_chunk":3,"learning_rate":0.005,"momentum":0.9} |
| 1516 | — |
| 1518 | {"learning_rate":0.005,"freeze_blocks":0,"epochs":3,"chunk_tokens":32768} |
| 1520 | {"chunk_size":32000,"epochs":3} |
| 1521 | {"epochs":3} |
| 1523 | {"learning_rate":0.01} |
| 1530 | {"batched_loras":true} |
| 1532 | {"learning_rate":0.005,"epochs":3,"momentum":0.9} |
| 1533 | {"epochs":3,"learning_rate":0.005} |
| 1535 | — |
| 1537 | — |
| 1541 | {"learning_rate":0.005,"epochs":3,"momentum":0.9} |
| 1546 | {"freeze_blocks":2} |
| 1555 | {"learning_rate":0.005,"momentum":0.9,"epochs_per_chunk":3} |
| 1557 | {"optimizer":"SGD","epochs":5,"learning_rate":0.005} |
| 1561 | {"learning_rate":0.01,"epochs":3,"chunk_size":32000} |
| 1565 | {"chunk_size":32768,"optimizer":"SGD","momentum":0.9,"learning_rate":0.01,"epochs":3,"gradient_clip":1} |
| 1572 | {"epochs":3,"chunks":1238} |
| 1579 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":1,"momentum":0.9,"batch_seqs":32,"grad_clip":1} |
| 1583 | {"learning_rate":0.005,"momentum":0.9,"epochs_per_chunk":3,"chunk_length":32000} |
| 1584 | {"chunk_size":32000,"epochs_per_chunk":3,"learning_rate":0.01,"momentum":0.9} |
| 1585 | {"chunk_tokens":32000,"epochs_per_chunk":3,"learning_rate":0.005,"momentum":0.9,"gradient_clipping":1} |
| 1598 | {"seeds":5} |
| 1605 | {"epochs":1,"learning_rate":0.005,"optimizer":"SGD"} |
| 1616 | {"learning_rate":0.005,"epochs":3} |
| 1626 | {"phased":true,"num_phases":3,"prefix_docs":2000} |
| 1628 | {"learning_rate":0.005,"epochs":3,"momentum":0.9} |
| 1629 | {"epochs":3} |
| 1639 | {"learning_rate":0.005,"momentum":0.9,"base_epochs":3,"adaptive_epochs":true,"max_epochs":5,"min_epochs":1,"ema_alpha":0.3} |
| 1643 | {"chunks":310,"chunk_tokens":32,"seq_len":4096,"learning_rate":0.01,"momentum":0.9,"epochs":1} |
| 1644 | {"chunks":310,"chunk_tokens":32768,"learning_rate":0.01,"momentum":0.9,"epochs":1} |
| 1645 | {"learning_rate":0.005,"epochs":3} |
| 1646 | — |
| 1647 | {"epochs_per_chunk":3} |
| 1655 | {"learning_rate":0.002,"momentum":0.9,"stride":76,"freeze_layers":2,"epochs":1} |
| 1667 | {"optimizer":"SGD","learning_rate":0.005,"epochs_per_chunk":3} |
| 1670 | {"method_variant":"Multi-Phase Global SGD","phases":3,"prefix_documents":2000,"learning_rate":0.001,"momentum":0.9,"gradient_clipping":1} |
| 1672 | {"learning_rate":0.005,"epochs_per_chunk":3} |
| 1679 | — |
| 1688 | {"chunk_size":32768,"learning_rate":0.005,"momentum":0.9,"epochs":3} |
| 1693 | {"phases":3,"prefix_docs":2000,"learning_rate":0.001,"momentum":0.9,"gradient_clip":1} |
| 1696 | {"learning_rate":0.003,"epochs":3,"freeze_blocks":1,"chunk_tokens":32768} |
| 1698 | {"learning_rate":0.005,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2,"momentum":0.9} |
| 1700 | {"phased":true,"num_phases":3} |
| 1706 | {"epochs":4,"learning_rate":0.0055} |
| 1711 | {"learning_rate":0.005,"momentum":0.9,"epochs":3,"freeze_blocks":2} |
| 1715 | {"learning_rate":0.005,"epochs":3} |
| 1716 | {"epochs":3,"learning_rate":0.005,"momentum":0.9} |
| 1718 | {"epochs":4} |
| 1722 | {"learning_rate":0.001,"epochs":1,"chunk_tokens":32768,"freeze_blocks":10} |
| 1727 | {"phases":4,"enabled":true} |
| 1729 | {"phases":3} |
| 1730 | {"learning_rate":0.005,"epochs":3,"chunk_tokens":32768,"batch_seqs":32,"freeze_blocks":0,"eval_stride":64} |
| 1731 | {"learning_rate":0.005,"momentum":0.9,"epochs":3} |
| 1732 | {"learning_rate":0.005,"momentum":0.9,"epochs":3} |
| 1733 | — |
| 1734 | {"learning_rate":0.005,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2,"momentum":0.9} |
| 1736 | {"phased":true,"num_phases":3,"prefix_docs":2000,"per_doc_lora_reset":true} |
| 1737 | {"learning_rate":0.005,"momentum":0.9,"epochs":3} |
| 1744 | {"freeze_blocks":10,"param_mode":"all","loss_gate_mode":"running_mean","loss_gate_margin":0,"final_block_only":true} |
| 1747 | {"learning_rate":0.005,"momentum":0.9,"chunk_size":2048} |
| 1749 | {"chunk_size":32768,"optimizer":"AdamW"} |
| 1750 | {"learning_rate":0.005,"epochs":3} |
| 1755 | {"optimizer":"SGD"} |
| 1756 | {"phased":true} |
| 1759 | {"learning_rate":0.005,"epochs":3,"chunk_size":32000} |
| 1760 | {"epochs":3,"learning_rate":0.005} |