← Back to Test-Time Training
full TTT
Test-Time TrainingUsed in
117 PRs
Best BPB
0.3964
Avg BPB
1.1198
Submissions
PR #30by JackYoung27
1.2663PR #64by yesbhautik
1.1250PR #150by yahya010
1.1478PR #152by timowhite88
1.1744PR #178by timowhite88
1.1667PR #223by 0xjaishy
1.1326PR #254by timowhite88
1.1303PR #263by Dannybc123
1.5382PR #264by stukenov
1.1455PR #281by charmquark1984
1.1381PR #290by ibarrajo
1.1354PR #297by davidpuertolas
1.1629PR #301by lookin-zz
1.1807PR #302by JackYoung27
1.1520PR #303by sseanliu
1.1436PR #317by chris-buckley
1.1442PR #334by nathon-lee
1.2207PR #338by alertcat
1.1254PR #364by shikhar1729
1.1497PR #366by shivnarainms22
1.1574PR #384by anantdgoel
1.2882PR #388by ElliotSlusky
1.1231PR #390by newjordan
1.1295PR #397by translatingthename
1.1364PR #398by felipe-parodi
1.1213PR #401by newjordan
1.1243PR #421by vytautas-bunevicius
1.1466PR #442by sjp611
1.1027PR #455by kasimte
1.1299PR #481by mrdavtan
1.0970PR #486by ndokutovich
1.1101PR #509by andrewbaggio1
1.1175PR #517by lukacf
0.9789PR #518by sofiabod
1.0622PR #562by bigbag
1.1354PR #581by teddyoweh
1.0698PR #587by newjordan
1.1208PR #595by LoquiAuris
1.1100PR #639by Robby955
1.1158PR #661by andrewbaggio1
1.1175PR #667by suchitj2702
1.1352PR #672by andrewbaggio1
1.0781PR #684by DeepReinforce
1.0574PR #685by andrewbaggio1
1.0366PR #686by msisovic
1.1182PR #696by gravelBridge
1.2622PR #702by lukacf
1.0244PR #741by andrewbaggio1
0.9850PR #756by abaybektursun
1.1142PR #771by sunnypatneedi
1.0705PR #785by SirSaltySalmon
1.5364PR #852by Prush69
1.1189PR #857by aruniyer
1.1093PR #928by autocode-rayes
1.1211PR #964by vivekvar-dl
1.3900PR #965by Adam-Jacuch
1.1184PR #967by dexhunter
1.0450PR #977by michaelwinczuk
1.1185PR #1002by SoHarshh
1.1650PR #1005by OnlyJundong
1.0853PR #1006by NewyorkDev
1.1085PR #1009by SoHarshh
1.1574PR #1043by okezue
1.1261PR #1045by Hilo-Hilo
1.1509PR #1050by Taleef7
1.1194PR #1052by demouo
1.1978PR #1077by malc3om
1.1130PR #1107by mradassaad
1.5633PR #1108by DbBested
1.1502PR #1186by andrewbaggio1
0.9850PR #1191by dentity007
1.3587PR #1193by dentity007
1.4390PR #1216by SoHarshh
1.1574PR #1228by meinlebenswerk
1.1527PR #1255by akaiHuang
1.5080PR #1257by BoxiYu
1.0855PR #1270by VirajDeshwal
1.1088PR #1275by ranausmanai
1.1492PR #1306by resouer
1.0846PR #1310by cadenmcmann
1.1177PR #1318by renqianluo
1.0095PR #1328by renqianluo
0.6361PR #1329by renqianluo
0.6361PR #1351by resouer
1.0807PR #1364by stukenov
1.1025PR #1376by stukenov
0.7094PR #1397by Mertyandimata
1.1047PR #1400by tmancino
1.1035PR #1407by OnlyJundong
1.0960PR #1408by aamodbhatt
1.0800PR #1416by erichroepke
1.0795PR #1422by swapp1990
1.1172PR #1423by aryanbhosale
1.0791PR #1424by OnlyJundong
1.0858PR #1430by renqianluo
0.3964PR #1482by aamodbhatt
1.0787PR #1487by ndokutovich
1.0600PR #1489by joshkmartinez
1.0736PR #1517by RulinShao
1.0632PR #1529by msisovic
1.0744PR #1539by translatingthename
1.0587PR #1550by translatingthename
1.0587PR #1601by SPThole
1.1190PR #1620by shiawyonglim
1.6644PR #1624by joshkmartinez
1.0585PR #1625by ChideraIbe123
1.1104PR #1633by joshkmartinez
1.0585PR #1638by kunwar-vikrant
1.0832PR #1676by aazizyan
1.0788PR #1697by Buld1n
1.0812PR #1701by Buld1n
1.1016PR #1702by Buld1n
1.1092PR #1703by Buld1n
1.0832PR #1735by AjAnubolu
1.0429PR #1738by alertcat
1.0354PR #1758by kilojoules
1.0277PR #1760by BrandtChristian
1.1863Hyperparameters Across PRs
| pr_number | parameters |
|---|---|
| 30 | — |
| 64 | {"epochs":25,"learning_rate":0.012,"momentum":0.9,"freeze_blocks":0} |
| 150 | {"learning_rate":0.002,"epochs":3,"freeze_first_blocks":2} |
| 152 | {"learning_rate":0.002,"epochs":2,"momentum":0.9,"batch_size":32} |
| 178 | {"epochs":2,"learning_rate":0.002,"frozen_blocks":4} |
| 223 | {"learning_rate":0.0003,"epochs":1,"momentum":0.95} |
| 254 | {"learning_rate":0.002,"momentum":0.9,"epochs":3,"freezing_first_blocks":2} |
| 263 | {"learning_rate":0.0001,"steps":1,"scope":["attn.proj.weight","mlp.proj.weight"]} |
| 264 | {"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs":2} |
| 281 | {"learning_rate":0.002,"momentum":0.9,"epochs":3} |
| 290 | {"epochs":3,"learning_rate":0.002,"freeze_blocks":2} |
| 297 | {"learning_rate":0.0003,"momentum":0.95} |
| 301 | {"epochs":3,"learning_rate":0.002,"momentum":0.9,"freeze_blocks":2} |
| 302 | {"scope":"MLP weights in last 3 blocks","learning_rate":null,"decay_prior":true} |
| 303 | {"learning_rate":0.002,"epochs":3,"freeze_blocks":2,"momentum":0.9,"gradient_clipping":1} |
| 317 | {"learning_rate":0.002,"epochs":3,"momentum":0.9,"freeze_blocks":2} |
| 334 | {"epochs":3,"frozen_blocks":2} |
| 338 | {"epochs":3,"learning_rate":0.002,"momentum":0.9,"frozen_blocks":2} |
| 364 | {"learning_rate":0.005,"momentum":0.9,"epochs":15,"freeze_blocks":0,"batch_seqs":16} |
| 366 | {"epochs":3,"learning_rate":0.002,"momentum":0.9,"grad_clip":1,"frozen_blocks":2} |
| 384 | {"learning_rate":0.002,"momentum":0.9,"epochs":2} |
| 388 | {"learning_rate":0.008,"epochs":25,"momentum":0.9,"batch_seqs":32,"freeze_blocks":0} |
| 390 | {"epochs":8,"learning_rate":0.002,"momentum":0.9} |
| 397 | {"learning_rate":0.002,"epochs":3,"freeze_blocks":2,"momentum":0.9} |
| 398 | {"epochs":20,"learning_rate":0.008,"momentum":0.9,"freeze_blocks":0} |
| 401 | {"epochs":8,"learning_rate":0.002,"momentum":0.9} |
| 421 | {"epochs":3,"optimizer":"SGD","time":"83s"} |
| 442 | {"learning_rate":0.0005,"epochs":10,"optimizer":"AdamW"} |
| 455 | {"epochs":3,"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"batch_size":32,"freezes_first_blocks":2} |
| 481 | {"optimizer":"AdamW","learning_rate":0.0005,"epochs":30,"cosine_decay":true,"per_layer_lr":true,"freeze_blocks":0,"batch_seqs_per_gpu":64} |
| 486 | {"optimizer":"AdamW","epochs":10,"freeze_blocks":0,"time_seconds":154} |
| 509 | {"epochs":30,"lr_schedule":"cosine decay"} |
| 517 | {"epochs":100,"learning_rate":0.001,"lr_min":0.00001,"scheduler":"cosine annealing"} |
| 518 | {"epochs":50,"learning_rate":0.0005,"weight_decay":0,"all_parameters_unfrozen":true,"per_layer_lr":{"mlp.proj":3,"mlp.fc":0.5},"grad_clip":1,"ddp_gradient_sync":true} |
| 562 | {"epochs":22,"optimizer":"AdamW","learning_rate":0.0005,"weight_decay":0,"lr_schedule":"per-step cosine decay to 0","per_layer_lr_groups":{"output_projections":3,"input_projections":0.5},"batch_size_per_gpu":32,"gradient_sync":"all_reduce per step","gradient_clipping":1,"TTT_time_seconds":406,"eval_time_seconds":197} |
| 581 | {"epochs":20,"learning_rate":0.0005,"min_learning_rate":0.00002} |
| 587 | — |
| 595 | {"optimizer":"AdamW","learning_rate":0.0005,"epochs":10,"weight_decay":0,"gradient_clipping":1} |
| 639 | {"optimizers_tested":["AdamW","SGD"],"learning_rates":[0.0005,0.002,0.001],"epochs":[3,5,10],"effect":"neutral-to-harmful on GPTQ weights"} |
| 661 | {"epochs":30,"schedule":"cosine","seed":1337} |
| 667 | {"learning_rate":0.003,"momentum":0.95,"epochs":3,"chunk_tokens":32768} |
| 672 | {"epochs":30,"optimizer":"AdamW","learning_rate":0.0005,"lr_schedule":"cosine decay","per_layer_lr_groups":{"mlp.proj":3,"mlp.fc":0.5}} |
| 684 | {"epochs":20} |
| 685 | {"phases":2,"phase_1":"cosine recovery","phase_2":"multi-pass score-first scoring","passes":3} |
| 686 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2,"untie":false} |
| 696 | {"learning_rate":0.002,"epochs_per_chunk":2,"stride":256,"chunk_tokens":32768,"batch_seqs":32,"all_parameters_adapt":true} |
| 702 | {"epochs":100,"learning_rate":0.001} |
| 741 | {"epochs":20,"learning_rate_schedule":"cosine","per_layer_lr_groups":true} |
| 756 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"stride":64} |
| 771 | {"learning_rate":0.0005,"epochs":30,"cosine":true,"per_layer_lr":true,"freeze_blocks":0,"batch_seqs":64,"max_steps":300} |
| 785 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1} |
| 852 | {"epochs":3} |
| 857 | {"epochs":20,"learning_rate":0.0005} |
| 928 | {"enabled":1} |
| 964 | {"document_isolated":true,"reset_at_bos":true} |
| 965 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1} |
| 967 | {"epochs":4,"zero_frozen_blocks":true,"skip_sliding_eval":true} |
| 977 | {"legal":true} |
| 1002 | {"learning_rate":0.002,"epochs":3,"legal_score_first":true} |
| 1005 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768} |
| 1006 | {"mode":"pre-quantization","epochs":3} |
| 1009 | {"learning_rate":0.002,"epochs":3} |
| 1043 | {"epochs":3,"learning_rate":0.002,"momentum":0.9,"grad_clip":1} |
| 1045 | {"learning_rate":0.002,"epochs":3} |
| 1050 | {"enabled":true} |
| 1052 | {"chunk":65536} |
| 1077 | {"window_size":32768,"optimizer":"SGD"} |
| 1107 | {"epochs":1} |
| 1108 | {"learning_rate_range":[0.00005,0.002]} |
| 1186 | {"epochs":20,"learning_rate":null} |
| 1191 | — |
| 1193 | — |
| 1216 | {"learning_rate":0.002,"epochs":3,"legal":true,"score_first":true} |
| 1228 | — |
| 1255 | {"optimizer":"AdamW"} |
| 1257 | {"learning_rate":0.0005,"epochs":3} |
| 1270 | {"epochs":3} |
| 1275 | — |
| 1306 | {"epochs":6,"learning_rate":0.0005,"freeze_first_blocks":2,"batch_size":32} |
| 1310 | {"learning_rate":0.002,"epochs":3,"all_blocks_unfrozen":true} |
| 1318 | {"learning_rate":0.001,"epochs":1,"frozen_blocks":"0-9"} |
| 1328 | {"optimizer":"AdamW","epochs":1,"learning_rate":0.001,"freeze_blocks":10} |
| 1329 | {"optimizer":"AdamW","epochs":1,"learning_rate":0.001,"freeze_blocks":10} |
| 1351 | {"timing":"pre-quantization","epochs":10,"freeze":0,"per_block_lr_scaling":{"start":0.3,"end":1,"interpolation":"linear"}} |
| 1364 | {"epochs":6,"freeze_first_blocks":2,"learning_rate_start":0.0005,"learning_rate_end":0.00005} |
| 1376 | {"epochs":6,"freeze_first_blocks":2,"learning_rate":0.0005} |
| 1397 | — |
| 1400 | {"epochs":10,"adaptive_lr":true,"per_block_lr":true} |
| 1407 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1} |
| 1408 | {"learning_rate":0.0005,"epochs":10,"freeze_blocks":0,"per_block_lr_scale":"0.3x-1.0x"} |
| 1416 | {"optimizer":"AdamW","epochs":6,"timing":"pre-quant"} |
| 1422 | {"optimizer":"SGD","learning_rate":0.005,"momentum":0.9,"chunk_size":2048,"all_weights":true} |
| 1423 | {"pre_quant":true,"epochs":6,"learning_rate":0.0005,"freeze_first_blocks":2} |
| 1424 | {"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1} |
| 1430 | {"optimizer":"AdamW","epochs":1,"learning_rate":0.001,"freeze_blocks":"0-9","second_pass_fraction":0.1,"floor_lr":0.0001} |
| 1482 | {"epochs":8,"learning_rate":0.00045,"freeze_blocks":1} |
| 1487 | {"epochs":10,"learning_rate":0.00045,"freeze_blocks":1,"schedule":"cosine"} |
| 1489 | {"learning_rate":0.0005,"epochs":6,"freeze_blocks":2,"batch_seqs":32,"grad_clip":1} |
| 1517 | {"epochs":18,"learning_rate":0.0003,"freeze_blocks":1} |
| 1529 | {"enabled":true,"learning_rate":0.01} |
| 1539 | {"epochs":6,"learning_rate":0.0005,"freeze_blocks":2,"compiled":true} |
| 1550 | {"learning_rate":0.0005,"epochs":6,"freeze_blocks":2,"batch_size":32,"sequence_length":2048,"compiled":true} |
| 1601 | {"every":4,"inner_outer_split":"cross-chunk","delta_loss":true,"optimizer":"SAM","rho":0.05} |
| 1620 | {"micro_batching":true} |
| 1624 | {"learning_rate":0.00045,"epochs":10,"freeze_blocks":1} |
| 1625 | {"mode":"E2E","scope":"MLP-only in last fraction of blocks","last_frac":null,"learning_rate":0.015,"epochs":2} |
| 1633 | {"learning_rate":0.00045,"epochs":10,"freeze_blocks":1} |
| 1638 | {"adaptive_epochs":true,"max_epochs":null,"min_epochs":null,"ema":null} |
| 1676 | {"score_before_update":true,"single_pass":true,"learning_rate":0.005,"momentum":0.9,"epochs_per_chunk":3} |
| 1697 | {"learning_rate":0.005,"epochs":3} |
| 1701 | {"learning_rate":0.005,"epochs":3} |
| 1702 | {"easy_chunk_ratio":0.998,"easy_chunk_epochs":1,"outlier_drop_fraction":0.03,"score_weight_power":0.5} |
| 1703 | {"learning_rate":0.005,"epochs":3} |
| 1735 | {"enabled":true,"epochs":21,"parallel_gpus":8,"pre_quant":true} |
| 1738 | {"epochs":21,"parallel_gpus":8,"federated_averaging":true} |
| 1758 | {"learning_rate":0.001,"freeze_blocks":0,"epochs":21,"phase":"pre-quant"} |
| 1760 | {"epochs":7,"learning_rate":0.0005,"pre_quantization":true} |