← Back to Test-Time Training

full TTT

Test-Time Training
Used in
117 PRs
Best BPB
0.3964
Avg BPB
1.1198

Submissions

PR #30by JackYoung27
1.2663
PR #64by yesbhautik
1.1250
PR #150by yahya010
1.1478
PR #152by timowhite88
1.1744
PR #178by timowhite88
1.1667
PR #223by 0xjaishy
1.1326
PR #254by timowhite88
1.1303
PR #263by Dannybc123
1.5382
PR #264by stukenov
1.1455
PR #281by charmquark1984
1.1381
PR #290by ibarrajo
1.1354
PR #297by davidpuertolas
1.1629
PR #301by lookin-zz
1.1807
PR #302by JackYoung27
1.1520
PR #303by sseanliu
1.1436
PR #317by chris-buckley
1.1442
PR #334by nathon-lee
1.2207
PR #338by alertcat
1.1254
PR #364by shikhar1729
1.1497
PR #366by shivnarainms22
1.1574
PR #384by anantdgoel
1.2882
PR #388by ElliotSlusky
1.1231
PR #390by newjordan
1.1295
PR #397by translatingthename
1.1364
PR #398by felipe-parodi
1.1213
PR #401by newjordan
1.1243
PR #421by vytautas-bunevicius
1.1466
PR #442by sjp611
1.1027
PR #455by kasimte
1.1299
PR #481by mrdavtan
1.0970
PR #486by ndokutovich
1.1101
PR #509by andrewbaggio1
1.1175
PR #517by lukacf
0.9789
PR #518by sofiabod
1.0622
PR #562by bigbag
1.1354
PR #581by teddyoweh
1.0698
PR #587by newjordan
1.1208
PR #595by LoquiAuris
1.1100
PR #639by Robby955
1.1158
PR #661by andrewbaggio1
1.1175
PR #667by suchitj2702
1.1352
PR #672by andrewbaggio1
1.0781
PR #684by DeepReinforce
1.0574
PR #685by andrewbaggio1
1.0366
PR #686by msisovic
1.1182
PR #696by gravelBridge
1.2622
PR #702by lukacf
1.0244
PR #741by andrewbaggio1
0.9850
PR #756by abaybektursun
1.1142
PR #771by sunnypatneedi
1.0705
PR #785by SirSaltySalmon
1.5364
PR #852by Prush69
1.1189
PR #857by aruniyer
1.1093
PR #928by autocode-rayes
1.1211
PR #964by vivekvar-dl
1.3900
PR #965by Adam-Jacuch
1.1184
PR #967by dexhunter
1.0450
PR #977by michaelwinczuk
1.1185
PR #1002by SoHarshh
1.1650
PR #1005by OnlyJundong
1.0853
PR #1006by NewyorkDev
1.1085
PR #1009by SoHarshh
1.1574
PR #1043by okezue
1.1261
PR #1045by Hilo-Hilo
1.1509
PR #1050by Taleef7
1.1194
PR #1052by demouo
1.1978
PR #1077by malc3om
1.1130
PR #1107by mradassaad
1.5633
PR #1108by DbBested
1.1502
PR #1186by andrewbaggio1
0.9850
PR #1191by dentity007
1.3587
PR #1193by dentity007
1.4390
PR #1216by SoHarshh
1.1574
PR #1228by meinlebenswerk
1.1527
PR #1255by akaiHuang
1.5080
PR #1257by BoxiYu
1.0855
PR #1270by VirajDeshwal
1.1088
PR #1275by ranausmanai
1.1492
PR #1306by resouer
1.0846
PR #1310by cadenmcmann
1.1177
PR #1318by renqianluo
1.0095
PR #1328by renqianluo
0.6361
PR #1329by renqianluo
0.6361
PR #1351by resouer
1.0807
PR #1364by stukenov
1.1025
PR #1376by stukenov
0.7094
PR #1397by Mertyandimata
1.1047
PR #1400by tmancino
1.1035
PR #1407by OnlyJundong
1.0960
PR #1408by aamodbhatt
1.0800
PR #1416by erichroepke
1.0795
PR #1422by swapp1990
1.1172
PR #1423by aryanbhosale
1.0791
PR #1424by OnlyJundong
1.0858
PR #1430by renqianluo
0.3964
PR #1482by aamodbhatt
1.0787
PR #1487by ndokutovich
1.0600
PR #1489by joshkmartinez
1.0736
PR #1517by RulinShao
1.0632
PR #1529by msisovic
1.0744
PR #1539by translatingthename
1.0587
PR #1550by translatingthename
1.0587
PR #1601by SPThole
1.1190
PR #1620by shiawyonglim
1.6644
PR #1624by joshkmartinez
1.0585
PR #1625by ChideraIbe123
1.1104
PR #1633by joshkmartinez
1.0585
PR #1638by kunwar-vikrant
1.0832
PR #1676by aazizyan
1.0788
PR #1697by Buld1n
1.0812
PR #1701by Buld1n
1.1016
PR #1702by Buld1n
1.1092
PR #1703by Buld1n
1.0832
PR #1735by AjAnubolu
1.0429
PR #1738by alertcat
1.0354
PR #1758by kilojoules
1.0277
PR #1760by BrandtChristian
1.1863

Hyperparameters Across PRs

pr_numberparameters
30
64{"epochs":25,"learning_rate":0.012,"momentum":0.9,"freeze_blocks":0}
150{"learning_rate":0.002,"epochs":3,"freeze_first_blocks":2}
152{"learning_rate":0.002,"epochs":2,"momentum":0.9,"batch_size":32}
178{"epochs":2,"learning_rate":0.002,"frozen_blocks":4}
223{"learning_rate":0.0003,"epochs":1,"momentum":0.95}
254{"learning_rate":0.002,"momentum":0.9,"epochs":3,"freezing_first_blocks":2}
263{"learning_rate":0.0001,"steps":1,"scope":["attn.proj.weight","mlp.proj.weight"]}
264{"optimizer":"SGD","learning_rate":0.002,"momentum":0.9,"epochs":2}
281{"learning_rate":0.002,"momentum":0.9,"epochs":3}
290{"epochs":3,"learning_rate":0.002,"freeze_blocks":2}
297{"learning_rate":0.0003,"momentum":0.95}
301{"epochs":3,"learning_rate":0.002,"momentum":0.9,"freeze_blocks":2}
302{"scope":"MLP weights in last 3 blocks","learning_rate":null,"decay_prior":true}
303{"learning_rate":0.002,"epochs":3,"freeze_blocks":2,"momentum":0.9,"gradient_clipping":1}
317{"learning_rate":0.002,"epochs":3,"momentum":0.9,"freeze_blocks":2}
334{"epochs":3,"frozen_blocks":2}
338{"epochs":3,"learning_rate":0.002,"momentum":0.9,"frozen_blocks":2}
364{"learning_rate":0.005,"momentum":0.9,"epochs":15,"freeze_blocks":0,"batch_seqs":16}
366{"epochs":3,"learning_rate":0.002,"momentum":0.9,"grad_clip":1,"frozen_blocks":2}
384{"learning_rate":0.002,"momentum":0.9,"epochs":2}
388{"learning_rate":0.008,"epochs":25,"momentum":0.9,"batch_seqs":32,"freeze_blocks":0}
390{"epochs":8,"learning_rate":0.002,"momentum":0.9}
397{"learning_rate":0.002,"epochs":3,"freeze_blocks":2,"momentum":0.9}
398{"epochs":20,"learning_rate":0.008,"momentum":0.9,"freeze_blocks":0}
401{"epochs":8,"learning_rate":0.002,"momentum":0.9}
421{"epochs":3,"optimizer":"SGD","time":"83s"}
442{"learning_rate":0.0005,"epochs":10,"optimizer":"AdamW"}
455{"epochs":3,"optimizer":"SGD","momentum":0.9,"learning_rate":0.002,"batch_size":32,"freezes_first_blocks":2}
481{"optimizer":"AdamW","learning_rate":0.0005,"epochs":30,"cosine_decay":true,"per_layer_lr":true,"freeze_blocks":0,"batch_seqs_per_gpu":64}
486{"optimizer":"AdamW","epochs":10,"freeze_blocks":0,"time_seconds":154}
509{"epochs":30,"lr_schedule":"cosine decay"}
517{"epochs":100,"learning_rate":0.001,"lr_min":0.00001,"scheduler":"cosine annealing"}
518{"epochs":50,"learning_rate":0.0005,"weight_decay":0,"all_parameters_unfrozen":true,"per_layer_lr":{"mlp.proj":3,"mlp.fc":0.5},"grad_clip":1,"ddp_gradient_sync":true}
562{"epochs":22,"optimizer":"AdamW","learning_rate":0.0005,"weight_decay":0,"lr_schedule":"per-step cosine decay to 0","per_layer_lr_groups":{"output_projections":3,"input_projections":0.5},"batch_size_per_gpu":32,"gradient_sync":"all_reduce per step","gradient_clipping":1,"TTT_time_seconds":406,"eval_time_seconds":197}
581{"epochs":20,"learning_rate":0.0005,"min_learning_rate":0.00002}
587
595{"optimizer":"AdamW","learning_rate":0.0005,"epochs":10,"weight_decay":0,"gradient_clipping":1}
639{"optimizers_tested":["AdamW","SGD"],"learning_rates":[0.0005,0.002,0.001],"epochs":[3,5,10],"effect":"neutral-to-harmful on GPTQ weights"}
661{"epochs":30,"schedule":"cosine","seed":1337}
667{"learning_rate":0.003,"momentum":0.95,"epochs":3,"chunk_tokens":32768}
672{"epochs":30,"optimizer":"AdamW","learning_rate":0.0005,"lr_schedule":"cosine decay","per_layer_lr_groups":{"mlp.proj":3,"mlp.fc":0.5}}
684{"epochs":20}
685{"phases":2,"phase_1":"cosine recovery","phase_2":"multi-pass score-first scoring","passes":3}
686{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":2,"untie":false}
696{"learning_rate":0.002,"epochs_per_chunk":2,"stride":256,"chunk_tokens":32768,"batch_seqs":32,"all_parameters_adapt":true}
702{"epochs":100,"learning_rate":0.001}
741{"epochs":20,"learning_rate_schedule":"cosine","per_layer_lr_groups":true}
756{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"stride":64}
771{"learning_rate":0.0005,"epochs":30,"cosine":true,"per_layer_lr":true,"freeze_blocks":0,"batch_seqs":64,"max_steps":300}
785{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1}
852{"epochs":3}
857{"epochs":20,"learning_rate":0.0005}
928{"enabled":1}
964{"document_isolated":true,"reset_at_bos":true}
965{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1}
967{"epochs":4,"zero_frozen_blocks":true,"skip_sliding_eval":true}
977{"legal":true}
1002{"learning_rate":0.002,"epochs":3,"legal_score_first":true}
1005{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768}
1006{"mode":"pre-quantization","epochs":3}
1009{"learning_rate":0.002,"epochs":3}
1043{"epochs":3,"learning_rate":0.002,"momentum":0.9,"grad_clip":1}
1045{"learning_rate":0.002,"epochs":3}
1050{"enabled":true}
1052{"chunk":65536}
1077{"window_size":32768,"optimizer":"SGD"}
1107{"epochs":1}
1108{"learning_rate_range":[0.00005,0.002]}
1186{"epochs":20,"learning_rate":null}
1191
1193
1216{"learning_rate":0.002,"epochs":3,"legal":true,"score_first":true}
1228
1255{"optimizer":"AdamW"}
1257{"learning_rate":0.0005,"epochs":3}
1270{"epochs":3}
1275
1306{"epochs":6,"learning_rate":0.0005,"freeze_first_blocks":2,"batch_size":32}
1310{"learning_rate":0.002,"epochs":3,"all_blocks_unfrozen":true}
1318{"learning_rate":0.001,"epochs":1,"frozen_blocks":"0-9"}
1328{"optimizer":"AdamW","epochs":1,"learning_rate":0.001,"freeze_blocks":10}
1329{"optimizer":"AdamW","epochs":1,"learning_rate":0.001,"freeze_blocks":10}
1351{"timing":"pre-quantization","epochs":10,"freeze":0,"per_block_lr_scaling":{"start":0.3,"end":1,"interpolation":"linear"}}
1364{"epochs":6,"freeze_first_blocks":2,"learning_rate_start":0.0005,"learning_rate_end":0.00005}
1376{"epochs":6,"freeze_first_blocks":2,"learning_rate":0.0005}
1397
1400{"epochs":10,"adaptive_lr":true,"per_block_lr":true}
1407{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1}
1408{"learning_rate":0.0005,"epochs":10,"freeze_blocks":0,"per_block_lr_scale":"0.3x-1.0x"}
1416{"optimizer":"AdamW","epochs":6,"timing":"pre-quant"}
1422{"optimizer":"SGD","learning_rate":0.005,"momentum":0.9,"chunk_size":2048,"all_weights":true}
1423{"pre_quant":true,"epochs":6,"learning_rate":0.0005,"freeze_first_blocks":2}
1424{"learning_rate":0.002,"epochs":3,"chunk_tokens":32768,"freeze_blocks":0,"momentum":0.9,"batch_seqs":32,"grad_clip":1}
1430{"optimizer":"AdamW","epochs":1,"learning_rate":0.001,"freeze_blocks":"0-9","second_pass_fraction":0.1,"floor_lr":0.0001}
1482{"epochs":8,"learning_rate":0.00045,"freeze_blocks":1}
1487{"epochs":10,"learning_rate":0.00045,"freeze_blocks":1,"schedule":"cosine"}
1489{"learning_rate":0.0005,"epochs":6,"freeze_blocks":2,"batch_seqs":32,"grad_clip":1}
1517{"epochs":18,"learning_rate":0.0003,"freeze_blocks":1}
1529{"enabled":true,"learning_rate":0.01}
1539{"epochs":6,"learning_rate":0.0005,"freeze_blocks":2,"compiled":true}
1550{"learning_rate":0.0005,"epochs":6,"freeze_blocks":2,"batch_size":32,"sequence_length":2048,"compiled":true}
1601{"every":4,"inner_outer_split":"cross-chunk","delta_loss":true,"optimizer":"SAM","rho":0.05}
1620{"micro_batching":true}
1624{"learning_rate":0.00045,"epochs":10,"freeze_blocks":1}
1625{"mode":"E2E","scope":"MLP-only in last fraction of blocks","last_frac":null,"learning_rate":0.015,"epochs":2}
1633{"learning_rate":0.00045,"epochs":10,"freeze_blocks":1}
1638{"adaptive_epochs":true,"max_epochs":null,"min_epochs":null,"ema":null}
1676{"score_before_update":true,"single_pass":true,"learning_rate":0.005,"momentum":0.9,"epochs_per_chunk":3}
1697{"learning_rate":0.005,"epochs":3}
1701{"learning_rate":0.005,"epochs":3}
1702{"easy_chunk_ratio":0.998,"easy_chunk_epochs":1,"outlier_drop_fraction":0.03,"score_weight_power":0.5}
1703{"learning_rate":0.005,"epochs":3}
1735{"enabled":true,"epochs":21,"parallel_gpus":8,"pre_quant":true}
1738{"epochs":21,"parallel_gpus":8,"federated_averaging":true}
1758{"learning_rate":0.001,"freeze_blocks":0,"epochs":21,"phase":"pre-quant"}
1760{"epochs":7,"learning_rate":0.0005,"pre_quantization":true}