← Back to Architecture
Gated Attention
ArchitectureUsed in
142 PRs
Best BPB
0.0281
Avg BPB
1.0394
Submissions
PR #344by aryanbhosale
1.1330PR #413by anantdgoel
1.4525PR #430by sahiee-dev
1.1428PR #474by joshuaswarren
1.1690PR #487by anantdgoel
1.1720PR #516by Asukabot0
1.1428PR #562by bigbag
1.1354PR #635by aryanbhosale
1.1330PR #638by Asukabot0
1.1164PR #670by abaybektursun
1.1171PR #715by Asukabot0
1.0337PR #727by Asukabot0
0.9674PR #733by stukenov
1.0278PR #745by stukenov
1.0222PR #754by aryanbhosale
1.1253PR #758by hypery11
1.0465PR #761by Asukabot0
0.9581PR #763by hypery11
0.9917PR #788by hypery11
0.9059PR #795by hypery11
0.8881PR #813by hypery11
0.6671PR #828by bigbag
0.9076PR #838by aryanbhosale
1.1215PR #850by callithyia
0.3212PR #864by aryanbhosale
0.2841PR #865by aryanbhosale
0.2841PR #871by greqone
0.8004PR #875by shalyhinpavel
1.0226PR #893by aryanbhosale
0.1310PR #909by sunnypatneedi
0.8609PR #921by TimPietrusky
0.0939PR #925by THUQiXuan
0.0281PR #940by antaloaalonso
0.9581PR #950by jzgdev
1.3178PR #952by FlashyFlash3011
1.1144PR #963by sunnypatneedi
0.8609PR #1001by ibarrajo
1.1188PR #1036by ivanontech
1.1974PR #1152by ericdatum
1.7942PR #1159by JDAppleseed
0.3693PR #1170by Christopher-Lee-McClendon
1.1199PR #1185by skoustav35
0.9641PR #1218by clarkkevRECORD
1.0978PR #1232by Christopher-Lee-McClendon
1.0929PR #1283by newjordan
1.1373PR #1287by dentity007
1.1048PR #1307by amrayach
1.1101PR #1311by htrung1105
1.1303PR #1410by izlley
1.1158PR #1452by bsisduck
0.3509PR #1454by bsisduck
0.3509PR #1490by wisebreadloaf
1.6110PR #1520by taka6745
1.0824PR #1536by dexhunter
1.0775PR #1537by pireylow
1.3971PR #1553by Abhishek8108
1.2097PR #1573by shivangbaveja
1.1464PR #1585by codemath3000
1.0639PR #1627by mike-ferguson
1.3246PR #1633by joshkmartinez
1.0585PR #1667by MarioPaerleRECORD
1.0714PR #1670by dexhunter
1.0597PR #1671by souro26
1.3827PR #1671by souro26
1.3827PR #1683by yunoshev
1.1280PR #1689by chris-colinsky
1.0822PR #1697by Buld1n
1.0812PR #1728by mikeapedia
1.0771PR #1734by yahya010
1.0108PR #1736by dexhunterRECORD
1.0655PR #1738by alertcat
1.0354PR #1751by Pravin-dev06
1.3565PR #1756by romeerp
1.0651PR #1766by tashapais
1.0655PR #1769by dexhunterRECORD
1.0645PR #1770by liujshi
1.0796PR #1771by bigbag
1.0651PR #1775by dentity007
1.0729PR #1779by leon2k2k2k
1.0642PR #1784by renqianluo
1.0708PR #1787by nprime06RECORD
1.0638PR #1790by miaoyuxun
1.0699PR #1792by renqianluo
1.0701PR #1794by Programmerryoki
1.0849PR #1797by dexhunter
1.0616PR #1798by leon2k2k2k
1.0629PR #1799by jamesEmerson112
1.2073PR #1800by leon2k2k2k
1.0629PR #1801by leon2k2k2k
1.0629PR #1801by leon2k2k2k
1.0629PR #1812by EthanNing
1.0729PR #1825by EthanYangTW
1.0770PR #1826by EthanYangTW
1.0770PR #1854by ndokutovich
0.9024PR #1857by dexhunter
1.0322PR #1859by suchihype
1.1454PR #1874by AjAnubolu
1.0677PR #1886by renqianluo
1.0696PR #1895by VFYAS
1.0785PR #1898by X-Abhishek-X
1.0661PR #1906by AayushBaniya2006
1.0614PR #1909by GodlyDonuts
1.0700PR #1914by Fija
1.0612PR #1915by AidenGeunGeun
1.0650PR #1920by bigbag
1.0699PR #1923by jorge-asenjo
1.0658PR #1924by dexhunter
1.0608PR #1926by bigbag
1.0684PR #1938by lijuncheng16
1.0713PR #1941by MarioPaerle
1.0687PR #1948by TimS-ml
1.0624PR #1956by AayushBaniya2006
1.0604PR #1957by mhlov000111
1.2313PR #1963by someone114514
1.0583PR #1965by himanshudongre
1.0587PR #1969by bsisduck
1.0804PR #1970by bsisduck
1.0674PR #1977by sahiee-dev
1.0730PR #1978by EthanYangTW
1.0778PR #1984by unowenmaxwen
1.0602PR #1987by TimS-ml
1.0618PR #1992by jamesEmerson112
1.0511PR #1995by User123331
1.0878PR #2005by jamesEmerson112
1.0805PR #2015by Muhtasham
1.2880PR #2017by Armigerous
1.0611PR #2019by aquariouseworkman
1.0585PR #2019by aquariouseworkman
1.0585PR #2026by RahimMirani
1.0611PR #2031by deborahnelson8788726
1.0599PR #2034by Maheshram1
1.0576PR #2037by organic-intelligence-1976
1.2670PR #2047by ZanePeycke
1.0591PR #2051by dexhunter
1.0605PR #2060by S0urC10ud
1.0579PR #2067by jiashenggu
1.0592PR #2068by jayaram1125
1.0617PR #2071by jamesEmerson112
1.0066PR #2072by wfproc
1.0717PR #2074by sanilb19
1.0884PR #2078by hi-aduek
1.0580PR #2090by SPThole
1.2310Hyperparameters Across PRs
| pr_number | parameters |
|---|---|
| 344 | — |
| 413 | {"bias_init":4} |
| 430 | {"layers":10} |
| 474 | — |
| 487 | {"added_params":37000} |
| 516 | — |
| 562 | — |
| 635 | — |
| 638 | — |
| 670 | — |
| 715 | — |
| 727 | — |
| 733 | — |
| 745 | — |
| 754 | — |
| 758 | — |
| 761 | — |
| 763 | — |
| 788 | — |
| 795 | — |
| 813 | — |
| 828 | — |
| 838 | — |
| 850 | {"bias":4} |
| 864 | — |
| 865 | — |
| 871 | — |
| 875 | {"layers":8,"final_attention_layer":1,"n_embd":384} |
| 893 | — |
| 909 | — |
| 921 | {"layers":11,"dim":512,"heads":8,"kv_heads":4} |
| 925 | {"experts":16,"hidden_size":512} |
| 940 | — |
| 950 | — |
| 952 | {"weight_init":0,"bias_init":4} |
| 963 | — |
| 1001 | — |
| 1036 | {"layers":12} |
| 1152 | {"init":0.1} |
| 1159 | {"enabled":0} |
| 1170 | — |
| 1185 | — |
| 1218 | — |
| 1232 | {"qk_gain_init":1.5} |
| 1283 | {"qk_gain_init":4} |
| 1287 | — |
| 1307 | {"layers":[2,4,6,8,10],"window_size":512} |
| 1311 | {"enabled":false} |
| 1410 | {"layers":[0,2,4,6,8,10]} |
| 1452 | {"layers":9} |
| 1454 | {"layers":9} |
| 1490 | {"layers":[1,3],"kv_heads":2} |
| 1520 | — |
| 1536 | — |
| 1537 | {"layer_start":7} |
| 1553 | {"qk_gain":5} |
| 1573 | — |
| 1585 | {"start_layer":8} |
| 1627 | — |
| 1633 | {"qk_gain_init":5.25} |
| 1667 | {"width":12,"layers":11} |
| 1670 | — |
| 1671 | — |
| 1671 | — |
| 1683 | — |
| 1689 | {"qk_gain":5.25} |
| 1697 | {"looped_band_layers":"3..5","recur_attn_gate":1,"recur_attn_gate_scale":0.5} |
| 1728 | — |
| 1734 | {"layers":10,"dimensions":544,"heads":8,"kv_share_stride":2} |
| 1736 | {"init_std":0.005} |
| 1738 | {"qk_gain":5.25} |
| 1751 | — |
| 1756 | — |
| 1766 | {"init_std":0.01} |
| 1769 | {"init_std":0.005,"quant_gate":true} |
| 1770 | {"type":"per-head V-Gate"} |
| 1771 | — |
| 1775 | — |
| 1779 | — |
| 1784 | {"per_head":true,"gate_activation":"sigmoid"} |
| 1787 | {"gate_window":12,"gate_params_per_layer":96} |
| 1790 | {"width":24} |
| 1792 | — |
| 1794 | {"gate":"2σ(attn_gate)"} |
| 1797 | — |
| 1798 | {"init_std":0.005,"quant_gate":true} |
| 1799 | {"type":"headwise","gates_per_head":1} |
| 1800 | — |
| 1801 | {"sparse_gate":true} |
| 1801 | {"frozen_carry":true} |
| 1812 | {"gate_width":12} |
| 1825 | {"width":12} |
| 1826 | {"width":12} |
| 1854 | — |
| 1857 | — |
| 1859 | {"gate_bias_init":2} |
| 1874 | {"width":24} |
| 1886 | — |
| 1895 | {"layers":[0,1,10]} |
| 1898 | {"scale":0.5} |
| 1906 | — |
| 1909 | {"width":36} |
| 1914 | {"num_heads":8,"num_kv_heads":4} |
| 1915 | {"gate_scale":0.5,"gate_window":12} |
| 1920 | {"width":24} |
| 1923 | {"window":12} |
| 1924 | — |
| 1926 | {"width":24} |
| 1938 | — |
| 1941 | {"layers":11,"gate_input_dim":12,"gate_output_dim":1} |
| 1948 | — |
| 1956 | — |
| 1957 | — |
| 1963 | {"sparse_attn_gate_scale":0.5,"sparse_attn_gate_init_std":0,"gated_attn_quant_gate":1} |
| 1965 | {"scale":0.5} |
| 1969 | {"gate_width":32} |
| 1970 | {"gate_width":32} |
| 1977 | {"width":24} |
| 1978 | {"width":12} |
| 1984 | {"scale":0.5} |
| 1987 | {"gate_scale":0.5} |
| 1992 | {"gate_dim":null,"heads":8} |
| 1995 | — |
| 2005 | {"gate_dim":null} |
| 2015 | {"quant_gate":1,"enabled":1} |
| 2017 | — |
| 2019 | {"smear_gate_enabled":true,"sparse_attn_gate_enabled":true,"sparse_attn_gate_scale":0.5} |
| 2019 | {"qk_gain_init":5.25} |
| 2026 | {"gate_window":12} |
| 2031 | {"window":12} |
| 2034 | {"gate_window":12,"scale":0.5} |
| 2037 | {"last_n":11} |
| 2047 | {"scale":0.5} |
| 2051 | — |
| 2060 | — |
| 2067 | — |
| 2068 | — |
| 2071 | {"gate_dim":null,"per_head":true} |
| 2072 | {"scale":0.5} |
| 2074 | {"quant_gate":true,"window":12} |
| 2078 | {"quant_gate":1,"scale":0.5} |
| 2090 | — |