← Back to Architecture
MLP3x
ArchitectureUsed in
382 PRs
Best BPB
0.0274
Avg BPB
1.0994
Submissions
PR #63by yahya010RECORD
1.1598PR #65by aquariouseworkmanRECORD
1.1556PR #66by arjun-krishna1
1.1632PR #69by TevBenji
1.1708PR #70by jfprincz
1.1659PR #76by unixmadtoonslab
1.1433PR #81by polarizedfortnite-cpu
1.1670PR #86by aruniyerRECORD
1.1502PR #88by seanward
1.1605PR #89by vmfunc
1.1622PR #99by takhir-iota
1.1605PR #102by unnir
1.1618PR #107by m0at
1.1648PR #108by kellyvv
1.4370PR #110by mr-ashish-panday
1.2244PR #114by saml212
1.1574PR #116by abhishekgahlot2
1.1666PR #117by trovatochris
1.1702PR #120by andrewgcodes
0.9588PR #122by mtybadger
1.1603PR #123by saikrishnarallabandi
1.1642PR #128by rsavitt
1.1594PR #135by unnir
1.1539PR #137by abhishekgahlot2
1.1666PR #139by ksang123
1.2029PR #150by yahya010
1.1478PR #156by dexhunter
1.1602PR #160by ChaseWNorton
1.1623PR #162by raahilshahRECORD
1.1458PR #164by jfprincz
1.1524PR #172by GMaN1911
1.1812PR #173by tamoghnokandar
1.1532PR #175by anthony-maio
1.1229PR #178by timowhite88
1.1667PR #180by thwu1RECORD
1.1428PR #181by manfromnowhere143
1.2194PR #187by Idan3011
1.1629PR #190by newjordan
1.1725PR #191by chris-buckley
1.1598PR #192by baudrillardsgh0st
1.1502PR #194by baudrillardsgh0st
1.1480PR #196by sicauzxl
1.3825PR #198by jfprinczRECORD
1.1318PR #201by machdragon
1.1551PR #206by dexhunter
1.1507PR #207by ajkpersonal
1.1568PR #208by ajkpersonal
1.1568PR #212by mrdavtan
1.1329PR #215by JayCheng113
1.1548PR #218by bopmite
1.1248PR #219by alertcat
1.1541PR #222by ansh-deriv
1.1601PR #223by 0xjaishy
1.1326PR #225by dibdabo
1.2089PR #230by MatthewHRockwell
1.1541PR #232by kellyvv
1.4370PR #236by saml212
1.1400PR #238by kellyvv
1.5164PR #243by kvmukilan
1.1704PR #246by kvmukilan
1.1704PR #249by kvmukilan
1.1704PR #251by kshitizz36
1.1596PR #254by timowhite88
1.1303PR #264by stukenov
1.1455PR #265by unnir
1.1307PR #267by andrewgcodes
1.1374PR #273by dentity007
1.1575PR #274by haikosys
1.1403PR #278by nicolasdickenmann
1.0365PR #281by charmquark1984
1.1381PR #287by jfprinczRECORD
1.1271PR #289by integrate-your-mind
1.1518PR #294by sseanliu
1.1645PR #295by gowtham0992
1.1477PR #296by sseanliu
1.1645PR #297by davidpuertolas
1.1629PR #302by JackYoung27
1.1520PR #303by sseanliu
1.1436PR #304by Bortlesboat
1.4245PR #305by Naazimsnh02
1.1672PR #306by xuafeng
1.1448PR #307by dennisimoo
1.1357PR #312by chanwoo-park-official
1.1668PR #316by SkywardSyntax
1.2035PR #317by chris-buckley
1.1442PR #324by crony-io
1.1702PR #326by crony-io
1.2890PR #327by Ananddna
1.1450PR #330by bopmite
1.1609PR #331by Rhodrium
1.1487PR #333by mahsumaktas
1.1565PR #334by nathon-lee
1.2207PR #338by alertcat
1.1254PR #339by sheeki03
1.1364PR #351by sp00mm
1.1659PR #352by sp00mm
1.1659PR #359by tmustier
1.1345PR #361by adityagupta26
1.1400PR #362by mkenney2
1.1497PR #364by shikhar1729
1.1497PR #366by shivnarainms22
1.1574PR #368by MatoTeziTanka
1.2037PR #369by signalrush
1.1328PR #370by SergheiBrinza
1.2421PR #374by unnirRECORD
1.1246PR #375by charmquark1984
1.1257PR #376by anthony-maio
1.1399PR #379by dannywillowliu-uchi
1.1257PR #383by joelnishanth
1.1320PR #385by dentity007
1.1488PR #388by ElliotSlusky
1.1231PR #389by trasnake87
1.1466PR #390by newjordan
1.1295PR #398by felipe-parodi
1.1213PR #400by chanwoo-park-official
1.1296PR #401by newjordan
1.1243PR #403by malc3om
1.1388PR #406by dentity007
1.1287PR #410by EthanYangTW
1.1216PR #414by signalrush
1.1233PR #415by EthanYangTW
1.1216PR #418by yashverms
1.1715PR #422by albertorkive
1.1396PR #424by someone114514
1.1725PR #433by Robby955
1.3441PR #442by sjp611
1.1027PR #443by CREVIOS
1.1431PR #444by AymanMahfuz27
1.4536PR #445by newjordan
1.1236PR #446by sofiabod
1.1933PR #447by CREVIOS
1.1431PR #448by handemanai
1.2006PR #450by zachgoldfine44
1.1466PR #453by Divyesh-Thirukonda
1.1248PR #454by nalediym
1.2055PR #455by kasimte
1.1299PR #456by Christopher-Lee-McClendon
1.1532PR #458by ofirkris
1.1365PR #461by Christopher-Lee-McClendon
1.1446PR #465by LoquiAuris
1.1508PR #466by simonbissonnette
1.1354PR #467by ADIITJ
1.1428PR #469by cmcdnd
1.1418PR #473by abaybektursun
1.1214PR #474by joshuaswarren
1.1690PR #481by mrdavtan
1.0970PR #483by tmustier
1.1346PR #485by harsha-gouru
1.1522PR #486by ndokutovich
1.1101PR #487by anantdgoel
1.1720PR #488by pkim02
1.3267PR #489by sofiabod
1.1327PR #499by newjordan
1.1478PR #508by newjordan
1.1215PR #510by SelfAnush
1.1989PR #512by MatoTeziTanka
0.9512PR #515by keshav55
1.1807PR #516by Asukabot0
1.1428PR #518by sofiabod
1.0622PR #525by hypery11
1.1160PR #528by EthanYangTW
1.1195PR #529by EthanYangTW
1.1195PR #533by newjordan
1.1207PR #546by shajalahamedcse
1.1752PR #547by shajalahamedcse
1.1752PR #548by LoquiAuris
1.0865PR #549by abaybektursunRECORD
1.1194PR #550by haimianbaobao007
1.1890PR #555by ymrohit
1.0916PR #560by Rohan5commit
1.1935PR #563by instax-dutta
1.1428PR #564by sadeghja1070
1.1270PR #568by MatoTeziTanka
0.7853PR #573by Sarimsaljook
1.0523PR #580by micoverde
1.2623PR #583by suchihype
1.1489PR #585by EthanYangTW
1.1179PR #586by EaCognitive
1.1365PR #589by RoyiRa
1.1178PR #593by abaybektursun
1.1163PR #595by LoquiAuris
1.1100PR #596by AriaAnima
0.6430PR #598by Christopher-Lee-McClendon
1.1334PR #605by bigbag
0.7227PR #612by Christopher-Lee-McClendon
1.1079PR #614by bigbag
0.6864PR #618by 0xtigerclaw
1.4702PR #619by zeal175
1.4222PR #622by Upsalla
1.0941PR #633by MatoTeziTanka
1.1526PR #642by minh-stakc
0.8173PR #644by Christopher-Lee-McClendon
1.0944PR #645by FlynnCruse
1.8990PR #659by deanbrr
1.0920PR #666by chrislovescoding
1.1932PR #671by keshav55
1.1807PR #678by SPThole
1.3525PR #681by Alfaxad
1.4775PR #682by gthgomez
1.1233PR #684by DeepReinforce
1.0574PR #685by andrewbaggio1
1.0366PR #688by RoyiRa
1.0745PR #690by EthanYangTW
1.1186PR #692by EthanYangTW
1.1186PR #693by EthanYangTW
1.1186PR #698by hesong0222-dev
1.1642PR #700by RoyiRa
1.0541PR #705by seanward
1.2151PR #709by StolbaJ
1.1478PR #710by Dhruba531
1.1240PR #713by hypery11
1.1180PR #714by Upsalla
1.1187PR #715by Asukabot0
1.0337PR #728by abaybektursun
1.1142PR #730by janwww
1.1570PR #734by Robby955
1.1198PR #736by Git-Aarya
1.2168PR #738by gowtham0992
1.0970PR #741by andrewbaggio1
0.9850PR #744by ShihChunHao
1.2824PR #752by Naazimsnh02
1.1182PR #754by aryanbhosale
1.1253PR #755by dcrow85
1.0321PR #757by fielding
1.1124PR #760by erikqu
1.2185PR #761by Asukabot0
0.9581PR #764by ndokutovich
0.9633PR #768by mradassaad
1.1201PR #770by minh-stakc
0.6672PR #771by sunnypatneedi
1.0705PR #774by travispchen
0.9370PR #777by Robby955
0.9623PR #778by raahilshah
0.9605PR #779by deanbrr
0.6683PR #786by shinegami-2002
0.8128PR #791by ShihChunHao
1.2824PR #794by jeremyschied
1.3346PR #795by hypery11
0.8881PR #796by Robby955
0.6567PR #797by armantsaturian
0.8960PR #798by travispchen
0.5466PR #799by yuvraajbains
1.2005PR #805by zeytx
1.1807PR #806by ibarrajo
0.6678PR #807by connectwithprakash
1.0116PR #808by Naazimsnh02
0.6364PR #809by AayushBaniya2006
0.2952PR #810by Idan3011
0.9393PR #811by quietsmile
0.4377PR #827by Programmerryoki
1.3999PR #828by bigbag
0.9076PR #836by autocode-rayes
1.1219PR #838by aryanbhosale
1.1215PR #840by quietsmile
0.2873PR #841by someone114514
1.1157PR #843by quietsmile
0.2834PR #849by dttdrv
1.1105PR #850by callithyia
0.3212PR #851by RoyiRa
0.2071PR #854by ivanontech
1.4530PR #874by fielding
1.6070PR #880by RoyiRa
0.1003PR #891by robbiebusinessacc
1.1428PR #892by robbiebusinessacc
1.1428PR #896by MVPandey
1.1896PR #900by Robby955
0.1156PR #907by resouer
0.0960PR #915by anthony-maio
0.9642PR #924by THUQiXuan
0.0280PR #925by THUQiXuan
0.0281PR #926by NandhuRajRK
0.8705PR #929by andreanjos
1.1653PR #933by haikosys
0.0804PR #940by antaloaalonso
0.9581PR #941by aptsalt
1.3620PR #945by TimPietrusky
0.0274PR #947by aamodbhatt
1.1576PR #953by dexhunter
1.0722PR #967by dexhunter
1.0450PR #972by Idan3011
0.3922PR #978by AnirudhRahul
1.5134PR #979by 0xadvait
1.1387PR #982by haikosys
0.0638PR #988by ymrohit
1.0857PR #989by alexanderaperry-arch
1.1402PR #993by aerosta
0.9631PR #995by dexhunter
1.0362PR #996by Idan3011
1.1478PR #997by randy06122001-boop
1.4182PR #999by aamodbhatt
1.1179PR #1002by SoHarshh
1.1650PR #1005by OnlyJundong
1.0853PR #1007by dillon-blake
1.2252PR #1009by SoHarshh
1.1574PR #1025by Zagot-byte
1.3579PR #1026by danielxmed
1.0945PR #1048by mrdavtan
1.1724PR #1051by tejas-goyal
1.2826PR #1059by edidisheng
1.1996PR #1062by yaowubarbara
1.4508PR #1068by LappyG
1.1510PR #1071by AbhayAnandUCSD
1.1455PR #1080by ciach
1.1228PR #1085by adityasasidhar
1.2831PR #1086by Omrigotlieb
1.1349PR #1088by serdardoesml
1.2542PR #1095by vimeto
0.0905PR #1105by abaybektursun
1.2208PR #1106by agalimova
1.1465PR #1108by DbBested
1.1502PR #1112by dillon-blake
1.2252PR #1125by jainpranjal97
1.1946PR #1127by dentity007
1.1311PR #1130by Gusanidas
1.1140PR #1139by ivanontech
1.1801PR #1141by ivanontech
1.1801PR #1142by ymrohit
1.1493PR #1144by inFaaa
1.3572PR #1148by aamodbhatt
1.1179PR #1167by Durlabhkumarjha
1.3736PR #1170by Christopher-Lee-McClendon
1.1199PR #1174by Okropniak
1.3069PR #1183by akaiHuang
1.5080PR #1185by skoustav35
0.9641PR #1186by andrewbaggio1
0.9850PR #1190by Durlabhkumarjha
1.3365PR #1202by VirajDeshwal
1.1412PR #1209by andrewbaggio1
1.1064PR #1216by SoHarshh
1.1574PR #1218by clarkkevRECORD
1.0978PR #1226by Wolfie8935
1.1428PR #1227by himanshudongre
1.4841PR #1228by meinlebenswerk
1.1527PR #1240by andrewbaggio1
1.1064PR #1254by Elarwei001
1.1070PR #1256by oidebrett
1.1444PR #1276by BiggerDABOSS
1.1100PR #1280by aamodbhatt
1.1156PR #1298by Omrigotlieb
1.1043PR #1312by adi-suresh01
1.3299PR #1313by anthony-maio
0.8637PR #1321by anthony-maio
0.7406PR #1337by sergimichi
1.2079PR #1359by LucasErcolano
0.4188PR #1361by jorge-asenjo
1.1220PR #1366by yunoshev
1.1371PR #1368by JKSNS
0.8503PR #1378by Rajat123456789
1.1711PR #1379by LucasErcolano
0.4162PR #1411by Blakethefn
1.5568PR #1417by BruhTheMomentum
1.3039PR #1422by swapp1990
1.1172PR #1431by Idan3011
1.1266PR #1436by DevWizard-Vandan
1.5546PR #1444by hypnoastic
1.3081PR #1446by LauraGomezjurado
1.0960PR #1453by iverbovoy
1.1324PR #1463by tsubasagit
1.2774PR #1473by AVINASH0052
1.1156PR #1479by andrewbaggio1
1.1450PR #1488by ndokutovich
0.8265PR #1492by bigbag
1.0810PR #1505by Rohan-Abhilash
1.1791PR #1508by jpfeiffe
1.1135PR #1527by alphastar1111
1.2026PR #1536by dexhunter
1.0775PR #1556by sidhanth97
1.4352PR #1557by ndokutovich
1.0773PR #1559by adityasasidhar
1.2498PR #1561by EthanYangTW
1.0783PR #1568by yuitokyouni
1.1639PR #1577by redefine-qbit
1.4016PR #1581by aiejvn
1.2321PR #1617by adityasasidhar
1.2192PR #1626by dexhunter
1.0719PR #1630by KevinChunye
1.1412PR #1647by powerpratik
1.0616PR #1649by joyceyan
1.1271PR #1666by mrbese
1.1531PR #1715by G3sparky
1.0809PR #1722by deborahnelson8788726
0.6580PR #1751by Pravin-dev06
1.3565Hyperparameters Across PRs
| pr_number | parameters |
|---|---|
| 63 | {"hidden_size":1344,"multiplier":2.625} |
| 65 | {"multiplier":3,"hidden_dim":1536} |
| 66 | {"hidden":1536,"multiplier":3} |
| 69 | {"layers":9,"hidden_dim":1536,"vocab_size":1024,"dim":512,"gqa_heads":8,"kv_heads":4} |
| 70 | {"mlp_mult":3,"hidden_size":1536} |
| 76 | {"multiplier":3} |
| 81 | {"mlp_mult":3} |
| 86 | {"mlp_mult":3,"hidden":1536} |
| 88 | {"MLP_HIDDEN":1536} |
| 89 | {"hidden_dim":1536} |
| 99 | {"mlp_mult":3,"num_layers":9,"model_dim":512,"num_heads":8,"num_kv_heads":4,"tie_embeddings":1} |
| 102 | {"mlp_mult":3,"hidden_dim":1536} |
| 107 | {"hidden_size":1488} |
| 108 | {"layers":11,"hidden_dim":1536} |
| 110 | {"layers":9,"ffn_schedule":[768,960,1152,1344,1536,1728,1920,2112,2304]} |
| 114 | {"mlp_hidden":1536,"default_mlp_hidden":1024} |
| 116 | {"hidden_size":1536,"mlp_mult":3} |
| 117 | {"multiplier":3} |
| 120 | {"hidden_dim":1536,"layers":9,"model_dim":512,"num_heads":8,"num_kv_heads":4} |
| 122 | {"hidden_dim":1536,"multiplier":3} |
| 123 | {"multiplier":3,"hidden_size":1536} |
| 128 | {"mlp_mult":3,"hidden":1536} |
| 135 | {"hidden_dimension":1536} |
| 137 | {"hidden":1536,"mlp_mult":3} |
| 139 | {"mlp_multiplier":3,"hidden_dim":2304} |
| 150 | {"hidden":1536} |
| 156 | {"dimensions":1536} |
| 160 | {"mlp_mult":3} |
| 162 | {"hidden":1536} |
| 164 | {"hidden_size":1536} |
| 172 | {"mlp_mult":3,"model_dim":512,"layers":9,"heads":8,"kv_heads":4} |
| 173 | {"hidden_size":1536,"base_hidden_size":1024} |
| 175 | {"expansion":3,"hidden_dim":1536,"negative_slope":0.5} |
| 178 | {"hidden":1536} |
| 180 | {"hidden":1536} |
| 181 | {"mlp_mult":3} |
| 187 | {"multiplier":3} |
| 190 | {"mlp_hidden":1344} |
| 191 | {"mlp_mult":3} |
| 192 | {"multiplier":3} |
| 194 | {"multiplier":3} |
| 196 | {"mlp_mult":3} |
| 198 | {"hidden_size":1536} |
| 201 | {"hidden":1536} |
| 206 | {"hidden_size":1536} |
| 207 | — |
| 208 | {"layers":11} |
| 212 | {"hidden":1536} |
| 215 | {"mlp_mult":3} |
| 218 | {"layers":3} |
| 219 | {"hidden":1536} |
| 222 | {"mlp_mult":3,"hidden_size":1536,"num_layers":10,"model_dim":512,"num_heads":8,"num_kv_heads":4} |
| 223 | {"hidden":1536} |
| 225 | {"layers":11,"dimensions":512,"mlp_hidden":1024} |
| 230 | {"hidden_dim":1536,"multiplier":3} |
| 232 | {"layers":11,"hidden_dim":1536} |
| 236 | {"hidden":1536} |
| 238 | {"multiplier":3} |
| 243 | {"hidden":1536} |
| 246 | {"hidden":1536} |
| 249 | {"hidden":1536} |
| 251 | {"mlp_mult":3} |
| 254 | {"layers":11,"hidden_dim":1536,"heads":8,"kv_heads":4} |
| 264 | {"multiplier":3,"hidden_size":1536} |
| 265 | {"hidden_dim":1536} |
| 267 | {"hidden_size":1536} |
| 273 | {"multiplier":3} |
| 274 | {"hidden_size":1536} |
| 278 | — |
| 281 | {"layers":11,"model_dim":512,"heads":8,"kv_heads":4,"mlp_hidden":1536} |
| 287 | {"hidden_size":1536} |
| 289 | {"hidden":1536,"multiplier":3} |
| 294 | — |
| 295 | {"hidden_size":1536} |
| 296 | {"hidden":1536} |
| 297 | {"hidden":1536,"model_dim":512,"layers":9} |
| 302 | {"layers":3} |
| 303 | — |
| 304 | {"hidden_size":1536} |
| 305 | {"hidden":1536} |
| 306 | {"expansion":3,"hidden_dim":1536} |
| 307 | {"multiplier":3} |
| 312 | {"mlp_mult":3} |
| 316 | — |
| 317 | — |
| 324 | {"layers":10,"hidden":1536,"mlp_expansion":3} |
| 326 | {"layers":10,"hidden":1536} |
| 327 | {"expansion":3} |
| 330 | {"hidden_size":1536} |
| 331 | {"layers":10,"hidden":1536} |
| 333 | {"multiplier":2.75,"hidden_size":1408} |
| 334 | — |
| 338 | {"expansion":3} |
| 339 | — |
| 351 | {"multiplier":3} |
| 352 | {"mlp_multiplier":3} |
| 359 | {"layers":11,"width":512} |
| 361 | {"expansion_ratio":3} |
| 362 | {"mlp_multiplier":3,"hidden_dim":1536} |
| 364 | {"layers":10} |
| 366 | {"hidden_dim":1536} |
| 368 | {"layers":10,"dim":512,"mlp_multiplier":3} |
| 369 | {"expansion":3} |
| 370 | {"hidden":1536} |
| 374 | {"multiplier":3} |
| 375 | {"multiplier":3} |
| 376 | {"hidden":1536} |
| 379 | {"expansion":3} |
| 383 | {"expansion":3} |
| 385 | {"mlp_mult":3,"hidden_size":1536} |
| 388 | — |
| 389 | — |
| 390 | {"expansion":3} |
| 398 | {"hidden":1536} |
| 400 | {"multiplier":3} |
| 401 | {"multiplier":3} |
| 403 | — |
| 406 | — |
| 410 | {"layers":3} |
| 414 | {"expansion":3} |
| 415 | — |
| 418 | {"expansion":3} |
| 422 | {"hidden":1536} |
| 424 | — |
| 433 | {"expansion":3} |
| 442 | {"hidden":1536} |
| 443 | {"multiplier":3} |
| 444 | {"mlp_mult":3} |
| 445 | — |
| 446 | {"layers":7,"width":512,"hidden":1536,"attention_heads":8,"kv_heads":4} |
| 447 | {"multiplier":3,"hidden_dim":1536} |
| 448 | {"mlp_mult":3} |
| 450 | {"hidden":1536} |
| 453 | {"hidden_size":1536} |
| 454 | {"hidden_dim":1536} |
| 455 | {"expansion":3} |
| 456 | {"expansion":3} |
| 458 | {"layers":3} |
| 461 | {"expansion":3} |
| 465 | {"hidden":1536} |
| 466 | {"multiplier":3} |
| 467 | {"multiplier":3} |
| 469 | {"hidden":1728} |
| 473 | {"multiplier":3} |
| 474 | — |
| 481 | {"multiplier":3} |
| 483 | {"mlp_mult":3} |
| 485 | {"hidden_size":1536} |
| 486 | — |
| 487 | {"multiplier":3} |
| 488 | {"layers":11,"model_dim":512,"num_heads":8,"num_kv_heads":4,"mlp_mult":3} |
| 489 | — |
| 499 | {"mlp_multiplier":4,"hidden_size":2560} |
| 508 | {"expansion":3} |
| 510 | {"expansion_factor":3,"activation":"relu²"} |
| 512 | {"hidden_size":1536} |
| 515 | {"layers":10,"mlp_hidden":1536,"embedding_dim":512,"GQA_heads":"8/4"} |
| 516 | {"multiplier":3} |
| 518 | — |
| 525 | — |
| 528 | — |
| 529 | — |
| 533 | {"expansion":3} |
| 546 | {"hidden":1536,"baseline_hidden":1024} |
| 547 | {"mlp_hidden_units":1536,"expansion_factor":3} |
| 548 | {"layers":10,"d_model":512,"heads":8,"kv_heads":4} |
| 549 | {"layers":3} |
| 550 | {"expansion_factor":3} |
| 555 | — |
| 560 | {"layers":10} |
| 563 | {"expansion_factor":3,"hidden_dim":1536,"activation":"ReLU²"} |
| 564 | {"expansion_factor":3,"activation":"relu-squared"} |
| 568 | {"hidden_size":1536} |
| 573 | {"hidden_dim":1536} |
| 580 | {"MLP_MULT":3} |
| 583 | {"multiplier":3,"hidden_dim":1536} |
| 585 | {"multiplier":3.5,"hidden_dim":1792} |
| 586 | {"expansion":3} |
| 589 | {"layers":3} |
| 593 | {"layers":3} |
| 595 | {"layers":10,"d_model":512,"heads":8,"kv_heads":4,"mlp_multiplier":3} |
| 596 | {"expansion":3} |
| 598 | {"hidden":1536,"activation":"ReLU²"} |
| 605 | — |
| 612 | {"hidden_dim":1536} |
| 614 | — |
| 618 | {"mlp_mult":3} |
| 619 | {"MLP_MULT":3,"NUM_LAYERS":10} |
| 622 | {"hidden":1536} |
| 633 | {"hidden_dim":1536} |
| 642 | — |
| 644 | {"expansion_factor":3,"hidden_dim":1536,"activation":"ReLU²"} |
| 645 | — |
| 659 | — |
| 666 | {"hidden":2304} |
| 671 | {"hidden_units":1536} |
| 678 | {"mlp_mult":3,"hidden":1536} |
| 681 | {"activation":"LeakyReLU(0.5)^2"} |
| 682 | {"mlp_mult":3} |
| 684 | — |
| 685 | — |
| 688 | {"activation":"LeakyReLU(0.5)^2"} |
| 690 | {"layers":3} |
| 692 | {"layers":3} |
| 693 | {"layers":3,"activation":"LeakyReLU(0.5)^2"} |
| 698 | {"multiplier":3} |
| 700 | {"multiplier":3.5} |
| 705 | {"hidden_multiplier":3,"hidden_dim":1536} |
| 709 | {"hidden_size":1536} |
| 710 | {"expansion":3} |
| 713 | {"layers":10,"dim":512} |
| 714 | — |
| 715 | {"multiplier":3} |
| 728 | {"layers":11} |
| 730 | {"mlp_mult":4} |
| 734 | {"multiplier":3} |
| 736 | {"layers":9,"mlp_multiplier":2} |
| 738 | {"layers":11,"dimensions":512} |
| 741 | — |
| 744 | — |
| 752 | {"activation":"LeakyReLU(0.5)^2"} |
| 754 | {"hidden_dim":1536} |
| 755 | {"mlp_mult":3,"hidden":1152} |
| 757 | — |
| 760 | {"multiplier":3} |
| 761 | {"multiplier":3} |
| 764 | {"layers":11,"dimensions":512,"gqa":"8/4"} |
| 768 | — |
| 770 | — |
| 771 | {"expansion":3} |
| 774 | {"hidden_size_multiplier":3} |
| 777 | {"activation":"LeakyReLU(0.5)^2"} |
| 778 | {"multiplier":3} |
| 779 | — |
| 786 | {"layers":3} |
| 791 | — |
| 794 | {"layers":3} |
| 795 | — |
| 796 | {"mlp_multiplier":3} |
| 797 | — |
| 798 | {"mlp_blocks":3} |
| 799 | {"mlp_mult":3} |
| 805 | {"mlp_multiplier":3} |
| 806 | — |
| 807 | — |
| 808 | — |
| 809 | {"multiplier":3} |
| 810 | {"hidden_size":1536} |
| 811 | — |
| 827 | {"multiplier":2} |
| 828 | {"layers":10,"d_model":512} |
| 836 | {"expansion":3} |
| 838 | {"hidden_dim":1536} |
| 840 | {"multiplier":3} |
| 841 | {"mlp_layers":3} |
| 843 | {"multiplier":3} |
| 849 | {"hidden":1792} |
| 850 | {"expansion":3,"hidden":1536} |
| 851 | {"multiplier":3.5} |
| 854 | {"layers":9,"model_dim":512,"heads":8,"kv_heads":4,"ffn_hidden_dim":1536} |
| 874 | {"mlp_multiplier":3} |
| 880 | {"multiplier":3.5} |
| 891 | — |
| 892 | — |
| 896 | {"multiplier":3} |
| 900 | — |
| 907 | {"mlp_dim":1536} |
| 915 | {"layers":3} |
| 924 | — |
| 925 | {"multiplier":3} |
| 926 | {"expansion":3} |
| 929 | {"layers":9,"hidden_size":1536,"mlp_mult":2} |
| 933 | {"hidden_size":768} |
| 940 | — |
| 941 | {"multiplier":3.5} |
| 945 | {"multiplier":3.5} |
| 947 | {"mlp_mult":3.2} |
| 953 | {"expansion":3.5} |
| 967 | {"expansion":3.5} |
| 972 | — |
| 978 | — |
| 979 | {"expansion":3} |
| 982 | {"hidden_multiplier":3} |
| 988 | {"multiplier":2.5} |
| 989 | {"multiplier":3,"hidden_dim":1536} |
| 993 | {"multiplier":3,"activation":"ReLU²"} |
| 995 | {"multiplier":3.5} |
| 996 | — |
| 997 | {"hidden":1536} |
| 999 | — |
| 1002 | — |
| 1005 | — |
| 1007 | {"expansion":3} |
| 1009 | {"layers":3} |
| 1025 | {"multiplier":3} |
| 1026 | — |
| 1048 | {"hidden":1536} |
| 1051 | {"negative_slope":0.5} |
| 1059 | {"mlp_mult":3} |
| 1062 | — |
| 1068 | {"mlp_mult":3} |
| 1071 | {"hidden_dim":1536} |
| 1080 | {"multiplier":3} |
| 1085 | {"expansion_factor":3} |
| 1086 | {"expansion":3} |
| 1088 | {"layers":3} |
| 1095 | {"multiplier":3} |
| 1105 | {"multiplier":3} |
| 1106 | — |
| 1108 | {"expansion":3} |
| 1112 | {"expansion":3} |
| 1125 | {"multiplier":3} |
| 1127 | {"hidden_size":1536} |
| 1130 | — |
| 1139 | {"hidden_dim":1920} |
| 1141 | {"mlp_multiplier":3} |
| 1142 | {"multiplier":3} |
| 1144 | {"multiplier":3} |
| 1148 | — |
| 1167 | {"mlp_mult":3} |
| 1170 | {"multiplier":3} |
| 1174 | {"num_layers":5,"model_dim":512,"mlp_mult":4,"num_heads":8,"num_kv_heads":4,"bigram_vocab_size":4096,"bigram_dim":1024} |
| 1183 | — |
| 1185 | — |
| 1186 | {"multiplier":3.25} |
| 1190 | {"layers":10,"mlp_expansion":3} |
| 1202 | {"multiplier":3} |
| 1209 | — |
| 1216 | {"multiplier":3,"activation":"LeakyReLU"} |
| 1218 | {"mlp_mult":4} |
| 1226 | {"hidden":1536} |
| 1227 | {"multiplier":2} |
| 1228 | — |
| 1240 | — |
| 1254 | {"expansion":3} |
| 1256 | — |
| 1276 | {"activation":"LeakyReLU","activation_slope":0.5} |
| 1280 | — |
| 1298 | {"activation":"LeakyReLU²"} |
| 1312 | {"multiplier":3} |
| 1313 | {"layers":3} |
| 1321 | {"layers":3} |
| 1337 | {"multiplier":3} |
| 1359 | {"multiplier":3} |
| 1361 | {"activation":"LeakyReLU²"} |
| 1366 | — |
| 1368 | {"hidden":1536} |
| 1378 | {"layers":11,"mlp_multiplier":3,"hidden_size":1536} |
| 1379 | {"multiplier":3} |
| 1411 | {"mlp_layers":3,"activation":"ReLU²"} |
| 1417 | {"expansion":3} |
| 1422 | {"multiplier":3} |
| 1431 | {"multiplier":3.5,"hidden_dim":1792} |
| 1436 | — |
| 1444 | — |
| 1446 | {"width":1536} |
| 1453 | {"multiplier":3,"hidden_dim":2640} |
| 1463 | {"hidden":1536} |
| 1473 | {"width_multiplier":3} |
| 1479 | {"activation":"LeakyReLU","multiplier":3} |
| 1488 | — |
| 1492 | {"multiplier":4} |
| 1505 | {"mlp_multiplier":3,"hidden_dim":1536} |
| 1508 | {"mlp_multiplier":3,"activation":"LeakyReLU"} |
| 1527 | {"multiplier":3} |
| 1536 | {"expansion":4} |
| 1556 | {"multiplier":3} |
| 1557 | {"layers":11,"dimensions":512,"heads":8,"kv_heads":4,"mlp_multiplier":4} |
| 1559 | {"multiplier":3} |
| 1561 | {"hidden_dim":2048} |
| 1568 | {"expansion":3} |
| 1577 | {"mlp_mult":1.9} |
| 1581 | {"mlp_mult":3} |
| 1617 | {"multiplier":3} |
| 1626 | {"hidden_multiplier":4} |
| 1630 | {"multiplier":3} |
| 1647 | {"multiplier":4} |
| 1649 | — |
| 1666 | {"multiplier":3} |
| 1715 | {"multiplier":4} |
| 1722 | {"hidden_multiplier":3} |
| 1751 | — |