← Back to Architecture

Hybrid

Architecture
Used in
12 PRs
Best BPB
0.5755
Avg BPB
1.3649

Hyperparameters Across PRs

pr_numberparameters
992
998
1007{"layers":11,"attention_layers":3,"token_shift_layers":8}
1044{"layers":9,"d_model":512,"heads":8,"kv_heads":4,"chunk_ratio":0.25}
1061{"oscillators":192,"layers":12,"heads":16}
1142{"lfb_layers":6,"lfb_dim":80,"lfb_bigram_vocab_size":2048}
1198{"diffusion_num_steps":8,"diffusion_block_min":24,"diffusion_block_max":128,"diffusion_min_mask_frac":0.1,"diffusion_max_mask_frac":0.6,"diffusion_block_start_min_frac":0.25,"diffusion_block_start_max_frac":0.9,"diffusion_time_scale":0.05,"diffusion_refine_last_n":5,"diffusion_batch_shared_block":1}
1346{"layers":15,"mlp_only":true,"tail_block":"blocks.14.mlp"}
1491{"layers":4,"model_dim":256,"num_heads":4,"num_kv_heads":4,"num_streams":4,"num_fracs":1}
1644{"layers":7,"ssm_blocks":5,"attention_layers":2,"dim":512,"d_state":64,"expand":2,"headdim":64,"chunk_size":64,"mlp_mult":3}
1665{"layers":8,"mamba_blocks":6,"attention_blocks":2,"attention_positions":[2,5],"dim":512,"d_state":128,"ngroups":1,"expand":2}
1685{"layers":9,"dimensions":384,"heads":6,"predictor_mlp_layers":2}