44# Contact: qubitium@modelcloud.ai, x.com/qubitium
55import torch
66from transformers import AutoConfig , AutoModelForCausalLM , AutoModelForImageTextToText
7+ from transformers .models .qwen2_moe .modeling_qwen2_moe import Qwen2MoeConfig , Qwen2MoeForCausalLM
8+ from transformers .models .qwen3_next .modeling_qwen3_next import Qwen3NextConfig , Qwen3NextForCausalLM
79
810from defuser import convert_model
911from defuser .modeling .replace_modules import materialize_model
1012
1113
14+ def _tiny_moe_config (config_cls ):
15+ return config_cls (
16+ num_hidden_layers = 1 ,
17+ hidden_size = 64 ,
18+ intermediate_size = 128 ,
19+ moe_intermediate_size = 32 ,
20+ num_attention_heads = 4 ,
21+ num_key_value_heads = 4 ,
22+ num_experts = 4 ,
23+ num_experts_per_tok = 2 ,
24+ vocab_size = 128 ,
25+ )
26+
27+
28+ def _assert_unfused_expert_module (experts ):
29+ assert hasattr (experts , "0" )
30+ expert0 = getattr (experts , "0" )
31+ assert hasattr (expert0 , "gate_proj" )
32+ assert hasattr (expert0 , "up_proj" )
33+ assert hasattr (expert0 , "down_proj" )
34+
35+
36+ def test_qwen2_moe ():
37+ model = Qwen2MoeForCausalLM (_tiny_moe_config (Qwen2MoeConfig ))
38+ assert model .config .model_type == "qwen2_moe"
39+
40+ converted = convert_model (model , max_layers = 1 )
41+ assert converted
42+
43+ _assert_unfused_expert_module (model .model .layers [0 ].mlp .experts )
44+
45+
1246def test_qwen3_moe ():
1347 model_id = "Qwen/Qwen3-30B-A3B"
1448 config = AutoConfig .from_pretrained (model_id )
@@ -24,12 +58,17 @@ def test_qwen3_moe():
2458 converted = convert_model (model , max_layers = 1 )
2559 assert converted
2660
27- experts = model .model .layers [0 ].mlp .experts
28- assert hasattr (experts , "0" )
29- expert0 = getattr (experts , "0" )
30- assert hasattr (expert0 , "gate_proj" )
31- assert hasattr (expert0 , "up_proj" )
32- assert hasattr (expert0 , "down_proj" )
61+ _assert_unfused_expert_module (model .model .layers [0 ].mlp .experts )
62+
63+
64+ def test_qwen3_next ():
65+ model = Qwen3NextForCausalLM (_tiny_moe_config (Qwen3NextConfig ))
66+ assert model .config .model_type == "qwen3_next"
67+
68+ converted = convert_model (model , max_layers = 1 )
69+ assert converted
70+
71+ _assert_unfused_expert_module (model .model .layers [0 ].mlp .experts )
3372
3473
3574def test_qwen3_5_moe ():
@@ -60,11 +99,8 @@ def test_qwen3_5_moe():
6099 moe_block = model .model .language_model .layers [0 ].mlp
61100 experts = moe_block .experts
62101
63- assert hasattr (experts , "0" )
102+ _assert_unfused_expert_module (experts )
64103 expert0 = getattr (experts , "0" )
65- assert hasattr (expert0 , "gate_proj" )
66- assert hasattr (expert0 , "up_proj" )
67- assert hasattr (expert0 , "down_proj" )
68104
69105 materialize_model (model .model .language_model .layers [0 ])
70106
@@ -102,14 +138,11 @@ def test_mixtral():
102138 moe_block = model .model .layers [0 ].mlp
103139 experts = moe_block .experts
104140
105- assert hasattr (experts , "0" )
141+ _assert_unfused_expert_module (experts )
106142 expert0 = getattr (experts , "0" )
107- assert hasattr (expert0 , "gate_proj" )
108- assert hasattr (expert0 , "up_proj" )
109- assert hasattr (expert0 , "down_proj" )
110143
111144 materialize_model (model .model .layers [0 ])
112145
113146 torch .testing .assert_close (expert0 .gate_proj .weight , expected_gate )
114147 torch .testing .assert_close (expert0 .up_proj .weight , expected_up )
115- torch .testing .assert_close (expert0 .down_proj .weight , expected_down )
148+ torch .testing .assert_close (expert0 .down_proj .weight , expected_down )
0 commit comments