Skip to content

Error: Qwen3.5-35B-A3B lora sft with mcore-adapter, error occurs when saving a ckpt. #411

@Unofish

Description

@Unofish

training scripts:
`# workdir=$(cd $(dirname $0); pwd)

parent_dir=$(dirname "$workdir")

WORLD_SIZE=4
TENSOR_MODEL_PARALLEL_SIZE=1
EXPERT_MODEL_PARALLEL_SIZE=4
PIPELINE_MODEL_PARALLEL_SIZE=1

MODEL_NAME="/workspace/mount/Models/Qwen/Qwen3.5-35B-A3B"

export DISABLE_VERSION_CHECK=1

USE_MCA=true

mca_options="
--tensor_model_parallel_size ${TENSOR_MODEL_PARALLEL_SIZE}
--sequence_parallel
--pipeline_model_parallel_size ${PIPELINE_MODEL_PARALLEL_SIZE}
--expert_model_parallel_size ${EXPERT_MODEL_PARALLEL_SIZE}
--use_distributed_optimizer
--bias_activation_fusion
--apply_rope_fusion
--overlap_param_gather
--overlap_grad_reduce
--moe_grouped_gemm true
--moe_token_dispatcher_type alltoall"

llama_factory_options="
--deepspeed=./config/ds_zero2.json"

options="
--do_train
--stage=sft
--finetuning_type=lora
--lora_rank 64
--dataset_dir=./mcore_adapter/examples/data
--dataset=tulu-if-3w-alc
--preprocessing_num_workers=8
--cutoff_len=2048
--template=qwen3_5
--model_name_or_path=$MODEL_NAME
--output_dir=/workspace/mount/output3
--per_device_train_batch_size=1
--gradient_accumulation_steps=2
--calculate_per_token_loss=True
--num_train_epochs 3
--learning_rate=2e-5
--logging_steps=1
--save_steps=50
--save_strategy steps
--lr_scheduler_type=cosine
--bf16
--recompute_granularity=full
--max_steps=2"

if [ "$USE_MCA" = true ]; then
options="$options $mca_options --use_mca"
else
WORLD_SIZE=$(($WORLD_SIZE / $TENSOR_MODEL_PARALLEL_SIZE / $PIPELINE_MODEL_PARALLEL_SIZE))
options="$options $llama_factory_options --use_mca=False"
fi

torchrun --nproc_per_node=$WORLD_SIZE ./mcore_adapter/examples/train/run_train.py $options
`


Error Logs

[rank3]: Traceback (most recent call last):
[rank3]: File "/mnt/home/ROLL/./mcore_adapter/examples/train/run_train.py", line 325, in
[rank3]: main()
[rank3]: File "/mnt/home/ROLL/./mcore_adapter/examples/train/run_train.py", line 319, in main
[rank3]: mca_train(training_args, model_args, data_args, finetuning_args)
[rank3]: File "/mnt/home/ROLL/./mcore_adapter/examples/train/run_train.py", line 292, in mca_train
[rank3]: sft_mca_train(training_args, model_args, data_args, finetuning_args)
[rank3]: File "/mnt/home/ROLL/./mcore_adapter/examples/train/run_train.py", line 232, in sft_mca_train
[rank3]: trainer.train(training_args.resume_from_checkpoint)
[rank3]: File "/usr/local/lib/python3.12/site-packages/transformers/trainer.py", line 1412, in train
[rank3]: return inner_training_loop(
[rank3]: ^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/mnt/home/ROLL/mcore_adapter/src/mcore_adapter/trainer/trainer.py", line 889, in _inner_training_loop
[rank3]: self._maybe_log_save_evaluate(
[rank3]: File "/mnt/home/ROLL/mcore_adapter/src/mcore_adapter/trainer/trainer.py", line 1015, in _maybe_log_save_evaluate
[rank3]: self._save_checkpoint(model, trial)
[rank3]: File "/usr/local/lib/python3.12/site-packages/transformers/trainer.py", line 3058, in _save_checkpoint
[rank3]: self._save_optimizer_and_scheduler(output_dir)
[rank3]: File "/mnt/home/ROLL/mcore_adapter/src/mcore_adapter/trainer/trainer.py", line 1067, in _save_optimizer_and_scheduler
[rank3]: model_shared_state_dict = self.model.sharded_state_dict()
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/mnt/home/ROLL/mcore_adapter/src/mcore_adapter/models/model_factory.py", line 193, in sharded_state_dict
[rank3]: state_dict["model"] = self.models[0].sharded_state_dict(prefix, *args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/models/gpt/gpt_model.py", line 819, in sharded_state_dict
[rank3]: sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/models/common/language_module/language_module.py", line 290, in sharded_state_dict
[rank3]: sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 99, in sharded_state_dict
[rank3]: sharded_state_dict_default(
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/utils.py", line 241, in sharded_state_dict_default
[rank3]: module_sharded_sd = module.sharded_state_dict(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 874, in sharded_state_dict
[rank3]: layer_sharded_state_dict = layer.sharded_state_dict(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/transformer_layer.py", line 851, in sharded_state_dict
[rank3]: sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 99, in sharded_state_dict
[rank3]: sharded_state_dict_default(
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/utils.py", line 241, in sharded_state_dict_default
[rank3]: module_sharded_sd = module.sharded_state_dict(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/ssm/gated_delta_net.py", line 470, in sharded_state_dict
[rank3]: assert sharded_state_dict[f"{prefix}in_proj.weight"].data.size(0) == in_proj_dim_local_tp, (
[rank3]: ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: KeyError: 'decoder.layers.0.self_attention.in_proj.weight'

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions