-
Notifications
You must be signed in to change notification settings - Fork 261
Error: Qwen3.5-35B-A3B lora sft with mcore-adapter, error occurs when saving a ckpt. #411
Description
training scripts:
`# workdir=$(cd $(dirname $0); pwd)
parent_dir=$(dirname "$workdir")
WORLD_SIZE=4
TENSOR_MODEL_PARALLEL_SIZE=1
EXPERT_MODEL_PARALLEL_SIZE=4
PIPELINE_MODEL_PARALLEL_SIZE=1
MODEL_NAME="/workspace/mount/Models/Qwen/Qwen3.5-35B-A3B"
export DISABLE_VERSION_CHECK=1
USE_MCA=true
mca_options="
--tensor_model_parallel_size ${TENSOR_MODEL_PARALLEL_SIZE}
--sequence_parallel
--pipeline_model_parallel_size ${PIPELINE_MODEL_PARALLEL_SIZE}
--expert_model_parallel_size ${EXPERT_MODEL_PARALLEL_SIZE}
--use_distributed_optimizer
--bias_activation_fusion
--apply_rope_fusion
--overlap_param_gather
--overlap_grad_reduce
--moe_grouped_gemm true
--moe_token_dispatcher_type alltoall"
llama_factory_options="
--deepspeed=./config/ds_zero2.json"
options="
--do_train
--stage=sft
--finetuning_type=lora
--lora_rank 64
--dataset_dir=./mcore_adapter/examples/data
--dataset=tulu-if-3w-alc
--preprocessing_num_workers=8
--cutoff_len=2048
--template=qwen3_5
--model_name_or_path=$MODEL_NAME
--output_dir=/workspace/mount/output3
--per_device_train_batch_size=1
--gradient_accumulation_steps=2
--calculate_per_token_loss=True
--num_train_epochs 3
--learning_rate=2e-5
--logging_steps=1
--save_steps=50
--save_strategy steps
--lr_scheduler_type=cosine
--bf16
--recompute_granularity=full
--max_steps=2"
if [ "$USE_MCA" = true ]; then
options="$options $mca_options --use_mca"
else
WORLD_SIZE=$(($WORLD_SIZE / $TENSOR_MODEL_PARALLEL_SIZE / $PIPELINE_MODEL_PARALLEL_SIZE))
options="$options $llama_factory_options --use_mca=False"
fi
torchrun --nproc_per_node=$WORLD_SIZE ./mcore_adapter/examples/train/run_train.py $options
`
Error Logs
[rank3]: Traceback (most recent call last):
[rank3]: File "/mnt/home/ROLL/./mcore_adapter/examples/train/run_train.py", line 325, in
[rank3]: main()
[rank3]: File "/mnt/home/ROLL/./mcore_adapter/examples/train/run_train.py", line 319, in main
[rank3]: mca_train(training_args, model_args, data_args, finetuning_args)
[rank3]: File "/mnt/home/ROLL/./mcore_adapter/examples/train/run_train.py", line 292, in mca_train
[rank3]: sft_mca_train(training_args, model_args, data_args, finetuning_args)
[rank3]: File "/mnt/home/ROLL/./mcore_adapter/examples/train/run_train.py", line 232, in sft_mca_train
[rank3]: trainer.train(training_args.resume_from_checkpoint)
[rank3]: File "/usr/local/lib/python3.12/site-packages/transformers/trainer.py", line 1412, in train
[rank3]: return inner_training_loop(
[rank3]: ^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/mnt/home/ROLL/mcore_adapter/src/mcore_adapter/trainer/trainer.py", line 889, in _inner_training_loop
[rank3]: self._maybe_log_save_evaluate(
[rank3]: File "/mnt/home/ROLL/mcore_adapter/src/mcore_adapter/trainer/trainer.py", line 1015, in _maybe_log_save_evaluate
[rank3]: self._save_checkpoint(model, trial)
[rank3]: File "/usr/local/lib/python3.12/site-packages/transformers/trainer.py", line 3058, in _save_checkpoint
[rank3]: self._save_optimizer_and_scheduler(output_dir)
[rank3]: File "/mnt/home/ROLL/mcore_adapter/src/mcore_adapter/trainer/trainer.py", line 1067, in _save_optimizer_and_scheduler
[rank3]: model_shared_state_dict = self.model.sharded_state_dict()
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/mnt/home/ROLL/mcore_adapter/src/mcore_adapter/models/model_factory.py", line 193, in sharded_state_dict
[rank3]: state_dict["model"] = self.models[0].sharded_state_dict(prefix, *args, **kwargs)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/models/gpt/gpt_model.py", line 819, in sharded_state_dict
[rank3]: sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/models/common/language_module/language_module.py", line 290, in sharded_state_dict
[rank3]: sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 99, in sharded_state_dict
[rank3]: sharded_state_dict_default(
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/utils.py", line 241, in sharded_state_dict_default
[rank3]: module_sharded_sd = module.sharded_state_dict(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/transformer_block.py", line 874, in sharded_state_dict
[rank3]: layer_sharded_state_dict = layer.sharded_state_dict(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/transformer_layer.py", line 851, in sharded_state_dict
[rank3]: sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/module.py", line 99, in sharded_state_dict
[rank3]: sharded_state_dict_default(
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/transformer/utils.py", line 241, in sharded_state_dict_default
[rank3]: module_sharded_sd = module.sharded_state_dict(
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: File "/usr/local/lib/python3.12/site-packages/megatron/core/ssm/gated_delta_net.py", line 470, in sharded_state_dict
[rank3]: assert sharded_state_dict[f"{prefix}in_proj.weight"].data.size(0) == in_proj_dim_local_tp, (
[rank3]: ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank3]: KeyError: 'decoder.layers.0.self_attention.in_proj.weight'