diff --git a/lpm_kernel/L2/dpo/dpo_train.py b/lpm_kernel/L2/dpo/dpo_train.py index fb5d0bc1..1d9d3dfd 100644 --- a/lpm_kernel/L2/dpo/dpo_train.py +++ b/lpm_kernel/L2/dpo/dpo_train.py @@ -44,11 +44,11 @@ def training_data_processor(args, SYS = "You are a helpful assistant.\n\n"): def train(args): tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left") model = AutoModelForCausalLM.from_pretrained( - args.base_model_path, - trust_remote_code=True, - ignore_mismatched_sizes=True, - torch_dtype=torch.float32, # CPU doesn't support bfloat16 -) + args.base_model_path, + trust_remote_code=True, + ignore_mismatched_sizes=True, + torch_dtype="auto", # Use auto detection instead of hardcoding float32 + ) time_str = get_east_eight_time_formatted() # merged_model = model.merge_and_unload() diff --git a/lpm_kernel/L2/memory_manager.py b/lpm_kernel/L2/memory_manager.py index b4c8950a..01e2ee0d 100644 --- a/lpm_kernel/L2/memory_manager.py +++ b/lpm_kernel/L2/memory_manager.py @@ -74,14 +74,12 @@ def get_optimal_training_config(self) -> Dict[str, Any]: "gradient_accumulation_steps": 1, } - # Enable mixed precision based on hardware support + # Let PyTorch automatically decide the best dtype if CUDA is available if self.cuda_available: - capability = torch.cuda.get_device_capability() - if capability[0] >= 8: # Ampere or newer (supports BF16) - config["bf16"] = True - elif capability[0] >= 7: # Volta or newer (supports FP16) - config["fp16"] = True - + # Instead of manually checking capabilities, use "auto" dtype + # PyTorch will automatically select the best precision for the hardware + config["dtype"] = "auto" + # Adjust accumulation steps based on available memory vram_gb = self.get_memory_info().get("vram_total_gb", 0) if vram_gb < 8: # Small GPUs diff --git a/lpm_kernel/L2/merge_lora_weights.py b/lpm_kernel/L2/merge_lora_weights.py index 14e892f4..b38bd677 100644 --- a/lpm_kernel/L2/merge_lora_weights.py +++ b/lpm_kernel/L2/merge_lora_weights.py @@ -56,16 +56,15 @@ def merge_lora_weights(base_model_path, lora_adapter_path, output_model_path): # Clean up memory before starting memory_manager.cleanup_memory(force=True) - # Explicitly set device configuration based on available hardware + # Use auto dtype selection instead of manually choosing based on hardware device_map = "auto" if use_cuda else None - dtype = torch.float16 if use_cuda else torch.float32 - logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, dtype={dtype}") + logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, using auto dtype") - # Use explicit configuration for GPU utilization + # Use auto dtype configuration for optimal hardware utilization base_model = AutoModelForCausalLM.from_pretrained( base_model_path, - torch_dtype=dtype, + torch_dtype="auto", device_map=device_map ) diff --git a/lpm_kernel/L2/train.py b/lpm_kernel/L2/train.py index 54a00ee6..8aa472dc 100644 --- a/lpm_kernel/L2/train.py +++ b/lpm_kernel/L2/train.py @@ -242,8 +242,20 @@ def main(model_args, data_args, training_args): # Configure quantization if requested if model_args.use_4bit_quantization: from transformers import BitsAndBytesConfig - compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype) - quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype) + + # Handle "auto" dtype appropriately + if model_args.bnb_4bit_compute_dtype == "auto": + # Let BitsAndBytesConfig handle the dtype automatically + compute_dtype = "auto" + else: + # Use the specified dtype + compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype) + + # Storage dtype follows the same pattern + if model_args.bnb_4bit_quant_storage_dtype == "auto": + quant_storage_dtype = "auto" + else: + quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype) model_kwargs["quantization_config"] = BitsAndBytesConfig( load_in_4bit=model_args.use_4bit_quantization, diff --git a/lpm_kernel/L2/train_for_user.sh b/lpm_kernel/L2/train_for_user.sh index 50846923..6d145612 100755 --- a/lpm_kernel/L2/train_for_user.sh +++ b/lpm_kernel/L2/train_for_user.sh @@ -5,7 +5,6 @@ LEARNING_RATE="2e-4" NUM_TRAIN_EPOCHS="3" CONCURRENCY_THREADS="2" DATA_SYNTHESIS_MODE="low" -HALF=False USE_CUDA=False # Default to False, will be overridden by parameter IS_COT=False @@ -71,19 +70,11 @@ if [ "$CONCURRENCY_THREADS" != "1" ]; then echo "Set thread environment variables to $CONCURRENCY_THREADS" fi -# Add BF16 option based on the platform and CUDA availability -if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then - HALF=True - echo "Enabling BF16 half precision for non-Apple platform with CUDA" -else - echo "Using standard precision (not using BF16)" -fi - # Print environment for debugging echo "Environment configuration:" echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}" echo " PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}" -echo " Using half precision: ${HALF}" +echo " Using automatic mixed precision" # Execute training script with parameters from environment variables python lpm_kernel/L2/train.py \ @@ -103,7 +94,6 @@ python lpm_kernel/L2/train.py \ --save_strategy "steps" \ --save_steps 5 \ --push_to_hub False \ - --bf16 $HALF \ --packing False \ --learning_rate $LEARNING_RATE \ --lr_scheduler_type "cosine" \ @@ -121,7 +111,8 @@ python lpm_kernel/L2/train.py \ --lora_target_modules "all-linear" \ --use_4bit_quantization False \ --use_nested_quant False \ - --bnb_4bit_compute_dtype "bfloat16" \ + --bnb_4bit_compute_dtype "auto" \ + --bnb_4bit_quant_storage_dtype "auto" \ --is_cot $IS_COT \ --use_cuda $USE_CUDA diff --git a/lpm_kernel/L2/utils.py b/lpm_kernel/L2/utils.py index 000debc9..5e3f5c37 100644 --- a/lpm_kernel/L2/utils.py +++ b/lpm_kernel/L2/utils.py @@ -310,6 +310,9 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None): if cuda_available and use_cuda_requested: device = "cuda" model_kwargs["device_map"] = "auto" + # Use auto dtype instead of hardcoded dtype + if "torch_dtype" not in model_kwargs: + model_kwargs["torch_dtype"] = "auto" else: if use_cuda_requested and not cuda_available: logger.warning("⚠️ CUDA was requested but is not available on this system. Falling back to CPU.") @@ -326,8 +329,19 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None): # Use model_kwargs quantization_config if provided, otherwise build it if "quantization_config" not in model_kwargs: if args.use_4bit_quantization: - compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) - quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype) + # Handle "auto" dtype appropriately + if args.bnb_4bit_compute_dtype == "auto": + # Let BitsAndBytesConfig handle the dtype automatically + compute_dtype = "auto" + else: + # Use the specified dtype + compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) + + # Storage dtype follows the same pattern + if args.bnb_4bit_quant_storage_dtype == "auto": + quant_storage_dtype = "auto" + else: + quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype) bnb_config = BitsAndBytesConfig( load_in_4bit=args.use_4bit_quantization, @@ -337,11 +351,6 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None): bnb_4bit_quant_storage=quant_storage_dtype, ) model_kwargs["quantization_config"] = bnb_config - - if compute_dtype == torch.float16 and args.use_4bit_quantization: - major, _ = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0) - if major >= 8: - logger.info("Your GPU supports bfloat16, you can accelerate training with the argument --bf16") elif args.use_8bit_quantization: bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization) model_kwargs["quantization_config"] = bnb_config @@ -358,7 +367,7 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None): unsloth_kwargs = { "model_name": args.model_name_or_path, "max_seq_length": data_args.max_seq_length, - "dtype": None, + "dtype": "auto", # Use auto dtype for automatic precision selection "load_in_4bit": args.use_4bit_quantization, "load_in_8bit": args.use_8bit_quantization, "trust_remote_code": True, @@ -383,6 +392,10 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None): # Set default device_map if not specified if "device_map" not in load_kwargs and args.use_cuda and torch.cuda.is_available(): load_kwargs["device_map"] = "auto" + + # Ensure automatic dtype selection + if "torch_dtype" not in load_kwargs and args.use_cuda and torch.cuda.is_available(): + load_kwargs["torch_dtype"] = "auto" logger.info(f"Loading model with parameters: {load_kwargs}") model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **load_kwargs) @@ -396,17 +409,17 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None): memory_manager.cleanup_memory(force=True) try: - # Try with simpler configuration - float16 instead of bfloat16 - logger.info("Attempting to load with float16 precision...") + # Try with simpler configuration - use auto dtype instead of float16 + logger.info("Attempting to load with auto precision...") model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, device_map="auto" if torch.cuda.is_available() and args.use_cuda else None, - torch_dtype=torch.float16 if torch.cuda.is_available() and args.use_cuda else None, + torch_dtype="auto" if torch.cuda.is_available() and args.use_cuda else None, trust_remote_code=True ) except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e: # If that fails too, try even more conservative loading - logger.warning(f"Float16 loading failed: {str(e)}") + logger.warning(f"Auto dtype loading failed: {str(e)}") memory_manager.cleanup_memory(force=True) try: @@ -417,7 +430,7 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None): device_map="auto", offload_folder="offload_folder", offload_state_dict=True, - torch_dtype=torch.float16 if torch.cuda.is_available() else None, + torch_dtype="auto" if torch.cuda.is_available() else None, trust_remote_code=True, low_cpu_mem_usage=True )