diff --git a/lpm_kernel/L2/dpo/dpo_train.py b/lpm_kernel/L2/dpo/dpo_train.py
index fb5d0bc1..1d9d3dfd 100644
--- a/lpm_kernel/L2/dpo/dpo_train.py
+++ b/lpm_kernel/L2/dpo/dpo_train.py
@@ -44,11 +44,11 @@ def training_data_processor(args, SYS = "You are a helpful assistant.\n\n"):
 def train(args):
     tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left")
     model = AutoModelForCausalLM.from_pretrained(
-    args.base_model_path, 
-    trust_remote_code=True,
-    ignore_mismatched_sizes=True, 
-    torch_dtype=torch.float32,  # CPU doesn't support bfloat16
-)
+        args.base_model_path, 
+        trust_remote_code=True,
+        ignore_mismatched_sizes=True, 
+        torch_dtype="auto",  # Use auto detection instead of hardcoding float32
+    )
     time_str = get_east_eight_time_formatted()
 
     # merged_model = model.merge_and_unload()
diff --git a/lpm_kernel/L2/memory_manager.py b/lpm_kernel/L2/memory_manager.py
index b4c8950a..01e2ee0d 100644
--- a/lpm_kernel/L2/memory_manager.py
+++ b/lpm_kernel/L2/memory_manager.py
@@ -74,14 +74,12 @@ def get_optimal_training_config(self) -> Dict[str, Any]:
             "gradient_accumulation_steps": 1,
         }
         
-        # Enable mixed precision based on hardware support
+        # Let PyTorch automatically decide the best dtype if CUDA is available
         if self.cuda_available:
-            capability = torch.cuda.get_device_capability()
-            if capability[0] >= 8:  # Ampere or newer (supports BF16)
-                config["bf16"] = True
-            elif capability[0] >= 7:  # Volta or newer (supports FP16)
-                config["fp16"] = True
-                
+            # Instead of manually checking capabilities, use "auto" dtype
+            # PyTorch will automatically select the best precision for the hardware
+            config["dtype"] = "auto"
+            
             # Adjust accumulation steps based on available memory
             vram_gb = self.get_memory_info().get("vram_total_gb", 0)
             if vram_gb < 8:  # Small GPUs
diff --git a/lpm_kernel/L2/merge_lora_weights.py b/lpm_kernel/L2/merge_lora_weights.py
index 14e892f4..b38bd677 100644
--- a/lpm_kernel/L2/merge_lora_weights.py
+++ b/lpm_kernel/L2/merge_lora_weights.py
@@ -56,16 +56,15 @@ def merge_lora_weights(base_model_path, lora_adapter_path, output_model_path):
         # Clean up memory before starting
         memory_manager.cleanup_memory(force=True)
         
-        # Explicitly set device configuration based on available hardware
+        # Use auto dtype selection instead of manually choosing based on hardware
         device_map = "auto" if use_cuda else None
-        dtype = torch.float16 if use_cuda else torch.float32
         
-        logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, dtype={dtype}")
+        logger.info(f"Loading base model from {base_model_path} with device_map={device_map}, using auto dtype")
         
-        # Use explicit configuration for GPU utilization
+        # Use auto dtype configuration for optimal hardware utilization
         base_model = AutoModelForCausalLM.from_pretrained(
             base_model_path,
-            torch_dtype=dtype,
+            torch_dtype="auto",
             device_map=device_map
         )
         
diff --git a/lpm_kernel/L2/train.py b/lpm_kernel/L2/train.py
index 54a00ee6..8aa472dc 100644
--- a/lpm_kernel/L2/train.py
+++ b/lpm_kernel/L2/train.py
@@ -242,8 +242,20 @@ def main(model_args, data_args, training_args):
     # Configure quantization if requested
     if model_args.use_4bit_quantization:
         from transformers import BitsAndBytesConfig
-        compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)
-        quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype)
+        
+        # Handle "auto" dtype appropriately
+        if model_args.bnb_4bit_compute_dtype == "auto":
+            # Let BitsAndBytesConfig handle the dtype automatically
+            compute_dtype = "auto"
+        else:
+            # Use the specified dtype
+            compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)
+        
+        # Storage dtype follows the same pattern
+        if model_args.bnb_4bit_quant_storage_dtype == "auto":
+            quant_storage_dtype = "auto"
+        else:
+            quant_storage_dtype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype)
         
         model_kwargs["quantization_config"] = BitsAndBytesConfig(
             load_in_4bit=model_args.use_4bit_quantization,
diff --git a/lpm_kernel/L2/train_for_user.sh b/lpm_kernel/L2/train_for_user.sh
index 50846923..6d145612 100755
--- a/lpm_kernel/L2/train_for_user.sh
+++ b/lpm_kernel/L2/train_for_user.sh
@@ -5,7 +5,6 @@ LEARNING_RATE="2e-4"
 NUM_TRAIN_EPOCHS="3"
 CONCURRENCY_THREADS="2"
 DATA_SYNTHESIS_MODE="low"
-HALF=False
 USE_CUDA=False  # Default to False, will be overridden by parameter
 IS_COT=False
 
@@ -71,19 +70,11 @@ if [ "$CONCURRENCY_THREADS" != "1" ]; then
   echo "Set thread environment variables to $CONCURRENCY_THREADS"
 fi
 
-# Add BF16 option based on the platform and CUDA availability
-if [ "$PLATFORM" != "apple" ] && [ "$USE_CUDA" == "True" ]; then
-  HALF=True
-  echo "Enabling BF16 half precision for non-Apple platform with CUDA"
-else
-  echo "Using standard precision (not using BF16)"
-fi
-
 # Print environment for debugging
 echo "Environment configuration:"
 echo "  CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES}"
 echo "  PYTORCH_CUDA_ALLOC_CONF: ${PYTORCH_CUDA_ALLOC_CONF}"
-echo "  Using half precision: ${HALF}"
+echo "  Using automatic mixed precision"
 
 # Execute training script with parameters from environment variables
 python lpm_kernel/L2/train.py \
@@ -103,7 +94,6 @@ python lpm_kernel/L2/train.py \
   --save_strategy "steps" \
   --save_steps 5 \
   --push_to_hub False \
-  --bf16 $HALF \
   --packing False \
   --learning_rate $LEARNING_RATE \
   --lr_scheduler_type "cosine" \
@@ -121,7 +111,8 @@ python lpm_kernel/L2/train.py \
   --lora_target_modules "all-linear" \
   --use_4bit_quantization False \
   --use_nested_quant False \
-  --bnb_4bit_compute_dtype "bfloat16" \
+  --bnb_4bit_compute_dtype "auto" \
+  --bnb_4bit_quant_storage_dtype "auto" \
   --is_cot $IS_COT \
   --use_cuda $USE_CUDA
 
diff --git a/lpm_kernel/L2/utils.py b/lpm_kernel/L2/utils.py
index 000debc9..5e3f5c37 100644
--- a/lpm_kernel/L2/utils.py
+++ b/lpm_kernel/L2/utils.py
@@ -310,6 +310,9 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
     if cuda_available and use_cuda_requested:
         device = "cuda"
         model_kwargs["device_map"] = "auto"
+        # Use auto dtype instead of hardcoded dtype
+        if "torch_dtype" not in model_kwargs:
+            model_kwargs["torch_dtype"] = "auto"
     else:
         if use_cuda_requested and not cuda_available:
             logger.warning("⚠️ CUDA was requested but is not available on this system. Falling back to CPU.")
@@ -326,8 +329,19 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
     # Use model_kwargs quantization_config if provided, otherwise build it
     if "quantization_config" not in model_kwargs:
         if args.use_4bit_quantization:
-            compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
-            quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
+            # Handle "auto" dtype appropriately
+            if args.bnb_4bit_compute_dtype == "auto":
+                # Let BitsAndBytesConfig handle the dtype automatically
+                compute_dtype = "auto"
+            else:
+                # Use the specified dtype
+                compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
+            
+            # Storage dtype follows the same pattern
+            if args.bnb_4bit_quant_storage_dtype == "auto":
+                quant_storage_dtype = "auto"
+            else:
+                quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype)
 
             bnb_config = BitsAndBytesConfig(
                 load_in_4bit=args.use_4bit_quantization,
@@ -337,11 +351,6 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
                 bnb_4bit_quant_storage=quant_storage_dtype,
             )
             model_kwargs["quantization_config"] = bnb_config
-
-            if compute_dtype == torch.float16 and args.use_4bit_quantization:
-                major, _ = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
-                if major >= 8:
-                    logger.info("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
         elif args.use_8bit_quantization:
             bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization)
             model_kwargs["quantization_config"] = bnb_config
@@ -358,7 +367,7 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
             unsloth_kwargs = {
                 "model_name": args.model_name_or_path,
                 "max_seq_length": data_args.max_seq_length,
-                "dtype": None,
+                "dtype": "auto",  # Use auto dtype for automatic precision selection
                 "load_in_4bit": args.use_4bit_quantization,
                 "load_in_8bit": args.use_8bit_quantization,
                 "trust_remote_code": True,
@@ -383,6 +392,10 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
             # Set default device_map if not specified
             if "device_map" not in load_kwargs and args.use_cuda and torch.cuda.is_available():
                 load_kwargs["device_map"] = "auto"
+            
+            # Ensure automatic dtype selection
+            if "torch_dtype" not in load_kwargs and args.use_cuda and torch.cuda.is_available():
+                load_kwargs["torch_dtype"] = "auto"
                             
             logger.info(f"Loading model with parameters: {load_kwargs}")
             model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **load_kwargs)
@@ -396,17 +409,17 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
         memory_manager.cleanup_memory(force=True)
         
         try:
-            # Try with simpler configuration - float16 instead of bfloat16
-            logger.info("Attempting to load with float16 precision...")
+            # Try with simpler configuration - use auto dtype instead of float16
+            logger.info("Attempting to load with auto precision...")
             model = AutoModelForCausalLM.from_pretrained(
                 args.model_name_or_path,
                 device_map="auto" if torch.cuda.is_available() and args.use_cuda else None,
-                torch_dtype=torch.float16 if torch.cuda.is_available() and args.use_cuda else None,
+                torch_dtype="auto" if torch.cuda.is_available() and args.use_cuda else None,
                 trust_remote_code=True
             )
         except (RuntimeError, torch.cuda.OutOfMemoryError, MemoryError) as e:
             # If that fails too, try even more conservative loading
-            logger.warning(f"Float16 loading failed: {str(e)}")
+            logger.warning(f"Auto dtype loading failed: {str(e)}")
             memory_manager.cleanup_memory(force=True)
             
             try:
@@ -417,7 +430,7 @@ def create_and_prepare_model(args, data_args, training_args, model_kwargs=None):
                     device_map="auto",
                     offload_folder="offload_folder",
                     offload_state_dict=True,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else None,
+                    torch_dtype="auto" if torch.cuda.is_available() else None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 )