diff --git a/lpm_frontend/src/components/ModelStatus/index.tsx b/lpm_frontend/src/components/ModelStatus/index.tsx
index 9a96d54a..8f730b09 100644
--- a/lpm_frontend/src/components/ModelStatus/index.tsx
+++ b/lpm_frontend/src/components/ModelStatus/index.tsx
@@ -1,14 +1,16 @@
 import { Status, statusRankMap, useTrainingStore } from '@/store/useTrainingStore';
-import { startService, stopService } from '@/service/train';
+import { startService, stopService, checkCudaAvailability } from '@/service/train';
 import { StatusBar } from '../StatusBar';
 import { useRef, useEffect, useState, useMemo } from 'react';
-import { message } from 'antd';
+import { message, Modal, Switch, Tooltip } from 'antd';
 import {
   CloudUploadOutlined,
   CheckCircleOutlined,
   PlayCircleOutlined,
   PauseCircleOutlined,
-  LoadingOutlined
+  LoadingOutlined,
+  ThunderboltOutlined,
+  RocketOutlined
 } from '@ant-design/icons';
 import RegisterUploadModal from '../upload/RegisterUploadModal';
 
@@ -34,6 +36,9 @@ export function ModelStatus() {
   const isTraining = useTrainingStore((state) => state.isTraining);
 
   const [messageApi, contextHolder] = message.useMessage();
+  const [useGpu, setUseGpu] = useState(true);
+  const [cudaAvailable, setCudaAvailable] = useState(false);
+  const [showStartModal, setShowStartModal] = useState(false);
 
   const loadInfo = useLoadInfoStore((state) => state.loadInfo);
   const isRegistered = useMemo(() => {
@@ -43,6 +48,25 @@ export function ModelStatus() {
   const [showRegisterModal, setShowRegisterModal] = useState(false);
   const [showtrainingModal, setShowtrainingModal] = useState(false);
 
+  useEffect(() => {
+    // Check if CUDA is available
+    checkCudaAvailability().then(res => {
+      if (res.data.code === 0) {
+        const isCudaAvailable = res.data.data.cuda_available;
+        setCudaAvailable(isCudaAvailable);
+        
+        // If CUDA is not available, default to CPU
+        if (!isCudaAvailable && useGpu) {
+          setUseGpu(false);
+        }
+      }
+    }).catch(error => {
+      console.error('Error checking CUDA availability:', error);
+      // Default to CPU if error checking CUDA
+      setUseGpu(false);
+    });
+  }, []);
+
   const handleRegistryClick = () => {
     if (!serviceStarted) {
       messageApi.info({
@@ -137,15 +161,18 @@ export function ModelStatus() {
 
     if (!config.model_name) {
       message.error('Please train a base model first');
-
       return;
     }
 
     setServiceStarting(true);
-    startService({ model_name: config.model_name })
+    startService({ 
+      model_name: config.model_name,
+      use_gpu: useGpu 
+    })
       .then((res) => {
         if (res.data.code === 0) {
-          messageApi.success({ content: 'Service starting...', duration: 1 });
+          const modeText = useGpu ? 'GPU acceleration' : 'CPU-only mode';
+          messageApi.success({ content: `Service starting with ${modeText}...`, duration: 2 });
           startPolling();
         } else {
           setServiceStarting(false);
@@ -190,11 +217,11 @@ export function ModelStatus() {
     } else {
       if (isTraining) {
         setShowtrainingModal(true);
-
         return;
       }
 
-      handleStartService();
+      // Show the start modal with GPU/CPU selection
+      setShowStartModal(true);
     }
   };
 
@@ -260,11 +287,71 @@ export function ModelStatus() {
         </div>
       </div>
 
+      {/* Modal for selecting GPU/CPU mode */}
+      <Modal
+        title="Start Service"
+        open={showStartModal}
+        onOk={() => {
+          setShowStartModal(false);
+          handleStartService();
+        }}
+        onCancel={() => setShowStartModal(false)}
+        okText="Start"
+        cancelText="Cancel"
+      >
+        <div className="py-4">
+          <div className="mb-4">
+            <p>Choose the inference mode for your model:</p>
+          </div>
+          
+          <div className="flex items-center justify-between mb-6 bg-gray-50 p-4 rounded-lg">
+            <div className="flex items-center">
+              {useGpu ? (
+                <ThunderboltOutlined style={{ fontSize: '24px', color: '#1890ff', marginRight: '12px' }} />
+              ) : (
+                <RocketOutlined style={{ fontSize: '24px', color: '#52c41a', marginRight: '12px' }} />
+              )}
+              <div>
+                <div className="font-medium">{useGpu ? 'GPU Acceleration' : 'CPU Mode'}</div>
+                <div className="text-sm text-gray-500">
+                  {useGpu
+                    ? 'Faster inference but requires compatible NVIDIA GPU'
+                    : 'Compatible with all systems, but slower inference'}
+                </div>
+              </div>
+            </div>
+            
+            <Tooltip title={!cudaAvailable && useGpu ? "CUDA GPU not available on this system" : ""}>
+              <Switch
+                checked={useGpu}
+                onChange={setUseGpu}
+                disabled={!cudaAvailable && useGpu}
+                checkedChildren="GPU"
+                unCheckedChildren="CPU"
+              />
+            </Tooltip>
+          </div>
+          
+          {!cudaAvailable && (
+            <div className="text-amber-500 text-sm mb-2">
+              <p>
+                No CUDA-compatible GPU detected. Running in CPU-only mode.
+              </p>
+            </div>
+          )}
+          
+          <p className="text-gray-500 text-sm">
+            GPU acceleration requires a compatible NVIDIA graphics card with CUDA support.
+            CPU mode works on all systems but may be slower.
+          </p>
+        </div>
+      </Modal>
+
       <RegisterUploadModal onClose={() => setShowRegisterModal(false)} open={showRegisterModal} />
       <TrainingTipModal
         confirm={() => {
-          handleStartService();
           setShowtrainingModal(false);
+          setShowStartModal(true);
         }}
         onClose={() => setShowtrainingModal(false)}
         open={showtrainingModal}
diff --git a/lpm_kernel/api/domains/trainprocess/routes.py b/lpm_kernel/api/domains/trainprocess/routes.py
index af6425e4..c8a12f2e 100644
--- a/lpm_kernel/api/domains/trainprocess/routes.py
+++ b/lpm_kernel/api/domains/trainprocess/routes.py
@@ -288,7 +288,6 @@ def retrain():
         data_synthesis_mode: Mode for data synthesis (optional)
         use_cuda: Whether to use CUDA for training (optional)
         is_cot: Whether to use Chain of Thought (optional)
-        use_previous_params: Whether to use previous training parameters (optional, default True)
     
     Returns:
         Response: JSON response
@@ -318,7 +317,7 @@ def retrain():
         is_cot = data.get("is_cot", None)
         
         # Log the received parameters
-        logger.info(f"Retrain parameters: model_name={model_name}, learning_rate={learning_rate}, number_of_epochs={number_of_epochs}, concurrency_threads={concurrency_threads}, data_synthesis_mode={data_synthesis_mode}, use_cuda={use_cuda}, is_cot={is_cot}, use_previous_params={use_previous_params}")
+        logger.info(f"Retrain parameters: model_name={model_name}, learning_rate={learning_rate}, number_of_epochs={number_of_epochs}, concurrency_threads={concurrency_threads}, data_synthesis_mode={data_synthesis_mode}, use_cuda={use_cuda}, is_cot={is_cot}")
         
         # Create training service instance
         train_service = TrainProcessService(current_model_name=model_name)
diff --git a/lpm_kernel/api/services/local_llm_service.py b/lpm_kernel/api/services/local_llm_service.py
index 04abd9a5..52e32bcf 100644
--- a/lpm_kernel/api/services/local_llm_service.py
+++ b/lpm_kernel/api/services/local_llm_service.py
@@ -59,7 +59,6 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
 
             # Check for CUDA availability if GPU was requested
             cuda_available = torch.cuda.is_available() if use_gpu else False
-            cuda_available = False
             gpu_info = ""
             
             if use_gpu and cuda_available:
@@ -81,7 +80,6 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
                 logger.info("Using CPU for inference (GPU not requested)")
 
             # Check for GPU optimization marker
-            gpu_optimized = False
             model_dir = os.path.dirname(model_path)
             gpu_marker_path = os.path.join(model_dir, "gpu_optimized.json")
             if os.path.exists(gpu_marker_path):
@@ -118,66 +116,41 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
                 "--cont-batching"         # Enable continuous batching
             ]
             
-            # Set up environment with CUDA variables to ensure GPU detection
             env = os.environ.copy()
+            # Default: do not expose GPU
             env["CUDA_VISIBLE_DEVICES"] = ""
-            
-            # Add GPU-related parameters if CUDA is available
-            if cuda_available and use_gpu:
-                # Force GPU usage with optimal parameters for faster loads
+
+            if use_gpu and cuda_available:
+                # --- GPU/CUDA setup ---
+                # Add GPU-specific llama.cpp arguments
                 cmd.extend([
                     "--n-gpu-layers", "999",  # Use all layers on GPU
                     "--tensor-split", "0",    # Use the first GPU for all operations
                     "--main-gpu", "0",        # Use GPU 0 as the primary device
-                    "--mlock"                 # Lock memory to prevent swapping during inference
+                    "--flash-attn"
                 ])
-                
-                # Set CUDA environment variables to help with GPU detection
-                env["CUDA_VISIBLE_DEVICES"] = "0"  # Force using first GPU
-                
-                # Ensure comprehensive library paths for CUDA
-                cuda_lib_paths = [
-                    "/usr/local/cuda/lib64",
-                    "/usr/lib/cuda/lib64",
-                    "/usr/local/lib",
-                    "/usr/lib/x86_64-linux-gnu",
-                    "/usr/lib/wsl/lib"  # For Windows WSL environments
-                ]
-                
-                # Build a comprehensive LD_LIBRARY_PATH
-                current_ld_path = env.get("LD_LIBRARY_PATH", "")
-                for path in cuda_lib_paths:
-                    if os.path.exists(path) and path not in current_ld_path:
-                        current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path
-                
-                env["LD_LIBRARY_PATH"] = current_ld_path
-                logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}")
-                
-                # If this is Windows, use different approach for CUDA libraries
-                if os.name == 'nt':
-                    # Windows typically has CUDA in PATH already if installed
-                    logger.info("Windows system detected, using system CUDA libraries")
-                else:
-                    # On Linux, try to find CUDA libraries in common locations
-                    for cuda_path in [
-                        # Common CUDA paths
+                # Set CUDA environment variables
+                env["CUDA_VISIBLE_DEVICES"] = "0"  # Use first GPU
+
+                # Set up LD_LIBRARY_PATH for CUDA (Linux/WSL only)
+                if os.name != 'nt':
+                    cuda_lib_paths = [
                         "/usr/local/cuda/lib64",
                         "/usr/lib/cuda/lib64",
-                        "/usr/local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib",
-                        "/usr/local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib",
-                    ]:
-                        if os.path.exists(cuda_path):
-                            # Add CUDA path to library path
-                            env["LD_LIBRARY_PATH"] = f"{cuda_path}:{env.get('LD_LIBRARY_PATH', '')}"
-                            env["CUDA_HOME"] = os.path.dirname(cuda_path)
-                            logger.info(f"Found CUDA at {cuda_path}, setting environment variables")
-                            break
-
-                # NOTE: CUDA support and rebuild should be handled at build/setup time (e.g., Docker build or setup script).
-                # The runtime check and rebuild logic has been removed for efficiency and reliability.
-                # Ensure llama.cpp is built with CUDA support before running the server if GPU is required.
+                        "/usr/local/lib",
+                        "/usr/lib/x86_64-linux-gnu",
+                        "/usr/lib/wsl/lib"
+                    ]
+                    current_ld_path = env.get("LD_LIBRARY_PATH", "")
+                    for path in cuda_lib_paths:
+                        if os.path.exists(path) and path not in current_ld_path:
+                            current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path
+                    env["LD_LIBRARY_PATH"] = current_ld_path
+                    logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}")
+                else:
+                    logger.info("Windows system detected, using system CUDA libraries")
 
-                # Pre-heat GPU to ensure faster initial response
+                # Pre-heat GPU for faster initial response
                 if torch.cuda.is_available():
                     logger.info("Pre-warming GPU to reduce initial latency...")
                     dummy_tensor = torch.zeros(1, 1).cuda()
@@ -185,10 +158,9 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool:
                     torch.cuda.synchronize()
                     torch.cuda.empty_cache()
                     logger.info("GPU warm-up complete")
-                
                 logger.info("Using GPU acceleration for inference with optimized settings")
             else:
-                # If GPU isn't available or supported, optimize for CPU
+                # --- CPU setup ---
                 cmd.extend([
                     "--threads", str(max(1, os.cpu_count() - 1)),  # Use all CPU cores except one
                 ])