diff --git a/lpm_frontend/src/components/ModelStatus/index.tsx b/lpm_frontend/src/components/ModelStatus/index.tsx index 9a96d54a..8f730b09 100644 --- a/lpm_frontend/src/components/ModelStatus/index.tsx +++ b/lpm_frontend/src/components/ModelStatus/index.tsx @@ -1,14 +1,16 @@ import { Status, statusRankMap, useTrainingStore } from '@/store/useTrainingStore'; -import { startService, stopService } from '@/service/train'; +import { startService, stopService, checkCudaAvailability } from '@/service/train'; import { StatusBar } from '../StatusBar'; import { useRef, useEffect, useState, useMemo } from 'react'; -import { message } from 'antd'; +import { message, Modal, Switch, Tooltip } from 'antd'; import { CloudUploadOutlined, CheckCircleOutlined, PlayCircleOutlined, PauseCircleOutlined, - LoadingOutlined + LoadingOutlined, + ThunderboltOutlined, + RocketOutlined } from '@ant-design/icons'; import RegisterUploadModal from '../upload/RegisterUploadModal'; @@ -34,6 +36,9 @@ export function ModelStatus() { const isTraining = useTrainingStore((state) => state.isTraining); const [messageApi, contextHolder] = message.useMessage(); + const [useGpu, setUseGpu] = useState(true); + const [cudaAvailable, setCudaAvailable] = useState(false); + const [showStartModal, setShowStartModal] = useState(false); const loadInfo = useLoadInfoStore((state) => state.loadInfo); const isRegistered = useMemo(() => { @@ -43,6 +48,25 @@ export function ModelStatus() { const [showRegisterModal, setShowRegisterModal] = useState(false); const [showtrainingModal, setShowtrainingModal] = useState(false); + useEffect(() => { + // Check if CUDA is available + checkCudaAvailability().then(res => { + if (res.data.code === 0) { + const isCudaAvailable = res.data.data.cuda_available; + setCudaAvailable(isCudaAvailable); + + // If CUDA is not available, default to CPU + if (!isCudaAvailable && useGpu) { + setUseGpu(false); + } + } + }).catch(error => { + console.error('Error checking CUDA availability:', error); + // Default to CPU if error checking CUDA + setUseGpu(false); + }); + }, []); + const handleRegistryClick = () => { if (!serviceStarted) { messageApi.info({ @@ -137,15 +161,18 @@ export function ModelStatus() { if (!config.model_name) { message.error('Please train a base model first'); - return; } setServiceStarting(true); - startService({ model_name: config.model_name }) + startService({ + model_name: config.model_name, + use_gpu: useGpu + }) .then((res) => { if (res.data.code === 0) { - messageApi.success({ content: 'Service starting...', duration: 1 }); + const modeText = useGpu ? 'GPU acceleration' : 'CPU-only mode'; + messageApi.success({ content: `Service starting with ${modeText}...`, duration: 2 }); startPolling(); } else { setServiceStarting(false); @@ -190,11 +217,11 @@ export function ModelStatus() { } else { if (isTraining) { setShowtrainingModal(true); - return; } - handleStartService(); + // Show the start modal with GPU/CPU selection + setShowStartModal(true); } }; @@ -260,11 +287,71 @@ export function ModelStatus() { + {/* Modal for selecting GPU/CPU mode */} + { + setShowStartModal(false); + handleStartService(); + }} + onCancel={() => setShowStartModal(false)} + okText="Start" + cancelText="Cancel" + > +
+
+

Choose the inference mode for your model:

+
+ +
+
+ {useGpu ? ( + + ) : ( + + )} +
+
{useGpu ? 'GPU Acceleration' : 'CPU Mode'}
+
+ {useGpu + ? 'Faster inference but requires compatible NVIDIA GPU' + : 'Compatible with all systems, but slower inference'} +
+
+
+ + + + +
+ + {!cudaAvailable && ( +
+

+ No CUDA-compatible GPU detected. Running in CPU-only mode. +

+
+ )} + +

+ GPU acceleration requires a compatible NVIDIA graphics card with CUDA support. + CPU mode works on all systems but may be slower. +

+
+
+ setShowRegisterModal(false)} open={showRegisterModal} /> { - handleStartService(); setShowtrainingModal(false); + setShowStartModal(true); }} onClose={() => setShowtrainingModal(false)} open={showtrainingModal} diff --git a/lpm_kernel/api/domains/trainprocess/routes.py b/lpm_kernel/api/domains/trainprocess/routes.py index af6425e4..c8a12f2e 100644 --- a/lpm_kernel/api/domains/trainprocess/routes.py +++ b/lpm_kernel/api/domains/trainprocess/routes.py @@ -288,7 +288,6 @@ def retrain(): data_synthesis_mode: Mode for data synthesis (optional) use_cuda: Whether to use CUDA for training (optional) is_cot: Whether to use Chain of Thought (optional) - use_previous_params: Whether to use previous training parameters (optional, default True) Returns: Response: JSON response @@ -318,7 +317,7 @@ def retrain(): is_cot = data.get("is_cot", None) # Log the received parameters - logger.info(f"Retrain parameters: model_name={model_name}, learning_rate={learning_rate}, number_of_epochs={number_of_epochs}, concurrency_threads={concurrency_threads}, data_synthesis_mode={data_synthesis_mode}, use_cuda={use_cuda}, is_cot={is_cot}, use_previous_params={use_previous_params}") + logger.info(f"Retrain parameters: model_name={model_name}, learning_rate={learning_rate}, number_of_epochs={number_of_epochs}, concurrency_threads={concurrency_threads}, data_synthesis_mode={data_synthesis_mode}, use_cuda={use_cuda}, is_cot={is_cot}") # Create training service instance train_service = TrainProcessService(current_model_name=model_name) diff --git a/lpm_kernel/api/services/local_llm_service.py b/lpm_kernel/api/services/local_llm_service.py index 04abd9a5..52e32bcf 100644 --- a/lpm_kernel/api/services/local_llm_service.py +++ b/lpm_kernel/api/services/local_llm_service.py @@ -59,7 +59,6 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool: # Check for CUDA availability if GPU was requested cuda_available = torch.cuda.is_available() if use_gpu else False - cuda_available = False gpu_info = "" if use_gpu and cuda_available: @@ -81,7 +80,6 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool: logger.info("Using CPU for inference (GPU not requested)") # Check for GPU optimization marker - gpu_optimized = False model_dir = os.path.dirname(model_path) gpu_marker_path = os.path.join(model_dir, "gpu_optimized.json") if os.path.exists(gpu_marker_path): @@ -118,66 +116,41 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool: "--cont-batching" # Enable continuous batching ] - # Set up environment with CUDA variables to ensure GPU detection env = os.environ.copy() + # Default: do not expose GPU env["CUDA_VISIBLE_DEVICES"] = "" - - # Add GPU-related parameters if CUDA is available - if cuda_available and use_gpu: - # Force GPU usage with optimal parameters for faster loads + + if use_gpu and cuda_available: + # --- GPU/CUDA setup --- + # Add GPU-specific llama.cpp arguments cmd.extend([ "--n-gpu-layers", "999", # Use all layers on GPU "--tensor-split", "0", # Use the first GPU for all operations "--main-gpu", "0", # Use GPU 0 as the primary device - "--mlock" # Lock memory to prevent swapping during inference + "--flash-attn" ]) - - # Set CUDA environment variables to help with GPU detection - env["CUDA_VISIBLE_DEVICES"] = "0" # Force using first GPU - - # Ensure comprehensive library paths for CUDA - cuda_lib_paths = [ - "/usr/local/cuda/lib64", - "/usr/lib/cuda/lib64", - "/usr/local/lib", - "/usr/lib/x86_64-linux-gnu", - "/usr/lib/wsl/lib" # For Windows WSL environments - ] - - # Build a comprehensive LD_LIBRARY_PATH - current_ld_path = env.get("LD_LIBRARY_PATH", "") - for path in cuda_lib_paths: - if os.path.exists(path) and path not in current_ld_path: - current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path - - env["LD_LIBRARY_PATH"] = current_ld_path - logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}") - - # If this is Windows, use different approach for CUDA libraries - if os.name == 'nt': - # Windows typically has CUDA in PATH already if installed - logger.info("Windows system detected, using system CUDA libraries") - else: - # On Linux, try to find CUDA libraries in common locations - for cuda_path in [ - # Common CUDA paths + # Set CUDA environment variables + env["CUDA_VISIBLE_DEVICES"] = "0" # Use first GPU + + # Set up LD_LIBRARY_PATH for CUDA (Linux/WSL only) + if os.name != 'nt': + cuda_lib_paths = [ "/usr/local/cuda/lib64", "/usr/lib/cuda/lib64", - "/usr/local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib", - "/usr/local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib", - ]: - if os.path.exists(cuda_path): - # Add CUDA path to library path - env["LD_LIBRARY_PATH"] = f"{cuda_path}:{env.get('LD_LIBRARY_PATH', '')}" - env["CUDA_HOME"] = os.path.dirname(cuda_path) - logger.info(f"Found CUDA at {cuda_path}, setting environment variables") - break - - # NOTE: CUDA support and rebuild should be handled at build/setup time (e.g., Docker build or setup script). - # The runtime check and rebuild logic has been removed for efficiency and reliability. - # Ensure llama.cpp is built with CUDA support before running the server if GPU is required. + "/usr/local/lib", + "/usr/lib/x86_64-linux-gnu", + "/usr/lib/wsl/lib" + ] + current_ld_path = env.get("LD_LIBRARY_PATH", "") + for path in cuda_lib_paths: + if os.path.exists(path) and path not in current_ld_path: + current_ld_path = f"{path}:{current_ld_path}" if current_ld_path else path + env["LD_LIBRARY_PATH"] = current_ld_path + logger.info(f"Setting LD_LIBRARY_PATH to: {current_ld_path}") + else: + logger.info("Windows system detected, using system CUDA libraries") - # Pre-heat GPU to ensure faster initial response + # Pre-heat GPU for faster initial response if torch.cuda.is_available(): logger.info("Pre-warming GPU to reduce initial latency...") dummy_tensor = torch.zeros(1, 1).cuda() @@ -185,10 +158,9 @@ def start_server(self, model_path: str, use_gpu: bool = True) -> bool: torch.cuda.synchronize() torch.cuda.empty_cache() logger.info("GPU warm-up complete") - logger.info("Using GPU acceleration for inference with optimized settings") else: - # If GPU isn't available or supported, optimize for CPU + # --- CPU setup --- cmd.extend([ "--threads", str(max(1, os.cpu_count() - 1)), # Use all CPU cores except one ])