diff --git a/lpm_kernel/L2/utils.py b/lpm_kernel/L2/utils.py index 6a0a00ab..626c9173 100644 --- a/lpm_kernel/L2/utils.py +++ b/lpm_kernel/L2/utils.py @@ -38,6 +38,8 @@ import gc import requests +progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json") + # Initialize the logger logger = logging.getLogger(__name__) @@ -595,6 +597,15 @@ def setup_logger(log_path, logger_name="download_logger"): return logger +def _write_progress_to_file(progress_data,progress_file): + try: + import json + with open(progress_file, 'w', encoding='utf-8') as f: + json_str = json.dumps(progress_data) + f.write(json_str) + except Exception as e: + logger.error(f"Error writing progress to file: {str(e)}") + def save_hf_model(model_name=None, log_file_path=None) -> str: """Saves a Hugging Face model locally. @@ -708,10 +719,21 @@ def progress_callback(current, total): if current % (1024 * 1024 * 10) < 8192: if total and total > 0: percent = current / total * 100 + progress_data ={ + "file_name": filename, + "file_size": total_size / 1024 / 1024 if total_size >0 else None, + "downloaded_mb": current/1024/1024, + "total_mb": total/1024/1024, + "percentage": percent, + "completed": False + } + _write_progress_to_file(progress_data, progress_file) logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB / {total/1024/1024:.2f} MB ({percent:.2f}%)") + else: logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB (total size unknown)") - + + # Download file with progress tracking response = requests.get(url, stream=True) if response.status_code == 200: @@ -789,6 +811,18 @@ def progress_callback(current, total): logger.info(f"Model {model_name} downloaded with {file_count} files.") except Exception: logger.info(f"Download completed for model: {model_name}.") + + progress_data = { + "file_name": "ALL_FILES", + "file_size": 100, + "downloaded_mb": 100, + "total_mb": 100, + "percentage": 100.0, + "completed": True, + } + + _write_progress_to_file(progress_data, progress_file) + except requests.RequestException: try: from modelscope.hub.snapshot_download import snapshot_download diff --git a/lpm_kernel/api/domains/trainprocess/trainprocess_service.py b/lpm_kernel/api/domains/trainprocess/trainprocess_service.py index f05850b2..35bea1ae 100644 --- a/lpm_kernel/api/domains/trainprocess/trainprocess_service.py +++ b/lpm_kernel/api/domains/trainprocess/trainprocess_service.py @@ -34,6 +34,7 @@ import subprocess from lpm_kernel.configs.logging import get_train_process_logger, TRAIN_LOG_FILE logger = get_train_process_logger() +progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json") class TrainProcessService: """Training process service (singleton pattern)""" @@ -808,86 +809,65 @@ def _monitor_model_download(self) -> bool: last_position = 0 # Variables to track download status - current_file = "" + file_name = "" file_size = 0 total_size = 0 # Total size of all files file_sizes = {} # Dictionary to store file sizes last_update_time = time.time() + completed = False while True: - try: - # Read new log content - with open(log_file, 'r') as f: - f.seek(last_position) - new_lines = f.readlines() - last_position = f.tell() - - for line in new_lines: - line = line.strip() - - # Check for download start - if "Starting download of model:" in line: - logger.info("Model download started") - continue - - # Get file size information when a download starts - if "Starting download of file:" in line: - match = re.search(r"Starting download of file: (.+) \(Size: ([\d\.]+) MB\)", line) - if match: - current_file = match.group(1) - file_size = float(match.group(2)) - file_sizes[current_file] = file_size - total_size = sum(file_sizes.values()) - # logger.info(f"Starting download of {current_file} ({file_size} MB)") - - # Track file download progress - if "Downloaded" in line and "MB /" in line: - match = re.search(r"File (.+): Downloaded ([\d\.]+) MB / ([\d\.]+) MB \(([\d\.]+)%\)", line) - if match: - file_name = match.group(1) - downloaded_mb = float(match.group(2)) - total_mb = float(match.group(3)) - percentage = float(match.group(4)) - - # Update file size if it was updated (especially for model.safetensors) - if total_mb > file_sizes.get(file_name, 0): - file_sizes[file_name] = total_mb - total_size = sum(file_sizes.values()) - - # Calculate overall progress - if total_size > 0: - # Sum up all downloaded data - completed_files_size = sum([file_sizes.get(f, 0) for f in file_sizes if f != file_name]) - current_file_downloaded = (percentage / 100.0) * total_mb - overall_downloaded = completed_files_size + current_file_downloaded - current_progress = (overall_downloaded / total_size) * 100 - current_progress = min(99.0, current_progress) # Cap at 99% until fully complete - # Update progress at most once per second - current_time = time.time() - if current_time - last_update_time >= 3.0: + if os.path.exists(progress_file): + try: + import json + with open(progress_file, 'r') as f: + progress_data = json.load(f) + file_name = progress_data.get("file_name") + file_size = float(progress_data.get("file_size")) + downloaded_mb = float(progress_data.get("downloaded_mb")) + total_mb = float(progress_data.get("total_mb")) + percentage = float(progress_data.get("percentage")) + completed = progress_data.get("completed") - self._update_progress( - "downloading_the_base_model", - "model_download", - current_progress, - f"Overall: {current_progress:.1f}% - Downloading {file_name}: {percentage}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)", - file_name - ) - last_update_time = current_time + except Exception as e: + logger.error(f"Error reading progress file: {str(e)}") + if completed: + self.progress.mark_step_status(ProcessStep.MODEL_DOWNLOAD, Status.COMPLETED) + logger.info("Model download completed") + return True + + file_sizes[file_name] = file_size + total_size = sum(file_sizes.values()) + + # Update file size if it was updated (especially for model.safetensors) + if total_mb > file_sizes.get(file_name, 0): + file_sizes[file_name] = total_mb + total_size = sum(file_sizes.values()) + + # Calculate overall progress + if total_size > 0: + # Sum up all downloaded data + completed_files_size = sum( + [file_sizes.get(f, 0) for f in file_sizes if f != file_name]) + current_file_downloaded = (percentage / 100.0) * total_mb + overall_downloaded = completed_files_size + current_file_downloaded + current_progress = (overall_downloaded / total_size) * 100 + current_progress = min(99.0, current_progress) # Cap at 99% until fully complete + # Update progress at most once per second + current_time = time.time() + if current_time - last_update_time >= 3.0: + self._update_progress( + "downloading_the_base_model", + "model_download", + current_progress, + f"Overall: {current_progress:.1f}% - Downloading {file_name}: {percentage}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)", + file_name + ) + last_update_time = current_time + + # Briefly pause to avoid excessive CPU usage + time.sleep(0.1) - if "Model downloaded successfully" in line: - self.progress.mark_step_status(ProcessStep.MODEL_DOWNLOAD, Status.COMPLETED) - logger.info("Model download completed") - return True - - # Briefly pause to avoid excessive CPU usage - time.sleep(0.1) - - except IOError as e: - logger.error(f"Failed to read log file: {str(e)}") - time.sleep(0.1) - continue - except Exception as e: logger.error(f"Failed to monitor model download progress: {str(e)}") return False