From fa94eef59bbad1dbc9b51747b941894f19cd14cf Mon Sep 17 00:00:00 2001 From: yanmuyuan <2216646664@qq.com> Date: Thu, 15 May 2025 15:08:39 +0800 Subject: [PATCH 1/3] Fix model download progress bar dependency log --- lpm_kernel/L2/utils.py | 37 +++++- .../trainprocess/trainprocess_service.py | 123 ++++++++---------- 2 files changed, 87 insertions(+), 73 deletions(-) diff --git a/lpm_kernel/L2/utils.py b/lpm_kernel/L2/utils.py index 5105c7bc..4f4d4d77 100644 --- a/lpm_kernel/L2/utils.py +++ b/lpm_kernel/L2/utils.py @@ -656,6 +656,15 @@ def save_hf_model(model_name=None, log_file_path=None) -> str: logger.info(f"Will be saved to: {save_path}") hf_model_name = f"Qwen/{model_name}" + progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json") + def _write_progress_to_file(progress_data,progress_file): + try: + import json + with open(progress_file, 'w', encoding='utf-8') as f: + json_str = json.dumps(progress_data) + f.write(json_str) + except Exception as e: + logger.error(f"Error writing progress to file: {str(e)}") try: # Get list of files to download @@ -702,15 +711,27 @@ def download_file_with_progress(filename): # Define progress callback def progress_callback(current, total): progress_bar.update(current - progress_bar.n) - + # Log progress every ~1MB if current % (1024 * 1024) < 8192: if total and total > 0: percent = current / total * 100 + progress_data ={ + "file_name": filename, + "file_size": total_size / 1024 / 1024 if total_size >0 else None, + "downloaded_mb": current/1024/1024, + "total_mb": total/1024/1024, + "percentage": percent, + "completed": False + } + _write_progress_to_file(progress_data, progress_file) logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB / {total/1024/1024:.2f} MB ({percent:.2f}%)") + else: logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB (total size unknown)") - + + + # Download file with progress tracking response = requests.get(url, stream=True) if response.status_code == 200: @@ -788,6 +809,18 @@ def progress_callback(current, total): logger.info(f"Model {model_name} downloaded with {file_count} files.") except Exception: logger.info(f"Download completed for model: {model_name}.") + + progress_data = { + "file_name": "ALL_FILES", + "file_size": 100, + "downloaded_mb": 100, + "total_mb": 100, + "percentage": 100.0, + "completed": True, + } + + _write_progress_to_file(progress_data, progress_file) + except requests.RequestException: try: from modelscope.hub.snapshot_download import snapshot_download diff --git a/lpm_kernel/api/domains/trainprocess/trainprocess_service.py b/lpm_kernel/api/domains/trainprocess/trainprocess_service.py index 9bfc318f..5acae17b 100644 --- a/lpm_kernel/api/domains/trainprocess/trainprocess_service.py +++ b/lpm_kernel/api/domains/trainprocess/trainprocess_service.py @@ -833,85 +833,66 @@ def _monitor_model_download(self) -> bool: last_position = 0 # Variables to track download status - current_file = "" + file_name = "" file_size = 0 total_size = 0 # Total size of all files file_sizes = {} # Dictionary to store file sizes last_update_time = time.time() + completed = False while True: - try: - # Read new log content - with open(log_file, 'r') as f: - f.seek(last_position) - new_lines = f.readlines() - last_position = f.tell() - - for line in new_lines: - line = line.strip() - - # Check for download start - if "Starting download of model:" in line: - logger.info("Model download started") - continue - - # Get file size information when a download starts - if "Starting download of file:" in line: - match = re.search(r"Starting download of file: (.+) \(Size: ([\d\.]+) MB\)", line) - if match: - current_file = match.group(1) - file_size = float(match.group(2)) - file_sizes[current_file] = file_size - total_size = sum(file_sizes.values()) - # logger.info(f"Starting download of {current_file} ({file_size} MB)") - - # Track file download progress - if "Downloaded" in line and "MB /" in line: - match = re.search(r"File (.+): Downloaded ([\d\.]+) MB / ([\d\.]+) MB \(([\d\.]+)%\)", line) - if match: - file_name = match.group(1) - downloaded_mb = float(match.group(2)) - total_mb = float(match.group(3)) - percentage = float(match.group(4)) - - # Update file size if it was updated (especially for model.safetensors) - if total_mb > file_sizes.get(file_name, 0): - file_sizes[file_name] = total_mb - total_size = sum(file_sizes.values()) - - # Calculate overall progress - if total_size > 0: - # Sum up all downloaded data - completed_files_size = sum([file_sizes.get(f, 0) for f in file_sizes if f != file_name]) - current_file_downloaded = (percentage / 100.0) * total_mb - overall_downloaded = completed_files_size + current_file_downloaded - current_progress = (overall_downloaded / total_size) * 100 - current_progress = min(99.0, current_progress) # Cap at 99% until fully complete - # Update progress at most once per second - current_time = time.time() - if current_time - last_update_time >= 3.0: + progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json") + if os.path.exists(progress_file): + try: + import json + with open(progress_file, 'r') as f: + progress_data = json.load(f) + file_name = progress_data.get("file_name") + file_size = float(progress_data.get("file_size")) + downloaded_mb = float(progress_data.get("downloaded_mb")) + total_mb = float(progress_data.get("total_mb")) + percentage = float(progress_data.get("percentage")) + completed = progress_data.get("completed") + + except Exception as e: + logger.error(f"Error reading progress file: {str(e)}") + if completed: + self.progress.mark_step_status(ProcessStep.MODEL_DOWNLOAD, Status.COMPLETED) + logger.info("Model download completed") + return True + + file_sizes[file_name] = file_size + total_size = sum(file_sizes.values()) + + # Update file size if it was updated (especially for model.safetensors) + if total_mb > file_sizes.get(file_name, 0): + file_sizes[file_name] = total_mb + total_size = sum(file_sizes.values()) + + # Calculate overall progress + if total_size > 0: + # Sum up all downloaded data + completed_files_size = sum( + [file_sizes.get(f, 0) for f in file_sizes if f != file_name]) + current_file_downloaded = (percentage / 100.0) * total_mb + overall_downloaded = completed_files_size + current_file_downloaded + current_progress = (overall_downloaded / total_size) * 100 + current_progress = min(99.0, current_progress) # Cap at 99% until fully complete + # Update progress at most once per second + current_time = time.time() + if current_time - last_update_time >= 3.0: + self._update_progress( + "downloading_the_base_model", + "model_download", + current_progress, + f"Overall: {current_progress:.1f}% - Downloading {file_name}: {percentage}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)" + ) + last_update_time = current_time + + # Briefly pause to avoid excessive CPU usage + time.sleep(0.1) - self._update_progress( - "downloading_the_base_model", - "model_download", - current_progress, - f"Overall: {current_progress:.1f}% - Downloading {file_name}: {percentage}% ({downloaded_mb:.1f}/{total_mb:.1f} MB)" - ) - last_update_time = current_time - if "Model downloaded successfully" in line: - self.progress.mark_step_status(ProcessStep.MODEL_DOWNLOAD, Status.COMPLETED) - logger.info("Model download completed") - return True - - # Briefly pause to avoid excessive CPU usage - time.sleep(0.1) - - except IOError as e: - logger.error(f"Failed to read log file: {str(e)}") - time.sleep(0.1) - continue - except Exception as e: logger.error(f"Failed to monitor model download progress: {str(e)}") return False From aabeebf881b8194777c606a13ac64ca4bc99abfb Mon Sep 17 00:00:00 2001 From: yanmuyuan <2216646664@qq.com> Date: Thu, 15 May 2025 15:36:22 +0800 Subject: [PATCH 2/3] indentation problem --- lpm_kernel/L2/utils.py | 3 ++- lpm_kernel/api/domains/trainprocess/trainprocess_service.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lpm_kernel/L2/utils.py b/lpm_kernel/L2/utils.py index 4f4d4d77..c989d161 100644 --- a/lpm_kernel/L2/utils.py +++ b/lpm_kernel/L2/utils.py @@ -38,6 +38,8 @@ import gc import requests +progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json") + # Initialize the logger logger = logging.getLogger(__name__) @@ -656,7 +658,6 @@ def save_hf_model(model_name=None, log_file_path=None) -> str: logger.info(f"Will be saved to: {save_path}") hf_model_name = f"Qwen/{model_name}" - progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json") def _write_progress_to_file(progress_data,progress_file): try: import json diff --git a/lpm_kernel/api/domains/trainprocess/trainprocess_service.py b/lpm_kernel/api/domains/trainprocess/trainprocess_service.py index 5acae17b..1f612911 100644 --- a/lpm_kernel/api/domains/trainprocess/trainprocess_service.py +++ b/lpm_kernel/api/domains/trainprocess/trainprocess_service.py @@ -34,6 +34,7 @@ import subprocess from lpm_kernel.configs.logging import get_train_process_logger, TRAIN_LOG_FILE logger = get_train_process_logger() +progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json") class TrainProcessService: """Training process service (singleton pattern)""" @@ -841,7 +842,6 @@ def _monitor_model_download(self) -> bool: completed = False while True: - progress_file = os.path.join(os.path.dirname(TRAIN_LOG_FILE), "train_progress.json") if os.path.exists(progress_file): try: import json From 0006ac6a6e4ada9b89e2ef4d3d26b337fc962d1a Mon Sep 17 00:00:00 2001 From: yanmuyuan <2216646664@qq.com> Date: Thu, 15 May 2025 15:44:04 +0800 Subject: [PATCH 3/3] Abnormal indentation issue --- lpm_kernel/L2/utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lpm_kernel/L2/utils.py b/lpm_kernel/L2/utils.py index c989d161..7c7c0208 100644 --- a/lpm_kernel/L2/utils.py +++ b/lpm_kernel/L2/utils.py @@ -597,6 +597,15 @@ def setup_logger(log_path, logger_name="download_logger"): return logger +def _write_progress_to_file(progress_data,progress_file): + try: + import json + with open(progress_file, 'w', encoding='utf-8') as f: + json_str = json.dumps(progress_data) + f.write(json_str) + except Exception as e: + logger.error(f"Error writing progress to file: {str(e)}") + def save_hf_model(model_name=None, log_file_path=None) -> str: """Saves a Hugging Face model locally. @@ -658,14 +667,6 @@ def save_hf_model(model_name=None, log_file_path=None) -> str: logger.info(f"Will be saved to: {save_path}") hf_model_name = f"Qwen/{model_name}" - def _write_progress_to_file(progress_data,progress_file): - try: - import json - with open(progress_file, 'w', encoding='utf-8') as f: - json_str = json.dumps(progress_data) - f.write(json_str) - except Exception as e: - logger.error(f"Error writing progress to file: {str(e)}") try: # Get list of files to download @@ -731,7 +732,6 @@ def progress_callback(current, total): else: logger.info(f"File {filename}: Downloaded {current/1024/1024:.2f} MB (total size unknown)") - # Download file with progress tracking response = requests.get(url, stream=True)