From 70513e3b443cefc98a436888bad0cf7a9cd5d121 Mon Sep 17 00:00:00 2001 From: Walnutes Date: Tue, 2 Dec 2025 11:04:46 +0800 Subject: [PATCH] add online data mixing (ODM) doc --- docs/.vuepress/notes/en/guide.ts | 1 + docs/.vuepress/notes/zh/guide.ts | 1 + docs/en/notes/guide/mixer/odm.md | 207 ++++++++++++++++++++++++++++++ docs/zh/notes/guide/mixer/odm.md | 208 +++++++++++++++++++++++++++++++ 4 files changed, 417 insertions(+) create mode 100644 docs/en/notes/guide/mixer/odm.md create mode 100644 docs/zh/notes/guide/mixer/odm.md diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index ce20914..339ac1a 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -39,6 +39,7 @@ export const Guide: ThemeNote = defineNoteConfig({ 'quickstart', 'tutorial', 'doremi', + 'odm', ], }, { diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index 5f56c23..c5a56d0 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -39,6 +39,7 @@ export const Guide: ThemeNote = defineNoteConfig({ 'quickstart', 'tutorial', 'doremi', + 'odm', ], }, { diff --git a/docs/en/notes/guide/mixer/odm.md b/docs/en/notes/guide/mixer/odm.md new file mode 100644 index 0000000..8ba77ff --- /dev/null +++ b/docs/en/notes/guide/mixer/odm.md @@ -0,0 +1,207 @@ +--- +title: ODM Data Mixer +createTime: 2025/01/27 10:00:00 +icon: material-symbols:casino +permalink: /en/guide/mixer/odm/ +--- + +# ODM Data Mixer + +ODM (Online Data Mixing) is an algorithm for dynamically optimizing multi-domain data mixing ratios during training. It uses the Exp3 algorithm from Multi-Armed Bandits to adaptively adjust domain weights based on training loss as a reward signal. Unlike DoReMi, ODM does not require a reference model, making it more efficient and easier to deploy. + +## Algorithm Overview + +ODM uses the Exp3 (Exponential weights for Exploration and Exploitation) algorithm to dynamically adjust domain weights during training: + +1. **Warmup Phase**: Use initial proportions (uniform or specified) for a fixed number of steps +2. **Evaluation Phase**: Evaluate current loss for each domain by sampling batches +3. **Reward Update**: Update cumulative estimated rewards using importance weighting and moving average +4. **Policy Update**: Update sampling policy using Exp3 algorithm with decaying exploration rate + +The algorithm automatically balances exploration (trying all domains) and exploitation (focusing on high-reward domains) through a time-decaying exploration rate. + +## Key Features + +- **No Reference Model Required**: Unlike DoReMi, ODM works directly with the training model +- **Online Learning**: Adapts weights continuously during training +- **Importance Weighting**: Uses importance-weighted rewards to handle non-uniform sampling +- **Decaying Exploration**: Exploration rate decreases over time for better convergence + +## Configuration + +### Basic Configuration + +**Configuration File**: `odm_dynamic_qwen_pt_full.yaml` + +```yaml +### dynamic_train - ODM: Online Data Mixing with Multi-Armed Bandits +train_type: dynamic_mix +components_cfg_file: src/dataflex/configs/components.yaml +component_name: odm # Use ODM mixer +mixture_sample_rule: mixture +init_mixture_proportions: [0.5, 0.5] # Initial weights +warmup_step: 2000 # Warmup steps before starting ODM +update_step: 500 # Frequency of weight updates +update_times: -1 # -1 means continuous updates until training ends +``` + +**Configuration in components.yaml**: + +```yaml +mixers: + odm: + name: odm + params: + # Smoothing parameter for exponential moving average (0 to 1) + alpha: 0.9 + + # Number of warmup steps using initial proportions + warmup_steps: 2000 + + # Number of samples to evaluate per domain when computing rewards + num_eval_samples: 500 + + # Batch size for evaluation + eval_batch_size: 8 + + # Initial proportions for warmup period + initial_proportions: [0.5, 0.5] + # initial_proportions: null # Use uniform distribution +``` + +### Training Configuration Parameters + +In the training configuration file: + +- **`warmup_step`**: Number of steps before starting ODM (should match or exceed `warmup_steps` in components.yaml) +- **`update_step`**: Frequency of weight updates (every N steps) +- **`update_times`**: Number of weight updates. Use `-1` for continuous updates until training ends +- **`train_step`**: Optional explicit total training steps (overrides `num_train_epochs`) + +## Training Process + +### Single-Step Training + +Unlike DoReMi's three-step process, ODM only requires a single training run: + +```bash +# Single training run with ODM +llamafactory-cli train examples/train_full/mixers/odm_dynamic_qwen_pt_full.yaml +``` + +### Weight Update Process + +During training, ODM performs the following steps at each update: + +1. **Domain Evaluation**: Sample and evaluate batches from each domain to compute current losses +2. **Reward Computation**: Convert losses to rewards (reward = loss / 10.0 to prevent explosion) +3. **Importance Weighting**: Update cumulative estimated rewards using importance weighting: `R̂_i += reward_i / π_i` +4. **Policy Update**: Update domain weights using Exp3 algorithm: + - Compute exploration rate: `ε_t = min{1/K, sqrt(ln(K) / (K * t))}` + - Update weights: `w_i = exp(ε_{t-1} * R̂_i) * scaling_factor + ε_t` + - Normalize to get probabilities: `π_i = w_i / Σ_j w_j` + +### Weight Logging + +During training, a `odm_weights.jsonl` file is automatically generated, recording detailed information for each weight update: + +```json +{"step": 2000, "timestamp": "2025-01-27 10:00:00", "domain_names": ["wiki", "c4"], "domain_weights": [0.3, 0.7], "cumulative_estimated_rewards": [25.3, 45.8], "exploration_rate": 0.0141, "alpha": 0.9, "warmup_steps": 2000, "is_warmup": false} +{"step": 2500, "timestamp": "2025-01-27 10:10:00", "domain_names": ["wiki", "c4"], "domain_weights": [0.25, 0.75], "cumulative_estimated_rewards": [28.1, 52.3], "exploration_rate": 0.0126, "alpha": 0.9, "warmup_steps": 2000, "is_warmup": false} +``` + +## Weight Extraction and Analysis + +Extract optimized weights from the training output directory: + +```python +import json + +# Read weight logs +weights_history = [] +with open('odm_result/odm_weights.jsonl', 'r') as f: + for line in f: + weights_history.append(json.loads(line)) + +# Get final weights (skip warmup entries) +non_warmup_entries = [e for e in weights_history if not e.get('is_warmup', False)] +if non_warmup_entries: + final_entry = non_warmup_entries[-1] + final_weights = final_entry['domain_weights'] + domain_names = final_entry['domain_names'] + + print("Final optimized domain weights:") + for name, weight in zip(domain_names, final_weights): + print(f" {name}: {weight:.4f}") + +# Visualize weight evolution +import matplotlib.pyplot as plt +import numpy as np + +non_warmup_steps = [e['step'] for e in non_warmup_entries] +weights_matrix = np.array([e['domain_weights'] for e in non_warmup_entries]) + +plt.figure(figsize=(10, 6)) +for i, name in enumerate(domain_names): + plt.plot(non_warmup_steps, weights_matrix[:, i], label=name, marker='o') +plt.xlabel('Training Step') +plt.ylabel('Domain Weight') +plt.title('ODM Domain Weight Evolution') +plt.legend() +plt.grid(True) +plt.savefig('odm_weights_evolution.png') +plt.show() + +# Analyze exploration rate decay +exploration_rates = [e['exploration_rate'] for e in non_warmup_entries] +plt.figure(figsize=(10, 4)) +plt.plot(non_warmup_steps, exploration_rates, label='Exploration Rate ε_t') +plt.xlabel('Training Step') +plt.ylabel('Exploration Rate') +plt.title('ODM Exploration Rate Decay') +plt.legend() +plt.grid(True) +plt.savefig('odm_exploration_rate.png') +plt.show() +``` + +## Complete Training Example + +```bash +llamafactory-cli train examples/train_full/mixers/odm_dynamic_qwen_pt_full.yaml +``` + +## Comparison with DoReMi + +| Aspect | ODM | DoReMi | +|--------|-----|--------| +| **Reference Model** | Not required | Required (Step 1) | +| **Training Steps** | Single step | Three steps | +| **Computation Cost** | Lower (no reference model) | Higher (reference + proxy + target) | +| **Adaptation Speed** | Continuous online adaptation | Batch optimization on proxy model | +| **Loss Signal** | Training loss | Excess loss (vs reference) | +| **Algorithm** | Exp3 (Multi-Armed Bandits) | Exponentiated Gradient Ascent | + +## FAQ + +### Q: How does ODM differ from DoReMi? + +A: ODM uses training loss directly as a reward signal and adapts online during training, while DoReMi requires a reference model and optimizes weights on a proxy model before training the target model. ODM is simpler to use but DoReMi may provide better theoretical guarantees with excess loss. + +### Q: How is the exploration rate computed? + +A: The exploration rate decays over time: `ε_t = min{1/K, sqrt(ln(K) / (K * t))}` where K is the number of domains and t is the step number. This ensures the algorithm explores more in early stages and exploits more in later stages. + +### Q: What if a domain gets very low weight? + +A: The exploration rate ensures each domain maintains at least `ε_t` probability, preventing any domain from being completely ignored. As exploration decays, domains with consistently high loss will naturally get lower weights, but they still get sampled occasionally. + +### Q: How to choose between uniform and custom initial proportions? + +A: Use uniform distribution (`initial_proportions: null`) for unbiased exploration. Use custom proportions if you have prior knowledge about domain importance or want to start from a specific distribution. The algorithm will adapt from either starting point. + +## References + +- Paper: [Online Data Mixing: Efficient and Consistent Training for Multilingual Neural Machine Translation](https://arxiv.org/abs/2312.02406) +- Official Implementation: [Online Data Mixing GitHub](https://github.com/alon-albalak/online-data-mixing) +- Project: [DataFlex GitHub](https://github.com/OpenDCAI/DataFlex) diff --git a/docs/zh/notes/guide/mixer/odm.md b/docs/zh/notes/guide/mixer/odm.md new file mode 100644 index 0000000..e06d41d --- /dev/null +++ b/docs/zh/notes/guide/mixer/odm.md @@ -0,0 +1,208 @@ +--- +title: ODM 数据混合器 +createTime: 2025/01/27 10:00:00 +icon: material-symbols:casino +permalink: /zh/guide/mixer/odm/ +--- + +# ODM 数据混合器 + +ODM (Online Data Mixing) 是一种在训练过程中动态优化多领域数据混合比例的算法。它使用多臂老虎机(Multi-Armed Bandits)中的 Exp3 算法,基于训练损失作为奖励信号自适应地调整领域权重。与 DoReMi 不同,ODM 不需要参考模型,因此更加高效且易于部署。 + +## 算法概述 + +ODM 使用 Exp3(Exploration and Exploitation 的指数权重)算法在训练过程中动态调整领域权重: + +1. **预热阶段**:使用初始比例(均匀分布或指定值)进行固定步数的训练 +2. **评估阶段**:通过采样批次评估每个领域的当前损失 +3. **奖励更新**:使用重要性加权和移动平均更新累积估计奖励 +4. **策略更新**:使用具有衰减探索率的 Exp3 算法更新采样策略 + +算法通过随时间衰减的探索率自动平衡探索(尝试所有领域)和利用(关注高奖励领域)。 + +## 核心特性 + +- **无需参考模型**:与 DoReMi 不同,ODM 直接使用训练模型工作 +- **在线学习**:在训练过程中持续适应权重 +- **重要性加权**:使用重要性加权奖励来处理非均匀采样 +- **衰减探索**:探索率随时间衰减,以实现更好的收敛 + +## 配置说明 + +### 基础配置 + +**配置文件**: `odm_dynamic_qwen_pt_full.yaml` + +```yaml +### dynamic_train - ODM: Online Data Mixing with Multi-Armed Bandits +train_type: dynamic_mix +components_cfg_file: src/dataflex/configs/components.yaml +component_name: odm # 使用 ODM 混合器 +mixture_sample_rule: mixture +init_mixture_proportions: [0.5, 0.5] # 初始权重 +warmup_step: 2000 # ODM 开始前的预热步数 +update_step: 500 # 权重更新频率 +update_times: -1 # -1 表示持续更新直到训练结束 +``` + +**在 components.yaml 中的配置**: + +```yaml +mixers: + odm: + name: odm + params: + # 指数移动平均的平滑参数 (0 到 1) + alpha: 0.9 + + # 使用初始比例的预热步数 + warmup_steps: 2000 + + # 计算奖励时每个领域评估的样本数 + num_eval_samples: 500 + + # 评估时的批次大小 + eval_batch_size: 8 + + # 预热期的初始比例 + initial_proportions: [0.5, 0.5] + # initial_proportions: null # 使用均匀分布 +``` + +### 训练配置参数 + +在训练配置文件中: + +- **`warmup_step`**: 开始 ODM 前的步数(应与 components.yaml 中的 `warmup_steps` 匹配或更大) +- **`update_step`**: 权重更新的频率(每 N 步) +- **`update_times`**: 权重更新次数。使用 `-1` 表示持续更新直到训练结束 +- **`train_step`**: 可选的显式总训练步数(覆盖 `num_train_epochs`) + +## 训练流程 + +### 单步训练 + +与 DoReMi 的三步流程不同,ODM 只需要一次训练运行: + +```bash +# 使用 ODM 的单一训练运行 +llamafactory-cli train examples/train_full/mixers/odm_dynamic_qwen_pt_full.yaml +``` + +### 权重更新过程 + +在训练过程中,ODM 在每次更新时执行以下步骤: + +1. **领域评估**:从每个领域采样并评估批次以计算当前损失 +2. **奖励计算**:将损失转换为奖励(reward = loss / 10.0 以防止爆炸) +3. **重要性加权**:使用重要性加权更新累积估计奖励:`R̂_i += reward_i / π_i` +4. **策略更新**:使用 Exp3 算法更新领域权重: + - 计算探索率:`ε_t = min{1/K, sqrt(ln(K) / (K * t))}` + - 更新权重:`w_i = exp(ε_{t-1} * R̂_i) * scaling_factor + ε_t` + - 归一化得到概率:`π_i = w_i / Σ_j w_j` + +### 权重日志 + +训练过程中会自动生成 `odm_weights.jsonl` 文件,记录每次权重更新的详细信息: + +```json +{"step": 2000, "timestamp": "2025-01-27 10:00:00", "domain_names": ["wiki", "c4"], "domain_weights": [0.3, 0.7], "cumulative_estimated_rewards": [25.3, 45.8], "exploration_rate": 0.0141, "alpha": 0.9, "warmup_steps": 2000, "is_warmup": false} +{"step": 2500, "timestamp": "2025-01-27 10:10:00", "domain_names": ["wiki", "c4"], "domain_weights": [0.25, 0.75], "cumulative_estimated_rewards": [28.1, 52.3], "exploration_rate": 0.0126, "alpha": 0.9, "warmup_steps": 2000, "is_warmup": false} +``` + +## 权重提取和分析 + +从训练输出目录中提取优化后的权重: + +```python +import json + +# 读取权重日志 +weights_history = [] +with open('odm_result/odm_weights.jsonl', 'r') as f: + for line in f: + weights_history.append(json.loads(line)) + +# 获取最终权重(跳过预热条目) +non_warmup_entries = [e for e in weights_history if not e.get('is_warmup', False)] +if non_warmup_entries: + final_entry = non_warmup_entries[-1] + final_weights = final_entry['domain_weights'] + domain_names = final_entry['domain_names'] + + print("最终优化后的领域权重:") + for name, weight in zip(domain_names, final_weights): + print(f" {name}: {weight:.4f}") + +# 可视化权重变化趋势 +import matplotlib.pyplot as plt +import numpy as np + +non_warmup_steps = [e['step'] for e in non_warmup_entries] +weights_matrix = np.array([e['domain_weights'] for e in non_warmup_entries]) + +plt.figure(figsize=(10, 6)) +for i, name in enumerate(domain_names): + plt.plot(non_warmup_steps, weights_matrix[:, i], label=name, marker='o') +plt.xlabel('Training Step') +plt.ylabel('Domain Weight') +plt.title('ODM Domain Weight Evolution') +plt.legend() +plt.grid(True) +plt.savefig('odm_weights_evolution.png') +plt.show() + +# 分析探索率衰减 +exploration_rates = [e['exploration_rate'] for e in non_warmup_entries] +plt.figure(figsize=(10, 4)) +plt.plot(non_warmup_steps, exploration_rates, label='Exploration Rate ε_t') +plt.xlabel('Training Step') +plt.ylabel('Exploration Rate') +plt.title('ODM Exploration Rate Decay') +plt.legend() +plt.grid(True) +plt.savefig('odm_exploration_rate.png') +plt.show() +``` + +## 完整训练示例 + +```bash +llamafactory-cli train examples/train_full/mixers/odm_dynamic_qwen_pt_full.yaml +``` + +## 与 DoReMi 的对比 + +| 方面 | ODM | DoReMi | +|------|-----|--------| +| **参考模型** | 不需要 | 需要(步骤 1) | +| **训练步骤** | 单步 | 三步 | +| **计算成本** | 较低(无参考模型) | 较高(参考 + 代理 + 目标) | +| **适应速度** | 持续在线适应 | 在代理模型上批量优化 | +| **损失信号** | 训练损失 | 过剩损失(相对于参考) | +| **算法** | Exp3(多臂老虎机) | 指数梯度上升 | + +## 常见问题 + +### Q: ODM 与 DoReMi 有何不同? + +A: ODM 直接使用训练损失作为奖励信号,在训练过程中在线适应,而 DoReMi 需要参考模型,并在训练目标模型之前在代理模型上优化权重。ODM 更简单易用,但 DoReMi 在使用过剩损失时可能提供更好的理论保证。 + +### Q: 探索率是如何计算的? + +A: 探索率随时间衰减:`ε_t = min{1/K, sqrt(ln(K) / (K * t))}` 其中 K 是领域数量,t 是步数。这确保算法在早期阶段更多探索,在后期阶段更多利用。 + +### Q: 如果某个领域获得非常低的权重怎么办? + +A: 探索率确保每个领域保持至少 `ε_t` 的概率,防止任何领域被完全忽略。随着探索衰减,持续高损失的领域自然会获得较低权重,但它们仍偶尔会被采样。 + +### Q: 如何在均匀分布和自定义初始比例之间选择? + +A: 对于无偏探索,使用均匀分布(`initial_proportions: null`)。如果您对领域重要性有先验知识或想从特定分布开始,请使用自定义比例。算法将从任一起点适应。 + + +## 参考资料 + +- 论文: [Online Data Mixing: Efficient and Consistent Training for Multilingual Neural Machine Translation](https://arxiv.org/abs/2312.02406) +- 官方实现: [Online Data Mixing GitHub](https://github.com/alon-albalak/online-data-mixing) +- 项目地址: [DataFlex GitHub](https://github.com/OpenDCAI/DataFlex)