ModelEngine-Group · starlight6336 · May 18, 2026 · May 18, 2026
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0095_0Fzrv1GXHPI.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0095_0Fzrv1GXHPI.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0096_SPsOscw70ns.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0096_SPsOscw70ns.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0097_Qtpn66PvyUA.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0097_Qtpn66PvyUA.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0098_NH_WrDj9kAI.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0098_NH_WrDj9kAI.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0099_W2uSJ0YfDyI.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/0099_W2uSJ0YfDyI.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/audioset_macro_map_v1.json b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/classify/audioset_macro_map_v1.json
@@ -0,0 +1,133 @@
+{
+  "HumanSpeech": [
+    "Speech",
+    "Male speech, man speaking",
+    "Female speech, woman speaking",
+    "Child speech, kid speaking",
+    "Conversation",
+    "Narration, monologue",
+    "Whispering",
+    "Shout",
+    "Yell",
+    "Screaming",
+    "Laughter",
+    "Crying, sobbing",
+    "Singing",
+    "Rapping",
+    "Humming",
+    "Breathing",
+    "Cough",
+    "Sneeze"
+  ],
+  "Music": [
+    "Music",
+    "Musical instrument",
+    "Vocal music",
+    "Song",
+    "Background music",
+    "Electronic music",
+    "Rock music",
+    "Classical music",
+    "Jazz",
+    "Hip hop music",
+    "Techno",
+    "House music",
+    "Dance music"
+  ],
+  "Animal": [
+    "Animal",
+    "Domestic animals, pets",
+    "Dog",
+    "Cat",
+    "Bird",
+    "Insect",
+    "Livestock, farm animals, working animals"
+  ],
+  "Vehicle": [
+    "Vehicle",
+    "Car",
+    "Truck",
+    "Bus",
+    "Train",
+    "Aircraft",
+    "Motorcycle",
+    "Traffic noise, roadway noise",
+    "Vehicle horn, car horn, honking"
+  ],
+  "EngineMachinery": [
+    "Engine",
+    "Idling",
+    "Accelerating, revving, vroom",
+    "Medium engine (mid frequency)",
+    "Heavy engine (low frequency)",
+    "Mechanical fan",
+    "Air conditioning",
+    "Vacuum cleaner",
+    "Tools",
+    "Power tool",
+    "Drill",
+    "Jackhammer"
+  ],
+  "AlarmSiren": [
+    "Siren",
+    "Buzzer",
+    "Alarm",
+    "Car alarm",
+    "Fire alarm",
+    "Smoke detector, smoke alarm",
+    "Telephone bell ringing",
+    "Ringtone"
+  ],
+  "ImpactClatter": [
+    "Clang",
+    "Clatter",
+    "Chink, clink",
+    "Ding",
+    "Bang",
+    "Smash, crash",
+    "Breaking",
+    "Door",
+    "Doorbell",
+    "Knock",
+    "Tap"
+  ],
+  "GunshotExplosion": [
+    "Explosion",
+    "Gunshot, gunfire",
+    "Machine gun",
+    "Fireworks",
+    "Firecracker"
+  ],
+  "Crowd": [
+    "Crowd",
+    "Chatter",
+    "Cheering",
+    "Applause",
+    "Hubbub, speech noise, speech babble",
+    "Cacophony"
+  ],
+  "WindWater": [
+    "Wind",
+    "Wind noise (microphone)",
+    "Thunderstorm",
+    "Thunder",
+    "Water",
+    "Rain",
+    "Waves, surf",
+    "Stream",
+    "Waterfall"
+  ],
+  "Silence": [
+    "Silence"
+  ],
+  "Noise": [
+    "Noise",
+    "Environmental noise",
+    "Static",
+    "Mains hum",
+    "White noise",
+    "Pink noise",
+    "Distortion"
+  ]
+}
+
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-01-01-01-01-04.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-01-01-01-01-04.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-01-01-01-02-02.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-01-01-01-02-02.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-02-01-02-01-15.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-02-01-02-01-15.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-02-02-01-02-15.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-02-02-01-02-15.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-03-01-02-01-02.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-03-01-02-01-02.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-03-02-01-02-17.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-03-02-01-02-17.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-04-01-02-01-10.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/emotion/03-01-04-01-02-01-10.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0000.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0000.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0001.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0001.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0002.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0002.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0003.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0003.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0004.flac b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/84-121123-0004.flac
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0122.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0122.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0123.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0123.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0124.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0124.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0125.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0125.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0126.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/audio/summary/BAC009S0002W0126.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/.txt b/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/.txt
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/librispeech_0000.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/librispeech_0000.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/librispeech_0001.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/librispeech_0001.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/librispeech_0002.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/librispeech_0002.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/.txt b/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/.txt
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/aishell_0000.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/aishell_0000.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/aishell_0001.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/aishell_0001.wav
diff --git a/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/aishell_0002.wav b/runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/aishell_0002.wav
diff --git a/runtime/ops/mapper/__init__.py b/runtime/ops/mapper/__init__.py
@@ -47,7 +47,30 @@ def _import_operators():
     from . import remove_duplicate_sentences
     from . import knowledge_relation_slice
     from . import pii_ner_detection
-        # ===== Video operators (PR1-PR5) =====
+
+    # ===== Audio operators =====
+    from . import audio_anomaly_filter
+    from . import audio_asr_pipeline
+    from . import audio_asr_transcribe
+    from . import audio_dc_offset_removal
+    from . import audio_emotion_recognize
+    from . import audio_fast_lang_id
+    from . import audio_fast_lang_id_text
+    from . import audio_format_convert
+    from . import audio_gtcrn_denoise
+    from . import audio_hum_notch
+    from . import audio_noise_gate
+    from . import audio_pre_emphasis
+    from . import audio_quantize_encode
+    from . import audio_rms_loudness_normalize
+    from . import audio_simple_agc
+    from . import audio_soft_peak_limiter
+    from . import audio_sound_classify
+    from . import audio_telephony_bandpass
+    from . import audio_text_summarize
+    from . import audio_trim_silence_edges
+
+    # ===== Video operators (PR1-PR5) =====
     from . import _video_common
     from . import video_format_convert
     from . import video_sensitive_detect

diff --git a/runtime/ops/mapper/audio_anomaly_filter/README.md b/runtime/ops/mapper/audio_anomaly_filter/README.md
@@ -0,0 +1,41 @@
+# AudioAnomalyFilter 异常语音检测与过滤算子
+
+## 概述
+
+AudioAnomalyFilter 用于对音频做快速质量检测，计算时长、静音帧比例与音频可读性，并给出 `quality_flag`。算子不再通过清空 `text/data` 模拟删除文件，而是写入结构化质量标签；下游音频算子可根据标签软跳过异常样本。
+
+## 功能特性
+
+- **时长检测**：支持最小时长/最大时长阈值
+- **静音比例检测**：基于短时 RMS 统计静音帧占比
+- **可读性检测**：文本文件强行改成 `.wav` 等不可读取音频会被标记为 `invalid`
+- **下游门控**：支持让后续音频算子跳过异常样本，符合 DataMate 一文件一输出链路
+- **结果结构化输出**：报告写入 `ext_params.audio_quality`
+
+## 参数说明
+
+| 参数 | 类型 | 默认值 | 说明 |
+|---|---|---:|---|
+| minDur | inputNumber | 1.0 | 最小时长（秒），小于该值视为异常 |
+| maxDur | inputNumber | 20000.0 | 最大时长（秒），大于该值视为异常 |
+| silenceRatioTh | slider | 0.8 | 静音帧比例阈值（0~1），>= 阈值视为异常 |
+| silenceRmsRatioTh | slider | 0.05 | 静音判定阈值 = global_rms * 该比例 |
+| skipInvalidDownstream | switch | true | true=后续音频算子遇到 invalid 软跳过；false=仅打标并继续处理 |
+
+## 输入输出
+
+- **输入**：`sample["filePath"]`（音频文件路径）
+- **输出**：
+  - `sample["ext_params"]["audio_quality"]`：
+    - `quality_flag`: `ok/invalid`
+    - `duration/silence_ratio/global_rms/reason/read_error/skip_downstream`
+  - 如果该算子为链路最后一个算子：导出当前音频，质量报告写入 `ext_params.audio_quality`
+  - 如果该算子位于链路中间：保持当前音频，后续音频算子按 `skip_downstream` 决定是否软跳过
+
+## 依赖说明
+
+- **Python 依赖**：优先 `torchaudio`，兜底 `soundfile`
+
+## 版本历史
+
+- **v1.0.0**：支持时长/静音比例/可读性检测，按 DataMate 链路语义写质量标签并门控下游
diff --git a/runtime/ops/mapper/audio_anomaly_filter/__init__.py b/runtime/ops/mapper/audio_anomaly_filter/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='AudioAnomalyFilter',
+                          module_path="ops.mapper.audio_anomaly_filter.process")
diff --git a/runtime/ops/mapper/audio_anomaly_filter/audio_skip.py b/runtime/ops/mapper/audio_anomaly_filter/audio_skip.py
@@ -0,0 +1,114 @@
+# -- encoding: utf-8 --
+
+from pathlib import Path
+from typing import Any, Dict
+
+from loguru import logger
+
+
+AUDIO_EXTS = {
+    "aac",
+    "aif",
+    "aiff",
+    "amr",
+    "au",
+    "flac",
+    "m4a",
+    "mp3",
+    "oga",
+    "ogg",
+    "opus",
+    "snd",
+    "wav",
+    "webm",
+    "wma",
+}
+
+
+def _parts(path_value: str) -> set[str]:
+    try:
+        return {part.lower() for part in Path(path_value).parts}
+    except Exception:
+        return set()
+
+
+def is_reference_sample(sample: Dict[str, Any], filepath_key: str = "filePath") -> bool:
+    path_value = str(sample.get(filepath_key) or "")
+    return "references" in _parts(path_value)
+
+
+def _ext_from_sample(
+    sample: Dict[str, Any],
+    filepath_key: str = "filePath",
+    filetype_key: str = "fileType",
+    target_type_key: str = "target_type",
+) -> str:
+    for key in (target_type_key, filetype_key):
+        value = str(sample.get(key) or "").strip().lower().lstrip(".")
+        if value:
+            return value
+    path_value = str(sample.get(filepath_key) or "").strip()
+    return Path(path_value).suffix.lower().lstrip(".") if path_value else ""
+
+
+def is_audio_sample(
+    sample: Dict[str, Any],
+    filepath_key: str = "filePath",
+    filetype_key: str = "fileType",
+    target_type_key: str = "target_type",
+    data_key: str = "data",
+) -> bool:
+    if is_reference_sample(sample, filepath_key):
+        return False
+    data = sample.get(data_key)
+    if isinstance(data, (bytes, bytearray)) and data:
+        return True
+    return _ext_from_sample(sample, filepath_key, filetype_key, target_type_key) in AUDIO_EXTS
+
+
+def invalid_quality_reason(sample: Dict[str, Any], ext_params_key: str = "ext_params") -> str:
+    for key in ("fileName", "sourceFileName", "filePath"):
+        marker_source = Path(str(sample.get(key) or "")).stem.lower()
+        marker = "__quality_invalid"
+        if marker in marker_source:
+            reason = marker_source.split(marker, 1)[1].strip("_") or "invalid_audio"
+            return f"invalid_audio_quality:{reason}"
+
+    ext = sample.get(ext_params_key, {})
+    if not isinstance(ext, dict):
+        return ""
+    quality = ext.get("audio_quality", {})
+    if not isinstance(quality, dict):
+        return ""
+    if str(quality.get("quality_flag") or "").strip().lower() != "invalid":
+        return ""
+    skip_downstream = quality.get("skip_downstream", True)
+    if isinstance(skip_downstream, str):
+        skip_downstream = skip_downstream.strip().lower() in {"1", "true", "yes", "y", "on"}
+    if not skip_downstream:
+        return ""
+    reason = str(quality.get("reason") or "invalid_audio").strip()
+    return f"invalid_audio_quality:{reason}"
+
+
+def mark_skipped_sample(
+    sample: Dict[str, Any],
+    reason: str,
+    op_name: str,
+    text_key: str = "text",
+    data_key: str = "data",
+    filetype_key: str = "fileType",
+    target_type_key: str = "target_type",
+    ext_params_key: str = "ext_params",
+) -> Dict[str, Any]:
+    ext = sample.get(ext_params_key, {})
+    if not isinstance(ext, dict):
+        ext = {"_raw": ext}
+    ext.setdefault("audio_skip", {})[op_name] = reason
+    sample[ext_params_key] = ext
+    sample[text_key] = ""
+    sample[data_key] = b""
+    sample[filetype_key] = ""
+    sample[target_type_key] = ""
+    logger.info(f"fileName: {sample.get('fileName')}, method: {op_name} skipped: {reason}")
+    return sample
diff --git a/runtime/ops/mapper/audio_anomaly_filter/metadata.yml b/runtime/ops/mapper/audio_anomaly_filter/metadata.yml
@@ -0,0 +1,66 @@
+name: 'audioOps-异常语音检测与过滤'
+name_en: 'audioOps-Audio Anomaly Detect & Filter'
+description: '对音频做快速异常检测：时长范围、静音帧比例与可读性。结果写入 ext_params.audio_quality；可控制下游音频算子是否跳过异常样本。'
+description_en: 'Fast audio anomaly detection (duration, silence ratio and readability). Writes ext_params.audio_quality and can make downstream audio ops skip invalid samples.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'AudioAnomalyFilter'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'audio'
+inputs: 'audio'
+outputs: 'audio'
+settings:
+  minDur:
+    name: '最小时长(秒)'
+    type: 'inputNumber'
+    description: '小于该值视为异常。'
+    defaultVal: 1.0
+    min: 0
+    max: 36000
+    step: 0.1
+  maxDur:
+    name: '最大时长(秒)'
+    type: 'inputNumber'
+    description: '大于该值视为异常。'
+    defaultVal: 20000.0
+    min: 0
+    max: 360000
+    step: 1
+  silenceRatioTh:
+    name: '静音帧比例阈值'
+    type: 'slider'
+    description: '静音帧比例 >= 阈值 时视为异常。'
+    defaultVal: 0.8
+    min: 0
+    max: 1
+    step: 0.01
+  silenceRmsRatioTh:
+    name: '静音判定比例'
+    type: 'slider'
+    description: '静音判定阈值 = global_rms * 该比例。'
+    defaultVal: 0.05
+    min: 0
+    max: 1
+    step: 0.01
+  skipInvalidDownstream:
+    name: '下游跳过异常音频'
+    description: '开启后，后续音频算子遇到 quality_flag=invalid 会软跳过；关闭后仅打标并继续处理。不可读取的伪 wav 会被标为 invalid。'
+    type: 'switch'
+    defaultVal: 'true'
+    required: false
+    checkedLabel: '跳过'
+    unCheckedLabel: '继续'
+runtime:
+  memory: 104857600
+  cpu: 0.2
+  gpu: 0
+  npu: 0
+  storage: 10MB
+
+metrics:
+  - name: '处理耗时'
+    metric: '依输入音频长度与运行环境而定'
+release:
+  - '首次发布'