Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
{
"HumanSpeech": [
"Speech",
"Male speech, man speaking",
"Female speech, woman speaking",
"Child speech, kid speaking",
"Conversation",
"Narration, monologue",
"Whispering",
"Shout",
"Yell",
"Screaming",
"Laughter",
"Crying, sobbing",
"Singing",
"Rapping",
"Humming",
"Breathing",
"Cough",
"Sneeze"
],
"Music": [
"Music",
"Musical instrument",
"Vocal music",
"Song",
"Background music",
"Electronic music",
"Rock music",
"Classical music",
"Jazz",
"Hip hop music",
"Techno",
"House music",
"Dance music"
],
"Animal": [
"Animal",
"Domestic animals, pets",
"Dog",
"Cat",
"Bird",
"Insect",
"Livestock, farm animals, working animals"
],
"Vehicle": [
"Vehicle",
"Car",
"Truck",
"Bus",
"Train",
"Aircraft",
"Motorcycle",
"Traffic noise, roadway noise",
"Vehicle horn, car horn, honking"
],
"EngineMachinery": [
"Engine",
"Idling",
"Accelerating, revving, vroom",
"Medium engine (mid frequency)",
"Heavy engine (low frequency)",
"Mechanical fan",
"Air conditioning",
"Vacuum cleaner",
"Tools",
"Power tool",
"Drill",
"Jackhammer"
],
"AlarmSiren": [
"Siren",
"Buzzer",
"Alarm",
"Car alarm",
"Fire alarm",
"Smoke detector, smoke alarm",
"Telephone bell ringing",
"Ringtone"
],
"ImpactClatter": [
"Clang",
"Clatter",
"Chink, clink",
"Ding",
"Bang",
"Smash, crash",
"Breaking",
"Door",
"Doorbell",
"Knock",
"Tap"
],
"GunshotExplosion": [
"Explosion",
"Gunshot, gunfire",
"Machine gun",
"Fireworks",
"Firecracker"
],
"Crowd": [
"Crowd",
"Chatter",
"Cheering",
"Applause",
"Hubbub, speech noise, speech babble",
"Cacophony"
],
"WindWater": [
"Wind",
"Wind noise (microphone)",
"Thunderstorm",
"Thunder",
"Water",
"Rain",
"Waves, surf",
"Stream",
"Waterfall"
],
"Silence": [
"Silence"
],
"Noise": [
"Noise",
"Environmental noise",
"Static",
"Mains hum",
"White noise",
"Pink noise",
"Distortion"
]
}

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
100 changes: 100 additions & 0 deletions runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/en/.txt

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
100 changes: 100 additions & 0 deletions runtime/ops/mapper/__audioOps_Test_Cases__/humanSpeech/zh/.txt

Large diffs are not rendered by default.

Binary file not shown.
Binary file not shown.
Binary file not shown.
25 changes: 24 additions & 1 deletion runtime/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,30 @@ def _import_operators():
from . import remove_duplicate_sentences
from . import knowledge_relation_slice
from . import pii_ner_detection
# ===== Video operators (PR1-PR5) =====

# ===== Audio operators =====
from . import audio_anomaly_filter
from . import audio_asr_pipeline
from . import audio_asr_transcribe
from . import audio_dc_offset_removal
from . import audio_emotion_recognize
from . import audio_fast_lang_id
from . import audio_fast_lang_id_text
from . import audio_format_convert
from . import audio_gtcrn_denoise
from . import audio_hum_notch
from . import audio_noise_gate
from . import audio_pre_emphasis
from . import audio_quantize_encode
from . import audio_rms_loudness_normalize
from . import audio_simple_agc
from . import audio_soft_peak_limiter
from . import audio_sound_classify
from . import audio_telephony_bandpass
from . import audio_text_summarize
from . import audio_trim_silence_edges

# ===== Video operators (PR1-PR5) =====
from . import _video_common
from . import video_format_convert
from . import video_sensitive_detect
Expand Down
41 changes: 41 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# AudioAnomalyFilter 异常语音检测与过滤算子

## 概述

AudioAnomalyFilter 用于对音频做快速质量检测,计算时长、静音帧比例与音频可读性,并给出 `quality_flag`。算子不再通过清空 `text/data` 模拟删除文件,而是写入结构化质量标签;下游音频算子可根据标签软跳过异常样本。

## 功能特性

- **时长检测**:支持最小时长/最大时长阈值
- **静音比例检测**:基于短时 RMS 统计静音帧占比
- **可读性检测**:文本文件强行改成 `.wav` 等不可读取音频会被标记为 `invalid`
- **下游门控**:支持让后续音频算子跳过异常样本,符合 DataMate 一文件一输出链路
- **结果结构化输出**:报告写入 `ext_params.audio_quality`

## 参数说明

| 参数 | 类型 | 默认值 | 说明 |
|---|---|---:|---|
| minDur | inputNumber | 1.0 | 最小时长(秒),小于该值视为异常 |
| maxDur | inputNumber | 20000.0 | 最大时长(秒),大于该值视为异常 |
| silenceRatioTh | slider | 0.8 | 静音帧比例阈值(0~1),>= 阈值视为异常 |
| silenceRmsRatioTh | slider | 0.05 | 静音判定阈值 = global_rms * 该比例 |
| skipInvalidDownstream | switch | true | true=后续音频算子遇到 invalid 软跳过;false=仅打标并继续处理 |

## 输入输出

- **输入**:`sample["filePath"]`(音频文件路径)
- **输出**:
- `sample["ext_params"]["audio_quality"]`:
- `quality_flag`: `ok/invalid`
- `duration/silence_ratio/global_rms/reason/read_error/skip_downstream`
- 如果该算子为链路最后一个算子:导出当前音频,质量报告写入 `ext_params.audio_quality`
- 如果该算子位于链路中间:保持当前音频,后续音频算子按 `skip_downstream` 决定是否软跳过

## 依赖说明

- **Python 依赖**:优先 `torchaudio`,兜底 `soundfile`

## 版本历史

- **v1.0.0**:支持时长/静音比例/可读性检测,按 DataMate 链路语义写质量标签并门控下游
6 changes: 6 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-

from datamate.core.base_op import OPERATORS

OPERATORS.register_module(module_name='AudioAnomalyFilter',
module_path="ops.mapper.audio_anomaly_filter.process")
114 changes: 114 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/audio_skip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# -- encoding: utf-8 --

from pathlib import Path
from typing import Any, Dict

from loguru import logger


AUDIO_EXTS = {
"aac",
"aif",
"aiff",
"amr",
"au",
"flac",
"m4a",
"mp3",
"oga",
"ogg",
"opus",
"snd",
"wav",
"webm",
"wma",
}


def _parts(path_value: str) -> set[str]:
try:
return {part.lower() for part in Path(path_value).parts}
except Exception:
return set()


def is_reference_sample(sample: Dict[str, Any], filepath_key: str = "filePath") -> bool:
path_value = str(sample.get(filepath_key) or "")
return "references" in _parts(path_value)


def _ext_from_sample(
sample: Dict[str, Any],
filepath_key: str = "filePath",
filetype_key: str = "fileType",
target_type_key: str = "target_type",
) -> str:
for key in (target_type_key, filetype_key):
value = str(sample.get(key) or "").strip().lower().lstrip(".")
if value:
return value
path_value = str(sample.get(filepath_key) or "").strip()
return Path(path_value).suffix.lower().lstrip(".") if path_value else ""


def is_audio_sample(
sample: Dict[str, Any],
filepath_key: str = "filePath",
filetype_key: str = "fileType",
target_type_key: str = "target_type",
data_key: str = "data",
) -> bool:
if is_reference_sample(sample, filepath_key):
return False
data = sample.get(data_key)
if isinstance(data, (bytes, bytearray)) and data:
return True
return _ext_from_sample(sample, filepath_key, filetype_key, target_type_key) in AUDIO_EXTS


def invalid_quality_reason(sample: Dict[str, Any], ext_params_key: str = "ext_params") -> str:
for key in ("fileName", "sourceFileName", "filePath"):
marker_source = Path(str(sample.get(key) or "")).stem.lower()
marker = "__quality_invalid"
if marker in marker_source:
reason = marker_source.split(marker, 1)[1].strip("_") or "invalid_audio"
return f"invalid_audio_quality:{reason}"

ext = sample.get(ext_params_key, {})
if not isinstance(ext, dict):
return ""
quality = ext.get("audio_quality", {})
if not isinstance(quality, dict):
return ""
if str(quality.get("quality_flag") or "").strip().lower() != "invalid":
return ""
skip_downstream = quality.get("skip_downstream", True)
if isinstance(skip_downstream, str):
skip_downstream = skip_downstream.strip().lower() in {"1", "true", "yes", "y", "on"}
if not skip_downstream:
return ""
reason = str(quality.get("reason") or "invalid_audio").strip()
return f"invalid_audio_quality:{reason}"


def mark_skipped_sample(
sample: Dict[str, Any],
reason: str,
op_name: str,
text_key: str = "text",
data_key: str = "data",
filetype_key: str = "fileType",
target_type_key: str = "target_type",
ext_params_key: str = "ext_params",
) -> Dict[str, Any]:
ext = sample.get(ext_params_key, {})
if not isinstance(ext, dict):
ext = {"_raw": ext}
ext.setdefault("audio_skip", {})[op_name] = reason
sample[ext_params_key] = ext
sample[text_key] = ""
sample[data_key] = b""
sample[filetype_key] = ""
sample[target_type_key] = ""
logger.info(f"fileName: {sample.get('fileName')}, method: {op_name} skipped: {reason}")
return sample
66 changes: 66 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/metadata.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: 'audioOps-异常语音检测与过滤'
name_en: 'audioOps-Audio Anomaly Detect & Filter'
description: '对音频做快速异常检测:时长范围、静音帧比例与可读性。结果写入 ext_params.audio_quality;可控制下游音频算子是否跳过异常样本。'
description_en: 'Fast audio anomaly detection (duration, silence ratio and readability). Writes ext_params.audio_quality and can make downstream audio ops skip invalid samples.'
language: 'python'
vendor: 'huawei'
raw_id: 'AudioAnomalyFilter'
version: '1.0.0'
types:
- 'cleaning'
modal: 'audio'
inputs: 'audio'
outputs: 'audio'
settings:
minDur:
name: '最小时长(秒)'
type: 'inputNumber'
description: '小于该值视为异常。'
defaultVal: 1.0
min: 0
max: 36000
step: 0.1
maxDur:
name: '最大时长(秒)'
type: 'inputNumber'
description: '大于该值视为异常。'
defaultVal: 20000.0
min: 0
max: 360000
step: 1
silenceRatioTh:
name: '静音帧比例阈值'
type: 'slider'
description: '静音帧比例 >= 阈值 时视为异常。'
defaultVal: 0.8
min: 0
max: 1
step: 0.01
silenceRmsRatioTh:
name: '静音判定比例'
type: 'slider'
description: '静音判定阈值 = global_rms * 该比例。'
defaultVal: 0.05
min: 0
max: 1
step: 0.01
skipInvalidDownstream:
name: '下游跳过异常音频'
description: '开启后,后续音频算子遇到 quality_flag=invalid 会软跳过;关闭后仅打标并继续处理。不可读取的伪 wav 会被标为 invalid。'
type: 'switch'
defaultVal: 'true'
required: false
checkedLabel: '跳过'
unCheckedLabel: '继续'
runtime:
memory: 104857600
cpu: 0.2
gpu: 0
npu: 0
storage: 10MB

metrics:
- name: '处理耗时'
metric: '依输入音频长度与运行环境而定'
release:
- '首次发布'
Loading