diff --git a/CLAUDE.md b/CLAUDE.md index 367ab34..bc4f9f2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -31,8 +31,8 @@ noisekit/ ## CLI ```bash -noisekit generate --dataset --samples N --presets P1 P2 --output ./out --seed 42 -noisekit generate ... --presets noise --noise-dir /path/to/noise_wavs +noisekit generate --dataset --samples N --preset P1 --preset P2 --output ./out --seed 42 +noisekit generate ... --preset noise --noise-dir /path/to/noise_wavs noisekit generate ... --no-nisqa # skip NISQA (no model download, faster) noisekit score ./audio_dir [--reference-dir ./ref] [--output scores.json] noisekit score ./audio_dir --no-nisqa # skip NISQA for standalone scoring @@ -179,44 +179,44 @@ uv run noisekit list-presets --verbose uv run noisekit generate \ --dataset google/fleurs \ --config en_us --split test \ - --samples 3 --presets clean_reference telecom low_bitrate \ + --samples 3 --preset clean_reference --preset telecom --preset low_bitrate \ --output ./test_out --seed 42 cat test_out/metadata.jsonl # New atomic presets — no external dependencies uv run noisekit generate \ --dataset google/fleurs --config en_us --split test \ - --samples 3 --presets clipping \ + --samples 3 --preset clipping \ --no-nisqa --output ./test_atomic --seed 42 # noise — auto-downloads MUSAN noise-only clips on first run uv run noisekit generate \ --dataset google/fleurs --config en_us --split test \ - --samples 3 --presets noise \ + --samples 3 --preset noise \ --output ./test_noise --seed 42 # Compound presets (auto-downloads MUSAN noise on first run) uv run noisekit generate \ --dataset google/fleurs --config en_us --split test \ - --samples 3 --presets noise_telecom \ + --samples 3 --preset noise_telecom \ --no-nisqa --output ./test_compound --seed 42 # clipping_telecom — no noise dir needed uv run noisekit generate \ --dataset google/fleurs --config en_us --split test \ - --samples 3 --presets clipping_telecom \ + --samples 3 --preset clipping_telecom \ --no-nisqa --output ./test_clipping_telecom --seed 42 # Far-field reverb uv run noisekit generate \ --dataset google/fleurs --config en_us --split test \ - --samples 3 --presets reverb noise_reverb \ + --samples 3 --preset reverb --preset noise_reverb \ --no-nisqa --output ./test_reverb --seed 42 # noise with your own noise corpus (skips auto-download) uv run noisekit generate \ --dataset google/fleurs --config en_us --split test \ - --samples 3 --presets noise \ + --samples 3 --preset noise \ --noise-dir ~/datasets/musan/noise \ --output ./test_noise --seed 42 ``` diff --git a/README.md b/README.md index 475038c..a0eb8ac 100644 --- a/README.md +++ b/README.md @@ -56,17 +56,20 @@ uvx noisekit generate \ --config en_us \ --split test \ --samples 300 \ - --presets telecom low_bitrate \ + --preset telecom \ + --preset low_bitrate \ --output ./benchmark_dataset \ --seed 42 ``` +`--preset` is repeatable: pass it once per preset. + For `noise`, you can supply your own background-noise WAVs with `--noise-dir` (e.g. [MUSAN](https://www.openslr.org/17/), [DEMAND](https://zenodo.org/record/1227121), or [FSD50K](https://zenodo.org/record/4060432)): ```bash uvx noisekit generate \ --dataset google/fleurs --config en_us --split test \ - --samples 300 --presets noise \ + --samples 300 --preset noise \ --noise-dir ~/datasets/musan/noise \ --output ./benchmark_dataset --seed 42 ``` diff --git a/noisekit/cli.py b/noisekit/cli.py index 3ed524d..d553514 100644 --- a/noisekit/cli.py +++ b/noisekit/cli.py @@ -20,7 +20,10 @@ def generate( dataset: Annotated[str, typer.Option(help="HuggingFace dataset name (e.g. google/fleurs)")], samples: Annotated[int, typer.Option(help="Number of source samples to process")] = 100, - presets: Annotated[list[str] | None, typer.Option(help="Preset name(s) to apply. Repeatable.")] = None, + preset: Annotated[ + list[str] | None, + typer.Option(help="Preset to apply. Repeatable: --preset telecom --preset clean_reference."), + ] = None, output: Annotated[ Path | None, typer.Option(help="Output directory. Omit to auto-create ./output//") ] = None, @@ -58,7 +61,7 @@ def generate( run_generate( dataset=dataset, samples=samples, - presets=list(presets) if presets else [], + presets=list(preset) if preset else [], output=resolved_output, seed=seed, split=split,