Skip to content

Commit c616299

Browse files
committed
Add shuffle_after_epoch_seed argument to file-based readers.
* Adds an optional `shuffle_after_epoch_seed` (int32) argument to all readers that support `shuffle_after_epoch`: readers.file, readers.numpy, readers.fits, readers.coco, and readers.nemo_asr. * Previously the per-epoch shuffle was always seeded with a fixed constant, making the dataset order identical across training runs. The new argument lets users supply a custom base seed so that different training runs produce statistically independent orderings, while still guaranteeing a consistent global permutation across shards. * When omitted, the old fixed seed is used, preserving backward compatibility. * Tests added in test_shuffling.py and test_numpy.py. Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
1 parent 87643a0 commit c616299

11 files changed

Lines changed: 295 additions & 3 deletions

dali/operators/reader/coco_reader_op.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,22 @@ This readers produces the following outputs::
9393
.AddOptionalArg("shuffle_after_epoch",
9494
R"code(If set to True, the reader shuffles the entire dataset after each epoch.)code",
9595
false)
96+
.AddOptionalArg<int32_t>("shuffle_after_epoch_seed",
97+
R"code(Random seed for the dataset shuffling performed after each epoch.
98+
99+
If not provided, a fixed default seed is used, which results in the same shuffling
100+
pattern across different training runs. Providing a custom seed allows for different
101+
shuffle patterns across training runs, which may be desirable for better statistical
102+
properties.
103+
104+
.. note::
105+
When using multiple DALI pipelines (e.g., for multi-GPU training), all pipeline
106+
instances should use the same ``shuffle_after_epoch_seed`` to ensure a consistent
107+
global shuffle across all shards.
108+
109+
.. note::
110+
This argument has no effect unless ``shuffle_after_epoch`` is set to ``True``.)code",
111+
nullptr, false)
96112
.AddOptionalArg<string>("file_root",
97113
R"code(Path to a directory that contains the data files.
98114

dali/operators/reader/file_reader_op.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,22 @@ This argument is mutually exclusive with `files`.)", nullptr)
9898
9999
`stick_to_shard` and `random_shuffle` cannot be used when this argument is set to True.)",
100100
false)
101+
.AddOptionalArg<int32_t>("shuffle_after_epoch_seed",
102+
R"(Random seed for the dataset shuffling performed after each epoch.
103+
104+
If not provided, a fixed default seed is used, which results in the same shuffling
105+
pattern across different training runs. Providing a custom seed allows for different
106+
shuffle patterns across training runs, which may be desirable for better statistical
107+
properties.
108+
109+
.. note::
110+
When using multiple DALI pipelines (e.g., for multi-GPU training), all pipeline
111+
instances should use the same ``shuffle_after_epoch_seed`` to ensure a consistent
112+
global shuffle across all shards.
113+
114+
.. note::
115+
This argument has no effect unless ``shuffle_after_epoch`` is set to ``True``.)",
116+
nullptr, false)
101117
.AddOptionalArg<vector<string>>("files", R"(A list of file paths to read the data from.
102118
103119
If `file_root` is provided, the paths are treated as being relative to it.

dali/operators/reader/fits_reader_op.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,22 @@ This argument is mutually exclusive with `files`.)",
7171
7272
`stick_to_shard` and `random_shuffle` cannot be used when this argument is set to True.)",
7373
false)
74+
.AddOptionalArg<int32_t>("shuffle_after_epoch_seed",
75+
R"(Random seed for the dataset shuffling performed after each epoch.
76+
77+
If not provided, a fixed default seed is used, which results in the same shuffling
78+
pattern across different training runs. Providing a custom seed allows for different
79+
shuffle patterns across training runs, which may be desirable for better statistical
80+
properties.
81+
82+
.. note::
83+
When using multiple DALI pipelines (e.g., for multi-GPU training), all pipeline
84+
instances should use the same ``shuffle_after_epoch_seed`` to ensure a consistent
85+
global shuffle across all shards.
86+
87+
.. note::
88+
This argument has no effect unless ``shuffle_after_epoch`` is set to ``True``.)",
89+
nullptr, false)
7490
.AddOptionalArg<vector<string>>("files", R"(A list of file paths to read the data from.
7591
7692
If `file_root` is provided, the paths are treated as being relative to it.

dali/operators/reader/loader/file_label_loader.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ class DLL_PUBLIC FileLabelLoaderBase : public Loader<CPUBackend, ImageLabelWrapp
5858
shuffle_after_epoch_(shuffle_after_epoch),
5959
current_index_(0),
6060
current_epoch_(0) {
61+
int32_t seed_arg = kDaliDataloaderSeed;
62+
spec.TryGetArgument(seed_arg, "shuffle_after_epoch_seed");
63+
shuffle_after_epoch_seed_ = seed_arg;
6164

6265
vector<string> files;
6366
vector<int> labels;
@@ -227,7 +230,7 @@ class DLL_PUBLIC FileLabelLoaderBase : public Loader<CPUBackend, ImageLabelWrapp
227230
// the random distribution.
228231
file_label_entries_ = backup_file_label_entries_;
229232
}
230-
std::mt19937 g(kDaliDataloaderSeed + current_epoch_);
233+
std::mt19937 g(static_cast<uint32_t>(shuffle_after_epoch_seed_ + current_epoch_));
231234
std::shuffle(file_label_entries_.begin(), file_label_entries_.end(), g);
232235
}
233236
}
@@ -261,6 +264,7 @@ class DLL_PUBLIC FileLabelLoaderBase : public Loader<CPUBackend, ImageLabelWrapp
261264
bool has_file_root_arg_ = false;
262265

263266
bool shuffle_after_epoch_;
267+
int32_t shuffle_after_epoch_seed_;
264268
Index current_index_;
265269
int current_epoch_;
266270
FileStream::MappingReserver mmap_reserver_;

dali/operators/reader/loader/file_loader.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ class FileLoader : public Loader<Backend, Target, true> {
5050
shuffle_after_epoch_(shuffle_after_epoch),
5151
current_index_(0),
5252
current_epoch_(0) {
53+
int32_t seed_arg = kDaliDataloaderSeed;
54+
spec.TryGetArgument(seed_arg, "shuffle_after_epoch_seed");
55+
shuffle_after_epoch_seed_ = seed_arg;
5356
vector<string> files;
5457

5558
file_discovery_opts_.label_from_subdir = false;
@@ -161,7 +164,7 @@ class FileLoader : public Loader<Backend, Target, true> {
161164
// reduce the randomness.
162165
file_entries_ = backup_file_entries_;
163166
}
164-
std::mt19937 g(kDaliDataloaderSeed + current_epoch_);
167+
std::mt19937 g(static_cast<uint32_t>(shuffle_after_epoch_seed_ + current_epoch_));
165168
std::shuffle(file_entries_.begin(), file_entries_.end(), g);
166169
}
167170
}
@@ -195,6 +198,7 @@ class FileLoader : public Loader<Backend, Target, true> {
195198
bool has_file_root_arg_ = false;
196199

197200
bool shuffle_after_epoch_;
201+
int32_t shuffle_after_epoch_seed_;
198202
Index current_index_;
199203
int current_epoch_;
200204
typename InputStream::MappingReserver mmap_reserver_;

dali/operators/reader/loader/nemo_asr_loader.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ void NemoAsrLoader::Reset(bool wrap_to_shard) {
104104
// reduce the randomness.
105105
std::iota(shuffled_indices_.begin(), shuffled_indices_.end(), 0);
106106
}
107-
std::mt19937 g(kDaliDataloaderSeed + current_epoch_);
107+
std::mt19937 g(static_cast<uint32_t>(shuffle_after_epoch_seed_ + current_epoch_));
108108
std::shuffle(shuffled_indices_.begin(), shuffled_indices_.end(), g);
109109
}
110110
}

dali/operators/reader/loader/nemo_asr_loader.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@ class DLL_PUBLIC NemoAsrLoader : public Loader<CPUBackend, AsrSample, true> {
113113
: Loader<CPUBackend, AsrSample, true>(spec),
114114
manifest_filepaths_(spec.GetRepeatedArgument<std::string>("manifest_filepaths")),
115115
shuffle_after_epoch_(spec.GetArgument<bool>("shuffle_after_epoch")),
116+
shuffle_after_epoch_seed_([&spec]() {
117+
int32_t seed = kDaliDataloaderSeed;
118+
spec.TryGetArgument(seed, "shuffle_after_epoch_seed");
119+
return seed;
120+
}()),
116121
sample_rate_(spec.GetArgument<float>("sample_rate")),
117122
quality_(spec.GetArgument<float>("quality")),
118123
downmix_(spec.GetArgument<bool>("downmix")),
@@ -172,6 +177,7 @@ class DLL_PUBLIC NemoAsrLoader : public Loader<CPUBackend, AsrSample, true> {
172177
std::vector<size_t> shuffled_indices_;
173178

174179
bool shuffle_after_epoch_;
180+
int32_t shuffle_after_epoch_seed_;
175181
Index current_index_ = 0;
176182
int current_epoch_ = 0;
177183

dali/operators/reader/nemo_asr_reader_op.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,22 @@ This reader produces between 1 and 3 outputs:
8686
.AddOptionalArg("shuffle_after_epoch",
8787
"If true, reader shuffles whole dataset after each epoch",
8888
false)
89+
.AddOptionalArg<int32_t>("shuffle_after_epoch_seed",
90+
R"code(Random seed for the dataset shuffling performed after each epoch.
91+
92+
If not provided, a fixed default seed is used, which results in the same shuffling
93+
pattern across different training runs. Providing a custom seed allows for different
94+
shuffle patterns across training runs, which may be desirable for better statistical
95+
properties.
96+
97+
.. note::
98+
When using multiple DALI pipelines (e.g., for multi-GPU training), all pipeline
99+
instances should use the same ``shuffle_after_epoch_seed`` to ensure a consistent
100+
global shuffle across all shards.
101+
102+
.. note::
103+
This argument has no effect unless ``shuffle_after_epoch`` is set to ``True``.)code",
104+
nullptr, false)
89105
.AddOptionalArg("sample_rate",
90106
"If specified, the target sample rate, in Hz, to which the audio is resampled.",
91107
-1.0f)

dali/operators/reader/numpy_reader_op.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,22 @@ This argument is mutually exclusive with `files`.)", nullptr)
137137
138138
`stick_to_shard` and `random_shuffle` cannot be used when this argument is set to True.)",
139139
false)
140+
.AddOptionalArg<int32_t>("shuffle_after_epoch_seed",
141+
R"(Random seed for the dataset shuffling performed after each epoch.
142+
143+
If not provided, a fixed default seed is used, which results in the same shuffling
144+
pattern across different training runs. Providing a custom seed allows for different
145+
shuffle patterns across training runs, which may be desirable for better statistical
146+
properties.
147+
148+
.. note::
149+
When using multiple DALI pipelines (e.g., for multi-GPU training), all pipeline
150+
instances should use the same ``shuffle_after_epoch_seed`` to ensure a consistent
151+
global shuffle across all shards.
152+
153+
.. note::
154+
This argument has no effect unless ``shuffle_after_epoch`` is set to ``True``.)",
155+
nullptr, false)
140156
.AddOptionalArg<vector<string>>("files", R"(A list of file paths to read the data from.
141157
142158
If `file_root` is provided, the paths are treated as being relative to it.

dali/test/python/reader/test_numpy.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,3 +1132,92 @@ def my_pipeline(files):
11321132
p = my_pipeline(files=[fname])
11331133
# shouldn't throw
11341134
assert_array_equal(p.run()[0][0], data)
1135+
1136+
1137+
def _collect_numpy_epoch_order(filenames, seed=None):
1138+
"""Run a numpy reader pipeline for one epoch and return the list of sample values (int).
1139+
1140+
Each file stores a unique integer value, so the returned list represents the read order.
1141+
"""
1142+
num_samples = len(filenames)
1143+
1144+
@pipeline_def(batch_size=1, num_threads=1, device_id=0)
1145+
def make_pipe():
1146+
data = fn.readers.numpy(
1147+
device="cpu",
1148+
files=filenames,
1149+
shuffle_after_epoch=True,
1150+
shard_id=0,
1151+
num_shards=1,
1152+
shuffle_after_epoch_seed=seed,
1153+
)
1154+
return data
1155+
1156+
pipe = make_pipe()
1157+
pipe.build()
1158+
1159+
order = []
1160+
for _ in range(num_samples):
1161+
out = pipe.run()
1162+
order.append(int(to_array(out[0])[0][0]))
1163+
1164+
return order
1165+
1166+
1167+
def test_shuffle_after_epoch_seed_numpy_reproducible():
1168+
"""Same shuffle_after_epoch_seed should produce the same order across runs."""
1169+
with tempfile.TemporaryDirectory() as test_data_root:
1170+
num_samples = 20
1171+
filenames = []
1172+
for i in range(num_samples):
1173+
fname = os.path.join(test_data_root, "sample_{:03d}.npy".format(i))
1174+
np.save(fname, np.array([i], dtype=np.float32))
1175+
filenames.append(fname)
1176+
1177+
seed = 42
1178+
order1 = _collect_numpy_epoch_order(filenames, seed=seed)
1179+
order2 = _collect_numpy_epoch_order(filenames, seed=seed)
1180+
1181+
assert (
1182+
order1 == order2
1183+
), "Same shuffle_after_epoch_seed should produce the same reading order"
1184+
1185+
1186+
def test_shuffle_after_epoch_seed_numpy_different_seeds():
1187+
"""Different shuffle_after_epoch_seed values should produce different orders."""
1188+
with tempfile.TemporaryDirectory() as test_data_root:
1189+
num_samples = 20
1190+
filenames = []
1191+
for i in range(num_samples):
1192+
fname = os.path.join(test_data_root, "sample_{:03d}.npy".format(i))
1193+
np.save(fname, np.array([i], dtype=np.float32))
1194+
filenames.append(fname)
1195+
1196+
order_seed1 = _collect_numpy_epoch_order(filenames, seed=11111)
1197+
order_seed2 = _collect_numpy_epoch_order(filenames, seed=99999)
1198+
1199+
assert (
1200+
order_seed1 != order_seed2
1201+
), "Different shuffle_after_epoch_seed values should produce different orders"
1202+
# Both orderings should contain all samples
1203+
assert (
1204+
sorted(order_seed1) == sorted(order_seed2) == list(range(num_samples))
1205+
), "All samples should be present regardless of seed"
1206+
1207+
1208+
def test_shuffle_after_epoch_seed_numpy_default_reproducible():
1209+
"""Without explicit seed, two pipelines should produce the same default order."""
1210+
with tempfile.TemporaryDirectory() as test_data_root:
1211+
num_samples = 20
1212+
filenames = []
1213+
for i in range(num_samples):
1214+
fname = os.path.join(test_data_root, "sample_{:03d}.npy".format(i))
1215+
np.save(fname, np.array([i], dtype=np.float32))
1216+
filenames.append(fname)
1217+
1218+
order1 = _collect_numpy_epoch_order(filenames, seed=None)
1219+
order2 = _collect_numpy_epoch_order(filenames, seed=None)
1220+
1221+
assert (
1222+
order1 == order2
1223+
), "Without explicit seed, default behavior should be reproducible (backward compat)"

0 commit comments

Comments
 (0)