hslvision/index.html at main · IntelligentRoboticsLab/hslvision · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>HSLVision: A Multimodal Vision Dataset for RoboCup Humanoid Soccer</title>
  <meta name="description" content="HSLVision: the first public RGB-D object detection dataset and benchmark for the RoboCup Humanoid Soccer League - 6,068 images, 65,309 annotations, 7 classes, multiple venues and robot embodiments.">
  <meta name="keywords" content="HSLVision, RoboCup, Humanoid Soccer League, Object Detection, RGB-D, Dataset, Benchmark, Monocular Depth, DepthAnythingV3, YOLO, DETR">

  <!-- Open Graph / Twitter -->
  <meta property="og:type" content="website">
  <meta property="og:title" content="HSLVision: A Multimodal Vision Dataset for RoboCup Humanoid Soccer">
  <meta property="og:description" content="The first public RGB-D object detection dataset and benchmark for the RoboCup Humanoid Soccer League.">
  <meta property="og:image" content="./static/images/teaser_lab42_rgb.jpg">
  <meta name="twitter:card" content="summary_large_image">
  <meta name="twitter:title" content="HSLVision: A Multimodal Vision Dataset for RoboCup Humanoid Soccer">
  <meta name="twitter:description" content="The first public RGB-D object detection dataset and benchmark for the RoboCup Humanoid Soccer League.">
  <meta name="twitter:image" content="./static/images/teaser_lab42_rgb.jpg">

  <!-- single SVG favicon that adapts to dark mode via an internal prefers-color-scheme media query -->
  <link rel="icon" type="image/svg+xml" href="./static/images/favicon.svg">
  <link rel="preconnect" href="https://fonts.googleapis.com">
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  <link href="https://fonts.googleapis.com/css2?family=Noto+Sans:wght@400;500;600;700&display=swap" rel="stylesheet">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.4/css/bulma.min.css">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/hslv.css">
</head>
<body>

<!-- ===== Title / hero ===== -->
<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-2 publication-title">
            HSL<span class="dnerf">Vision</span>: A Multimodal Vision Dataset for RoboCup Humanoid Soccer
          </h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">D. M. Xavier Catarrinho<sup>1,2</sup>,</span>
            <span class="author-block">G. de Jong<sup>1,2</sup>,</span>
            <span class="author-block">M. J. Meijer<sup>1,2</sup>,</span>
            <span class="author-block">H. Ruiter<sup>1,2</sup>,</span>
            <span class="author-block">M. Honkoop<sup>1,2</sup></span>
          </div>
          <div class="is-size-6 publication-affil">
            <span class="author-block"><sup>1</sup>University of Amsterdam,</span>
            <span class="author-block"><sup>2</sup>whIRLwind Amsterdam</span>
          </div>
          <div class="is-size-6 publication-venue">RoboCup 2026 · Incheon, South Korea</div>
          <div class="team-logo">
            <a class="uva-logo" href="https://uva.nl" target="_blank" rel="noopener">
              <img src="./static/images/uva_logo.svg" alt="University of Amsterdam" width="426" height="95">
            </a>
            <a class="irl-logo" href="https://www.intelligentroboticslab.nl" target="_blank" rel="noopener">
              <img src="./static/images/irl_logo.svg" alt="Intelligent Robotics Lab" width="299" height="262">
            </a>
            <a class="whirlwind-logo" href="https://whirlwind.team" target="_blank" rel="noopener">
              <img src="./static/images/text_logo.svg" alt="whIRLwind Amsterdam" width="391" height="141">
            </a>
          </div>

          <div class="publication-links">
            <span class="link-block">
              <button class="button is-dark is-rounded" disabled>
                <span class="icon"><i class="fas fa-file-pdf"></i></span><span>Paper (coming soon)</span>
              </button>
            </span>
            <span class="link-block">
              <a href="https://huggingface.co/datasets/whirlwind-ams/hslvision" class="button is-dark is-rounded">
                <span class="icon"><img class="hf-logo" src="./static/images/hf-logo.svg" alt=""></span><span>Dataset</span>
              </a>
            </span>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- ===== Teaser gallery - RGB↔depth sliders ===== -->
<section class="hero teaser">
  <div class="container is-max-desktop teaser-container">
    <div class="slider-row">
      <div class="ba">
        <img class="bottom" src="./static/images/teaser_cologne_rgb.jpg" alt="Annotated frame, German Open 2026 Cologne">
        <img class="top" src="./static/images/teaser_cologne_depth.jpg" alt="Estimated depth, Cologne">
        <span class="lbl l">RGB</span>
        <span class="lbl r">Depth</span>
        <div class="handle"><div class="knob"><i class="fas fa-arrows-left-right"></i></div></div>
      </div>
      <div class="ba">
        <img class="bottom" src="./static/images/teaser_lab42_rgb.jpg" alt="Annotated frame, LAB42">
        <img class="top" src="./static/images/teaser_lab42_depth.jpg" alt="Estimated depth, LAB42">
        <span class="lbl l">RGB</span>
        <span class="lbl r">Depth</span>
        <div class="handle"><div class="knob"><i class="fas fa-arrows-left-right"></i></div></div>
      </div>
      <div class="ba">
        <img class="bottom" src="./static/images/teaser_beijing_rgb.jpg" alt="Annotated frame, RCAP Beijing Masters 2025">
        <img class="top" src="./static/images/teaser_beijing_depth.jpg" alt="Estimated depth, Beijing">
        <span class="lbl l">RGB</span>
        <span class="lbl r">Depth</span>
        <div class="handle"><div class="knob"><i class="fas fa-arrows-left-right"></i></div></div>
      </div>
    </div>
    <h2 class="subtitle has-text-centered">
      Every RGB frame is paired with a pixel-aligned metric depth map, drag the slider to compare.
    </h2>
  </div>
</section>

<!-- ===== Dataset ===== -->
<section class="section" id="dataset">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3 has-text-centered">The Dataset</h2>
        <div class="content has-text-justified">
          <p>
            Images in HSLVision were collected across six locations: LAB42 (whIRLwind), German Open 2026
            Cologne (whIRLwind), RCAP Beijing Masters 2025 (whIRLwind, HTWK), RoboCup 2025 Salvador (HTWK),
            RoboCup 2019 Sydney (BitBots), and RoboCup 2017 Nagoya (BitBots). It spans a range of field
            layouts, goalpost designs, ball sizes, and robot platforms - the Booster Robotics K1 and T1, as
            well as several non-standard KidSize humanoids - recorded across the small, middle, and large
            HSL field sizes. An additional 770 annotated images from RCAP Abu Dhabi 2025 are held out to
            evaluate performance on unseen venues.
          </p>
        </div>

        <div class="sample-gallery">
          <figure><img src="./static/images/sample_lab42.jpg" alt="Annotated frame, LAB42"><figcaption>LAB42 · whIRLwind</figcaption></figure>
          <figure><img src="./static/images/sample_cologne.jpg" alt="Annotated frame, German Open 2026 Cologne"><figcaption>German Open 2026 · Cologne</figcaption></figure>
          <figure><img src="./static/images/sample_beijing.jpg" alt="Annotated frame, RCAP Beijing Masters 2025"><figcaption>RCAP Beijing Masters 2025</figcaption></figure>
          <figure><img src="./static/images/sample_salvador.jpg" alt="Annotated frame, RoboCup 2025 Salvador"><figcaption>RoboCup 2025 · Salvador</figcaption></figure>
          <figure><img src="./static/images/sample_sydney.jpg" alt="Annotated frame, RoboCup 2019 Sydney"><figcaption>RoboCup 2019 · Sydney</figcaption></figure>
          <figure><img src="./static/images/sample_nagoya.jpg" alt="Annotated frame, RoboCup 2017 Nagoya"><figcaption>RoboCup 2017 · Nagoya</figcaption></figure>
        </div>
        <div class="legend">
          <span><i style="background:#e8403a"></i>Ball</span>
          <span><i style="background:#f5c518"></i>Goalpost</span>
          <span><i style="background:#ee7b1a"></i>Robot</span>
          <span><i style="background:#2aa5e0"></i>L-intersection</span>
          <span><i style="background:#7d5cf0"></i>T-intersection</span>
          <span><i style="background:#d24be0"></i>X-intersection</span>
          <span><i style="background:#16c08a"></i>Penalty mark</span>
        </div>
        <p class="has-text-centered is-size-7 has-text-grey">
          One annotated frame per recording location, with ground-truth boxes drawn in the class palette.
        </p>

        <h3 class="title is-4">Data selection</h3>
        <div class="content has-text-justified">
          <p>
            Raw footage is subsampled at 3 fps and filtered to remove motion blur, occlusion, and low-quality
            frames. To avoid near-identical, redundant images, we embed each frame with a pretrained
            <strong>DINOv3 ViT-S/16</strong> encoder (384-dimensional pooler token, L2-normalized so cosine
            similarity reflects distance) and iteratively remove the image with the most above-threshold
            neighbors until no pair exceeds a similarity threshold&nbsp;τ. This yields a diverse subset without
            depending on frame order, while a pretrained encoder avoids per-dataset training and provides a
            semantically meaningful embedding.
          </p>
        </div>
        <figure class="image">
          <img src="./static/images/pipeline.png" alt="Data selection pipeline" width="2280" height="1145">
        </figure>
        <p class="is-size-7 has-text-grey">
          <strong>Figure 1.</strong> Overview of the data-selection pipeline. Bottom: embedding-space
          pruning iteratively removes points with many nearby neighbors, and the effect of varying τ on
          the diversity of the selected subset.
        </p>
      </div>
    </div>
  </div>
</section>

<!-- ===== Depth ===== -->
<section class="section" id="depth">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3 has-text-centered">Pixel-Aligned Depth</h2>
        <div class="content has-text-justified">
          <p>
            Onboard stereo depth is not available across all recording platforms, so every color frame is
            instead paired with a depth map estimated using <strong>DepthAnythingV3</strong>, which predicts
            per-pixel metric depth from a single RGB image. The depth maps are aligned with the RGB images and
            share the same pixel coordinates, so annotations apply directly to both modalities without any
            transformation.
          </p>
        </div>

        <figure class="image">
          <img src="./static/images/depth_comparison.png" alt="Depth comparison" width="1668" height="984">
        </figure>
        <p class="is-size-7 has-text-grey">
          <strong>Figure 2.</strong> RGB input, Hobot StereoNet, and DepthAnythingV3 predictions across
          two scenes. DepthAnythingV3 produces smoother, more spatially consistent depth with sharp,
          well-localized object boundaries.
        </p>
      </div>
    </div>
  </div>
</section>

<!-- ===== Release ===== -->
<section class="section" id="release">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Release</h2>
        <div class="content has-text-justified">
          <p>
            HSLVision is released under a <strong>CC BY 4.0</strong> license and available on
            <a href="https://huggingface.co/datasets/whirlwind-ams/hslvision">Hugging Face</a>. The release
            includes annotations, RGB images, and depth maps, divided into train / validation / test splits.
            All code for model evaluation, dataset tooling, and depth-map generation will be released publicly
            on GitHub upon acceptance.
          </p>
        </div>
        <div class="publication-links">
          <span class="link-block">
            <button class="button is-dark is-rounded" disabled>
              <span class="icon"><i class="fas fa-file-pdf"></i></span><span>Paper (coming soon)</span>
            </button>
          </span>
          <span class="link-block">
            <a href="https://huggingface.co/datasets/whirlwind-ams/hslvision" class="button is-dark is-rounded">
              <span class="icon"><img class="hf-logo" src="./static/images/hf-logo.svg" alt=""></span><span>Get the dataset</span>
            </a>
          </span>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- ===== Cite ===== -->
<section class="section" id="cite">
  <div class="container is-max-desktop">
    <h2 class="title is-3">BibTeX</h2>
    <div class="bib">
      <button class="copy" id="copyBib"><i class="far fa-copy"></i> Copy</button>
      <pre><code id="bibtex">@inproceedings{catarrinho2026hslvision,
  author    = {Xavier Catarrinho, D. M. and de Jong, G. and Meijer, M. J. and Ruiter, H. and Honkoop, M.},
  title     = {HSLVision: A Multimodal Vision Dataset for RoboCup Humanoid Soccer},
  booktitle = {RoboCup 2026: Robot World Cup XXIX},
  year      = {2026},
  location  = {Incheon, South Korea},
}</code></pre>
    </div>
  </div>
</section>

<!-- ===== Hungry for more? ===== -->
<section class="section" id="more">
  <div class="container is-max-desktop has-text-centered">
    <h2 class="title is-3">Hungry for more?</h2>
    <div class="content">
      <p>
        HSLVision is a whIRLwind project. Our mission is to push the boundaries of robotics and AI in
        robot football. We are always looking for interested students or collaboration partners to join us.
        Learn more at <a href="https://whirlwind.team">whirlwind.team</a>.
      </p>
    </div>
    <a href="https://whirlwind.team" class="more-logo">
      <img src="./static/images/text_logo.svg" alt="whIRLwind Amsterdam" width="391" height="141">
    </a>
  </div>
</section>

<!-- ===== Footer ===== -->
<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8 has-text-centered content">
        <p>
          Thanks to all members of team whIRLwind Amsterdam for their annotation efforts, and to team HTWK for
          providing some of the raw recordings used in this work.
        </p>
        <p>
          This website was built using the <a href="https://nerfies.github.io/">Nerfies</a> Research Page template with
          <a href="https://github.com/nerfies/nerfies.github.io">source code</a> of this website.
        </p>
      </div>
    </div>
  </div>
</footer>

<script>
  // Before/after RGB↔depth sliders (teaser trio + depth-section slider)
  document.querySelectorAll('.ba').forEach((el) => {
    const top = el.querySelector('.top'), handle = el.querySelector('.handle');
    const set = (pct) => {
      pct = Math.max(0, Math.min(100, pct));
      top.style.clipPath = 'inset(0 0 0 ' + pct + '%)';
      handle.style.left = pct + '%';
    };
    const fromEvent = (clientX) => {
      const r = el.getBoundingClientRect();
      set(((clientX - r.left) / r.width) * 100);
    };
    let drag = false;
    el.addEventListener('mousedown', e => { drag = true; fromEvent(e.clientX); });
    window.addEventListener('mousemove', e => { if (drag) fromEvent(e.clientX); });
    window.addEventListener('mouseup', () => drag = false);
    el.addEventListener('touchstart', e => fromEvent(e.touches[0].clientX), { passive: true });
    el.addEventListener('touchmove', e => fromEvent(e.touches[0].clientX), { passive: true });
    set(50);
  });

  // Copy BibTeX
  const copyBtn = document.getElementById('copyBib');
  copyBtn.addEventListener('click', () => {
    navigator.clipboard.writeText(document.getElementById('bibtex').textContent.trim()).then(() => {
      copyBtn.innerHTML = '<i class="fas fa-check"></i> Copied';
      setTimeout(() => copyBtn.innerHTML = '<i class="far fa-copy"></i> Copy', 1800);
    });
  });
</script>
</body>
</html>