addressing the case when output region for repeat operation is too big

ipdemes · ipdemes · commit 4b13a6c3048b · 2022-05-31T15:11:57.000-06:00
diff --git a/cunumeric/module.py b/cunumeric/module.py
@@ -2046,7 +2046,6 @@ def repeat(a, repeats, axis=None):
     --------
     Multiple GPUs, Multiple CPUs
     """
-
     # when array is a scalar
     if np.ndim(a) == 0:
         if np.ndim(repeats) == 0:
@@ -2100,11 +2099,36 @@ def repeat(a, repeats, axis=None):
                 category=UserWarning,
             )
         repeats = np.int64(repeats)
-        result = array._thunk.repeat(
-            repeats=repeats,
-            axis=axis,
-            scalar_repeats=True,
-        )
+        if repeats < 0:
+            return ValueError(
+                "'repeats' should not be negative: {}".format(repeats)
+            )
+
+        # check output shape (if it will fit to GPU or not)
+        out_shape = list(array.shape)
+        out_shape[axis] *= repeats
+        out_shape = tuple(out_shape)
+        size = sum(out_shape) * array.itemsize
+        # check if size of the output array is less 8GB. In this case we can
+        # use output regions, otherwise we will use statcally allocated
+        # array
+        if size < 8589934592 / 2:
+
+            result = array._thunk.repeat(
+                repeats=repeats, axis=axis, scalar_repeats=True
+            )
+        else:
+            # this implementation is taken from CuPy
+            result = ndarray(shape=out_shape, dtype=array.dtype)
+            a_index = [slice(None)] * len(out_shape)
+            res_index = list(a_index)
+            offset = 0
+            for i in range(a._shape[axis]):
+                a_index[axis] = slice(i, i + 1)
+                res_index[axis] = slice(offset, offset + repeats)
+                result[res_index] = array[a_index]
+                offset += repeats
+            return result
     # repeats is an array
     else:
         # repeats should be integer type
@@ -2116,9 +2140,31 @@ def repeat(a, repeats, axis=None):
         repeats = repeats.astype(np.int64)
         if repeats.shape[0] != array.shape[axis]:
             return ValueError("incorrect shape of repeats array")
-        result = array._thunk.repeat(
-            repeats=repeats._thunk, axis=axis, scalar_repeats=False
-        )
+
+        # check output shape (if it will fit to GPU or not)
+        out_shape = list(array.shape)
+        n_repeats = sum(repeats)
+        out_shape[axis] = n_repeats
+        out_shape = tuple(out_shape)
+        size = sum(out_shape) * array.itemsize
+        # check if size of the output array is less 8GB. In this case we can
+        # use output regions, otherwise we will use statcally allocated
+        # array
+        if size < 8589934592 / 2:
+            result = array._thunk.repeat(
+                repeats=repeats._thunk, axis=axis, scalar_repeats=False
+            )
+        else:  # this implementation is taken from CuPy
+            result = ndarray(shape=out_shape, dtype=array.dtype)
+            a_index = [slice(None)] * len(out_shape)
+            res_index = list(a_index)
+            offset = 0
+            for i in range(a._shape[axis]):
+                a_index[axis] = slice(i, i + 1)
+                res_index[axis] = slice(offset, offset + repeats[i])
+                result[res_index] = array[a_index]
+                offset += repeats[i]
+            return result
     return ndarray(shape=result.shape, thunk=result)
 
 
diff --git a/src/cunumeric/index/repeat.cc b/src/cunumeric/index/repeat.cc
@@ -69,6 +69,8 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
     int64_t out_idx = 0;
     for (size_t in_idx = 0; in_idx < volume; ++in_idx) {
       auto p = in_pitches.unflatten(in_idx, in_rect.lo);
+      // TODO replace assert with Legate exception handeling interface when available
+      assert(repeats[p] >= 0);
       for (size_t r = 0; r < repeats[p]; r++) out[out_idx++] = in[p];
     }
   }
@@ -88,6 +90,8 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
     for (int64_t idx = in_rect.lo[axis]; idx <= in_rect.hi[axis]; ++idx) {
       p[axis]            = idx;
       offsets[off_idx++] = sum;
+      // TODO replace assert with Legate exception handeling interface when available
+      assert(repeats[p] >= 0);
       sum += repeats[p];
     }
 
diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu
@@ -41,6 +41,8 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
     if (offset < extent) {
       auto p = origin;
       p[axis] += offset;
+      // TODO replace assert with Legate exception handeling interface when available
+      assert(repeats[p] >= 0);
       auto val        = repeats[p];
       offsets[offset] = val;
       SumReduction<int64_t>::fold<true>(value, val);
diff --git a/src/cunumeric/index/repeat_omp.cc b/src/cunumeric/index/repeat_omp.cc
@@ -77,7 +77,9 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
       int64_t axis_lo = p[axis];
 #pragma omp for schedule(static) private(p)
       for (int64_t idx = 0; idx < axis_extent; ++idx) {
-        p[axis]      = axis_lo + idx;
+        p[axis] = axis_lo + idx;
+        // TODO replace assert with Legate exception handeling interface when available
+        assert(repeats[p] >= 0);
         auto val     = repeats[p];
         offsets[idx] = val;
         local_sums[tid] += val;

Original file line number	Diff line number	Diff line change
`@@ -69,6 +69,8 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {`
`69`	`69`	`int64_t out_idx = 0;`
`70`	`70`	`for (size_t in_idx = 0; in_idx < volume; ++in_idx) {`
`71`	`71`	`auto p = in_pitches.unflatten(in_idx, in_rect.lo);`
	`72`	`+ // TODO replace assert with Legate exception handeling interface when available`
	`73`	`+ assert(repeats[p] >= 0);`
`72`	`74`	`for (size_t r = 0; r < repeats[p]; r++) out[out_idx++] = in[p];`
`73`	`75`	`}`
`74`	`76`	`}`
`@@ -88,6 +90,8 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {`
`88`	`90`	`for (int64_t idx = in_rect.lo[axis]; idx <= in_rect.hi[axis]; ++idx) {`
`89`	`91`	`p[axis] = idx;`
`90`	`92`	`offsets[off_idx++] = sum;`
	`93`	`+ // TODO replace assert with Legate exception handeling interface when available`
	`94`	`+ assert(repeats[p] >= 0);`
`91`	`95`	`sum += repeats[p];`
`92`	`96`	`}`
`93`	`97`