Skip to content

Can't groupby two or more keys #17

@natemcintosh

Description

@natemcintosh

When attempting to groupby two or more keys, I get an attribute error. This does not occur when grouping by a single key

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-c4a4f7e033ab> in <module>
----> 1 df.groupby(['path','time']).alt.mean().compute()

~/anaconda3/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
    154         dask.base.compute
    155         """
--> 156         (result,) = compute(self, traverse=False, **kwargs)
    157         return result
    158 

~/anaconda3/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
    396     keys = [x.__dask_keys__() for x in collections]
    397     postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398     results = schedule(dsk, keys, **kwargs)
    399     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    400 

~/anaconda3/lib/python3.6/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
     74     results = get_async(pool.apply_async, len(pool._pool), dsk, result,
     75                         cache=cache, get_id=_thread_get_id,
---> 76                         pack_exception=pack_exception, **kwargs)
     77 
     78     # Cleanup pools associated to dead threads

~/anaconda3/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    465                 finish_task(dsk, key, state, results, keyorder.get)
    466                 for f in posttask_cbs:
--> 467                     f(key, res, dsk, state, worker_id)
    468 
    469                 while state['ready'] and len(state['running']) < num_workers:

~/anaconda3/lib/python3.6/site-packages/dask/cache.py in _posttask(self, key, value, dsk, state, id)
     59             duration += max(self.durations.get(k, 0) for k in deps)
     60         self.durations[key] = duration
---> 61         nb = self._nbytes(value) + overhead + sys.getsizeof(key) * 4
     62         self.cache.put(key, value, cost=duration / nb / 1e9, nbytes=nb)
     63 

~/anaconda3/lib/python3.6/site-packages/cachey/nbytes.py in nbytes(o)
     27 
     28     if name == 'pandas.core.series.Series':
---> 29         return _array(o._data.blocks[0].values) + _array(o.index._data)
     30     elif name == 'pandas.core.frame.DataFrame':
     31         return _array(o.index) + sum([_array(blk.values)

~/anaconda3/lib/python3.6/site-packages/cachey/nbytes.py in _array(x)
      3 
      4 def _array(x):
----> 5     if x.dtype == 'O':
      6         return sys.getsizeof('0'*100) * x.size
      7     elif str(x.dtype) == 'category':

AttributeError: 'NoneType' object has no attribute 'dtype'

Is this a known problem? Or a bug?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions