GPU backend¶
This example depends on data in a file that can be made in the following way.
import awkward as ak
def make_data(fn, N=1000000):
part = [[[1, 2, 3], [], [4, 5]],
[[6, 7]]] * N
arr = ak.Array({"a": part})
ak.to_parquet(arr, fn, extensionarray=False)
The file cuda-env.yaml can be used to create a functional environment using conda:
$ conda env create -f example/cuda-env.yaml
[3]:
import awkward as ak
import cupy as cp
import cudf
import numpy as np
import akimbo.cudf
import subprocess
def gpu_mem():
return
print(subprocess.check_output("nvidia-smi | grep py", shell=True).split()[-2].decode())
ak.__version__, akimbo.__version__
[3]:
('2.7.1', '2024.10.1.dev9+g9f64d31')
[4]:
df = cudf.read_parquet("s.parquet")
gpu_mem()
[5]:
df.dtypes
[5]:
a list
dtype: object
[6]:
df.iloc[0] # each element is list-of-lists
[6]:
a [[1, 2, 3], [], [4, 5]]
Name: 0, dtype: list
[7]:
# allows all ak.* namespace, many identical to numpy equivalents
dir(df.a.ak)
[7]:
['Mask',
'all',
'almost_equal',
'angle',
'annotations',
'any',
'apply',
'argcartesian',
'argcombinations',
'argmax',
'argmin',
'argsort',
'array',
'array_equal',
'attrs',
'awkward',
'backend',
'behavior',
'behaviors',
'broadcast_arrays',
'broadcast_fields',
'builder',
'cartesian',
'categories',
'combinations',
'concatenate',
'contents',
'copy',
'corr',
'count',
'count_nonzero',
'covar',
'cpp_type',
'cppyy',
'drop_none',
'dt',
'enforce_type',
'errors',
'explode',
'fields',
'fill_none',
'firsts',
'flatten',
'forms',
'forth',
'from_arrow',
'from_arrow_schema',
'from_avro_file',
'from_buffers',
'from_categorical',
'from_cupy',
'from_dlpack',
'from_feather',
'from_iter',
'from_jax',
'from_json',
'from_numpy',
'from_parquet',
'from_raggedtensor',
'from_rdataframe',
'from_regular',
'from_tensorflow',
'from_torch',
'full_like',
'highlevel',
'imag',
'index',
'is_categorical',
'is_none',
'is_tuple',
'is_valid',
'isclose',
'jax',
'layout',
'linear_fit',
'local_index',
'mask',
'max',
'mean',
'merge_option_of_records',
'merge_union_of_records',
'metadata_from_parquet',
'min',
'mixin_class',
'mixin_class_method',
'moment',
'named_axis',
'nan_to_none',
'nan_to_num',
'nanargmax',
'nanargmin',
'nanmax',
'nanmean',
'nanmin',
'nanprod',
'nanstd',
'nansum',
'nanvar',
'nbytes',
'ndim',
'num',
'numba',
'numba_type',
'ones_like',
'operations',
'pad_none',
'parameters',
'positional_axis',
'prettyprint',
'prod',
'ptp',
'ravel',
'real',
'record',
'round',
'run_lengths',
'show',
'singletons',
'softmax',
'sort',
'std',
'str',
'strings_astype',
'sum',
'to_arrow',
'to_arrow_table',
'to_backend',
'to_buffers',
'to_cudf',
'to_cupy',
'to_dataframe',
'to_feather',
'to_jax',
'to_json',
'to_layout',
'to_list',
'to_numpy',
'to_packed',
'to_parquet',
'to_parquet_dataset',
'to_parquet_row_groups',
'to_raggedtensor',
'to_rdataframe',
'to_regular',
'to_tensorflow',
'to_torch',
'tolist',
'transform',
'type',
'types',
'typestr',
'typetracer',
'unflatten',
'unpack',
'unzip',
'validity_error',
'values_astype',
'var',
'where',
'with_field',
'with_name',
'with_named_axis',
'with_parameter',
'without_field',
'without_named_axis',
'without_parameters',
'zeros_like',
'zip']
[8]:
df.a.ak.sum(axis=None)
[8]:
array(28000000)
[9]:
# if output was array-like, it stays on the GPU
type(_)
[9]:
cupy.ndarray
[11]:
# fast reduction across three levels of nesting
%timeit df.a.ak.sum(axis=None)
12.6 ms ± 779 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
[12]:
# ufunc maintains structure
np.negative(df.a.ak)
[12]:
0 [[-1, -2, -3], [], [-4, -5]]
1 [[-6, -7]]
2 [[-1, -2, -3], [], [-4, -5]]
3 [[-6, -7]]
4 [[-1, -2, -3], [], [-4, -5]]
...
1999995 [[-6, -7]]
1999996 [[-1, -2, -3], [], [-4, -5]]
1999997 [[-6, -7]]
1999998 [[-1, -2, -3], [], [-4, -5]]
1999999 [[-6, -7]]
Length: 2000000, dtype: list
[10]:
gpu_mem() # created new arrays on GPU, made new cuDF series
256MiB
[13]:
# operator overload
print((df.a.ak + 1).head())
0 [[2, 3, 4], [], [5, 6]]
1 [[7, 8]]
2 [[2, 3, 4], [], [5, 6]]
3 [[7, 8]]
4 [[2, 3, 4], [], [5, 6]]
dtype: list
numba¶
[14]:
import numba.cuda
ak.numba.register_and_check()
@numba.cuda.jit(extensions=[ak.numba.cuda])
def inner_sum(array, out):
tid = numba.cuda.grid(1)
if tid < len(array):
out[tid] = 0
for x in array[tid]:
for y in x:
out[tid] += y
out = cp.empty(len(df.a), dtype="int32")
blocksize = 256
numblocks = (len(df.a) + blocksize - 1) // blocksize
df.a.ak.apply(lambda x: inner_sum[numblocks, blocksize](ak.drop_none(x, axis=0), out))
out
[14]:
array([15, 13, 15, ..., 13, 15, 13], dtype=int32)
[15]:
%timeit df.a.ak.apply(lambda x: inner_sum[numblocks, blocksize](ak.drop_none(x, axis=0), out))
16.7 ms ± 233 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
[14]:
gpu_mem()
264MiB
slice
[16]:
# pick the first number of the innermost lists, if there is one
df.a.ak[:, :, :1]
[16]:
0 [[1], [], [4]]
1 [[6]]
2 [[1], [], [4]]
3 [[6]]
4 [[1], [], [4]]
...
1999995 [[6]]
1999996 [[1], [], [4]]
1999997 [[6]]
1999998 [[1], [], [4]]
1999999 [[6]]
Length: 2000000, dtype: list
[17]:
# pick the first inner list of each row
df.a.ak[:, 0, :]
[17]:
0 [1, 2, 3]
1 [6, 7]
2 [1, 2, 3]
3 [6, 7]
4 [1, 2, 3]
...
1999995 [6, 7]
1999996 [1, 2, 3]
1999997 [6, 7]
1999998 [1, 2, 3]
1999999 [6, 7]
Length: 2000000, dtype: list
[ ]: