GPU backend¶

This example depends on data in a file that can be made in the following way.

import awkward as ak

def make_data(fn, N=1000000):
    part = [[[1, 2, 3], [], [4, 5]],
            [[6, 7]]] * N
    arr = ak.Array({"a": part})
    ak.to_parquet(arr, fn, extensionarray=False)

The file cuda-env.yaml can be used to create a functional environment using conda:

$ conda env create -f example/cuda-env.yaml

[3]:

import awkward as ak
import cupy as cp
import cudf
import numpy as np
import akimbo.cudf
import subprocess

def gpu_mem():
    return
    print(subprocess.check_output("nvidia-smi | grep py", shell=True).split()[-2].decode())

ak.__version__, akimbo.__version__

[3]:

('2.7.1', '2024.10.1.dev9+g9f64d31')

[4]:

df = cudf.read_parquet("s.parquet")
gpu_mem()

[5]:

df.dtypes

[5]:

a    list
dtype: object

[6]:

df.iloc[0]  # each element is list-of-lists

[6]:

a    [[1, 2, 3], [], [4, 5]]
Name: 0, dtype: list

[7]:

# allows all ak.* namespace, many identical to numpy equivalents
dir(df.a.ak)

[7]:

['Mask',
 'all',
 'almost_equal',
 'angle',
 'annotations',
 'any',
 'apply',
 'argcartesian',
 'argcombinations',
 'argmax',
 'argmin',
 'argsort',
 'array',
 'array_equal',
 'attrs',
 'awkward',
 'backend',
 'behavior',
 'behaviors',
 'broadcast_arrays',
 'broadcast_fields',
 'builder',
 'cartesian',
 'categories',
 'combinations',
 'concatenate',
 'contents',
 'copy',
 'corr',
 'count',
 'count_nonzero',
 'covar',
 'cpp_type',
 'cppyy',
 'drop_none',
 'dt',
 'enforce_type',
 'errors',
 'explode',
 'fields',
 'fill_none',
 'firsts',
 'flatten',
 'forms',
 'forth',
 'from_arrow',
 'from_arrow_schema',
 'from_avro_file',
 'from_buffers',
 'from_categorical',
 'from_cupy',
 'from_dlpack',
 'from_feather',
 'from_iter',
 'from_jax',
 'from_json',
 'from_numpy',
 'from_parquet',
 'from_raggedtensor',
 'from_rdataframe',
 'from_regular',
 'from_tensorflow',
 'from_torch',
 'full_like',
 'highlevel',
 'imag',
 'index',
 'is_categorical',
 'is_none',
 'is_tuple',
 'is_valid',
 'isclose',
 'jax',
 'layout',
 'linear_fit',
 'local_index',
 'mask',
 'max',
 'mean',
 'merge_option_of_records',
 'merge_union_of_records',
 'metadata_from_parquet',
 'min',
 'mixin_class',
 'mixin_class_method',
 'moment',
 'named_axis',
 'nan_to_none',
 'nan_to_num',
 'nanargmax',
 'nanargmin',
 'nanmax',
 'nanmean',
 'nanmin',
 'nanprod',
 'nanstd',
 'nansum',
 'nanvar',
 'nbytes',
 'ndim',
 'num',
 'numba',
 'numba_type',
 'ones_like',
 'operations',
 'pad_none',
 'parameters',
 'positional_axis',
 'prettyprint',
 'prod',
 'ptp',
 'ravel',
 'real',
 'record',
 'round',
 'run_lengths',
 'show',
 'singletons',
 'softmax',
 'sort',
 'std',
 'str',
 'strings_astype',
 'sum',
 'to_arrow',
 'to_arrow_table',
 'to_backend',
 'to_buffers',
 'to_cudf',
 'to_cupy',
 'to_dataframe',
 'to_feather',
 'to_jax',
 'to_json',
 'to_layout',
 'to_list',
 'to_numpy',
 'to_packed',
 'to_parquet',
 'to_parquet_dataset',
 'to_parquet_row_groups',
 'to_raggedtensor',
 'to_rdataframe',
 'to_regular',
 'to_tensorflow',
 'to_torch',
 'tolist',
 'transform',
 'type',
 'types',
 'typestr',
 'typetracer',
 'unflatten',
 'unpack',
 'unzip',
 'validity_error',
 'values_astype',
 'var',
 'where',
 'with_field',
 'with_name',
 'with_named_axis',
 'with_parameter',
 'without_field',
 'without_named_axis',
 'without_parameters',
 'zeros_like',
 'zip']

[8]:

df.a.ak.sum(axis=None)

[8]:

array(28000000)

[9]:

# if output was array-like, it stays on the GPU
type(_)

[9]:

cupy.ndarray

[11]:

# fast reduction across three levels of nesting
%timeit df.a.ak.sum(axis=None)

12.6 ms ± 779 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)

[12]:

# ufunc maintains structure
np.negative(df.a.ak)

[12]:

0          [[-1, -2, -3], [], [-4, -5]]
1                            [[-6, -7]]
2          [[-1, -2, -3], [], [-4, -5]]
3                            [[-6, -7]]
4          [[-1, -2, -3], [], [-4, -5]]
                       ...
1999995                      [[-6, -7]]
1999996    [[-1, -2, -3], [], [-4, -5]]
1999997                      [[-6, -7]]
1999998    [[-1, -2, -3], [], [-4, -5]]
1999999                      [[-6, -7]]
Length: 2000000, dtype: list

[10]:

gpu_mem()  # created new arrays on GPU, made new cuDF series

256MiB

[13]:

# operator overload
print((df.a.ak + 1).head())

0    [[2, 3, 4], [], [5, 6]]
1                   [[7, 8]]
2    [[2, 3, 4], [], [5, 6]]
3                   [[7, 8]]
4    [[2, 3, 4], [], [5, 6]]
dtype: list

numba¶

[14]:

import numba.cuda
ak.numba.register_and_check()

@numba.cuda.jit(extensions=[ak.numba.cuda])
def inner_sum(array, out):
    tid = numba.cuda.grid(1)
    if tid < len(array):
        out[tid] = 0
        for x in array[tid]:
            for y in x:
                out[tid] += y

out = cp.empty(len(df.a), dtype="int32")
blocksize = 256
numblocks = (len(df.a) + blocksize - 1) // blocksize

df.a.ak.apply(lambda x: inner_sum[numblocks, blocksize](ak.drop_none(x, axis=0), out))
out

[14]:

array([15, 13, 15, ..., 13, 15, 13], dtype=int32)

[15]:

%timeit df.a.ak.apply(lambda x: inner_sum[numblocks, blocksize](ak.drop_none(x, axis=0), out))

16.7 ms ± 233 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)

[14]:

gpu_mem()

264MiB

slice

[16]:

# pick the first number of the innermost lists, if there is one
df.a.ak[:, :, :1]

[16]:

0          [[1], [], [4]]
1                   [[6]]
2          [[1], [], [4]]
3                   [[6]]
4          [[1], [], [4]]
                ...
1999995             [[6]]
1999996    [[1], [], [4]]
1999997             [[6]]
1999998    [[1], [], [4]]
1999999             [[6]]
Length: 2000000, dtype: list

[17]:

# pick the first inner list of each row
df.a.ak[:, 0, :]

[17]:

0          [1, 2, 3]
1             [6, 7]
2          [1, 2, 3]
3             [6, 7]
4          [1, 2, 3]
             ...
1999995       [6, 7]
1999996    [1, 2, 3]
1999997       [6, 7]
1999998    [1, 2, 3]
1999999       [6, 7]
Length: 2000000, dtype: list

[ ]: