FFT benchmarks

Perform 2D FFT benchmarks using the CUDA and OpenCL backends of pyvkfft, and compare with scikit-cuda (cuFFT) and gpyfft (clfft) if they are present

Note 1: this is now more easily done using the ``pyvkfft-benchmark`` command-line script

Note 2: in this example, we are using a fixed batch size for 1D and 3D transforms (e.g. 16 arrays transformed in //). This is different from the command-line benchmark, which keeps a fixed total array size (100’s of MB). This is why the throughput remains low for small sizes, and then becomes larger than the card’s nominal bandwidth because of caching effects, before reaching more normal throughput values.

Note 3: due to a bug, we cannot delete cuFFT plans (or this corrupts the cuda context), so the memory usage will continue to grow during execution. Either do not test cufft (skcuda) or limit the size to avoid this. The command-line script avoids this by using a separate process for each individual test (and is consequently pretty slow).

[1]:
%matplotlib notebook
gpu_name = None

import os
import platform
import gc
from itertools import permutations

try:
    import pycuda.driver as cu_drv
    import pycuda.gpuarray as cua
    from pycuda import curandom
    import pyvkfft.cuda
    from pyvkfft.cuda import VkFFTApp as  cuVkFFTApp
    from pyvkfft.base import primes
    has_pyvkfft_cuda = True
except ImportError:
    has_pyvkfft_cuda = False

try:
    import pyopencl as cl
    import pyopencl.array as cla
    from pyopencl import clrandom
    import pyvkfft.opencl
    from pyvkfft.opencl import VkFFTApp as clVkFFTApp
    from pyvkfft.base import primes
    has_pyvkfft_opencl = True
except ImportError:
    has_pyvkfft_opencl = False

try:
    import pycuda.autoinit
    import pycuda.driver as cu_drv
    import pycuda.gpuarray as cua
    from pycuda import curandom
    import skcuda.fft as cu_fft
    has_skcuda = True
except:
    has_skcuda = False

try:
    import pyopencl as cl
    import pyopencl.array as cla
    from pyopencl import clrandom
    import gpyfft
    has_gpyfft = True
except:
    has_gpyfft = False

import matplotlib.pyplot as plt
import numpy as np
import timeit


/home/esrf/favre/miniconda3/envs/pynx-py311-cu11.7/lib/python3.11/site-packages/skcuda/cublas.py:284: UserWarning: creating CUBLAS context to get version number
  warnings.warn('creating CUBLAS context to get version number')
[2]:
gpu_name_real = None
if has_pyvkfft_opencl or has_gpyfft:
    # Create some context on the first available GPU
    if 'PYOPENCL_CTX' in os.environ:
        cl_ctx = cl.create_some_context()
    else:
        cl_ctx = None
        # Find the first OpenCL GPU available and use it, unless
        for p in cl.get_platforms():
            for d in p.get_devices():
                if d.type & cl.device_type.GPU == 0:
                    continue
                gpu_name_real = d.name
                print("Selected OpenCL device: ", d.name)
                cl_ctx = cl.Context(devices=(d,))
                break
            if cl_ctx is not None:
                break
    cq = cl.CommandQueue(cl_ctx)

if has_pyvkfft_cuda or has_skcuda:
    if gpu_name is None:
        d = cu_drv.Device(0)
        gpu_name_real = d.name()
        print("Selected  CUDA  device: ", d.name())
        cu_ctx = d.make_context()
    else:
        for i in range(cu_drv.Device.count()):
            d = cu_drv.Device(i)
            if gpu_name.lower() in d.name().lower():
                gpu_name_real = d.name()
                print("Selected  CUDA  device: ", d.name())
                cu_ctx = d.make_context()
                break

Selected OpenCL device:  NVIDIA A40
Selected  CUDA  device:  NVIDIA A40
[3]:
ndim = 2              # Dimensions for the FFT (1, 2 or 3)
nmax = 3072           # Maximum FFT size (e.g. 512 for 3D, 4096 for 2D,...) - nmax is included
dtype = np.complex64  # Data type
radix_max = 7         # Largest allowed prime factor: use 2 for quick tests or 7 (13 is also possible)
cl_platform = None    # If None, the first OpenCL platform with a GPU is selected. Otherwise match part of the platform name


nb_repeat = 3         # Perform nb_repeat tests, keep best time

# number of parallel arrays for 2D (nz, n, n) and 1D (nz, nz, n) transforms
nz = 16

plt.figure(figsize=(9.5, 8))

results = {"n": []}
if "vkFFT.opencl" not in results and has_pyvkfft_opencl:
    results["vkFFT.opencl"] = []
    results["vkFFT.opencl-dt"] = []
if "gpyfft[clFFT]" not in results and has_gpyfft:
    results["gpyfft[clFFT]"] = []
    results["gpyfft[clFFT]-dt"] = []
if "vkFFT.cuda" not in results and has_pyvkfft_cuda:
    results["vkFFT.cuda"] = []
    results["vkFFT.cuda-dt"] = []
if "skcuda[cuFFT]" not in results and has_skcuda:
    results["skcuda[cuFFT]"] = []
    results["skcuda[cuFFT]-dt"] = []
    plans_skcuda = []

if ndim == 1:
    header_results = "%4d x%4d x%4s [%dD]" % (nz, nz, "N", ndim)
elif ndim ==2:
    header_results = "%4d x%4s x%4s [%dD]" % (nz, "N", "N", ndim)
else:
    header_results = "%4s x%4s x%4s [%dD]" % ("N", "N", "N", ndim)
for b in results.keys():
    if b != "n" and "-dt" not in b:
        header_results += "%17s  " % b


print("Gbytes/s and time given for a couple (FFT, iFFT), dtype=%s" % np.dtype(np.complex64).name)
print()
print(header_results)


# Only test up to prime factors equal to 7 (cuFFT)
for n in range(16, nmax+1):
    if max(primes(n)) > radix_max:
        continue
    results["n"].append(n)
    # Estimate number of repeats to last 0.1s with at least 100 GB/s
    nb = int(round(0.1 * 100 / (nz**(3-ndim) * n ** ndim * np.dtype(dtype).itemsize * ndim * 2 * 2 / 1024 ** 3)))
    nb = max(nb, 1)
    nb = min(nb, 1000)
    # print("%4d (nb=%4d)"%(n, nb))

    if ndim == 1:
        sh = nz, nz, n
    elif ndim == 2:
        sh = nz, n, n
    else:
        sh = n, n, n

    # OpenCL backends
    if has_pyvkfft_opencl or has_gpyfft:
        d = clrandom.rand(cq, shape=sh, dtype=np.float32).astype(dtype)

    if has_pyvkfft_opencl:
        dt = 0
        try:
            app= clVkFFTApp(d.shape, d.dtype, queue=cq, ndim=ndim)
            for i in range(nb_repeat):
                cq.finish()
                t0 = timeit.default_timer()
                for i in range(nb):
                    d = app.ifft(d)
                    d = app.fft(d)
                cq.finish()
                dt1 = timeit.default_timer() - t0
                if dt == 0:
                    dt = dt1
                elif dt1< dt:
                    dt = dt1
            #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [pyvkfft.opencl]  [nb=%4d]" %
            #      (nz, n, n, dt / nb * 1000, gbps, nb))
            del app
            gbps = d.nbytes * nb * ndim * 2 * 2 / dt / 1024 ** 3
        except:
            gbps = 0
        results["vkFFT.opencl"].append(gbps)
        results["vkFFT.opencl-dt"].append(dt)
        gc.collect()

    if has_gpyfft:
        dt = 0
        for axes in permutations([-1, -2, -3][:ndim]):
            gpyfft_plan = gpyfft.FFT(cl_ctx, cq, d, None, axes=axes)
            # Shuffle axes order to find fastest transform
            for i in range(nb_repeat):
                cq.finish()
                t0 = timeit.default_timer()
                for i in range(nb):
                    gpyfft_plan.enqueue(forward=True)
                    gpyfft_plan.enqueue(forward=False)
                cq.finish()
                dt1 = timeit.default_timer() - t0
                if dt == 0:
                    dt = dt1
                elif dt1< dt:
                    dt = dt1
            del gpyfft_plan
        gbps = d.nbytes * nb * ndim * 2 * 2 / dt / 1024 ** 3
        #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [gpyfft[clFFT]]  [nb=%4d]" %
        #      (nz, n, n, dt / nb * 1000, gbps, nb))
        results["gpyfft[clFFT]"].append(gbps)
        results["gpyfft[clFFT]-dt"].append(dt)

    if has_pyvkfft_opencl or has_gpyfft:
        d.data.release()
        del d
        gc.collect()

    # CUDA backends
    if has_pyvkfft_cuda or has_pyvkfft_cuda:
        d = curandom.rand(shape=sh, dtype=np.float32).astype(dtype)

    if has_pyvkfft_cuda:
        try:
            app= cuVkFFTApp(d.shape, d.dtype, ndim=ndim)
            dt = 0
            for i in range(nb_repeat):
                cu_ctx.synchronize()
                t0 = timeit.default_timer()
                for i in range(nb):
                    d = app.ifft(d)
                    d = app.fft(d)
                cu_ctx.synchronize()
                dt1 = timeit.default_timer() - t0
                if dt == 0:
                    dt = dt1
                elif dt1< dt:
                    dt = dt1
            #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [pyvkfft.cuda]    [nb=%4d]" %
            #      (nz, n, n, dt / nb * 1000, gbps, nb))
            del app
            gbps = d.nbytes * nb * ndim * 2 * 2 / dt / 1024 ** 3
        except:
            gbps = 0
        results["vkFFT.cuda"].append(gbps)
        results["vkFFT.cuda-dt"].append(dt)
        gc.collect()

    if has_skcuda:
        if ndim == 1:
            plan = cu_fft.Plan(n, dtype, dtype, batch=nz*nz)
        elif ndim == 2:
            plan = cu_fft.Plan((n,n), dtype, dtype, batch=nz)
        else:
            plan = cu_fft.Plan((n,n,n), dtype, dtype, batch=1)
        dt = 0
        for i in range(nb_repeat):
            cu_ctx.synchronize()
            t0 = timeit.default_timer()
            for i in range(nb):
                cu_fft.fft(d, d, plan)
                cu_fft.ifft(d, d, plan)
            cu_ctx.synchronize()
            dt1 = timeit.default_timer() - t0
            if dt == 0:
                dt = dt1
            elif dt1< dt:
                dt = dt1
        gbps = d.nbytes * nb * ndim * 2 * 2 / dt / 1024 ** 3
        #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [skcuda[cuFFT]]    [nb=%4d]" %
        #      (nz, n, n, dt / nb * 1000, gbps, nb))
        # del plan
        plans_skcuda.append(plan)  # We can't destroy skcuda plans (bug in cufft)
        results["skcuda[cuFFT]"].append(gbps)
        results["skcuda[cuFFT]-dt"].append(dt)


    if has_pyvkfft_cuda or has_pyvkfft_cuda:
        d.gpudata.free()
        del d
        gc.collect()

    # text output
    r = "%4d x%4d x %4d      " % sh
    for b in results.keys():
        if b != "n" and "-dt" not in b:
            dt = results[b+'-dt'][-1] / nb
            if dt < 1e-3 :
                r += "%7.2f [%6.2f µs]" % (results[b][-1], dt * 1e6)
            elif dt > 1:
                r += "%7.2f [%6.2f  s]" % (results[b][-1], dt)
            else:
                r += "%7.2f [%6.2f ms]" % (results[b][-1], dt * 1000)
    print(r + "  [nb=%4d]"%nb)


    plt.clf()
    x = results['n']
    if "gpyfft[clFFT]" in results:
        y = results["gpyfft[clFFT]"]
        plt.plot(x, y, color='#00A000', marker='v', markersize=3, linestyle='', label="gpyfft[clFFT]")
    if "skcuda[cuFFT]" in results:
        y = results["skcuda[cuFFT]"]
        plt.plot(x, y, color='#A00000', marker='^', markersize=3, linestyle='', label="skcuda[cuFFT]")
    if "vkFFT.opencl" in results:
        y = results["vkFFT.opencl"]
        plt.plot(x, y, color='#00FF00', marker='o', markersize=3, linestyle='', label="vkFFT.opencl")
    if "vkFFT.cuda" in results:
        y = results["vkFFT.cuda"]
        plt.plot(x, y, color='#FF0000', marker='o', markersize=3, linestyle='', label="vkFFT.cuda")

    plt.legend(loc='lower right', fontsize=10)
    plt.xlabel("FFT size", fontsize=12)
    plt.ylabel("idealised throughput [Gbytes/s]", fontsize=12)
    plt.suptitle("%dD FFT speed [%s, %s, %s]" % (ndim, gpu_name_real, platform.platform(),
                                                 platform.node()), fontsize=12)
    plt.title("'Ideal' throughput assumes one r+w operation per FFT axis", fontsize=10)
    plt.grid(which='both', alpha=0.3)
    plt.xlim(0)
    plt.ylim(0)
    plt.tight_layout()

    # Force refresh
    plt.draw()
    plt.gcf().canvas.draw()
    plt.pause(.001)

plt.savefig('benchmark-%dDFFT-%s-%s-%s.png'%(ndim, gpu_name_real.replace(' ','_'),
                                             platform.platform(), platform.node()))

Gbytes/s and time given for a couple (FFT, iFFT), dtype=complex64

  16 x   N x   N [2D]     vkFFT.opencl      gpyfft[clFFT]         vkFFT.cuda      skcuda[cuFFT]
  16 x  16 x   16        17.37 [ 14.05 µs]   7.43 [ 32.87 µs]  26.93 [  9.07 µs]  23.83 [ 10.24 µs]  [nb=1000]
  16 x  18 x   18        22.08 [ 13.99 µs]   9.27 [ 33.34 µs]  35.83 [  8.62 µs]  27.95 [ 11.06 µs]  [nb=1000]
  16 x  20 x   20        27.68 [ 13.78 µs]  11.35 [ 33.62 µs]  44.56 [  8.56 µs]  35.40 [ 10.77 µs]  [nb=1000]
  16 x  21 x   21        30.73 [ 13.69 µs]  12.28 [ 34.25 µs]  44.27 [  9.50 µs]  37.12 [ 11.33 µs]  [nb=1000]
  16 x  24 x   24        38.59 [ 14.23 µs]  16.41 [ 33.47 µs]  60.57 [  9.07 µs]  52.16 [ 10.53 µs]  [nb=1000]
  16 x  25 x   25        41.75 [ 14.28 µs]  17.21 [ 34.63 µs]  65.74 [  9.07 µs]  58.28 [ 10.23 µs]  [nb=1000]
  16 x  27 x   27        49.26 [ 14.11 µs]  19.91 [ 34.92 µs]  75.13 [  9.25 µs]  67.06 [ 10.37 µs]  [nb=1000]
  16 x  28 x   28        52.65 [ 14.20 µs]  21.57 [ 34.66 µs]  78.34 [  9.54 µs]  66.11 [ 11.31 µs]  [nb=1000]
  16 x  30 x   30        61.19 [ 14.03 µs]  25.50 [ 33.66 µs]  87.13 [  9.85 µs]  73.64 [ 11.66 µs]  [nb=1000]
  16 x  32 x   32        70.43 [ 13.87 µs]  28.57 [ 34.18 µs] 105.78 [  9.23 µs]  86.82 [ 11.25 µs]  [nb=1000]
  16 x  35 x   35        83.83 [ 13.94 µs]  34.04 [ 34.32 µs] 108.79 [ 10.74 µs]  94.36 [ 12.38 µs]  [nb=1000]
  16 x  36 x   36        88.17 [ 14.02 µs]  35.76 [ 34.56 µs] 124.87 [  9.90 µs] 114.77 [ 10.77 µs]  [nb=1000]
  16 x  40 x   40       109.30 [ 13.96 µs]  44.30 [ 34.45 µs] 135.87 [ 11.23 µs] 124.65 [ 12.24 µs]  [nb=1000]
  16 x  42 x   42       120.19 [ 14.00 µs]  49.04 [ 34.31 µs] 156.20 [ 10.77 µs] 137.46 [ 12.24 µs]  [nb=1000]
  16 x  45 x   45       139.18 [ 13.88 µs]  56.66 [ 34.08 µs] 148.36 [ 13.02 µs] 153.47 [ 12.58 µs]  [nb=1000]
  16 x  48 x   48       156.22 [ 14.07 µs]  63.33 [ 34.69 µs] 198.77 [ 11.05 µs] 179.30 [ 12.25 µs]  [nb=1000]
  16 x  49 x   49       161.70 [ 14.16 µs]  67.16 [ 34.09 µs] 200.96 [ 11.39 µs] 178.01 [ 12.86 µs]  [nb=1000]
  16 x  50 x   50       171.68 [ 13.89 µs]  69.07 [ 34.52 µs] 194.61 [ 12.25 µs] 184.67 [ 12.91 µs]  [nb=1000]
  16 x  54 x   54       197.31 [ 14.09 µs]  80.98 [ 34.34 µs] 245.09 [ 11.35 µs] 206.34 [ 13.48 µs]  [nb=1000]
  16 x  56 x   56       212.83 [ 14.05 µs]  86.92 [ 34.41 µs] 270.91 [ 11.04 µs] 238.15 [ 12.56 µs]  [nb=1000]
  16 x  60 x   60       245.59 [ 13.98 µs]  99.69 [ 34.44 µs] 266.54 [ 12.88 µs] 266.33 [ 12.89 µs]  [nb=1000]
  16 x  63 x   63       270.83 [ 13.98 µs] 109.18 [ 34.67 µs] 320.94 [ 11.79 µs] 269.14 [ 14.06 µs]  [nb=1000]
  16 x  64 x   64       277.46 [ 14.08 µs] 112.67 [ 34.67 µs] 384.08 [ 10.17 µs] 331.09 [ 11.80 µs]  [nb=1000]
  16 x  70 x   70       333.32 [ 14.02 µs] 121.70 [ 38.40 µs] 371.02 [ 12.60 µs] 330.69 [ 14.13 µs]  [nb=1000]
  16 x  72 x   72       347.05 [ 14.25 µs] 142.32 [ 34.74 µs] 366.59 [ 13.49 µs] 350.17 [ 14.12 µs]  [nb=1000]
  16 x  75 x   75       342.13 [ 15.68 µs] 156.06 [ 34.37 µs] 333.35 [ 16.09 µs] 181.28 [ 29.59 µs]  [nb=1000]
  16 x  80 x   80       440.13 [ 13.87 µs] 177.02 [ 34.48 µs] 461.22 [ 13.23 µs] 409.82 [ 14.89 µs]  [nb=1000]
  16 x  81 x   81       422.45 [ 14.81 µs] 181.59 [ 34.46 µs] 449.75 [ 13.91 µs] 414.34 [ 15.10 µs]  [nb=1000]
  16 x  84 x   84       431.07 [ 15.61 µs] 193.87 [ 34.71 µs] 446.61 [ 15.07 µs] 447.18 [ 15.05 µs]  [nb=1000]
  16 x  90 x   90       497.34 [ 15.53 µs] 221.91 [ 34.81 µs] 489.66 [ 15.78 µs] 413.43 [ 18.68 µs]  [nb=1000]
  16 x  96 x   96       566.17 [ 15.52 µs] 250.01 [ 35.15 µs] 560.65 [ 15.68 µs] 486.11 [ 18.08 µs]  [nb=1000]
  16 x  98 x   98       519.54 [ 17.63 µs] 258.84 [ 35.39 µs] 546.88 [ 16.75 µs] 249.31 [ 36.74 µs]  [nb=1000]
  16 x 100 x  100       607.65 [ 15.69 µs] 280.02 [ 34.06 µs] 639.92 [ 14.90 µs] 546.26 [ 17.46 µs]  [nb=1000]
  16 x 105 x  105       540.10 [ 19.47 µs] 167.52 [ 62.77 µs] 538.18 [ 19.54 µs] 514.01 [ 20.46 µs]  [nb= 951]
  16 x 108 x  108       616.07 [ 18.06 µs] 325.69 [ 34.15 µs] 648.63 [ 17.15 µs] 578.83 [ 19.22 µs]  [nb= 899]
  16 x 112 x  112       677.84 [ 17.65 µs] 340.48 [ 35.14 µs] 709.05 [ 16.87 µs] 657.14 [ 18.20 µs]  [nb= 836]
  16 x 120 x  120       717.43 [ 19.14 µs] 391.34 [ 35.09 µs] 782.99 [ 17.54 µs] 681.05 [ 20.16 µs]  [nb= 728]
  16 x 125 x  125       788.68 [ 18.89 µs] 369.43 [ 40.34 µs] 799.42 [ 18.64 µs] 722.87 [ 20.61 µs]  [nb= 671]
  16 x 126 x  126       722.78 [ 20.95 µs] 345.32 [ 43.85 µs] 759.39 [ 19.94 µs] 645.95 [ 23.44 µs]  [nb= 660]
  16 x 128 x  128       859.25 [ 18.18 µs] 416.98 [ 37.47 µs] 856.53 [ 18.24 µs] 836.59 [ 18.68 µs]  [nb= 640]
  16 x 135 x  135       712.28 [ 24.40 µs] 411.73 [ 42.21 µs] 733.55 [ 23.69 µs] 656.74 [ 26.47 µs]  [nb= 575]
  16 x 140 x  140       857.26 [ 21.80 µs] 358.17 [ 52.19 µs] 902.31 [ 20.72 µs] 766.14 [ 24.40 µs]  [nb= 535]
  16 x 144 x  144       976.17 [ 20.26 µs] 440.53 [ 44.89 µs]1018.93 [ 19.41 µs] 931.28 [ 21.23 µs]  [nb= 506]
  16 x 147 x  147       765.43 [ 26.92 µs] 434.85 [ 47.39 µs] 813.13 [ 25.34 µs] 374.65 [ 55.01 µs]  [nb= 485]
  16 x 150 x  150       944.70 [ 22.71 µs] 430.89 [ 49.80 µs] 985.92 [ 21.76 µs] 736.20 [ 29.15 µs]  [nb= 466]
  16 x 160 x  160      1049.86 [ 23.25 µs] 492.96 [ 49.53 µs]1105.16 [ 22.09 µs] 926.83 [ 26.34 µs]  [nb= 410]
  16 x 162 x  162       934.20 [ 26.79 µs] 355.70 [ 70.36 µs] 974.38 [ 25.69 µs] 765.53 [ 32.69 µs]  [nb= 400]
  16 x 168 x  168      1090.51 [ 24.68 µs] 396.20 [ 67.94 µs]1149.26 [ 23.42 µs] 803.34 [ 33.51 µs]  [nb= 372]
  16 x 175 x  175       943.61 [ 30.95 µs] 440.78 [ 66.26 µs] 966.92 [ 30.21 µs] 786.20 [ 37.15 µs]  [nb= 342]
  16 x 180 x  180      1017.56 [ 30.37 µs] 455.85 [ 67.78 µs]1114.08 [ 27.73 µs] 906.71 [ 34.08 µs]  [nb= 324]
  16 x 189 x  189       938.56 [ 36.30 µs] 377.63 [ 90.21 µs] 978.13 [ 34.83 µs] 828.66 [ 41.11 µs]  [nb= 294]
  16 x 192 x  192      1202.06 [ 29.25 µs] 385.26 [ 91.25 µs]1221.35 [ 28.78 µs]1018.30 [ 34.52 µs]  [nb= 284]
  16 x 196 x  196      1118.50 [ 32.75 µs] 383.94 [ 95.42 µs]1223.92 [ 29.93 µs] 999.29 [ 36.66 µs]  [nb= 273]
  16 x 200 x  200       991.16 [ 38.49 µs] 387.03 [ 98.56 µs]1039.61 [ 36.69 µs]1000.03 [ 38.15 µs]  [nb= 262]
  16 x 210 x  210       839.00 [ 50.13 µs] 266.28 [157.94 µs] 845.03 [ 49.77 µs] 947.95 [ 44.37 µs]  [nb= 238]
  16 x 216 x  216       945.92 [ 47.04 µs] 436.13 [102.02 µs] 950.91 [ 46.79 µs] 857.16 [ 51.91 µs]  [nb= 225]
  16 x 224 x  224       747.69 [ 64.00 µs] 481.54 [ 99.37 µs] 772.77 [ 61.92 µs] 747.68 [ 64.00 µs]  [nb= 209]
  16 x 225 x  225       796.05 [ 60.65 µs] 362.64 [133.14 µs] 819.09 [ 58.94 µs] 744.66 [ 64.83 µs]  [nb= 207]
  16 x 240 x  240       653.36 [ 84.08 µs] 503.35 [109.13 µs] 644.44 [ 85.24 µs] 515.99 [106.46 µs]  [nb= 182]
  16 x 243 x  243       532.06 [105.84 µs] 329.38 [170.97 µs] 529.90 [106.27 µs] 553.53 [101.74 µs]  [nb= 178]
  16 x 245 x  245       481.78 [118.82 µs] 340.85 [167.94 µs] 480.13 [119.23 µs] 228.43 [250.60 µs]  [nb= 175]
  16 x 250 x  250       516.05 [115.50 µs] 322.28 [184.95 µs] 519.26 [114.79 µs] 464.75 [128.25 µs]  [nb= 168]
  16 x 252 x  252       455.43 [132.98 µs] 363.80 [166.47 µs] 454.62 [133.22 µs] 489.95 [123.61 µs]  [nb= 165]
  16 x 256 x  256       455.62 [137.18 µs] 360.28 [173.48 µs] 539.26 [115.90 µs] 504.39 [123.91 µs]  [nb= 160]
  16 x 270 x  270       473.46 [146.84 µs] 348.35 [199.58 µs] 471.86 [147.34 µs] 397.18 [175.04 µs]  [nb= 144]
  16 x 280 x  280       446.95 [167.29 µs] 289.03 [258.68 µs] 442.96 [168.79 µs] 460.40 [162.40 µs]  [nb= 134]
  16 x 288 x  288       491.52 [160.93 µs] 294.61 [268.50 µs] 498.32 [158.74 µs] 475.06 [166.51 µs]  [nb= 126]
  16 x 294 x  294       466.36 [176.76 µs] 327.90 [251.39 µs] 466.68 [176.63 µs] 159.38 [517.20 µs]  [nb= 121]
  16 x 300 x  300       470.35 [182.48 µs] 313.36 [273.90 µs] 471.32 [182.11 µs] 471.74 [181.94 µs]  [nb= 117]
  16 x 315 x  315       416.85 [227.01 µs] 134.54 [703.34 µs] 417.98 [226.40 µs] 442.27 [213.96 µs]  [nb= 106]
  16 x 320 x  320       435.06 [224.46 µs] 358.69 [272.26 µs] 429.21 [227.52 µs] 473.13 [206.41 µs]  [nb= 102]
  16 x 324 x  324       480.68 [208.27 µs] 321.97 [310.94 µs] 481.39 [207.97 µs] 472.57 [211.85 µs]  [nb= 100]
  16 x 336 x  336       464.09 [232.00 µs] 307.76 [349.84 µs] 463.35 [232.37 µs] 476.69 [225.86 µs]  [nb=  93]
  16 x 343 x  343       413.13 [271.58 µs] 351.23 [319.45 µs] 415.23 [270.21 µs] 469.14 [239.16 µs]  [nb=  89]
  16 x 350 x  350       425.37 [274.64 µs] 285.35 [409.40 µs] 423.67 [275.74 µs] 456.04 [256.17 µs]  [nb=  86]
  16 x 360 x  360       448.57 [275.53 µs] 288.05 [429.07 µs] 449.07 [275.22 µs] 468.74 [263.68 µs]  [nb=  81]
  16 x 375 x  375       402.62 [333.09 µs] 238.22 [562.97 µs] 408.95 [327.94 µs] 223.25 [600.72 µs]  [nb=  75]
  16 x 378 x  378       424.27 [321.18 µs] 325.78 [418.27 µs] 424.91 [320.69 µs] 441.49 [308.65 µs]  [nb=  73]
  16 x 384 x  384       457.72 [307.23 µs] 264.06 [532.55 µs] 457.16 [307.60 µs] 474.51 [296.36 µs]  [nb=  71]
  16 x 392 x  392       458.17 [319.85 µs] 269.06 [544.65 µs] 458.65 [319.52 µs] 480.94 [304.71 µs]  [nb=  68]
  16 x 400 x  400       456.98 [333.91 µs] 257.47 [592.64 µs] 457.56 [333.48 µs] 480.03 [317.87 µs]  [nb=  66]
  16 x 405 x  405       430.97 [362.96 µs] 275.48 [567.83 µs] 433.71 [360.67 µs] 484.01 [323.19 µs]  [nb=  64]
  16 x 420 x  420       457.77 [367.50 µs] 149.30 [  1.13 ms] 459.37 [366.21 µs] 471.92 [356.48 µs]  [nb=  59]
  16 x 432 x  432       461.36 [385.77 µs] 262.55 [677.87 µs] 461.80 [385.40 µs] 485.23 [366.79 µs]  [nb=  56]
  16 x 441 x  441       364.29 [509.14 µs] 247.65 [748.91 µs] 360.70 [514.20 µs] 431.11 [430.22 µs]  [nb=  54]
  16 x 448 x  448       466.89 [409.96 µs] 264.33 [724.11 µs] 466.52 [410.28 µs] 481.45 [397.56 µs]  [nb=  52]
  16 x 450 x  450       464.98 [415.33 µs] 293.83 [657.25 µs] 463.17 [416.95 µs] 457.95 [421.71 µs]  [nb=  52]
  16 x 480 x  480       456.65 [481.17 µs] 347.30 [632.66 µs] 458.31 [479.43 µs] 479.60 [458.15 µs]  [nb=  46]
  16 x 486 x  486       449.32 [501.33 µs] 299.56 [751.94 µs] 450.85 [499.62 µs] 487.09 [462.45 µs]  [nb=  44]
  16 x 490 x  490       417.34 [548.65 µs] 298.64 [766.74 µs] 417.97 [547.83 µs] 455.13 [503.10 µs]  [nb=  44]
  16 x 500 x  500       464.45 [513.34 µs] 247.80 [962.16 µs] 450.03 [529.79 µs] 243.92 [977.46 µs]  [nb=  42]
  16 x 504 x  504       467.28 [518.42 µs] 263.23 [920.29 µs] 468.00 [517.63 µs] 480.56 [504.10 µs]  [nb=  41]
  16 x 512 x  512       464.95 [537.70 µs] 299.22 [835.49 µs] 462.94 [540.03 µs] 488.71 [511.56 µs]  [nb=  40]
  16 x 525 x  525       397.14 [661.87 µs] 136.24 [  1.93 ms] 398.96 [658.86 µs] 161.17 [  1.63 ms]  [nb=  38]
  16 x 540 x  540       468.98 [592.97 µs] 253.67 [  1.10 ms] 472.63 [588.40 µs] 479.60 [579.84 µs]  [nb=  36]
  16 x 560 x  560       474.31 [630.55 µs] 376.80 [793.71 µs] 475.26 [629.28 µs] 482.70 [619.58 µs]  [nb=  33]
  16 x 567 x  567       389.28 [787.59 µs] 244.83 [  1.25 ms] 396.56 [773.14 µs] 482.07 [636.00 µs]  [nb=  33]
  16 x 576 x  576       458.97 [689.38 µs] 229.94 [  1.38 ms] 464.17 [681.66 µs] 488.97 [647.08 µs]  [nb=  32]
  16 x 588 x  588       484.85 [680.06 µs] 267.01 [  1.23 ms] 486.10 [678.31 µs] 485.33 [679.39 µs]  [nb=  30]
  16 x 600 x  600       450.53 [762.04 µs] 227.06 [  1.51 ms] 449.97 [762.99 µs] 485.59 [707.02 µs]  [nb=  29]
  16 x 625 x  625       424.86 [876.83 µs] 286.49 [  1.30 ms] 423.72 [879.19 µs] 481.49 [773.70 µs]  [nb=  27]
  16 x 630 x  630       413.33 [915.77 µs] 123.92 [  3.05 ms] 429.37 [881.55 µs] 446.90 [846.97 µs]  [nb=  26]
  16 x 640 x  640       454.62 [859.24 µs] 223.61 [  1.75 ms] 456.78 [855.16 µs] 496.19 [787.25 µs]  [nb=  26]
  16 x 648 x  648       470.04 [851.95 µs] 231.02 [  1.73 ms] 472.95 [846.71 µs] 487.32 [821.75 µs]  [nb=  25]
  16 x 672 x  672       477.09 [902.69 µs] 249.19 [  1.73 ms] 479.85 [897.50 µs] 492.57 [874.32 µs]  [nb=  23]
  16 x 675 x  675       364.03 [  1.19 ms] 236.77 [  1.84 ms] 363.92 [  1.19 ms] 419.45 [  1.04 ms]  [nb=  23]
  16 x 686 x  686       407.91 [  1.10 ms] 240.23 [  1.87 ms] 415.34 [  1.08 ms] 241.77 [  1.86 ms]  [nb=  22]
  16 x 700 x  700       458.50 [  1.02 ms] 233.83 [  2.00 ms] 460.51 [  1.01 ms] 417.62 [  1.12 ms]  [nb=  21]
  16 x 720 x  720       457.21 [  1.08 ms] 206.65 [  2.39 ms] 459.59 [  1.08 ms] 485.25 [  1.02 ms]  [nb=  20]
  16 x 729 x  729       349.66 [  1.45 ms] 305.01 [  1.66 ms] 348.76 [  1.45 ms] 378.00 [  1.34 ms]  [nb=  20]
  16 x 735 x  735       381.74 [  1.35 ms] 126.07 [  4.09 ms] 380.20 [  1.36 ms] 160.66 [  3.21 ms]  [nb=  19]
  16 x 750 x  750       472.57 [  1.14 ms] 205.70 [  2.61 ms] 473.54 [  1.13 ms] 474.12 [  1.13 ms]  [nb=  19]
  16 x 756 x  756       460.02 [  1.18 ms] 229.85 [  2.37 ms] 466.71 [  1.17 ms] 484.06 [  1.13 ms]  [nb=  18]
  16 x 768 x  768       473.89 [  1.19 ms] 214.99 [  2.62 ms] 456.49 [  1.23 ms] 496.11 [  1.13 ms]  [nb=  18]
  16 x 784 x  784       472.43 [  1.24 ms] 216.92 [  2.70 ms] 473.86 [  1.24 ms] 489.36 [  1.20 ms]  [nb=  17]
  16 x 800 x  800       466.61 [  1.31 ms] 232.97 [  2.62 ms] 464.60 [  1.31 ms] 477.35 [  1.28 ms]  [nb=  16]
  16 x 810 x  810       472.88 [  1.32 ms] 230.09 [  2.72 ms] 472.90 [  1.32 ms] 442.94 [  1.41 ms]  [nb=  16]
  16 x 840 x  840       459.59 [  1.46 ms] 148.47 [  4.53 ms] 461.70 [  1.46 ms] 483.05 [  1.39 ms]  [nb=  15]
  16 x 864 x  864       470.25 [  1.51 ms] 222.85 [  3.19 ms] 470.23 [  1.51 ms] 482.55 [  1.48 ms]  [nb=  14]
  16 x 875 x  875       433.65 [  1.68 ms] 196.18 [  3.72 ms] 427.18 [  1.71 ms] 428.86 [  1.70 ms]  [nb=  14]
  16 x 882 x  882       452.61 [  1.64 ms] 218.67 [  3.39 ms] 451.57 [  1.64 ms] 159.85 [  4.64 ms]  [nb=  13]
  16 x 896 x  896       471.80 [  1.62 ms] 209.23 [  3.66 ms] 472.10 [  1.62 ms] 490.05 [  1.56 ms]  [nb=  13]
  16 x 900 x  900       461.07 [  1.68 ms] 208.75 [  3.70 ms] 463.44 [  1.67 ms] 483.46 [  1.60 ms]  [nb=  13]
  16 x 945 x  945       383.32 [  2.22 ms] 129.26 [  6.59 ms] 385.42 [  2.21 ms] 464.24 [  1.83 ms]  [nb=  12]
  16 x 960 x  960       460.95 [  1.91 ms] 214.25 [  4.10 ms] 461.48 [  1.90 ms] 485.00 [  1.81 ms]  [nb=  11]
  16 x 972 x  972       460.40 [  1.96 ms] 201.12 [  4.48 ms] 466.29 [  1.93 ms] 464.10 [  1.94 ms]  [nb=  11]
  16 x 980 x  980       480.05 [  1.91 ms] 208.41 [  4.39 ms] 482.06 [  1.90 ms] 484.93 [  1.89 ms]  [nb=  11]
  16 x1000 x 1000       463.97 [  2.06 ms] 209.39 [  4.55 ms] 465.85 [  2.05 ms] 489.52 [  1.95 ms]  [nb=  10]
  16 x1008 x 1008       473.54 [  2.05 ms] 211.16 [  4.59 ms] 472.48 [  2.05 ms] 375.41 [  2.58 ms]  [nb=  10]
  16 x1024 x 1024       463.64 [  2.16 ms] 221.84 [  4.51 ms] 481.30 [  2.08 ms] 501.26 [  1.99 ms]  [nb=  10]
  16 x1029 x 1029       419.99 [  2.40 ms] 148.83 [  6.78 ms] 420.84 [  2.40 ms] 234.40 [  4.31 ms]  [nb=  10]
  16 x1050 x 1050       425.48 [  2.47 ms] 102.56 [ 10.25 ms] 427.90 [  2.46 ms] 430.03 [  2.45 ms]  [nb=  10]
  16 x1080 x 1080       462.02 [  2.41 ms] 196.93 [  5.65 ms] 462.80 [  2.40 ms] 253.64 [  4.39 ms]  [nb=   9]
  16 x1120 x 1120       467.83 [  2.56 ms] 211.68 [  5.65 ms] 467.15 [  2.56 ms] 153.14 [  7.81 ms]  [nb=   8]
  16 x1125 x 1125       394.53 [  3.06 ms] 166.73 [  7.24 ms] 395.78 [  3.05 ms] 478.98 [  2.52 ms]  [nb=   8]
  16 x1134 x 1134       427.38 [  2.87 ms] 164.24 [  7.47 ms] 430.40 [  2.85 ms] 480.76 [  2.55 ms]  [nb=   8]
  16 x1152 x 1152       466.34 [  2.71 ms] 183.69 [  6.89 ms] 466.98 [  2.71 ms] 469.82 [  2.69 ms]  [nb=   8]
  16 x1176 x 1176       474.82 [  2.78 ms] 182.06 [  7.24 ms] 478.67 [  2.76 ms] 475.21 [  2.78 ms]  [nb=   8]
  16 x1200 x 1200       458.44 [  3.00 ms] 158.14 [  8.68 ms] 458.62 [  2.99 ms] 455.17 [  3.02 ms]  [nb=   7]
  16 x1215 x 1215       405.72 [  3.47 ms] 163.39 [  8.62 ms] 403.88 [  3.49 ms] 405.03 [  3.48 ms]  [nb=   7]
  16 x1225 x 1225       481.32 [  2.97 ms] 178.58 [  8.01 ms] 479.50 [  2.98 ms] 234.22 [  6.11 ms]  [nb=   7]
  16 x1250 x 1250       418.50 [  3.56 ms] 171.34 [  8.70 ms] 419.05 [  3.56 ms] 482.82 [  3.09 ms]  [nb=   7]
  16 x1260 x 1260       448.76 [  3.37 ms]  88.92 [ 17.03 ms] 448.34 [  3.38 ms] 480.82 [  3.15 ms]  [nb=   7]
  16 x1280 x 1280       451.26 [  3.46 ms] 161.48 [  9.68 ms] 453.08 [  3.45 ms] 452.22 [  3.46 ms]  [nb=   6]
  16 x1296 x 1296       480.61 [  3.33 ms] 170.00 [  9.42 ms] 482.20 [  3.32 ms] 339.56 [  4.72 ms]  [nb=   6]
  16 x1323 x 1323       377.04 [  4.43 ms] 159.84 [ 10.44 ms] 374.51 [  4.46 ms] 475.00 [  3.51 ms]  [nb=   6]
  16 x1344 x 1344       484.81 [  3.55 ms] 119.01 [ 14.47 ms] 489.38 [  3.52 ms] 484.19 [  3.56 ms]  [nb=   6]
  16 x1350 x 1350       421.71 [  4.12 ms] 164.06 [ 10.59 ms] 422.59 [  4.11 ms] 424.84 [  4.09 ms]  [nb=   6]
  16 x1372 x 1372       445.03 [  4.03 ms] 164.17 [ 10.93 ms] 448.89 [  4.00 ms] 292.32 [  6.14 ms]  [nb=   6]
  16 x1400 x 1400       470.34 [  3.97 ms] 135.43 [ 13.80 ms] 470.67 [  3.97 ms] 458.68 [  4.08 ms]  [nb=   5]
  16 x1440 x 1440       439.82 [  4.50 ms] 139.45 [ 14.18 ms] 438.70 [  4.51 ms] 463.12 [  4.27 ms]  [nb=   5]
  16 x1458 x 1458       474.52 [  4.27 ms] 200.25 [ 10.12 ms] 475.15 [  4.27 ms] 477.82 [  4.24 ms]  [nb=   5]
  16 x1470 x 1470       401.99 [  5.13 ms]  92.36 [ 22.31 ms] 408.39 [  5.05 ms] 471.70 [  4.37 ms]  [nb=   5]
  16 x1500 x 1500       433.46 [  4.95 ms] 113.83 [ 18.85 ms] 434.49 [  4.94 ms] 430.91 [  4.98 ms]  [nb=   5]
  16 x1512 x 1512       469.40 [  4.64 ms] 179.66 [ 12.14 ms] 473.55 [  4.60 ms] 472.05 [  4.62 ms]  [nb=   5]
  16 x1536 x 1536       336.38 [  6.69 ms] 157.13 [ 14.32 ms] 420.35 [  5.35 ms] 468.69 [  4.80 ms]  [nb=   4]
  16 x1568 x 1568       336.53 [  6.97 ms] 159.12 [ 14.74 ms] 480.03 [  4.88 ms] 341.18 [  6.87 ms]  [nb=   4]
  16 x1575 x 1575       302.73 [  7.81 ms] 165.06 [ 14.33 ms] 463.14 [  5.11 ms] 250.28 [  9.45 ms]  [nb=   4]
  16 x1600 x 1600       336.15 [  7.26 ms] 125.13 [ 19.51 ms] 475.61 [  5.13 ms] 476.01 [  5.13 ms]  [nb=   4]
  16 x1620 x 1620       328.32 [  7.62 ms] 125.82 [ 19.89 ms] 469.05 [  5.34 ms] 469.59 [  5.33 ms]  [nb=   4]
  16 x1680 x 1680       335.30 [  8.03 ms] 134.82 [ 19.97 ms] 470.27 [  5.72 ms] 467.35 [  5.76 ms]  [nb=   4]
  16 x1701 x 1701       305.72 [  9.03 ms] 113.54 [ 24.30 ms] 457.57 [  6.03 ms] 306.64 [  9.00 ms]  [nb=   4]
  16 x1715 x 1715       305.04 [  9.20 ms] 101.35 [ 27.68 ms] 450.96 [  6.22 ms] 236.72 [ 11.85 ms]  [nb=   4]
  16 x1728 x 1728       335.60 [  8.49 ms] 141.13 [ 20.18 ms] 473.28 [  6.02 ms] 343.60 [  8.29 ms]  [nb=   4]
  16 x1750 x 1750       319.08 [  9.15 ms] 129.90 [ 22.48 ms] 455.63 [  6.41 ms] 164.16 [ 17.79 ms]  [nb=   3]
  16 x1764 x 1764       329.40 [  9.01 ms] 132.94 [ 22.32 ms] 438.38 [  6.77 ms] 432.85 [  6.86 ms]  [nb=   3]
  16 x1792 x 1792       337.12 [  9.08 ms] 159.94 [ 19.15 ms] 475.56 [  6.44 ms] 332.43 [  9.21 ms]  [nb=   3]
  16 x1800 x 1800       331.75 [  9.31 ms]  99.97 [ 30.91 ms] 462.83 [  6.68 ms] 460.91 [  6.70 ms]  [nb=   3]
  16 x1875 x 1875       301.52 [ 11.12 ms] 101.57 [ 33.01 ms] 419.99 [  7.98 ms] 324.05 [ 10.35 ms]  [nb=   3]
  16 x1890 x 1890       319.61 [ 10.66 ms]  85.20 [ 39.98 ms] 458.96 [  7.42 ms] 366.57 [  9.29 ms]  [nb=   3]
  16 x1920 x 1920       337.12 [ 10.43 ms] 111.35 [ 31.57 ms] 463.68 [  7.58 ms] 259.65 [ 13.54 ms]  [nb=   3]
  16 x1944 x 1944       333.33 [ 10.81 ms] 131.67 [ 27.37 ms] 467.69 [  7.71 ms] 333.56 [ 10.80 ms]  [nb=   3]
  16 x1960 x 1960       332.88 [ 11.01 ms] 107.42 [ 34.11 ms] 460.95 [  7.95 ms] 355.90 [ 10.29 ms]  [nb=   3]
  16 x2000 x 2000       333.62 [ 11.43 ms] 104.34 [ 36.56 ms] 455.34 [  8.38 ms] 222.81 [ 17.12 ms]  [nb=   3]
  16 x2016 x 2016       335.99 [ 11.54 ms] 102.87 [ 37.68 ms] 463.87 [  8.36 ms] 307.27 [ 12.61 ms]  [nb=   3]
  16 x2025 x 2025       293.51 [ 13.32 ms] 105.01 [ 37.24 ms] 421.79 [  9.27 ms] 324.80 [ 12.04 ms]  [nb=   3]
  16 x2048 x 2048       253.37 [ 15.79 ms] 153.92 [ 25.99 ms] 479.09 [  8.35 ms] 480.63 [  8.32 ms]  [nb=   2]
  16 x2058 x 2058       321.07 [ 12.58 ms]  93.70 [ 43.11 ms] 429.18 [  9.41 ms] 433.28 [  9.32 ms]  [nb=   2]
  16 x2100 x 2100       327.38 [ 12.85 ms]  76.46 [ 55.00 ms] 436.49 [  9.64 ms] 418.56 [ 10.05 ms]  [nb=   2]
  16 x2160 x 2160       334.20 [ 13.31 ms]  94.12 [ 47.27 ms] 449.48 [  9.90 ms] 309.32 [ 14.38 ms]  [nb=   2]
  16 x2187 x 2187       301.78 [ 15.11 ms] 130.61 [ 34.92 ms] 419.17 [ 10.88 ms] 417.63 [ 10.92 ms]  [nb=   2]
  16 x2205 x 2205       297.07 [ 15.61 ms]  91.45 [ 50.70 ms] 410.05 [ 11.31 ms] 428.99 [ 10.81 ms]  [nb=   2]
  16 x2240 x 2240       332.98 [ 14.37 ms] 101.65 [ 47.07 ms] 452.66 [ 10.57 ms] 400.02 [ 11.96 ms]  [nb=   2]
  16 x2250 x 2250       318.07 [ 15.18 ms]  88.91 [ 54.30 ms] 410.10 [ 11.77 ms] 405.68 [ 11.90 ms]  [nb=   2]
  16 x2268 x 2268       331.10 [ 14.82 ms] 104.44 [ 46.97 ms] 443.30 [ 11.07 ms] 289.89 [ 16.92 ms]  [nb=   2]
  16 x2304 x 2304       336.49 [ 15.05 ms] 111.95 [ 45.22 ms] 456.53 [ 11.09 ms] 436.33 [ 11.60 ms]  [nb=   2]
  16 x2352 x 2352       335.38 [ 15.73 ms]  94.37 [ 55.90 ms] 436.33 [ 12.09 ms] 404.67 [ 13.04 ms]  [nb=   2]
  16 x2400 x 2400       331.52 [ 16.57 ms]  95.05 [ 57.79 ms] 434.29 [ 12.65 ms] 429.58 [ 12.79 ms]  [nb=   2]
  16 x2401 x 2401       296.94 [ 18.51 ms]  93.88 [ 58.56 ms] 401.97 [ 13.68 ms] 324.07 [ 16.96 ms]  [nb=   2]
  16 x2430 x 2430       320.60 [ 17.56 ms]  96.51 [ 58.35 ms] 414.31 [ 13.59 ms] 434.20 [ 12.97 ms]  [nb=   2]
  16 x2450 x 2450       312.62 [ 18.31 ms] 130.68 [ 43.80 ms] 425.27 [ 13.46 ms] 429.26 [ 13.34 ms]  [nb=   2]
  16 x2500 x 2500       318.23 [ 18.73 ms]  94.76 [ 62.90 ms] 431.42 [ 13.82 ms] 443.60 [ 13.44 ms]  [nb=   2]
  16 x2520 x 2520       331.55 [ 18.27 ms] 123.93 [ 48.87 ms] 435.57 [ 13.90 ms] 405.85 [ 14.92 ms]  [nb=   2]
  16 x2560 x 2560       338.19 [ 18.48 ms]  95.37 [ 65.54 ms] 430.70 [ 14.51 ms] 224.63 [ 27.82 ms]  [nb=   2]
  16 x2592 x 2592       334.79 [ 19.14 ms] 100.65 [ 63.66 ms] 437.05 [ 14.66 ms] 219.92 [ 29.13 ms]  [nb=   2]
  16 x2625 x 2625       303.64 [ 21.64 ms] 130.96 [ 50.18 ms] 388.86 [ 16.90 ms] 415.21 [ 15.83 ms]  [nb=   2]
  16 x2646 x 2646       322.12 [ 20.73 ms]  77.84 [ 85.77 ms] 413.50 [ 16.15 ms] 429.86 [ 15.53 ms]  [nb=   1]
  16 x2688 x 2688       336.35 [ 20.49 ms]  90.01 [ 76.55 ms] 437.98 [ 15.73 ms] 387.28 [ 17.79 ms]  [nb=   1]
  16 x2700 x 2700       324.08 [ 21.45 ms]  85.56 [ 81.26 ms] 425.73 [ 16.33 ms] 284.92 [ 24.40 ms]  [nb=   1]
  16 x2744 x 2744       332.12 [ 21.62 ms] 103.39 [ 69.45 ms] 436.37 [ 16.46 ms] 449.67 [ 15.97 ms]  [nb=   1]
  16 x2800 x 2800       335.37 [ 22.29 ms] 107.44 [ 69.59 ms] 408.93 [ 18.28 ms] 261.91 [ 28.55 ms]  [nb=   1]
  16 x2835 x 2835       300.28 [ 25.53 ms] 133.85 [ 57.27 ms] 381.55 [ 20.09 ms] 374.40 [ 20.47 ms]  [nb=   1]
  16 x2880 x 2880       337.41 [ 23.44 ms]  85.09 [ 92.97 ms] 440.19 [ 17.97 ms] 387.76 [ 20.40 ms]  [nb=   1]
  16 x2916 x 2916       330.85 [ 24.51 ms]  91.32 [ 88.80 ms] 415.56 [ 19.51 ms] 295.19 [ 27.47 ms]  [nb=   1]
  16 x2940 x 2940       328.67 [ 25.08 ms] 132.72 [ 62.11 ms] 410.95 [ 20.06 ms] 424.60 [ 19.41 ms]  [nb=   1]
  16 x3000 x 3000       330.08 [ 26.00 ms]  77.80 [110.33 ms] 428.07 [ 20.05 ms] 413.89 [ 20.74 ms]  [nb=   1]
  16 x3024 x 3024       336.70 [ 25.90 ms]  81.54 [106.95 ms] 408.02 [ 21.37 ms] 386.46 [ 22.57 ms]  [nb=   1]
  16 x3072 x 3072       336.18 [ 26.77 ms]  95.30 [ 94.44 ms] 387.19 [ 23.24 ms] 411.01 [ 21.90 ms]  [nb=   1]
[ ]: