pax_global_header00006660000000000000000000000064141417367010014516gustar00rootroot0000000000000052 comment=49c9a52bc7754dd8b18398c498e76e65c54ae3d6 compyle-release-0.8.1/000077500000000000000000000000001414173670100146125ustar00rootroot00000000000000compyle-release-0.8.1/.coveragerc000066400000000000000000000004011414173670100167260ustar00rootroot00000000000000[run] branch = True source = compyle omit = */tests/* compyle/api.py [report] exclude_lines = # Have to re-enable the standard pragma pragma: no cover except ImportError: raise NotImplementedError() if __name__ == .__main__.: compyle-release-0.8.1/.github/000077500000000000000000000000001414173670100161525ustar00rootroot00000000000000compyle-release-0.8.1/.github/workflows/000077500000000000000000000000001414173670100202075ustar00rootroot00000000000000compyle-release-0.8.1/.github/workflows/tests.yml000066400000000000000000000025331414173670100220770ustar00rootroot00000000000000name: Tests on: pull_request jobs: tests: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: [3.8, 3.9] runs-on: ${{ matrix.os }} defaults: run: shell: bash -l {0} steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true python-version: ${{ matrix.python-version }} channels: conda-forge - name: Install dependencies on Linux/MacOS run: | conda info conda install -c conda-forge pocl pyopencl python -c 'import pyopencl as cl' if: ${{ runner.os != 'Windows' }} - name: Install dependencies run: | conda info conda install -c conda-forge numpy cython python -m pip install -r requirements.txt python -m pip install coverage codecov python -m pip install -e ".[dev]" - name: Run tests run: | coverage erase coverage run -m pytest -v - name: Report if: ${{ success() }} run: coverage report - name: Upload Coverage to Codecov uses: codecov/codecov-action@v1 with: env_vars: ${{ matrix.os }}, ${{ matrix.python-version }} compyle-release-0.8.1/.gitignore000066400000000000000000000001021414173670100165730ustar00rootroot00000000000000*.pyc *.o *.c *.cpp *~ *.so build/ dist/ *.egg-info/ .pytest_cachecompyle-release-0.8.1/CHANGES.rst000066400000000000000000000027021414173670100164150ustar00rootroot000000000000000.8.1 ~~~~~~ * Release date: 7th November, 2021. * Fix issue with accidental file in sdist. 0.8 ~~~~ * Release date: 7th November, 2021. * Improve array module to support more numpy like functionality. * Improve profile output so it works in a distributed setting. * Add support for a configuration file in ~/.compyle/config.py * Added `atomic_dec` support. * Fix output capturing on jupyter notebooks. * Fix issues due to ast changes in Python 3.9.x. * Fix tests on 32bit architectures. * Fix several bugs and issues. 0.7 ~~~~ * Release date: 1st October, 2020. * Add convenient option to profile execution of code. * Add a convenient argument parser for scripts. * Add easy way to see generated sources. * Fix bug with installation of previous version. * Fix several bugs and issues. * Update the documentation. 0.6 ~~~~ * Release date: 15th June, 2020. * Add some non-trivial examples showcasing the package. * Document how one can use clang + OpenMP. * Add sorting, align, and other functions to array module. * Support for mapping structs on a GPU with CUDA. * Add address, cast, and address low-level functions. * Support for mako-templates for reducing repetitive code. * Bitwise operator support. * Attempt to auto-declare variables when possible. * Fix several bugs and issues. 0.5 ~~~~ * Release date: 3rd, December 2018 * First public release. * Support for elementwise, scan, and reduction operations on CPU and GPU using Cython, OpenCL and CUDA. compyle-release-0.8.1/LICENSE.txt000066400000000000000000000031111414173670100164310ustar00rootroot00000000000000Unless otherwise specified by LICENSE.txt files in individual directories, all code is Copyright (c) 2009-2018, the PySPH developers All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. compyle-release-0.8.1/MANIFEST.in000066400000000000000000000004071414173670100163510ustar00rootroot00000000000000include MANIFEST.in *.py *.rst *.yml *.txt *.toml recursive-include compyle *.pyx recursive-exclude compyle *.cpp recursive-include docs *.* recursive-include examples *.* recursive-exclude docs/build *.* recursive-exclude examples/ *.png __pycache__/* .DS_Store compyle-release-0.8.1/README.rst000066400000000000000000000100101414173670100162710ustar00rootroot00000000000000Compyle: execute a subset of Python on HPC platforms ====================================================== |CI Status| |Coverage Status| |Documentation Status| .. |CI Status| image:: https://github.com/pypr/compyle/actions/workflows/tests.yml/badge.svg :target: https://github.com/pypr/compyle/actions/workflows/tests.yml .. |Documentation Status| image:: https://readthedocs.org/projects/compyle/badge/?version=latest :target: https://compyle.readthedocs.io/en/latest/?badge=latest :alt: Documentation Status .. |Coverage Status| image:: https://codecov.io/gh/pypr/compyle/branch/master/graph/badge.svg :target: https://codecov.io/gh/pypr/compyle Compyle allows users to execute a restricted subset of Python (almost similar to C) on a variety of HPC platforms. Currently we support multi-core CPU execution using Cython, and for GPU devices we use OpenCL or CUDA. Users start with code implemented in a very restricted Python syntax, this code is then automatically transpiled, compiled and executed to run on either one CPU core, or multiple CPU cores (via OpenMP_) or on a GPU. Compyle offers source-to-source transpilation, making it a very convenient tool for writing HPC libraries. Some simple yet powerful parallel utilities are provided which can allow you to solve a remarkably large number of interesting HPC problems. Compyle also features JIT transpilation making it easy to use. Documentation and learning material is also available in the form of: - Documentation at: https://compyle.readthedocs.io - An introduction to compyle in the context of writing a parallel molecular dynamics simulator is in our `SciPy 2020 paper `_. - `Compyle poster presentation `_ - You may also try Compyle online for free on a `Google Colab notebook`_. While Compyle seems simple it is not a toy and is used heavily by the PySPH_ project where Compyle has its origins. .. _PySPH: https://github.com/pypr/pysph .. _Google Colab notebook: https://colab.research.google.com/drive/1SGRiArYXV1LEkZtUeg9j0qQ21MDqQR2U?usp=sharing Installation ------------- Compyle is itself largely pure Python but depends on numpy_ and requires either Cython_ or PyOpenCL_ or PyCUDA_ along with the respective backends of a C/C++ compiler, OpenCL and CUDA. If you are only going to execute code on a CPU then all you need is Cython. You should be able to install Compyle by doing:: $ pip install compyle .. _PyOpenCL: https://documen.tician.de/pyopencl/ .. _OpenCL: https://www.khronos.org/opencl/ .. _Cython: http://www.cython.org .. _numpy: http://www.numpy.org .. _OpenMP: http://openmp.org/ .. _PyCUDA: https://documen.tician.de/pycuda/ A simple example ---------------- Here is a very simple example:: from compyle.api import Elementwise, annotate, wrap, get_config import numpy as np @annotate def axpb(i, x, y, a, b): y[i] = a*sin(x[i]) + b x = np.linspace(0, 1, 10000) y = np.zeros_like(x) a, b = 2.0, 3.0 backend = 'cython' get_config().use_openmp = True x, y = wrap(x, y, backend=backend) e = Elementwise(axpb, backend=backend) e(x, y, a, b) This will execute the elementwise operation in parallel using OpenMP with Cython. The code is auto-generated, compiled and called for you transparently. The first time this runs, it will take a bit of time to compile everything but the next time, this is cached and will run much faster. If you just change the ``backend = 'opencl'``, the same exact code will be executed using PyOpenCL_ and if you change the backend to ``'cuda'``, it will execute via CUDA without any other changes to your code. This is obviously a very trivial example, there are more complex examples available as well. Examples --------- Some simple examples and benchmarks are available in the `examples `_ directory. You may also run these examples on the `Google Colab notebook`_ compyle-release-0.8.1/compyle/000077500000000000000000000000001414173670100162625ustar00rootroot00000000000000compyle-release-0.8.1/compyle/__init__.py000066400000000000000000000001121414173670100203650ustar00rootroot00000000000000# See PEP 440 for more on suitable version numbers. __version__ = '0.8.1' compyle-release-0.8.1/compyle/api.py000066400000000000000000000014721414173670100174110ustar00rootroot00000000000000from .array import Array, wrap from .ast_utils import (get_symbols, get_assigned, get_unknown_names_and_calls, has_return, has_node) from .config import get_config, set_config, use_config, Config from .cython_generator import ( CythonGenerator, get_func_definition ) from .ext_module import ExtModule from .extern import Extern from .low_level import Kernel, LocalMem, Cython, cast from .parallel import ( Elementwise, Reduction, Scan, elementwise ) from .profile import ( get_profile_info, named_profile, profile, profile_ctx, print_profile, profile_kernel, ProfileContext, profile2csv ) from .translator import ( CConverter, CStructHelper, OpenCLConverter, detect_type, ocl_detect_type, py2c ) from .types import KnownType, annotate, declare from .utils import ArgumentParser compyle-release-0.8.1/compyle/array.py000066400000000000000000001046611414173670100177620ustar00rootroot00000000000000import numpy as np import math import mako.template as mkt import time from pytools import memoize, memoize_method from .config import get_config from .types import (annotate, dtype_to_ctype, ctype_to_dtype, declare, dtype_to_knowntype, knowntype_to_ctype) from .template import Template from .sort import radix_sort from .profile import profile from .parallel import Elementwise try: import pycuda from .cuda import set_context set_context() def cu_bufint(arr, nbytes, offset): return arr.gpudata.as_buffer(nbytes, offset) except ImportError as e: pass def get_backend(backend=None): if not backend: cfg = get_config() if cfg.use_opencl: return 'opencl' elif cfg.use_cuda: return 'cuda' else: return 'cython' else: return backend minmax_tpl = """ WITHIN_KERNEL ${dtype} mmc_neutral() { ${dtype} result; % for prop in prop_names: % if not only_max: result.cur_min_${prop} = ${inf}; % endif % if not only_min: result.cur_max_${prop} = -${inf}; % endif % endfor return result; } WITHIN_KERNEL ${dtype} mmc_from_scalar(${args}) { ${dtype} result; % for prop in prop_names: % if not only_max: result.cur_min_${prop} = ${prop}; % endif % if not only_min: result.cur_max_${prop} = ${prop}; % endif % endfor return result; } WITHIN_KERNEL ${dtype} agg_mmc(${dtype} a, ${dtype} b) { ${dtype} result = a; % for prop in prop_names: % if not only_max: if (b.cur_min_${prop} < result.cur_min_${prop}) result.cur_min_${prop} = b.cur_min_${prop}; % endif % if not only_min: if (b.cur_max_${prop} > result.cur_max_${prop}) result.cur_max_${prop} = b.cur_max_${prop}; % endif % endfor return result; } """ minmax_operator_tpl = """ __device__ ${dtype} volatile &operator=( ${dtype} const &src) volatile { % for prop in prop_names: % if not only_max: this->cur_min_${prop} = src.cur_min_${prop}; % endif % if not only_min: this->cur_max_${prop} = src.cur_max_${prop}; % endif % endfor return *this; } """ def minmax_collector_key(device, dtype, props, name, *args): return (device, dtype, tuple(props), name) @memoize(key=minmax_collector_key) def make_collector_dtype(device, dtype, props, name, only_min, only_max, backend): fields = [("pad", np.int32)] for prop in props: if not only_min: fields.append(("cur_max_%s" % prop, dtype)) if not only_max: fields.append(("cur_min_%s" % prop, dtype)) custom_dtype = np.dtype(fields) if backend == 'opencl': from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct elif backend == 'cuda': from compyle.cuda import match_dtype_to_c_struct from pycuda.tools import get_or_register_dtype custom_dtype, c_decl = match_dtype_to_c_struct(device, name, custom_dtype) custom_dtype = get_or_register_dtype(name, custom_dtype) return custom_dtype, c_decl @memoize(key=lambda *args: (args[-3], args[-2], args[-1])) def get_minmax_kernel(ctx, dtype, inf, mmc_dtype, prop_names, only_min, only_max, name, mmc_c_decl, backend): tpl_args = ", ".join( ["%(dtype)s %(prop)s" % {'dtype': dtype, 'prop': prop} for prop in prop_names] ) if backend == 'cuda': # overload assignment operator in struct mmc_overload = mkt.Template(text=minmax_operator_tpl).render( prop_names=prop_names, dtype=name, only_min=only_min, only_max=only_max ) mmc_c_decl_lines = mmc_c_decl.splitlines() mmc_c_decl_lines = mmc_c_decl_lines[:-2] + \ mmc_overload.splitlines() + mmc_c_decl_lines[-2:] mmc_c_decl = '\n'.join(mmc_c_decl_lines) mmc_preamble = mmc_c_decl + minmax_tpl preamble = mkt.Template(text=mmc_preamble).render( args=tpl_args, prop_names=prop_names, dtype=name, only_min=only_min, only_max=only_max, inf=inf ) map_args = ", ".join( ["%(prop)s[i]" % {'dtype': dtype, 'prop': prop} for prop in prop_names] ) if backend == 'opencl': knl_args = ", ".join( ["__global %(dtype)s* %(prop)s" % {'dtype': dtype, 'prop': prop} for prop in prop_names] ) from pyopencl._cluda import CLUDA_PREAMBLE from pyopencl.reduction import ReductionKernel cluda_preamble = mkt.Template(text=CLUDA_PREAMBLE).render( double_support=True ) knl = ReductionKernel( ctx, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(%s)" % map_args, arguments=knl_args, preamble='\n'.join([cluda_preamble, preamble]) ) elif backend == 'cuda': knl_args = ", ".join( ["%(dtype)s* %(prop)s" % {'dtype': dtype, 'prop': prop} for prop in prop_names] ) from pycuda._cluda import CLUDA_PREAMBLE from pycuda.reduction import ReductionKernel cluda_preamble = mkt.Template(text=CLUDA_PREAMBLE).render( double_support=True ) knl = ReductionKernel( mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(%s)" % map_args, arguments=knl_args, preamble='\n'.join([cluda_preamble, preamble]) ) return knl def wrap_array(arr, backend): wrapped_array = Array(arr.dtype, allocate=False, backend=backend) if isinstance(arr, np.ndarray): wrapped_array.data = arr if backend == 'opencl' or backend == 'cuda': use_double = get_config().use_double _dtype = np.float64 if use_double else np.float32 if np.issubdtype(arr.dtype, np.floating): wrapped_array.dtype = _dtype wrapped_array.data = arr.astype(_dtype) q = None if backend == 'opencl': from .opencl import get_queue from pyopencl.array import to_device q = get_queue() if arr is not None: dev_ary = to_device(q, wrapped_array.data) wrapped_array.set_data(dev_ary) elif backend == 'cuda': from .cuda import set_context set_context() from pycuda.gpuarray import to_gpu if arr is not None: dev_ary = to_gpu(wrapped_array.data) wrapped_array.set_data(dev_ary) else: wrapped_array.set_data(wrapped_array.data) elif backend == 'opencl': import pyopencl.array as gpuarray if isinstance(arr, gpuarray.Array): wrapped_array.set_data(arr) elif backend == 'cuda': import pycuda.gpuarray as gpuarray if isinstance(arr, gpuarray.GPUArray): wrapped_array.set_data(arr) return wrapped_array def wrap(*args, **kw): ''' Parameters ---------- *args: any numpy arrays to be wrapped. **kw: only one keyword arg called `backend` is supported. backend: str: use appropriate backend for arrays. ''' backend = get_backend(kw.get('backend')) if len(args) == 1: return wrap_array(args[0], backend) else: return [wrap_array(x, backend) for x in args] def to_device(array, backend='cython'): if backend == 'cython': out = array elif backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = gpuarray.to_device(get_queue(), array) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.to_gpu(array) return wrap_array(out, backend) def ones_like(array, backend=None): if backend is None: backend = array.backend if backend == 'opencl': import pyopencl.array as gpuarray out = 1 + gpuarray.zeros_like(array.dev) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.ones_like(array.dev) else: out = np.ones_like(array.dev) return wrap_array(out, backend) def ones(n, dtype, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = 1 + gpuarray.zeros(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = np.array(1, dtype=dtype) + gpuarray.zeros(n, dtype) else: out = np.ones(n, dtype=dtype) return wrap_array(out, backend) def empty(n, dtype, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = gpuarray.empty(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.empty(n, dtype) else: out = np.empty(n, dtype=dtype) return wrap_array(out, backend) def empty_like(x): return empty(x.length, x.dtype, x.backend) def zeros(n, dtype, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = gpuarray.zeros(get_queue(), n, dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.zeros(n, dtype) else: out = np.zeros(n, dtype=dtype) return wrap_array(out, backend) def zeros_like(array, backend=None): if backend is None: backend = array.backend if backend == 'opencl': import pyopencl.array as gpuarray out = gpuarray.zeros_like(array.dev) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.zeros_like(array.dev) else: out = np.zeros_like(array.dev) return wrap_array(out, backend) def arange(start, stop, step, dtype=np.int32, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = gpuarray.arange(get_queue(), start, stop, step, dtype=dtype) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.arange(start, stop, step, dtype=dtype) else: out = np.arange(start, stop, step, dtype=dtype) return wrap_array(out, backend) def linspace(start, stop, num, dtype=np.float64, backend='opencl', endpoint=True): if not type(num) == int: raise TypeError("num should be an integer but got %s" % type(num)) if num <= 0: raise ValueError("Number of samples, %s, must be positive." % num) if backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue if endpoint: delta = (stop-start)/(num-1) else: delta = (stop-start)/num out = gpuarray.arange(get_queue(), 0, num, 1, dtype=dtype) out = out * delta+start elif backend == 'cuda': import pycuda.gpuarray as gpuarray import pycuda.autoinit if endpoint: delta = (stop-start)/(num-1) else: delta = (stop-start)/num out = gpuarray.arange(0, num, 1, dtype=dtype) out = out*delta+start else: out = np.linspace(start, stop, num, endpoint=endpoint, dtype=dtype) return wrap_array(out, backend) @annotate def n_diff_elwise(i, y, x, b, lb): it = declare('int', 1) for it in range(lb): y[i] += x[it+i] * b[it] @memoize def choose(n, x): return math.factorial(n)/(math.factorial(n-x) * math.factorial(x)) @memoize def diff_kernel(backend, dtype): e = Elementwise(n_diff_elwise, backend=backend) return e def diff(a, n, backend=None): """ calculate the first discrete difference of the given array. The first difference is given by ``out[i] = a[i+1] - a[i]`` """ if n == 0: return a if n < 0: raise ValueError( "order must be non-negative but got " + repr(n)) if(len(a) < n+1): raise ValueError( "Array a should have length at least n+1, but got " + str(len(a))) if backend is None: backend = a.backend if backend == 'opencl' or backend == 'cuda': from compyle.api import Elementwise binom_coeff = np.zeros(n+1) sign_fac = 1 if (n % 2 == 0) else -1 for i in range(n+1): binom_coeff[i] = choose(n, i) * (-1)**i * sign_fac binom_coeff = wrap(binom_coeff, backend=backend) len_ar = len(a) y = zeros(len_ar - n, dtype=a.dtype, backend=backend) e = diff_kernel(backend, a.dtype) e(y, a, binom_coeff, len(binom_coeff)) return y else: return wrap_array(np.diff(a, n), backend=backend) def minimum(ary, backend=None): if backend is None: backend = ary.backend if backend == 'cython': return ary.dev.min() elif backend == 'opencl': import pyopencl.array as gpuarray return gpuarray.min(ary.dev).get() elif backend == 'cuda': import pycuda.gpuarray as gpuarray return gpuarray.min(ary.dev).get() def maximum(ary, backend=None): if backend is None: backend = ary.backend if backend == 'cython': return ary.dev.max() elif backend == 'opencl': import pyopencl.array as gpuarray return gpuarray.max(ary.dev).get() elif backend == 'cuda': import pycuda.gpuarray as gpuarray return gpuarray.max(ary.dev).get() def sum(ary, backend=None): if backend is None: backend = ary.backend if backend == 'cython': return np.sum(ary.dev) if backend == 'opencl': import pyopencl.array as gpuarray return gpuarray.sum(ary.dev).get() if backend == 'cuda': import pycuda.gpuarray as gpuarray return gpuarray.sum(ary.dev).get() def dot(a, b, backend=None): if backend is None: backend = a.backend if backend == 'cython': return np.dot(a.dev, b.dev) if backend == 'opencl': import pyopencl.array as gpuarray return gpuarray.dot(a.dev, b.dev).get() if backend == 'cuda': import pycuda.gpuarray as gpuarray return gpuarray.dot(a.dev, b.dev).get() def trapz(y, x=None, dx=1.0, backend=None): if backend is None: backend = y.backend if x is None: d = dx out = (sum(y, backend=backend) - 0.5 * (y[0] + y[-1])) * d else: if not len(x) == len(y): raise Exception('arrays x and y should be of the same size') d = diff(x, 1, backend=backend) sum_ar = (y[:-1] + y[1:]) out = dot(d, sum_ar) * 0.5 return out @annotate def where_elwise(i, condition, x, y, ans): if condition[i]: ans[i] = x[i] else: ans[i] = y[i] @memoize def where_kernel(backend, dtype): e = Elementwise(where_elwise, backend=backend) return e def where(condition, x, y, backend=None): if backend is None: backend = x.backend if y.backend is not x.backend: raise TypeError( 'x and y should have same backend, got ${x_bk} and ${y_bk}'. format(x_bk=x.backend, y_bk=y.backend)) if x.dtype is not y.dtype: raise TypeError( 'x and y should have same data type, got {} and {}'.format( x.dtype, y.dtype)) e = where_kernel(backend, x.dtype) ans = empty(x.length, dtype=x.dtype, backend=backend) e(condition, x, y, ans) return ans @memoize(key=lambda *args: tuple(args[0])) def get_cl_sort_kernel(arg_types, ary_list): import pyopencl as cl from pyopencl.scan import GenericScanKernel import pyopencl.algorithm from compyle.opencl import get_context, get_queue arg_names = ["ary_%s" % i for i in range(len(ary_list))] sort_args = ["%s %s" % (knowntype_to_ctype(ktype), name) for ktype, name in zip(arg_types, arg_names)] sort_args = [arg.replace('GLOBAL_MEM', '__global') for arg in sort_args] sort_knl = cl.algorithm.RadixSort( get_context(), sort_args, scan_kernel=GenericScanKernel, key_expr="ary_0[i]", sort_arg_names=arg_names ) return sort_knl @memoize(key=lambda q: q) def get_allocator(queue): import pyopencl as cl allocator = cl.tools.MemoryPool( cl.tools.ImmediateAllocator(queue) ) return allocator @profile def sort_by_keys(ary_list, out_list=None, key_bits=None, backend=None, use_radix_sort=False): # FIXME: Need to use returned values, cuda backend uses # thrust that will internally allocate a new array for storing # the sorted data so out_list will not have the sorted arrays # first arg of ary_list is the key if backend is None: backend = ary_list[0].backend if backend == 'opencl': from .jit import get_ctype_from_arg from compyle.opencl import get_queue if not out_list: out_list = [ Array(ary.dtype, allocate=False, backend=backend) for ary in ary_list ] arg_types = [get_ctype_from_arg(arg, backend=backend) for arg in ary_list] sort_knl = get_cl_sort_kernel(arg_types, ary_list) allocator = get_allocator(get_queue()) arg_list = [ary.dev for ary in ary_list] out_arrays, event = sort_knl(*arg_list, key_bits=key_bits, allocator=allocator) for i, out in enumerate(out_list): out.set_data(out_arrays[i]) return out_list elif backend == 'cython' and use_radix_sort: out_list, order = radix_sort(ary_list, out_list=out_list, max_key_bits=key_bits, backend=backend) return out_list elif backend == 'cython': order = wrap(np.argsort(ary_list[0].dev), backend=backend) out_list = align(ary_list, order, out_list=out_list, backend=backend) return out_list else: order = argsort(ary_list[0], backend=backend) modified_out_list = None if out_list: modified_out_list = out_list[1:] out_list = align(ary_list[1:], order, out_list=modified_out_list, backend=backend) return [ary_list[0]] + out_list def argsort(ary, backend=None): # FIXME: Implement an OpenCL backend and add tests # NOTE: argsort also sorts the array if backend is None: backend = ary.backend if backend == 'cython': result = np.argsort(ary.dev) ary.dev = np.take(ary.dev, result) return wrap_array(result, backend=backend) elif backend == 'cuda': from compyle.cuda import argsort return argsort(ary) else: raise ValueError("Only cython and cuda backends supported") def update_minmax_gpu(ary_list, only_min=False, only_max=False, backend=None): if not backend: backend = ary_list[0].backend if only_min and only_max: raise ValueError("Only one of only_min and only_max can be True") props = ['ary_%s' % i for i in range(len(ary_list))] dtype = ary_list[0].dtype ctype = dtype_to_ctype(dtype, backend=backend) op = 'min' if not only_max else '' op += 'max' if not only_min else '' name = "%s_collector_%s" % (op, ''.join([ctype] + props)) if backend == 'opencl': from compyle.opencl import get_context ctx = get_context() device = ctx.devices[0] elif backend == 'cuda': ctx = None device = None mmc_dtype, mmc_c_decl = make_collector_dtype(device, dtype, props, name, only_min, only_max, backend) if np.issubdtype(dtype, np.floating): inf = np.finfo(dtype).max else: inf = np.iinfo(dtype).max knl = get_minmax_kernel(ctx, ctype, inf, mmc_dtype, props, only_min, only_max, name, mmc_c_decl, backend) args = [ary.dev for ary in ary_list] result = knl(*args).get() for ary, prop in zip(ary_list, props): if not only_max: ary.minimum = result["cur_min_%s" % prop] if not only_min: ary.maximum = result["cur_max_%s" % prop] @annotate def take_elwise(i, indices, ary, out_ary): out_ary[i] = ary[indices[i]] def take(ary, indices, backend=None, out=None): import compyle.parallel as parallel if backend is None: backend = ary.backend if out is None: out = empty(indices.length, ary.dtype, backend=backend) if backend == 'opencl' or backend == 'cuda': take_knl = parallel.Elementwise(take_elwise, backend=backend) take_knl(indices, ary, out) elif backend == 'cython': np.take(ary.dev, indices.dev, out=out.dev) return out @annotate def inp_cumsum(i, ary): return ary[i] @annotate def out_cumsum(i, ary, out, item): out[i] = item @profile def cumsum(ary, backend=None, out=None): if backend is None: backend = ary.backend if backend == 'opencl' or backend == 'cuda': import compyle.parallel as parallel if out is None: out = empty(ary.length, ary.dtype, backend=backend) cumsum_scan = parallel.Scan( inp_cumsum, out_cumsum, 'a+b', dtype=ary.dtype, backend=backend ) cumsum_scan(ary=ary, out=out) return out elif backend == 'cython': _out = out.dev if out is not None else out output = np.cumsum(ary.dev, out=_out) return wrap_array(output, backend) @annotate def take_bool_elwise(i, condition, ary, cum_sum_ar, out_ar): if condition[i]: out_ar[cum_sum_ar[i]-1] = ary[i] @memoize def take_bool_kernel(backend, dtype_ar): e = Elementwise(take_bool_elwise, backend=backend) return e def take_bool(ary, condition, backend=None): if backend is None: backend = ary.backend cumsum_ar = cumsum(condition, backend=backend) out_ar = ones(cumsum_ar[-1], ary.dtype, backend=backend) e = take_bool_kernel(backend, ary.dtype) e(condition, ary, cumsum_ar, out_ar) return out_ar class AlignMultiple(Template): def __init__(self, name, num_arys): super(AlignMultiple, self).__init__(name=name) self.num_arys = num_arys def extra_args(self): args = ['inp_%s' % num for num in range(self.num_arys)] args += ['out_%s' % num for num in range(self.num_arys)] return args, {} def template(self, i, order): ''' % for num in range(obj.num_arys): out_${num}[i] = inp_${num}[order[i]] % endfor ''' def key_align_kernel(ary_list, order, backend=None): from .jit import get_ctype_from_arg key = [get_ctype_from_arg(ary, backend=backend) for ary in ary_list] key.append(backend) key.append(get_config().use_openmp) return tuple(key) @memoize(key=key_align_kernel) def get_align_kernel(ary_list, order, backend=None): import compyle.parallel as parallel align_multiple_knl = AlignMultiple('align_multiple_knl', len(ary_list)) align_multiple_elwise = parallel.Elementwise(align_multiple_knl.function, backend=backend) return align_multiple_elwise def align(ary_list, order, out_list=None, backend=None): if not ary_list: return [] if backend is None: backend = order.backend if not out_list: out_list = [] for ary in ary_list: out_list.append(empty(order.length, ary.dtype, backend=ary.backend)) args_list = [order] + ary_list + out_list align_multiple_elwise = get_align_kernel(ary_list, order, backend=backend) align_multiple_elwise(*args_list) return out_list def gt_elwise(i, x, val, ans): ans[i] = x[i] > val def lt_elwise(i, x, val, ans): ans[i] = x[i] < val def ge_elwise(i, x, val, ans): ans[i] = x[i] >= val def le_elwise(i, x, val, ans): ans[i] = x[i] <= val def eq_elwise(i, x, val, ans): ans[i] = x[i] == val def ne_elwise(i, x, val, ans): ans[i] = x[i] is not val @memoize def comparison_kernel(func, backend, ary_type, other_type): func_annotated = annotate(func, i='int', x=ary_type, val=other_type, ans='intp') e = Elementwise(func_annotated, backend=backend) return e def comparison_template(func, other, arr, backend=None): if backend is None: backend = arr.backend from compyle.parallel import Elementwise other_type = dtype_to_ctype(type(other)) ary_type = dtype_to_ctype(arr.dtype) + 'p' ans = empty(arr.length, dtype=np.int32, backend=arr.backend) e = comparison_kernel(func, arr.backend, ary_type, other_type) e(arr, other, ans) return ans @annotate def add_elwise(i, a, b, out): out[i] = a[i] + b[i] @memoize def add_kernel(backend, dtype): e = Elementwise(add_elwise, backend=backend) return e @annotate def sub_elwise(i, a, b, out): out[i] = a[i] - b[i] class Array(object): def __init__(self, dtype, n=0, allocate=True, backend=None): self.backend = get_backend(backend) if backend == 'cuda': from .cuda import set_context set_context() self.dtype = dtype self.gptr_type = dtype_to_knowntype(dtype, address='global', backend=backend) self.minimum = 0 self.maximum = 0 self.data = None self._data = None self.dev = None if allocate: length = n if n == 0: n = 16 data = empty(n, dtype, backend=self.backend) self.set_data(data) self.length = length self._update_array_ref() def __len__(self): return len(self.dev) def __getitem__(self, key): if isinstance(key, slice): return wrap_array(self.dev[key], self.backend) elif isinstance(key, Array): if key.length < self.length: return self.align(key) else: # it should be boolean array return take_bool(self, key, backend=self.backend) # NOTE: Not sure about this, done for PyCUDA compatibility if self.backend != 'cython': return self.dev[key].get().item() else: return self.dev[key] def __setitem__(self, key, value): if self.backend == 'cuda': if isinstance(key, slice): if isinstance(value, np.ndarray): self.dev[key] = np.asarray(value, dtype=self.dtype) else: self.dev[key].fill(value) else: self.dev[key] = np.asarray(value, dtype=self.dtype) else: self.dev[key] = value def __add__(self, other): if isinstance(other, Array): e = add_kernel(self.backend, self.dtype) out = empty_like(self) e(self, other, out) return out else: return NotImplemented def __sub__(self, other): if isinstance(other, Array): e = Elementwise(sub_elwise, backend=self.backend) out = empty_like(self) e(self, other, out) return out else: return NotImplemented def __radd__(self, other): if isinstance(other, Array): other = other.dev ans = other + self.dev return wrap_array(ans, self.backend) def __rsub__(self, other): if isinstance(other, Array): other = other.dev ans = other - self.dev return wrap_array(ans, self.backend) def __str__(self): return self.dev.__str__() def __gt__(self, other): return comparison_template(gt_elwise, other, self) def __lt__(self, other): return comparison_template(lt_elwise, other, self) def __ge__(self, other): return comparison_template(ge_elwise, other, self) def __le__(self, other): return comparison_template(le_elwise, other, self) def __eq__(self, other): return comparison_template(eq_elwise, other, self) def __ne__(self, other): return comparison_template(ne_elwise, other, self) def _update_array_ref(self): # For PyCUDA compatibility if self.length == 0 and len(self._data) == 0: self.dev = self._data else: self.dev = self._data[:self.length] def _get_np_data(self): return self.data def get_buff(self, offset=0, length=0): if not length: nbytes = int(self.dev.nbytes - offset * self.dev.itemsize) else: nbytes = length * self.dev.itemsize if self.backend == 'cython': length = nbytes // self.dev.itemsize return self.dev[offset:offset + length] elif self.backend == 'cuda': return cu_bufint(self._data, nbytes, int(offset)) def get(self): if self.backend == 'cython': return self.dev elif self.backend == 'opencl' or self.backend == 'cuda': return self.dev.get() def get_view(self, offset=0, length=None): if length is None: length = self.length - offset view_arr = Array(self.dtype, allocate=False, backend=self.backend) view_arr.set_data(self.dev[offset:offset + length]) return view_arr def set(self, nparr): if self.backend == 'cython': self.set_data(nparr) else: self.set_data(to_device(nparr, backend=self.backend)) def pull(self): if self.data is None: self.data = np.empty(len(self.dev), dtype=self.dtype) self.data[:] = self.get() def push(self): if self.backend == 'opencl' or self.backend == 'cuda': self._data.set(self._get_np_data()) self.set_data(self._data) def resize(self, size): self.reserve(size) self.length = size self._update_array_ref() def reserve(self, size): if size > self.alloc: new_data = empty(size, self.dtype, backend=self.backend) # For PyCUDA compatibility if self.length > 0: new_data.dev[:self.length] = self.dev self._data = new_data.dev self.alloc = size self._update_array_ref() def set_data(self, data): # data can be an Array instance or # a numpy/cl array/cuda array if isinstance(data, Array): data = data.dev self._data = data self.length = data.size self.alloc = data.size self.dtype = data.dtype self._update_array_ref() def get_array(self): return self[:self.length] def get_data(self): return self._data def copy(self): arr_copy = Array(self.dtype, backend=self.backend, allocate=False) arr_copy.set_data(self.dev.copy()) return arr_copy @profile def update_min_max(self, only_min=False, only_max=False): if self.backend == 'cython': self.minimum = minimum(self, backend=self.backend) self.maximum = maximum(self, backend=self.backend) self.minimum = self.minimum.astype(self.dtype) self.maximum = self.maximum.astype(self.dtype) else: update_minmax_gpu([self]) def fill(self, value): self.dev.fill(value) def append(self, value): if self.length >= self.alloc: self.reserve(2 * self.length) self._data[self.length] = np.asarray(value, dtype=self.dtype) self.length += 1 self._update_array_ref() def extend(self, ary): if self.length + len(ary.dev) > self.alloc: self.reserve(self.length + len(ary.dev)) self._data[-len(ary.dev):] = ary.dev self.length += len(ary.dev) self._update_array_ref() @memoize_method def _get_remove_kernels(self): import compyle.parallel as parallel @annotate(i='int', gintp='indices, if_remove') def fill_if_remove(i, indices, if_remove): if_remove[indices[i]] = 1 fill_if_remove_knl = parallel.Elementwise( fill_if_remove, backend=self.backend) @annotate(i='int', if_remove='gintp', return_='int') def remove_input_expr(i, if_remove): return if_remove[i] types = {'i': 'int', 'item': 'int', 'if_remove': 'gintp', 'new_array': self.gptr_type, 'old_array': self.gptr_type} @annotate(**types) def remove_output_expr(i, item, if_remove, new_array, old_array): if not if_remove[i]: new_array[i - item] = old_array[i] remove_knl = parallel.Scan(remove_input_expr, remove_output_expr, 'a+b', dtype=np.int32, backend=self.backend) return fill_if_remove_knl, remove_knl @profile def remove(self, indices, input_sorted=False): if len(indices) > self.length: msg = 'Number of indices to be removed is greater than' msg += 'number of indices in array' raise ValueError(msg) if_remove = Array(np.int32, n=self.length, backend=self.backend) if_remove.fill(0) new_array = self.copy() fill_if_remove_knl, remove_knl = self._get_remove_kernels() fill_if_remove_knl(indices, if_remove) remove_knl(if_remove=if_remove, old_array=self, new_array=new_array) self.set_data(new_array.dev[:-len(indices.dev)]) def align(self, indices, out=None): return take(self, indices, backend=self.backend, out=out) def squeeze(self): self.set_data(self._data[:self.length]) def copy_values(self, indices, dest): # indices and dest need to be Array instances if not isinstance(indices, Array) or \ not isinstance(dest, Array): raise TypeError('indices and dest need to be \ Array instances') dest.dev[:len(indices.dev)] = take( self, indices, backend=self.backend ).dev compyle-release-0.8.1/compyle/ast_utils.py000066400000000000000000000116641414173670100206530ustar00rootroot00000000000000"""Utilities to work with the Python AST. """ import ast import sys PY_VER = sys.version_info.major basestring = str if PY_VER > 2 else basestring class NameLister(ast.NodeVisitor): """Utility class to collect the Names in an AST. """ def __init__(self, ctx=(ast.Load, ast.Store)): self.names = set() self.ctx = ctx def visit_Name(self, node): if isinstance(node.ctx, self.ctx): self.names.add(node.id) self.generic_visit(node) class SymbolParser(ast.NodeVisitor): """Utility class to gather the used symbols in a block of code. We look at assignments, augmented assignments, function calls, and any Names. These are all parsed in one shot and collected. Note that this works best for a single function that is parsed rather than for a collection of functions. """ def __init__(self): self.names = set() self.assign = set() self.calls = set() self.funcargs = set() self.func_name = '' self.ctx = (ast.Load, ast.Store) def visit_Name(self, node): if isinstance(node.ctx, self.ctx): self.names.add(node.id) self.generic_visit(node) def visit_AugAssign(self, node): if isinstance(node.target, ast.Name): self.assign.add(node.target.id) elif isinstance(node.target, ast.Subscript): v = node.target.value while not isinstance(v, ast.Name): v = v.value self.assign.add(v.id) self.generic_visit(node) def visit_Assign(self, node): for target in node.targets: if isinstance(target, ast.Name): self.assign.add(target.id) elif isinstance(target, ast.Subscript): n = target.value while not isinstance(n, ast.Name): n = n.value self.assign.add(n.id) elif isinstance(target, (ast.List, ast.Tuple)): for n in target.elts: if isinstance(n, ast.Name): self.assign.add(n.id) self.generic_visit(node) def visit_Call(self, node): if isinstance(node.func, ast.Name): self.calls.add(node.func.id) self.generic_visit(node) def visit_FunctionDef(self, node): self.func_name = node.name if PY_VER == 2: self.funcargs.update(x.id for x in node.args.args) if node.args.vararg: self.funcargs.add(node.args.vararg) if node.args.kwarg: self.funcargs.add(node.args.kwarg) else: self.funcargs.update(x.arg for x in node.args.args) if node.args.vararg: self.funcargs.add(node.args.vararg.arg) if node.args.kwarg: self.funcargs.add(node.args.kwarg.arg) if node.args.kwonlyargs: self.funcargs.update(x.arg for x in node.args.kwonlyargs) for arg in node.body: self.visit(arg) def _get_tree(code): return ast.parse(code) if isinstance(code, basestring) else code def get_symbols(code, ctx=(ast.Load, ast.Store)): """Given an AST or code string return the symbols used therein. Parameters ---------- code: A code string or the result of an ast.parse. ctx: The context of the names, can be one of ast.Load, ast.Store, ast.Del. """ tree = _get_tree(code) n = NameLister(ctx=ctx) n.visit(tree) return n.names def get_assigned(code): """Given an AST or code string return the symbols that are augmented assigned or assigned. Parameters ---------- code: A code string or the result of an ast.parse. """ tree = _get_tree(code) p = SymbolParser() p.visit(tree) return p.assign def get_unknown_names_and_calls(code): """Given an AST or code string return the unknown variables and calls in the code. The function returns two sets, ``names, calls``. Parameters ---------- code: A code string or the result of an ast.parse. """ tree = ast.parse(code) if isinstance(code, basestring) else code p = SymbolParser() p.visit(tree) funcargs = p.funcargs if len(p.func_name) > 0: funcargs.add(p.func_name) names = p.names - funcargs - p.calls - p.assign calls = p.calls return names, calls def has_node(code, node): """Given an AST or code string returns True if the code contains any particular node statement. Parameters ---------- code: A code string or the result of an ast.parse. node: A node type or tuple of node types to check for. If a tuple is passed it returns True if any one of them is in the code. """ tree = _get_tree(code) for n in ast.walk(tree): if isinstance(n, node): return True return False def has_return(code): """Returns True of the node has a return statement. """ return has_node(code, ast.Return) compyle-release-0.8.1/compyle/capture_stream.py000066400000000000000000000072761414173670100216660ustar00rootroot00000000000000import io import os import sys from tempfile import mktemp def get_ipython_capture(): try: # This will work inside IPython but not outside it. name = get_ipython().__class__.__name__ if name.startswith('ZMQ'): from IPython.utils.capture import capture_output return capture_output else: return None except NameError: return None class CaptureStream(object): """A context manager which captures any errors on a given stream (like sys.stderr). The stream is captured and the outputs can be used. We treat sys.stderr and stdout specially as very often these are overridden by nose or IPython. We always wrap the underlying file descriptors in this case as this is the intent of this context manager. This is somewhat based on this question: http://stackoverflow.com/questions/7018879/disabling-output-when-compiling-with-distutils Examples -------- See the tests in tests/test_capture_stream.py for example usage. """ def __init__(self, stream=sys.stderr): self.stream = stream if stream is sys.stderr: self.fileno = 2 elif stream is sys.stdout: self.fileno = 1 else: self.fileno = stream.fileno() self.orig_stream = None self.tmp_stream = None self.tmp_path = '' self._cached_output = None def __enter__(self): if sys.platform.startswith('win32') and sys.version_info[:2] > (3, 5): return self self.orig_stream = os.dup(self.fileno) self.tmp_path = mktemp() self.tmp_stream = io.open(self.tmp_path, 'w+', encoding='utf-8') os.dup2(self.tmp_stream.fileno(), self.fileno) return self def __exit__(self, type, value, tb): if sys.platform.startswith('win32') and sys.version_info[:2] > (3, 5): return if self.orig_stream is not None: os.dup2(self.orig_stream, self.fileno) if self.tmp_stream is not None: self._cache_output() self.tmp_stream.close() os.remove(self.tmp_path) def _cache_output(self): if self._cached_output is not None: return tmp_stream = self.tmp_stream result = '' if tmp_stream is not None: tmp_stream.flush() tmp_stream.seek(0) result = tmp_stream.read() self._cached_output = result def get_output(self): """Return the captured output. """ if self._cached_output is None: self._cache_output() return self._cached_output class CaptureMultipleStreams(object): """This lets one capture multiple streams together. """ def __init__(self, streams=None): streams = (sys.stdout, sys.stderr) if streams is None else streams self.streams = streams self.captures = [CaptureStream(x) for x in streams] cap = get_ipython_capture() if cap: self.jcap = cap(stdout=True, stderr=True, display=True) else: self.jcap = None self.joutput = None def __enter__(self): for capture in self.captures: capture.__enter__() if self.jcap: self.joutput = self.jcap.__enter__() return self def __exit__(self, type, value, tb): for capture in self.captures: capture.__exit__(type, value, tb) if self.jcap: self.jcap.__exit__(type, value, tb) def get_output(self): out = list(x.get_output() for x in self.captures) if self.joutput: out[0] += self.joutput.stdout out[1] += self.joutput.stderr return out compyle-release-0.8.1/compyle/config.py000066400000000000000000000114601414173670100201030ustar00rootroot00000000000000"""Simple configuration options for PySPH. Do not import any PySPH specific extensions here, if you must, do the import inside the function/method. """ from contextlib import contextmanager class Config(object): def __init__(self): self._use_openmp = None self._use_opencl = None self._use_cuda = None self._use_double = None self._omp_schedule = None self._profile = None self._use_local_memory = None self._wgs = None self._suppress_warnings = None @property def suppress_warnings(self): if self._suppress_warnings is None: self._suppress_warnings = self._suppress_warnings_default() return self._suppress_warnings @suppress_warnings.setter def suppress_warnings(self, value): self._suppress_warnings = value def _suppress_warnings_default(self): return False @property def use_openmp(self): if self._use_openmp is None: self._use_openmp = self._use_openmp_default() return self._use_openmp @use_openmp.setter def use_openmp(self, value): self._use_openmp = value def _use_openmp_default(self): return False @property def omp_schedule(self): if self._omp_schedule is None: self._omp_schedule = self._omp_schedule_default() return self._omp_schedule @omp_schedule.setter def omp_schedule(self, value): if len(value) != 2 or \ value[0].lower() not in ("static", "dynamic", "guided"): raise ValueError("Invalid OpenMP Schedule: {}".format(value)) self._omp_schedule = value def set_omp_schedule(self, omp_schedule): """ Expects input to be in the format used by OMP_SCHEDULE i.e. "schedule_type, chunk_size" """ temp = omp_schedule.split(",") if len(temp) == 2: self.omp_schedule = (temp[0], int(temp[1])) else: self.omp_schedule = (temp[0], None) def _omp_schedule_default(self): return ("dynamic", 64) @property def use_opencl(self): if self._use_opencl is None: self._use_opencl = self._use_opencl_default() return self._use_opencl @use_opencl.setter def use_opencl(self, value): self._use_opencl = value def _use_opencl_default(self): return False @property def use_cuda(self): if self._use_cuda is None: self._use_cuda = self._use_cuda_default() return self._use_cuda @use_cuda.setter def use_cuda(self, value): self._use_cuda = value def _use_cuda_default(self): return False @property def use_double(self): """This is only used by OpenCL code. """ if self._use_double is None: self._use_double = self._use_double_default() return self._use_double @use_double.setter def use_double(self, value): """This is only used by OpenCL code. """ self._use_double = value def _use_double_default(self): return False @property def profile(self): if self._profile is None: self._profile = self._profile_default() return self._profile @profile.setter def profile(self, value): self._profile = value def _profile_default(self): return False @property def use_local_memory(self): if self._use_local_memory is None: self._use_local_memory = self._use_local_memory_default() return self._use_local_memory @use_local_memory.setter def use_local_memory(self, value): self._use_local_memory = value def _use_local_memory_default(self): return False @property def wgs(self): if self._wgs is None: self._wgs = self._wgs_default() return self._wgs @wgs.setter def wgs(self, value): self._wgs = value def _wgs_default(self): return 32 _config = None def get_config(): global _config if _config is None: _config = Config() return _config def set_config(config): global _config _config = config @contextmanager def use_config(**kw): """A context manager for the configuration. One can do the following:: with use_config(use_openmp=True) as cfg: do_something() cfg.use_opencl = True do_something_else() The configuration will be restored to the original when one exits the context. Inside the scope of the with statement the configuration ``cfg`` is the one operational and so can be changed. """ orig_cfg = get_config() cfg = Config() for k, v in kw.items(): setattr(cfg, k, v) set_config(cfg) try: yield cfg finally: set_config(orig_cfg) compyle-release-0.8.1/compyle/cuda.py000066400000000000000000001677361414173670100175740ustar00rootroot00000000000000"""Common CUDA related functionality. """ from __future__ import print_function import logging from pytools import Record, RecordWithoutPickling from pytools.persistent_dict import KeyBuilder as KeyBuilderBase from pytools.persistent_dict import WriteOncePersistentDict from pycuda._cluda import CLUDA_PREAMBLE import pycuda._mymako as mako from pycuda.tools import (dtype_to_ctype, bitlog2, context_dependent_memoize, ScalarArg, VectorArg) import pycuda.gpuarray as gpuarray from compyle.thrust.sort import argsort import pycuda.driver as drv from pycuda.compiler import SourceModule as _SourceModule from pytools import memoize import numpy as np import six _cuda_ctx = False def set_context(): global _cuda_ctx if not _cuda_ctx: import pycuda.autoinit _cuda_ctx = True # The following code is taken from pyopencl for struct mapping. # it should be ported over to pycuda eventually. import pycuda.gpuarray as gpuarray # noqa class SourceModule(_SourceModule): def __getattr__(self, name): def kernel(*args, **kwargs): f = self.get_function(name) return f(*args, **kwargs) kernel.function_name = name return kernel class _CDeclList: def __init__(self, device): self.device = device self.declared_dtypes = set() self.declarations = [] self.saw_complex = False def add_dtype(self, dtype): dtype = np.dtype(dtype) if dtype.kind == "c": self.saw_complex = True if dtype.kind != "V": return if dtype in self.declared_dtypes: return for name, field_data in sorted(six.iteritems(dtype.fields)): field_dtype, offset = field_data[:2] self.add_dtype(field_dtype) _, cdecl = match_dtype_to_c_struct( self.device, dtype_to_ctype(dtype), dtype ) self.declarations.append(cdecl) self.declared_dtypes.add(dtype) def visit_arguments(self, arguments): for arg in arguments: dtype = arg.dtype if dtype.kind == "c": self.saw_complex = True def get_declarations(self): result = "\n\n".join(self.declarations) if self.saw_complex: result = ( "#include \n\n" + result) return result @memoize def match_dtype_to_c_struct(device, name, dtype, context=None, use_typedef=False): """Return a tuple `(dtype, c_decl)` such that the C struct declaration in `c_decl` and the structure :class:`numpy.dtype` instance `dtype` have the same memory layout. Note that *dtype* may be modified from the value that was passed in, for example to insert padding. (As a remark on implementation, this routine runs a small kernel on the given *device* to ensure that :mod:`numpy` and C offsets and sizes match.) This example explains the use of this function:: >>> import numpy as np >>> import pyopencl as cl >>> import pyopencl.tools >>> ctx = cl.create_some_context() >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)]) >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct( ... ctx.devices[0], 'id_val', dtype) >>> print c_decl typedef struct { unsigned id; float value; } id_val; >>> print dtype [('id', '>> cl.tools.get_or_register_dtype('id_val', dtype) As this example shows, it is important to call :func:`get_or_register_dtype` on the modified `dtype` returned by this function, not the original one. """ fields = sorted( six.iteritems(dtype.fields), key=lambda name_dtype_offset: name_dtype_offset[1][1] ) c_fields = [] for field_name, dtype_and_offset in fields: field_dtype, offset = dtype_and_offset[:2] c_fields.append(" %s %s;" % (dtype_to_ctype(field_dtype), field_name)) if use_typedef: c_decl = "typedef struct {\n%s\n} %s;\n\n" % ( "\n".join(c_fields), name ) else: c_decl = "struct %s {\n%s\n};\n\n" % ( name, "\n".join(c_fields) ) cdl = _CDeclList(device) for field_name, dtype_and_offset in fields: field_dtype, offset = dtype_and_offset[:2] cdl.add_dtype(field_dtype) pre_decls = cdl.get_declarations() offset_code = "\n".join( "result[%d] = pycuda_offsetof(%s, %s);" % (i + 1, name, field_name) for i, (field_name, _) in enumerate(fields)) src = r""" #define pycuda_offsetof(st, m) \ ((uint) ((char *) &(dummy_pycuda.m) \ - (char *)&dummy_pycuda )) %(pre_decls)s %(my_decl)s extern "C" __global__ void get_size_and_offsets(uint *result) { result[0] = sizeof(%(my_type)s); %(my_type)s dummy_pycuda; %(offset_code)s } """ % dict( pre_decls=pre_decls, my_decl=c_decl, my_type=name, offset_code=offset_code) prg = SourceModule(src) knl = prg.get_size_and_offsets result_buf = gpuarray.empty(1 + len(fields), np.uint32) e = drv.Event() knl(result_buf.gpudata, block=(1, 1, 1)) e.record() e.synchronize() size_and_offsets = result_buf.get() size = int(size_and_offsets[0]) from pytools import any offsets = size_and_offsets[1:] if any(ofs >= size for ofs in offsets): # offsets not plausible if dtype.itemsize == size: # If sizes match, use numpy's idea of the offsets. offsets = [dtype_and_offset[1] for field_name, dtype_and_offset in fields] else: raise RuntimeError( "OpenCL compiler reported offsetof() past sizeof() " "for struct layout on '%s'. " "This makes no sense, and it's usually indicates a " "compiler bug. " "Refusing to discover struct layout." % device) del knl del prg del context try: dtype_arg_dict = { 'names': [field_name for field_name, (field_dtype, offset) in fields], 'formats': [field_dtype for field_name, (field_dtype, offset) in fields], 'offsets': [int(x) for x in offsets], 'itemsize': int(size_and_offsets[0]), } dtype = np.dtype(dtype_arg_dict) if dtype.itemsize != size_and_offsets[0]: # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo. dtype_arg_dict["names"].append("_pycl_size_fixer") dtype_arg_dict["formats"].append(np.uint8) dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1) dtype = np.dtype(dtype_arg_dict) except NotImplementedError: def calc_field_type(): total_size = 0 padding_count = 0 for offset, (field_name, (field_dtype, _)) in zip(offsets, fields): if offset > total_size: padding_count += 1 yield ('__pycuda_padding%d' % padding_count, 'V%d' % offset - total_size) yield field_name, field_dtype total_size = field_dtype.itemsize + offset dtype = np.dtype(list(calc_field_type())) assert dtype.itemsize == size_and_offsets[0] return dtype, c_decl @memoize def dtype_to_c_struct(device, dtype): if dtype.fields is None: return "" import pyopencl.cltypes if dtype in pyopencl.cltypes.vec_type_to_scalar_and_count: # Vector types are built-in. Don't try to redeclare those. return "" matched_dtype, c_decl = match_dtype_to_c_struct( device, dtype_to_ctype(dtype), dtype) def dtypes_match(): result = len(dtype.fields) == len(matched_dtype.fields) for name, val in six.iteritems(dtype.fields): result = result and matched_dtype.fields[name] == val return result assert dtypes_match() return c_decl ##################################################################### # The GenericScanKernel is added here temporarily until the following # PR is merged into PyCUDA # https://github.com/inducer/pycuda/pull/188 ##################################################################### logger = logging.getLogger(__name__) ##################################################################### # The GenericScanKernel is added here temporarily until the following # PR is merged into PyCUDA # https://github.com/inducer/pycuda/pull/188 ##################################################################### def parse_arg_list(arguments): """Parse a list of kernel arguments. *arguments* may be a comma-separate list of C declarators in a string, a list of strings representing C declarators, or :class:`Argument` objects. """ if isinstance(arguments, str): arguments = arguments.split(",") def parse_single_arg(obj): if isinstance(obj, str): from pycuda.tools import parse_c_arg return parse_c_arg(obj) else: return obj return [parse_single_arg(arg) for arg in arguments] def get_arg_list_scalar_arg_dtypes(arg_types): result = [] for arg_type in arg_types: if isinstance(arg_type, ScalarArg): result.append(arg_type.dtype) elif isinstance(arg_type, VectorArg): result.append(None) else: raise RuntimeError("arg type not understood: %s" % type(arg_type)) return result def _process_code_for_macro(code): if "//" in code: raise RuntimeError("end-of-line comments ('//') may not be used in " "code snippets") return code.replace("\n", " \\\n") class _NumpyTypesKeyBuilder(KeyBuilderBase): def update_for_type(self, key_hash, key): if issubclass(key, np.generic): self.update_for_str(key_hash, key.__name__) return raise TypeError("unsupported type for persistent hash keying: %s" % type(key)) # {{{ preamble SHARED_PREAMBLE = CLUDA_PREAMBLE + """ #define WG_SIZE ${wg_size} #define SCAN_EXPR(a, b, across_seg_boundary) ${scan_expr} #define INPUT_EXPR(i) (${input_expr}) %if is_segmented: #define IS_SEG_START(i, a) (${is_segment_start_expr}) %endif ${preamble} typedef ${dtype_to_ctype(scan_dtype)} scan_type; typedef ${dtype_to_ctype(index_dtype)} index_type; // NO_SEG_BOUNDARY is the largest representable integer in index_type. // This assumption is used in code below. #define NO_SEG_BOUNDARY ${str(np.iinfo(index_dtype).max)} """ # }}} # {{{ main scan code # Algorithm: Each work group is responsible for one contiguous # 'interval'. There are just enough intervals to fill all compute # units. Intervals are split into 'units'. A unit is what gets # worked on in parallel by one work group. # # in index space: # interval > unit > local-parallel > k-group # # (Note that there is also a transpose in here: The data is read # with local ids along linear index order.) # # Each unit has two axes--the local-id axis and the k axis. # # unit 0: # | | | | | | | | | | ----> lid # | | | | | | | | | | # | | | | | | | | | | # | | | | | | | | | | # | | | | | | | | | | # # | # v k (fastest-moving in linear index) # # unit 1: # | | | | | | | | | | ----> lid # | | | | | | | | | | # | | | | | | | | | | # | | | | | | | | | | # | | | | | | | | | | # # | # v k (fastest-moving in linear index) # # ... # # At a device-global level, this is a three-phase algorithm, in # which first each interval does its local scan, then a scan # across intervals exchanges data globally, and the final update # adds the exchanged sums to each interval. # # Exclusive scan is realized by allowing look-behind (access to the # preceding item) in the final update, by means of a local shift. # # NOTE: All segment_start_in_X indices are relative to the start # of the array. SCAN_INTERVALS_SOURCE = SHARED_PREAMBLE + r""" #define K ${k_group_size} // #define DEBUG #ifdef DEBUG #define pycu_printf(ARGS) printf ARGS #else #define pycu_printf(ARGS) /* */ #endif KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${kernel_name}( ${argument_signature}, GLOBAL_MEM scan_type* __restrict__ partial_scan_buffer, const index_type N, const index_type interval_size %if is_first_level: , GLOBAL_MEM scan_type* __restrict__ interval_results %endif %if is_segmented and is_first_level: // NO_SEG_BOUNDARY if no segment boundary in interval. , GLOBAL_MEM index_type* __restrict__ g_first_segment_start_in_interval %endif %if store_segment_start_flags: , GLOBAL_MEM char* __restrict__ g_segment_start_flags %endif ) { // index K in first dimension used for carry storage %if use_bank_conflict_avoidance: // Avoid bank conflicts by adding a single 32-bit value to the size of // the scan type. struct __attribute__ ((__packed__)) wrapped_scan_type { scan_type value; int dummy; }; %else: struct wrapped_scan_type { scan_type value; }; %endif // padded in WG_SIZE to avoid bank conflicts LOCAL_MEM struct wrapped_scan_type ldata[K + 1][WG_SIZE + 1]; %if is_segmented: LOCAL_MEM char l_segment_start_flags[K][WG_SIZE]; LOCAL_MEM index_type l_first_segment_start_in_subtree[WG_SIZE]; // only relevant/populated for local id 0 index_type first_segment_start_in_interval = NO_SEG_BOUNDARY; index_type first_segment_start_in_k_group, first_segment_start_in_subtree; %endif // {{{ declare local data for input_fetch_exprs if any of them are stenciled <% fetch_expr_offsets = {} for name, arg_name, ife_offset in input_fetch_exprs: fetch_expr_offsets.setdefault(arg_name, set()).add(ife_offset) local_fetch_expr_args = set( arg_name for arg_name, ife_offsets in fetch_expr_offsets.items() if -1 in ife_offsets or len(ife_offsets) > 1) %> %for arg_name in local_fetch_expr_args: LOCAL_MEM ${arg_ctypes[arg_name]} l_${arg_name}[WG_SIZE*K]; %endfor // }}} const index_type interval_begin = interval_size * GID_0; const index_type interval_end = min(interval_begin + interval_size, N); const index_type unit_size = K * WG_SIZE; index_type unit_base = interval_begin; %for is_tail in [False, True]: %if not is_tail: for(; unit_base + unit_size <= interval_end; unit_base += unit_size) %else: if (unit_base < interval_end) %endif { // {{{ carry out input_fetch_exprs // (if there are ones that need to be fetched into local) %if local_fetch_expr_args: for(index_type k = 0; k < K; k++) { const index_type offset = k*WG_SIZE + LID_0; const index_type read_i = unit_base + offset; %for arg_name in local_fetch_expr_args: %if is_tail: if (read_i < interval_end) %endif { l_${arg_name}[offset] = ${arg_name}[read_i]; } %endfor } local_barrier(); %endif pycu_printf(("after input_fetch_exprs\n")); // }}} // {{{ read a unit's worth of data from global for(index_type k = 0; k < K; k++) { const index_type offset = k*WG_SIZE + LID_0; const index_type read_i = unit_base + offset; %if is_tail: if (read_i < interval_end) %endif { %for name, arg_name, ife_offset in input_fetch_exprs: ${arg_ctypes[arg_name]} ${name}; %if arg_name in local_fetch_expr_args: if (offset + ${ife_offset} >= 0) ${name} = l_${arg_name}[offset + ${ife_offset}]; else if (read_i + ${ife_offset} >= 0) ${name} = ${arg_name}[read_i + ${ife_offset}]; /* else if out of bounds, name is left undefined */ %else: // ${arg_name} gets fetched directly from global ${name} = ${arg_name}[read_i]; %endif %endfor scan_type scan_value = INPUT_EXPR(read_i); const index_type o_mod_k = offset % K; const index_type o_div_k = offset / K; ldata[o_mod_k][o_div_k].value = scan_value; %if is_segmented: bool is_seg_start = IS_SEG_START(read_i, scan_value); l_segment_start_flags[o_mod_k][o_div_k] = is_seg_start; %endif %if store_segment_start_flags: g_segment_start_flags[read_i] = is_seg_start; %endif } } pycu_printf(("after read from global\n")); // }}} // {{{ carry in from previous unit, if applicable %if is_segmented: local_barrier(); first_segment_start_in_k_group = NO_SEG_BOUNDARY; if (l_segment_start_flags[0][LID_0]) first_segment_start_in_k_group = unit_base + K*LID_0; %endif if (LID_0 == 0 && unit_base != interval_begin) { scan_type tmp = ldata[K][WG_SIZE - 1].value; scan_type tmp_aux = ldata[0][0].value; ldata[0][0].value = SCAN_EXPR( tmp, tmp_aux, %if is_segmented: (l_segment_start_flags[0][0]) %else: false %endif ); } pycu_printf(("after carry-in\n")); // }}} local_barrier(); // {{{ scan along k (sequentially in each work item) scan_type sum = ldata[0][LID_0].value; %if is_tail: const index_type offset_end = interval_end - unit_base; %endif for(index_type k = 1; k < K; k++) { %if is_tail: if (K * LID_0 + k < offset_end) %endif { scan_type tmp = ldata[k][LID_0].value; %if is_segmented: index_type seq_i = unit_base + K*LID_0 + k; if (l_segment_start_flags[k][LID_0]) { first_segment_start_in_k_group = min( first_segment_start_in_k_group, seq_i); } %endif sum = SCAN_EXPR(sum, tmp, %if is_segmented: (l_segment_start_flags[k][LID_0]) %else: false %endif ); ldata[k][LID_0].value = sum; } } pycu_printf(("after scan along k\n")); // }}} // store carry in out-of-bounds (padding) array entry (index K) in // the K direction ldata[K][LID_0].value = sum; %if is_segmented: l_first_segment_start_in_subtree[LID_0] = first_segment_start_in_k_group; %endif local_barrier(); // {{{ tree-based local parallel scan // This tree-based scan works as follows: // - Each work item adds the previous item to its current state // - barrier // - Each work item adds in the item from two positions to the left // - barrier // - Each work item adds in the item from four positions to the left // ... // At the end, each item has summed all prior items. // across k groups, along local id // (uses out-of-bounds k=K array entry for storage) scan_type val = ldata[K][LID_0].value; <% scan_offset = 1 %> % while scan_offset <= wg_size: // {{{ reads from local allowed, writes to local not allowed if (LID_0 >= ${scan_offset}) { scan_type tmp = ldata[K][LID_0 - ${scan_offset}].value; % if is_tail: if (K*LID_0 < offset_end) % endif { val = SCAN_EXPR(tmp, val, %if is_segmented: (l_first_segment_start_in_subtree[LID_0] != NO_SEG_BOUNDARY) %else: false %endif ); } %if is_segmented: // Prepare for l_first_segment_start_in_subtree, below. // Note that this update must take place *even* if we're // out of bounds. first_segment_start_in_subtree = min( l_first_segment_start_in_subtree[LID_0], l_first_segment_start_in_subtree [LID_0 - ${scan_offset}]); %endif } %if is_segmented: else { first_segment_start_in_subtree = l_first_segment_start_in_subtree[LID_0]; } %endif // }}} local_barrier(); // {{{ writes to local allowed, reads from local not allowed ldata[K][LID_0].value = val; %if is_segmented: l_first_segment_start_in_subtree[LID_0] = first_segment_start_in_subtree; %endif // }}} local_barrier(); %if 0: if (LID_0 == 0) { printf("${scan_offset}: "); for (int i = 0; i < WG_SIZE; ++i) { if (l_first_segment_start_in_subtree[i] == NO_SEG_BOUNDARY) printf("- "); else printf("%d ", l_first_segment_start_in_subtree[i]); } printf("\n"); } %endif <% scan_offset *= 2 %> % endwhile pycu_printf(("after tree scan\n")); // }}} // {{{ update local values if (LID_0 > 0) { sum = ldata[K][LID_0 - 1].value; for(index_type k = 0; k < K; k++) { %if is_tail: if (K * LID_0 + k < offset_end) %endif { scan_type tmp = ldata[k][LID_0].value; ldata[k][LID_0].value = SCAN_EXPR(sum, tmp, %if is_segmented: (unit_base + K * LID_0 + k >= first_segment_start_in_k_group) %else: false %endif ); } } } %if is_segmented: if (LID_0 == 0) { // update interval-wide first-seg variable from current unit first_segment_start_in_interval = min( first_segment_start_in_interval, l_first_segment_start_in_subtree[WG_SIZE-1]); } %endif pycu_printf(("after local update\n")); // }}} local_barrier(); // {{{ write data { // work hard with index math to achieve contiguous 32-bit stores GLOBAL_MEM int *dest = (GLOBAL_MEM int *) (partial_scan_buffer + unit_base); <% assert scan_dtype.itemsize % 4 == 0 ints_per_wg = wg_size ints_to_store = scan_dtype.itemsize*wg_size*k_group_size // 4 %> const index_type scan_types_per_int = ${scan_dtype.itemsize//4}; %for store_base in range(0, ints_to_store, ints_per_wg): <% # Observe that ints_to_store is divisible by the work group # size already, so we won't go out of bounds that way. assert store_base + ints_per_wg <= ints_to_store %> %if is_tail: if (${store_base} + LID_0 < scan_types_per_int*(interval_end - unit_base)) %endif { index_type linear_index = ${store_base} + LID_0; index_type linear_scan_data_idx = linear_index / scan_types_per_int; index_type remainder = linear_index - linear_scan_data_idx * scan_types_per_int; int* src = (int*) &(ldata [linear_scan_data_idx % K] [linear_scan_data_idx / K].value); dest[linear_index] = src[remainder]; } %endfor } pycu_printf(("after write\n")); // }}} local_barrier(); } % endfor // write interval sum %if is_first_level: if (LID_0 == 0) { interval_results[GID_0] = partial_scan_buffer[interval_end - 1]; %if is_segmented: g_first_segment_start_in_interval[GID_0] = first_segment_start_in_interval; %endif } %endif } """ # }}} # {{{ update UPDATE_SOURCE = SHARED_PREAMBLE + r""" KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${name_prefix}_final_update( ${argument_signature}, const index_type N, const index_type interval_size, GLOBAL_MEM scan_type* __restrict__ interval_results, GLOBAL_MEM scan_type* __restrict__ partial_scan_buffer %if is_segmented: , GLOBAL_MEM index_type* __restrict__ g_first_segment_start_in_interval %endif %if is_segmented and use_lookbehind_update: , GLOBAL_MEM char* __restrict__ g_segment_start_flags %endif ) { %if use_lookbehind_update: LOCAL_MEM scan_type ldata[WG_SIZE]; %endif %if is_segmented and use_lookbehind_update: LOCAL_MEM char l_segment_start_flags[WG_SIZE]; %endif const index_type interval_begin = interval_size * GID_0; const index_type interval_end = min(interval_begin + interval_size, N); // carry from last interval scan_type carry = ${neutral}; if (GID_0 != 0) carry = interval_results[GID_0 - 1]; %if is_segmented: const index_type first_seg_start_in_interval = g_first_segment_start_in_interval[GID_0]; %endif %if not is_segmented and 'last_item' in output_statement: scan_type last_item = interval_results[GDIM_0-1]; %endif %if not use_lookbehind_update: // {{{ no look-behind ('prev_item' not in output_statement -> simpler) index_type update_i = interval_begin+LID_0; %if is_segmented: index_type seg_end = min(first_seg_start_in_interval, interval_end); %endif for(; update_i < interval_end; update_i += WG_SIZE) { scan_type partial_val = partial_scan_buffer[update_i]; scan_type item = SCAN_EXPR(carry, partial_val, %if is_segmented: (update_i >= seg_end) %else: false %endif ); index_type i = update_i; { ${output_statement}; } } // }}} %else: // {{{ allow look-behind ('prev_item' in output_statement -> complicated) // We are not allowed to branch across barriers at a granularity smaller // than the whole workgroup. Therefore, the for loop is group-global, // and there are lots of local ifs. index_type group_base = interval_begin; scan_type prev_item = carry; // (A) for(; group_base < interval_end; group_base += WG_SIZE) { index_type update_i = group_base+LID_0; // load a work group's worth of data if (update_i < interval_end) { scan_type tmp = partial_scan_buffer[update_i]; tmp = SCAN_EXPR(carry, tmp, %if is_segmented: (update_i >= first_seg_start_in_interval) %else: false %endif ); ldata[LID_0] = tmp; %if is_segmented: l_segment_start_flags[LID_0] = g_segment_start_flags[update_i]; %endif } local_barrier(); // find prev_item if (LID_0 != 0) prev_item = ldata[LID_0 - 1]; /* else prev_item = carry (see (A)) OR last tail (see (B)); */ if (update_i < interval_end) { %if is_segmented: if (l_segment_start_flags[LID_0]) prev_item = ${neutral}; %endif scan_type item = ldata[LID_0]; index_type i = update_i; { ${output_statement}; } } if (LID_0 == 0) prev_item = ldata[WG_SIZE - 1]; // (B) local_barrier(); } // }}} %endif } """ # }}} # {{{ driver # {{{ helpers def _round_down_to_power_of_2(val): result = 2**bitlog2(val) if result > val: result >>= 1 assert result <= val return result _PREFIX_WORDS = set(""" ldata partial_scan_buffer global scan_offset segment_start_in_k_group carry g_first_segment_start_in_interval IS_SEG_START tmp Z val l_first_segment_start_in_subtree unit_size index_type interval_begin interval_size offset_end K SCAN_EXPR do_update WG_SIZE first_segment_start_in_k_group scan_type segment_start_in_subtree offset interval_results interval_end first_segment_start_in_subtree unit_base first_segment_start_in_interval k INPUT_EXPR prev_group_sum prev pv value partial_val pgs is_seg_start update_i scan_item_at_i seq_i read_i l_ o_mod_k o_div_k l_segment_start_flags scan_value sum first_seg_start_in_interval g_segment_start_flags group_base seg_end my_val DEBUG ARGS ints_to_store ints_per_wg scan_types_per_int linear_index linear_scan_data_idx dest src store_base wrapped_scan_type dummy scan_tmp tmp_aux LID_2 LID_1 LID_0 LDIM_0 LDIM_1 LDIM_2 GDIM_0 GDIM_1 GDIM_2 GID_0 GID_1 GID_2 """.split()) _IGNORED_WORDS = set(""" 4 8 32 typedef for endfor if void while endwhile endfor endif else const printf None return bool n char true false ifdef pycu_printf str range assert np iinfo max itemsize __packed__ struct __restrict__ extern C set iteritems len setdefault GLOBAL_MEM LOCAL_MEM_ARG WITHIN_KERNEL LOCAL_MEM KERNEL REQD_WG_SIZE local_barrier __syncthreads pragma __attribute__ __global__ __device__ __shared__ __launch_bounds__ threadIdx blockIdx blockDim gridDim x y z barrier _final_update _debug_scan kernel_name positions all padded integer its previous write based writes 0 has local worth scan_expr to read cannot not X items False bank four beginning follows applicable item min each indices works side scanning right summed relative used id out index avoid current state boundary True across be This reads groups along Otherwise undetermined store of times prior s update first regardless Each number because array unit from segment conflicts two parallel 2 empty define direction CL padding work tree bounds values and adds scan is allowed thus it an as enable at in occur sequentially end no storage data 1 largest may representable uses entry Y meaningful computations interval At the left dimension know d A load B group perform shift tail see last OR this add fetched into are directly need gets them stenciled that undefined there up any ones or name only relevant populated even wide we Prepare int seg Note re below place take variable must intra Therefore find code assumption branch workgroup complicated granularity phase remainder than simpler We smaller look ifs lots self behind allow barriers whole loop after already Observe achieve contiguous stores hard go with by math size won t way divisible bit so Avoid declare adding single type is_tail is_first_level input_expr argument_signature preamble double_support neutral output_statement k_group_size name_prefix is_segmented index_dtype scan_dtype wg_size is_segment_start_expr fetch_expr_offsets arg_ctypes ife_offsets input_fetch_exprs def ife_offset arg_name local_fetch_expr_args update_body update_loop_lookbehind update_loop_plain update_loop use_lookbehind_update store_segment_start_flags update_loop first_seg scan_dtype dtype_to_ctype use_bank_conflict_avoidance a b prev_item i last_item prev_value N NO_SEG_BOUNDARY across_seg_boundary """.split()) def _make_template(s): leftovers = set() def replace_id(match): # avoid name clashes with user code by adding 'psc_' prefix to # identifiers. word = match.group(1) if word in _IGNORED_WORDS: return word elif word in _PREFIX_WORDS: return "psc_" + word else: leftovers.add(word) return word import re s = re.sub(r"\b([a-zA-Z0-9_]+)\b", replace_id, s) if leftovers: from warnings import warn warn("leftover words in identifier prefixing: " + " ".join(leftovers)) return mako.template.Template(s, strict_undefined=True) class _GeneratedScanKernelInfo(Record): __slots__ = [ "scan_src", "kernel_name", "scalar_arg_dtypes", "wg_size", "k_group_size"] def __init__(self, scan_src, kernel_name, scalar_arg_dtypes, wg_size, k_group_size): Record.__init__(self, scan_src=scan_src, kernel_name=kernel_name, scalar_arg_dtypes=scalar_arg_dtypes, wg_size=wg_size, k_group_size=k_group_size) def build(self, options): program = SourceModule(self.scan_src, options=options) kernel = program.get_function(self.kernel_name) kernel.prepare(self.scalar_arg_dtypes) return _BuiltScanKernelInfo( kernel=kernel, wg_size=self.wg_size, k_group_size=self.k_group_size) class _BuiltScanKernelInfo(RecordWithoutPickling): __slots__ = ["kernel", "wg_size", "k_group_size"] def __init__(self, kernel, wg_size, k_group_size): RecordWithoutPickling.__init__(self, kernel=kernel, wg_size=wg_size, k_group_size=k_group_size) class _GeneratedFinalUpdateKernelInfo(Record): def __init__(self, source, kernel_name, scalar_arg_dtypes, update_wg_size): Record.__init__(self, source=source, kernel_name=kernel_name, scalar_arg_dtypes=scalar_arg_dtypes, update_wg_size=update_wg_size) def build(self, options): program = SourceModule(self.source, options=options) kernel = program.get_function(self.kernel_name) kernel.prepare(self.scalar_arg_dtypes) return _BuiltFinalUpdateKernelInfo( kernel=kernel, update_wg_size=self.update_wg_size ) class _BuiltFinalUpdateKernelInfo(RecordWithoutPickling): __slots__ = ["kernel", "update_wg_size"] def __init__(self, kernel, update_wg_size): RecordWithoutPickling.__init__(self, kernel=kernel, update_wg_size=update_wg_size) # }}} class ScanPerformanceWarning(UserWarning): pass class _GenericScanKernelBase(object): # {{{ constructor, argument processing def __init__(self, dtype, arguments, input_expr, scan_expr, neutral, output_statement, is_segment_start_expr=None, input_fetch_exprs=[], index_dtype=np.int32, name_prefix="scan", options=None, preamble=""): """ :arg dtype: the :class:`numpy.dtype` with which the scan will be performed. May be a structured type if that type was registered through :func:`pycuda.tools.get_or_register_dtype`. :arg arguments: A string of comma-separated C argument declarations. If *arguments* is specified, then *input_expr* must also be specified. All types used here must be known to PyCUDA. (see :func:`pycuda.tools.get_or_register_dtype`). :arg scan_expr: The associative, binary operation carrying out the scan, represented as a C string. Its two arguments are available as `a` and `b` when it is evaluated. `b` is guaranteed to be the 'element being updated', and `a` is the increment. Thus, if some data is supposed to just propagate along without being modified by the scan, it should live in `b`. This expression may call functions given in the *preamble*. Another value available to this expression is `across_seg_boundary`, a C `bool` indicating whether this scan update is crossing a segment boundary, as defined by `is_segment_start_expr`. The scan routine does not implement segmentation semantics on its own. It relies on `scan_expr` to do this. This value is available (but always `false`) even for a non-segmented scan. .. note:: In early pre-releases of the segmented scan, segmentation semantics were implemented *without* relying on `scan_expr`. :arg input_expr: A C expression, encoded as a string, resulting in the values to which the scan is applied. This may be used to apply a mapping to values stored in *arguments* before being scanned. The result of this expression must match *dtype*. The index intended to be mapped is available as `i` in this expression. This expression may also use the variables defined by *input_fetch_expr*. This expression may also call functions given in the *preamble*. :arg output_statement: a C statement that writes the output of the scan. It has access to the scan result as `item`, the preceding scan result item as `prev_item`, and the current index as `i`. `prev_item` in a segmented scan will be the neutral element at a segment boundary, not the immediately preceding item. Using *prev_item* in output statement has a small run-time cost. `prev_item` enables the construction of an exclusive scan. For non-segmented scans, *output_statement* may also reference `last_item`, which evaluates to the scan result of the last array entry. :arg is_segment_start_expr: A C expression, encoded as a string, resulting in a C `bool` value that determines whether a new scan segments starts at index *i*. If given, makes the scan a segmented scan. Has access to the current index `i`, the result of *input_expr* as a, and in addition may use *arguments* and *input_fetch_expr* variables just like *input_expr*. If it returns true, then previous sums will not spill over into the item with index *i* or subsequent items. :arg input_fetch_exprs: a list of tuples *(NAME, ARG_NAME, OFFSET)*. An entry here has the effect of doing the equivalent of the following before input_expr:: ARG_NAME_TYPE NAME = ARG_NAME[i+OFFSET]; `OFFSET` is allowed to be 0 or -1, and `ARG_NAME_TYPE` is the type of `ARG_NAME`. :arg preamble: |preamble| The first array in the argument list determines the size of the index space over which the scan is carried out, and thus the values over which the index *i* occurring in a number of code fragments in arguments above will vary. All code fragments further have access to N, the number of elements being processed in the scan. """ dtype = self.dtype = np.dtype(dtype) if neutral is None: from warnings import warn warn("not specifying 'neutral' is deprecated and will lead to " "wrong results if your scan is not in-place or your " "'output_statement' does something otherwise non-trivial", stacklevel=2) if dtype.itemsize % 4 != 0: raise TypeError( "scan value type must have size divisible by 4 bytes") self.index_dtype = np.dtype(index_dtype) if np.iinfo(self.index_dtype).min >= 0: raise TypeError("index_dtype must be signed") self.options = options self.parsed_args = parse_arg_list(arguments) from pycuda.tools import VectorArg vector_args_indices = [i for i, arg in enumerate(self.parsed_args) if isinstance(arg, VectorArg)] self.first_array_idx = vector_args_indices[0] self.input_expr = input_expr self.is_segment_start_expr = is_segment_start_expr self.is_segmented = is_segment_start_expr is not None if self.is_segmented: is_segment_start_expr = _process_code_for_macro( is_segment_start_expr) self.output_statement = output_statement for name, arg_name, ife_offset in input_fetch_exprs: if ife_offset not in [0, -1]: raise RuntimeError( "input_fetch_expr offsets must either be 0 or -1") self.input_fetch_exprs = input_fetch_exprs arg_dtypes = {} arg_ctypes = {} for arg in self.parsed_args: arg_dtypes[arg.name] = arg.dtype arg_ctypes[arg.name] = dtype_to_ctype(arg.dtype) self.options = options self.name_prefix = name_prefix # {{{ set up shared code dict from pytools import all from pycuda.characterize import has_double_support self.code_variables = dict( np=np, dtype_to_ctype=dtype_to_ctype, preamble=preamble, name_prefix=name_prefix, index_dtype=self.index_dtype, scan_dtype=dtype, is_segmented=self.is_segmented, arg_dtypes=arg_dtypes, arg_ctypes=arg_ctypes, scan_expr=_process_code_for_macro(scan_expr), neutral=_process_code_for_macro(neutral), double_support=has_double_support(), ) index_typename = dtype_to_ctype(self.index_dtype) scan_typename = dtype_to_ctype(dtype) # This key is meant to uniquely identify the non-device parameters for # the scan kernel. self.kernel_key = ( self.dtype, tuple(arg.declarator() for arg in self.parsed_args), self.input_expr, scan_expr, neutral, output_statement, is_segment_start_expr, tuple(input_fetch_exprs), index_dtype, name_prefix, preamble, # These depend on dtype_to_ctype(), so their value is independent of # the other variables. index_typename, scan_typename, ) # }}} self.use_lookbehind_update = "prev_item" in self.output_statement self.store_segment_start_flags = ( self.is_segmented and self.use_lookbehind_update) self.finish_setup() # }}} generic_scan_kernel_cache = WriteOncePersistentDict( "pycuda-generated-scan-kernel-cache-v1", key_builder=_NumpyTypesKeyBuilder()) class GenericScanKernel(_GenericScanKernelBase): """Generates and executes code that performs prefix sums ("scans") on arbitrary types, with many possible tweaks. Usage example:: import pycuda.gpuarray as gpuarray from compyle.cuda import GenericScanKernel knl = GenericScanKernel( np.int32, arguments="int *ary", input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i] = item;") a = gpuarray.arange(10000, dtype=np.int32) knl(a) """ def finish_setup(self): # Before generating the kernel, see if it's cached. cache_key = (self.kernel_key,) from_cache = False try: result = generic_scan_kernel_cache[cache_key] from_cache = True logger.debug( "cache hit for generated scan kernel '%s'" % self.name_prefix) ( self.first_level_scan_gen_info, self.second_level_scan_gen_info, self.final_update_gen_info) = result except KeyError: pass if not from_cache: logger.debug( "cache miss for generated scan kernel '%s'" % self.name_prefix) self._finish_setup_impl() result = (self.first_level_scan_gen_info, self.second_level_scan_gen_info, self.final_update_gen_info) generic_scan_kernel_cache.store_if_not_present(cache_key, result) # Build the kernels. self.first_level_scan_info = self.first_level_scan_gen_info.build( self.options) del self.first_level_scan_gen_info self.second_level_scan_info = self.second_level_scan_gen_info.build( self.options) del self.second_level_scan_gen_info self.final_update_info = self.final_update_gen_info.build( self.options) del self.final_update_gen_info def _finish_setup_impl(self): # {{{ find usable workgroup/k-group size, build first-level scan trip_count = 0 dev = drv.Context.get_device() avail_local_mem = dev.get_attribute( drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK) # not sure where these go, but roughly this much seems unavailable. avail_local_mem -= 0x400 max_scan_wg_size = dev.get_attribute( drv.device_attribute.MAX_THREADS_PER_BLOCK) wg_size_multiples = 64 use_bank_conflict_avoidance = ( self.dtype.itemsize > 4 and self.dtype.itemsize % 8 == 0) # k_group_size should be a power of two because of in-kernel # division by that number. solutions = [] for k_exp in range(0, 9): for wg_size in range(wg_size_multiples, max_scan_wg_size + 1, wg_size_multiples): k_group_size = 2**k_exp lmem_use = self.get_local_mem_use(wg_size, k_group_size, use_bank_conflict_avoidance) if lmem_use <= avail_local_mem: solutions.append( (wg_size * k_group_size, k_group_size, wg_size)) from pytools import any for wg_size_floor in [256, 192, 128]: have_sol_above_floor = any(wg_size >= wg_size_floor for _, _, wg_size in solutions) if have_sol_above_floor: # delete all solutions not meeting the wg size floor solutions = [(total, try_k_group_size, try_wg_size) for total, try_k_group_size, try_wg_size in solutions if try_wg_size >= wg_size_floor] break _, k_group_size, max_scan_wg_size = max(solutions) while True: candidate_scan_gen_info = self.generate_scan_kernel( max_scan_wg_size, self.parsed_args, _process_code_for_macro(self.input_expr), self.is_segment_start_expr, input_fetch_exprs=self.input_fetch_exprs, is_first_level=True, store_segment_start_flags=self.store_segment_start_flags, k_group_size=k_group_size, use_bank_conflict_avoidance=use_bank_conflict_avoidance) candidate_scan_info = candidate_scan_gen_info.build( self.options) # Will this device actually let us execute this kernel # at the desired work group size? Building it is the # only way to find out. kernel_max_wg_size = candidate_scan_info.kernel.get_attribute( drv.function_attribute.MAX_THREADS_PER_BLOCK) if candidate_scan_info.wg_size <= kernel_max_wg_size: break else: max_scan_wg_size = min(kernel_max_wg_size, max_scan_wg_size) trip_count += 1 assert trip_count <= 20 self.first_level_scan_gen_info = candidate_scan_gen_info assert (_round_down_to_power_of_2(candidate_scan_info.wg_size) == candidate_scan_info.wg_size) # }}} # {{{ build second-level scan from pycuda.tools import VectorArg second_level_arguments = self.parsed_args + [ VectorArg(self.dtype, "interval_sums")] second_level_build_kwargs = {} if self.is_segmented: second_level_arguments.append( VectorArg(self.index_dtype, "g_first_segment_start_in_interval_input")) # is_segment_start_expr answers the question "should previous sums # spill over into this item". And since # g_first_segment_start_in_interval_input answers the question if a # segment boundary was found in an interval of data, then if not, # it's ok to spill over. second_level_build_kwargs["is_segment_start_expr"] = \ "g_first_segment_start_in_interval_input[i] != NO_SEG_BOUNDARY" else: second_level_build_kwargs["is_segment_start_expr"] = None self.second_level_scan_gen_info = self.generate_scan_kernel( max_scan_wg_size, arguments=second_level_arguments, input_expr="interval_sums[i]", input_fetch_exprs=[], is_first_level=False, store_segment_start_flags=False, k_group_size=k_group_size, use_bank_conflict_avoidance=use_bank_conflict_avoidance, **second_level_build_kwargs) # }}} # {{{ generate final update kernel update_wg_size = min(max_scan_wg_size, 256) final_update_tpl = _make_template(UPDATE_SOURCE) final_update_src = str(final_update_tpl.render( wg_size=update_wg_size, output_statement=self.output_statement, argument_signature=", ".join( arg.declarator() for arg in self.parsed_args), is_segment_start_expr=self.is_segment_start_expr, input_expr=_process_code_for_macro(self.input_expr), use_lookbehind_update=self.use_lookbehind_update, **self.code_variables)) update_scalar_arg_dtypes = ( get_arg_list_scalar_arg_dtypes(self.parsed_args) + [self.index_dtype, self.index_dtype, None, None]) if self.is_segmented: # g_first_segment_start_in_interval update_scalar_arg_dtypes.append(None) if self.store_segment_start_flags: update_scalar_arg_dtypes.append(None) # g_segment_start_flags self.final_update_gen_info = _GeneratedFinalUpdateKernelInfo( final_update_src, self.name_prefix + "_final_update", update_scalar_arg_dtypes, update_wg_size) # }}} # {{{ scan kernel build/properties def get_local_mem_use(self, k_group_size, wg_size, use_bank_conflict_avoidance): arg_dtypes = {} for arg in self.parsed_args: arg_dtypes[arg.name] = arg.dtype fetch_expr_offsets = {} for name, arg_name, ife_offset in self.input_fetch_exprs: fetch_expr_offsets.setdefault(arg_name, set()).add(ife_offset) itemsize = self.dtype.itemsize if use_bank_conflict_avoidance: itemsize += 4 return ( # ldata itemsize * (k_group_size + 1) * (wg_size + 1) # l_segment_start_flags + k_group_size * wg_size # l_first_segment_start_in_subtree + self.index_dtype.itemsize * wg_size + k_group_size * wg_size * sum( arg_dtypes[arg_name].itemsize for arg_name, ife_offsets in list(fetch_expr_offsets.items()) if -1 in ife_offsets or len(ife_offsets) > 1)) def generate_scan_kernel( self, max_wg_size, arguments, input_expr, is_segment_start_expr, input_fetch_exprs, is_first_level, store_segment_start_flags, k_group_size, use_bank_conflict_avoidance): scalar_arg_dtypes = get_arg_list_scalar_arg_dtypes(arguments) # Empirically found on Nv hardware: no need to be bigger than this size wg_size = _round_down_to_power_of_2( min(max_wg_size, 256)) kernel_name = self.code_variables["name_prefix"] if is_first_level: kernel_name += "_lev1" else: kernel_name += "_lev2" scan_tpl = _make_template(SCAN_INTERVALS_SOURCE) scan_src = str( scan_tpl.render( wg_size=wg_size, input_expr=input_expr, k_group_size=k_group_size, argument_signature=", ".join( arg.declarator() for arg in arguments), is_segment_start_expr=is_segment_start_expr, input_fetch_exprs=input_fetch_exprs, is_first_level=is_first_level, store_segment_start_flags=store_segment_start_flags, use_bank_conflict_avoidance=use_bank_conflict_avoidance, kernel_name=kernel_name, **self.code_variables)) scalar_arg_dtypes.extend( (None, self.index_dtype, self.index_dtype)) if is_first_level: scalar_arg_dtypes.append(None) # interval_results if self.is_segmented and is_first_level: scalar_arg_dtypes.append(None) # g_first_segment_start_in_interval if store_segment_start_flags: scalar_arg_dtypes.append(None) # g_segment_start_flags return _GeneratedScanKernelInfo( scan_src=scan_src, kernel_name=kernel_name, scalar_arg_dtypes=scalar_arg_dtypes, wg_size=wg_size, k_group_size=k_group_size) # }}} def __call__(self, *args, **kwargs): # {{{ argument processing allocator = kwargs.get("allocator") n = kwargs.get("size") stream = kwargs.get("stream") if len(args) != len(self.parsed_args): raise TypeError("expected %d arguments, got %d" % (len(self.parsed_args), len(args))) first_array = args[self.first_array_idx] allocator = allocator or first_array.allocator if n is None: n, = first_array.shape if n == 0: return data_args = [] from pycuda.tools import VectorArg for arg_descr, arg_val in zip(self.parsed_args, args): if isinstance(arg_descr, VectorArg): data_args.append(arg_val.gpudata) else: data_args.append(arg_val) # }}} l1_info = self.first_level_scan_info l2_info = self.second_level_scan_info unit_size = l1_info.wg_size * l1_info.k_group_size dev = drv.Context.get_device() max_intervals = 3 * dev.get_attribute( drv.device_attribute.MULTIPROCESSOR_COUNT) from pytools import uniform_interval_splitting interval_size, num_intervals = uniform_interval_splitting( n, unit_size, max_intervals) # {{{ allocate some buffers interval_results = gpuarray.empty( num_intervals, dtype=self.dtype, allocator=allocator) partial_scan_buffer = gpuarray.empty( n, dtype=self.dtype, allocator=allocator) if self.store_segment_start_flags: segment_start_flags = gpuarray.empty( n, dtype=np.bool, allocator=allocator) # }}} # {{{ first level scan of interval (one interval per block) scan1_args = data_args + [ partial_scan_buffer.gpudata, n, interval_size, interval_results.gpudata, ] if self.is_segmented: first_segment_start_in_interval = gpuarray.empty( num_intervals, dtype=self.index_dtype, allocator=allocator) scan1_args.append(first_segment_start_in_interval.gpudata) if self.store_segment_start_flags: scan1_args.append(segment_start_flags.gpudata) l1_evt = l1_info.kernel.prepared_async_call( (num_intervals, 1), (l1_info.wg_size, 1, 1), stream, *scan1_args) # }}} # {{{ second level scan of per-interval results # can scan at most one interval assert interval_size >= num_intervals scan2_args = data_args + [ interval_results.gpudata, # interval_sums ] if self.is_segmented: scan2_args.append(first_segment_start_in_interval.gpudata) scan2_args = scan2_args + [ interval_results.gpudata, # partial_scan_buffer num_intervals, interval_size] l2_evt = l2_info.kernel.prepared_async_call( (1, 1), (l1_info.wg_size, 1, 1), stream, *scan2_args) # }}} # {{{ update intervals with result of interval scan upd_args = data_args + [n, interval_size, interval_results.gpudata, partial_scan_buffer.gpudata] if self.is_segmented: upd_args.append(first_segment_start_in_interval.gpudata) if self.store_segment_start_flags: upd_args.append(segment_start_flags.gpudata) return self.final_update_info.kernel.prepared_async_call( (num_intervals, 1), (self.final_update_info.update_wg_size, 1, 1), stream, *upd_args) # }}} # }}} compyle-release-0.8.1/compyle/cython_generator.py000066400000000000000000000472311414173670100222150ustar00rootroot00000000000000"""A simple code generator that generates high-performance Cython code from equivalent Python code. Note that this is not a general purpose code generator but one highly tailored for use in PySPH for general use cases, Cython itself does a terrific job. """ from __future__ import absolute_import import ast try: from collections import OrderedDict except ImportError: from ordereddict import OrderedDict import inspect import logging from textwrap import dedent import types from mako.template import Template from .types import KnownType, Undefined, get_declare_info from .config import get_config from .ast_utils import get_assigned, has_return from .utils import getsourcelines logger = logging.getLogger(__name__) def get_parallel_range(start, stop=None, step=1, **kwargs): config = get_config() if stop is None: stop = start start = 0 args = "{start}, {stop}, {step}" if config.use_openmp: schedule = config.omp_schedule[0] chunksize = config.omp_schedule[1] if 'schedule' in kwargs: schedule = kwargs.pop('schedule') if 'chunksize' in kwargs: chunksize = kwargs.pop('chunksize') if schedule is not None: args = args + ", schedule='{schedule}'" if chunksize is not None: args = args + ", chunksize={chunksize}" for k, v in kwargs.items(): args = args + ", %s=%r" % (k, v) args = args.format(start=start, stop=stop, step=step, schedule=schedule, chunksize=chunksize) return "prange({})".format(args) else: args = args.format(start=start, stop=stop, step=step) return "range({})".format(args) class CythonClassHelper(object): def __init__(self, name='', public_vars=None, methods=None): self.name = name self.public_vars = public_vars self.methods = methods if methods is not None else [] def generate(self): template = dedent(""" cdef class ${class_name}: %for name, type in public_vars.items(): cdef public ${type} ${name} %endfor def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) %for defn, body in methods: ${defn} %for line in body.splitlines(): ${line} %endfor %endfor """) t = Template(text=template) return t.render(class_name=self.name, public_vars=self.public_vars, methods=self.methods) def get_func_definition(sourcelines): """Given a block of source lines for a method or function, get the lines for the function block. """ # For now return the line after the first. count = 1 for line in sourcelines: if line.rstrip().endswith(':'): break count += 1 return sourcelines[:count], sourcelines[count:] def all_numeric(seq): """Return true if all values in given sequence are numeric. """ try: types = [int, float, long] except NameError: types = [int, float] return all(type(x) in types for x in seq) class CodeGenerationError(Exception): pass def parse_declare(code): """Given a string with the source for the declare method, return the type information. """ m = ast.parse(code) call = m.body[0].value if call.func.id != 'declare': raise CodeGenerationError('Unknown declare statement: %s' % code) arg0 = call.args[0] if not isinstance(arg0, ast.Str): err = 'Type should be a string, given :%r' % arg0.s raise CodeGenerationError(err) return get_declare_info(arg0.s) class CythonGenerator(object): def __init__(self, known_types=None, python_methods=False): """ Parameters ----------- - known_types: dict: provides default types for known arguments. - python_methods: bool: generate python wrapper functions. specifies if convenient Python friendly wrappers are to be generated in addition to the low-level c wrappers. """ self.code = '' self.python_methods = python_methods # Methods to not wrap. self.ignore_methods = ['_cython_code_'] self.known_types = known_types if known_types is not None else {} self._config = get_config() # ### Public protocol ##################################################### def add_known(self, names): '''Just for API compatibility with the translator. ''' pass def ctype_to_python(self, type_str): """Given a c-style type declaration obtained from the `detect_type` method, return a Python friendly type declaration. """ return type_str.replace('*', '[:]') def detect_type(self, name, value): """Given the variable name and value, detect its type. """ if isinstance(value, KnownType): return value.type.replace( 'GLOBAL_MEM ', '' ).replace('LOCAL_MEM ', '') if name.startswith(('s_', 'd_')) and name not in ['s_idx', 'd_idx']: return 'double*' if name in ['s_idx', 'd_idx']: return 'long' if value is Undefined or isinstance(value, Undefined): msg = 'Unknown type, for function argument named: %s' % name raise CodeGenerationError(msg) if isinstance(value, bool): return 'int' elif isinstance(value, int): return 'long' elif isinstance(value, str): return 'str' elif isinstance(value, float): return 'double' elif isinstance(value, (list, tuple)): if all_numeric(value): # We don't deal with integer lists for now. return 'double*' else: return 'list' if isinstance(value, list) else 'tuple' else: return 'object' def get_code(self): return self.code def parse(self, obj, declarations=None, is_serial=False): obj_type = type(obj) if isinstance(obj, types.FunctionType): self._parse_function(obj, declarations=declarations, is_serial=is_serial) elif hasattr(obj, '__class__'): self._parse_instance(obj) else: raise TypeError('Unsupported type to wrap: %s' % obj_type) def get_func_signature(self, func): """Given a function that is wrapped, return the Python wrapper definition signature and the Python call signature and the C wrapper definition and C call signature. For example if we had def f(x=1, y=[1.0]): pass If this were passed we would get back: (['int x', 'double[:] y'], ['x', '&y[0]']), (['int x', 'double* y'], ['x', 'y']) """ sourcelines = getsourcelines(func)[0] defn, lines = get_func_definition(sourcelines) f_name, returns, args = self._analyze_method(func, lines) py_args = [] py_call = [] c_args = [] c_call = [] for arg, value in args: c_type = self.detect_type(arg, value) c_args.append('{type} {arg}'.format(type=c_type, arg=arg)) c_call.append(arg) py_type = self.ctype_to_python(c_type) py_args.append('{type} {arg}'.format(type=py_type, arg=arg)) if c_type.endswith('*'): py_call.append('&{arg}[0]'.format(arg=arg)) else: py_call.append('{arg}'.format(arg=arg)) return (py_args, py_call), (c_args, c_call) def set_make_python_methods(self, value): """Turn on/off the generation of Python methods. """ self.python_methods = value # #### Private protocol ################################################### def _analyze_method(self, meth, lines): """Returns information about the method. Specifically it returns the method name, if it has a return value, and a list of [(arg_name, value),...]. """ name = meth.__name__ getfullargspec = getattr( inspect, 'getfullargspec', inspect.getargspec ) argspec = getfullargspec(meth) args = argspec.args is_method = False if args and args[0] == 'self': args = args[1:] is_method = True if hasattr(argspec, 'annotations'): annotations = argspec.annotations else: annotations = getattr(meth, '__annotations__', {}) call_args = {} # Type annotations always take first precendence even over known # types. if len(annotations) > 0: for arg in args: call_args[arg] = annotations.get(arg, Undefined) returns = annotations.get('return', False) else: body = ''.join(lines) returns = has_return(dedent(body)) defaults = argspec.defaults if argspec.defaults is not None else [] # The call_args dict is filled up with the defaults to detect # the appropriate type of the arguments. for i in range(1, len(defaults) + 1): call_args[args[-i]] = defaults[-i] # Set the rest to Undefined for i in range(len(args) - len(defaults)): call_args[args[i]] = Undefined # Make sure any predefined quantities are suitably typed. call_args.update(self.known_types) new_args = [('self', None)] if is_method else [] for arg in args: value = call_args[arg] new_args.append((arg, value)) return name, returns, new_args def _get_c_method_spec(self, name, returns, args): """Returns a C definition for the method. """ c_args = [] if args and args[0][0] == 'self': args = args[1:] c_args.append('self') for arg, value in args: c_type = self.detect_type(arg, value) c_args.append('{type} {arg}'.format(type=c_type, arg=arg)) if isinstance(returns, KnownType): c_ret = returns.type else: c_ret = 'double' if returns else 'void' c_arg_def = ', '.join(c_args) if self._config.use_openmp: ignore = ['reduce', 'converged'] gil = " nogil" if name not in ignore else "" else: gil = "" cdefn = 'cdef inline {ret} {name}({arg_def}){gil}:'.format( ret=c_ret, name=name, arg_def=c_arg_def, gil=gil ) return cdefn def _get_methods(self, cls): methods = [] for name in dir(cls): if name.startswith(('_', 'py_')): continue meth = getattr(cls, name) if callable(meth): if name in self.ignore_methods: continue c_code, py_code = self._get_method_wrapper( meth, indent=' ' * 8) methods.append(c_code) if self.python_methods: methods.append(py_code) return methods def _get_method_body(self, meth, lines, indent=' ' * 8, declarations=None, is_serial=False): getfullargspec = getattr( inspect, 'getfullargspec', inspect.getargspec ) args = set(getfullargspec(meth).args) src = [self._process_body_line(line, is_serial=is_serial) for line in lines] if declarations: cy_decls = [] for var, decl in declarations.items(): dtype, name = decl[:-1].split(' ') if dtype[0] == 'u': dtype = 'unsigned %s' % dtype[1:] modified_decl = '%s %s' % (dtype, name) cy_decls.append((var, indent + 'cdef %s\n' % modified_decl)) src = cy_decls + src declared = [] if not declarations else list(declarations.keys()) for names, defn in src: if names: declared.extend(x.strip() for x in names.split(',')) cython_body = ''.join([x[1] for x in src]) body = ''.join(lines) dedented_body = dedent(body) symbols = get_assigned(dedented_body) undefined = symbols - set(declared) - args declare = [indent + 'cdef double %s\n' % x for x in sorted(undefined)] code = ''.join(declare) + cython_body return code def _get_method_wrapper(self, meth, indent=' ' * 8, declarations=None, is_serial=False): sourcelines = getsourcelines(meth)[0] defn, lines = get_func_definition(sourcelines) m_name, returns, args = self._analyze_method(meth, lines) c_defn = self._get_c_method_spec(m_name, returns, args) c_body = self._get_method_body(meth, lines, indent=indent, declarations=declarations, is_serial=is_serial) self.code = '{defn}\n{body}'.format(defn=c_defn, body=c_body) if self.python_methods: defn, body = self._get_py_method_spec(m_name, returns, args, indent=indent) else: defn, body = None, None return (c_defn, c_body), (defn, body) def _get_public_vars(self, obj): # For now get it all from the dict. data = obj.__dict__ vars = OrderedDict((name, self.detect_type(name, data[name])) for name in sorted(data.keys())) return vars def _get_py_method_spec(self, name, returns, args, indent=' ' * 8): """Returns a Python friendly definition for the method along with the wrapper function. """ py_args = [] is_method = False if args and args[0][0] == 'self': is_method = True args = args[1:] py_args.append('self') call_sig = [] for arg, value in args: c_type = self.detect_type(arg, value) py_type = self.ctype_to_python(c_type) py_args.append('{type} {arg}'.format(type=py_type, arg=arg)) if c_type.endswith('*'): call_sig.append('&{arg}[0]'.format(arg=arg)) else: call_sig.append('{arg}'.format(arg=arg)) if isinstance(returns, KnownType): py_ret = returns.type + ' ' else: py_ret = 'double ' if returns else '' py_arg_def = ', '.join(py_args) pydefn = 'cpdef {ret}py_{name}({arg_def}):'.format( ret=py_ret, name=name, arg_def=py_arg_def ) call = ', '.join(call_sig) py_ret = 'return ' if returns else '' py_self = 'self.' if is_method else '' body = indent + '{ret}{self}{name}({call})\n'.format( name=name, call=call, ret=py_ret, self=py_self ) return pydefn, body def _handle_declare_statement(self, name, declare): def matrix(size): if not isinstance(size, tuple): size = (size,) sz = ''.join(['[%d]' % n for n in size]) return sz # Parse the declare statement. kind, _address_space, ctype, shape = parse_declare(declare) if kind == 'matrix': sz = matrix(shape) vars = ['%s%s' % (x.strip(), sz) for x in name.split(',')] defn = 'cdef {type} {vars}'.format( type=ctype, vars=', '.join(vars) ) return defn else: defn = 'cdef {type} {name}'.format(type=ctype, name=name) return defn def _handle_cast_statement(self, name, call): # FIXME: This won't handle casting to pointers # using something like 'intp' call_args = call[5:-1].split(',') expr = call_args[0].strip() ctype = call_args[1].strip()[1:-1] return '%s = <%s> (%s)' % (name, ctype, expr) def _handle_atomic_statement_inc(self, name, call, is_serial): # FIXME: This won't handle casting to pointers # using something like 'intp' call_arg = call[11:-1].strip() if self._config.use_openmp and not is_serial: return['openmp.omp_set_lock(&cy_lock)', '%s = %s' % (name, call_arg), '%s += 1' % call_arg, 'openmp.omp_unset_lock(&cy_lock)'] else: return ['%s = %s' % (name, call_arg), '%s += 1' % call_arg] def _handle_atomic_statement_dec(self, name, call, is_serial): # FIXME: This won't handle casting to pointers # using something like 'intp' call_arg = call[11:-1].strip() if self._config.use_openmp and not is_serial: return['openmp.omp_set_lock(&cy_lock)', '%s = %s' % (name, call_arg), '%s -= 1' % call_arg, 'openmp.omp_unset_lock(&cy_lock)'] else: return ['%s = %s' % (name, call_arg), '%s -= 1' % call_arg] def _parse_function(self, obj, declarations=None, is_serial=False): c_code, py_code = self._get_method_wrapper(obj, indent=' ' * 4, declarations=declarations, is_serial=is_serial) code = '{defn}\n{body}'.format(defn=c_code[0], body=c_code[1]) if self.python_methods: code += '\n' code += '{defn}\n{body}'.format(defn=py_code[0], body=py_code[1]) self.code = code def _parse_instance(self, obj): cls = obj.__class__ name = cls.__name__ public_vars = self._get_public_vars(obj) methods = self._get_methods(cls) helper = CythonClassHelper(name=name, public_vars=public_vars, methods=methods) self.code = helper.generate() def _process_body_line(self, line, is_serial=False): """Returns the name defined and the processed line itself. This hack primarily lets us declare variables from Python and inject them into Cython code. """ if '=' in line: words = [x.strip() for x in line.split('=')] if words[1].startswith('declare') and \ not line.strip().startswith('#'): name = words[0] declare = words[1] defn = self._handle_declare_statement(name, declare) indent = line[:line.index(name)] return name, indent + defn + '\n' elif words[1].startswith('cast') and \ not line.strip().startswith('#'): name = words[0] call = words[1] stmt = self._handle_cast_statement(name, call) indent = line[:line.index(name)] return '', indent + stmt + '\n' elif words[1].startswith('atomic_inc') and \ not line.strip().startswith('#'): name = words[0] call = words[1] indent = line[:line.index(name)] stmts = self._handle_atomic_statement_inc( name, call, is_serial) result = '' for stmt in stmts: result += indent + stmt + '\n' return '', result + '\n' elif words[1].startswith('atomic_dec') and \ not line.strip().startswith('#'): name = words[0] call = words[1] indent = line[:line.index(name)] stmts = self._handle_atomic_statement_dec( name, call, is_serial) result = '' for stmt in stmts: result += indent + stmt + '\n' return '', result + '\n' else: return '', line else: return '', line compyle-release-0.8.1/compyle/ext_module.py000066400000000000000000000240461414173670100210070ustar00rootroot00000000000000# Standard library imports from contextlib import contextmanager from distutils.sysconfig import get_config_vars from distutils.util import get_platform from distutils.errors import CompileError, LinkError import hashlib import importlib import io import logging import numpy import os from os.path import exists, expanduser, isdir, join import platform from pyximport import pyxbuild import shutil import sys import time # Conditional/Optional imports. if sys.platform == 'win32': from setuptools.extension import Extension else: from distutils.extension import Extension # Package imports. from .config import get_config # noqa: 402 from .capture_stream import CaptureMultipleStreams # noqa: 402 logger = logging.getLogger(__name__) def get_config_file_opts(): '''A global configuration file is used to configure build options for compyle and other packages. This is located in: ~/.compyle/config.py The file can contain arbitrary Python that is exec'd. The variables defined here specify the compile and link args. For example, one may set: OMP_CFLAGS = ['-fopenmp'] OMP_LINK = ['-fopenmp'] Will use these instead of the defaults that are automatically determined. These must be lists. ''' fname = expanduser(join('~', '.compyle', 'config.py')) opts = {} if exists(fname): with open(fname) as fp: exec(compile(fp.read(), fname, 'exec'), opts) opts.pop('__builtins__', None) return opts CONFIG_OPTS = get_config_file_opts() def get_platform_dir(): return 'py{version}-{platform_dir}'.format( version=sys.version[:3], platform_dir=get_platform() ) def get_ext_extension(): """Return the system's file extension for Extension modules.""" vars = get_config_vars() return vars.get('EXT_SUFFIX', vars.get('SO')) def get_md5(data): """Return the MD5 sum of the given data. """ return hashlib.md5(data.encode()).hexdigest() def get_openmp_flags(): """Return the OpenMP flags for the platform. This returns two lists, [extra_compile_args], [extra_link_args] """ if 'OMP_CFLAGS' in CONFIG_OPTS or 'OMP_LINK' in CONFIG_OPTS: return CONFIG_OPTS['OMP_CFLAGS'], CONFIG_OPTS['OMP_LINK'] if sys.platform == 'win32': return ['/openmp'], [] elif sys.platform == 'darwin': if (os.environ.get('CC') is not None and os.environ.get('CXX') is not None): return ['-fopenmp'], ['-fopenmp'] else: return ['-Xpreprocessor', '-fopenmp'], ['-lomp'] else: return ['-fopenmp'], ['-fopenmp'] class ExtModule(object): """Encapsulates the generated code, extension module etc. """ def __init__(self, src, extension='pyx', root=None, verbose=False, depends=None, extra_inc_dirs=None, extra_compile_args=None, extra_link_args=None): """Initialize ExtModule. Parameters ----------- src : str : source code. ext : str : extension for source code file. Do not specify the '.' (defaults to 'pyx'). root : str: root of directory to store code and modules in. If not set it defaults to "~/.compyle/source/". where is platform specific. verbose : Bool : Print messages for convenience. depends : list : a list of modules that this extension depends on if any of these have an m_time greater than the compiled extension module, the extension will be recompiled. extra_inc_dirs : list : a list of directories to look for .pxd, .h and other files. extra_compile_args: list : a list of extra compilation flags. extra_link_args: list : a list of extra link flags. """ self._setup_root(root) self.code = src self.hash = get_md5(src) self.extension = extension self.name = 'm_{0}'.format(self.hash) self._setup_filenames() self.verbose = verbose self.depends = depends self.extra_inc_dirs = extra_inc_dirs if extra_inc_dirs else [] self._add_local_include() self.extra_compile_args = ( extra_compile_args if extra_compile_args else [] ) self.extra_link_args = extra_link_args if extra_link_args else [] def _add_local_include(self): if 'bsd' in platform.system().lower(): local = '/usr/local/include' if local not in self.extra_inc_dirs: self.extra_inc_dirs.append(local) def _setup_filenames(self): base = self.name self.src_path = join(self.root, base + '.' + self.extension) self.ext_path = join(self.root, base + get_ext_extension()) self.lock_path = join(self.root, base + '.lock') @contextmanager def _lock(self, timeout=90): t1 = time.time() def _is_timed_out(): if timeout is None: return False else: return (time.time() - t1) > timeout def _try_to_lock(): if not exists(self.lock_path): try: os.mkdir(self.lock_path) except OSError: return False else: return True return False while not _try_to_lock(): time.sleep(0.1) if _is_timed_out(): break try: yield finally: os.rmdir(self.lock_path) def _write_source(self, path): if not exists(path): with io.open(path, 'w', encoding='utf-8') as f: f.write(self.code) def _setup_root(self, root): if root is None: plat_dir = get_platform_dir() self.root = expanduser(join('~', '.compyle', 'source', plat_dir)) else: self.root = root self.build_dir = join(self.root, 'build') if not isdir(self.build_dir): try: os.makedirs(self.build_dir) except OSError: # The directory was created at the same time by another # process. pass def _dependencies_have_changed(self): depends = self.depends if not depends: return False else: ext_mtime = os.stat(self.ext_path).st_mtime for name in depends: try: mod = importlib.import_module(name) mod_mtime = os.stat(mod.__file__).st_mtime if ext_mtime < mod_mtime: return True except ImportError: pass return False def should_recompile(self): if not exists(self.ext_path): return True elif self._dependencies_have_changed(): return True else: return False def build(self, force=False): """Build source into an extension module. If force is False previously compiled module is returned. """ if force or self.should_recompile(): self._message("Compiling code at:", self.src_path) inc_dirs = [numpy.get_include()] inc_dirs.extend(self.extra_inc_dirs) extra_compile_args, extra_link_args = ( self._get_extra_args() ) extension = Extension( name=self.name, sources=[self.src_path], include_dirs=inc_dirs, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, language="c++" ) if not hasattr(sys.stdout, 'errors'): # FIXME: This happens when nosetests replaces the # stdout with the a Tee instance. This Tee instance # does not have errors which breaks the tests so we # disable verbose reporting. script_args = [] else: script_args = ['--verbose'] try: with CaptureMultipleStreams() as stream: mod = pyxbuild.pyx_to_dll( self.src_path, extension, pyxbuild_dir=self.build_dir, force_rebuild=True, setup_args={'script_args': script_args} ) except (CompileError, LinkError): hline = "*"*80 print(hline + "\nERROR") s_out = stream.get_output() print(s_out[0]) print(s_out[1]) msg = "Compilation of code failed, please check "\ "error messages above." print(hline + "\n" + msg) sys.exit(1) shutil.copy(mod, self.ext_path) else: self._message("Precompiled code from:", self.src_path) def write_source(self): """Writes source without compiling. Used for testing""" if not exists(self.src_path): with self._lock(): self._write_source(self.src_path) def write_and_build(self): """Write source and build the extension module""" if not exists(self.ext_path): with self._lock(): self._write_source(self.src_path) self.build() else: self._message("Precompiled code from:", self.src_path) def load(self): """Load the built extension module. Returns """ self.write_and_build() spec = importlib.util.spec_from_file_location(self.name, self.ext_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module def _get_extra_args(self): ec, el = self.extra_compile_args, self.extra_link_args if get_config().use_openmp: _ec, _el = get_openmp_flags() return _ec + ec, _el + el else: return ec, el def _message(self, *args): msg = ' '.join(args) logger.info(msg) if self.verbose: print(msg) compyle-release-0.8.1/compyle/extern.py000066400000000000000000000023211414173670100201370ustar00rootroot00000000000000class Extern(object): """A simple way to support external functions and symbols. """ def link(self, backend): """Return a list of extra link args.""" return [] def code(self, backend): """Return suitable code as a string. This code is injected at the top of the generated code. """ raise NotImplementedError() def __call__(self, *args, **kw): """Implement for a pure Python implementation if needed. """ raise NotImplementedError() class _printf(Extern): def code(self, backend): # Always available so no need but in Cython we explicitly define it as # an example. if backend == 'cython': return 'from libc.studio cimport printf' return '' def __call__(self, *args): print(args[0] % args[1:]) # Now make it available publicly. printf = _printf() # More examples are available in the low_level.py module. def get_extern_code(externs, backend): links = [] code = [] for ex in externs: link = ex.link(backend) if link: links.extend(link) c = ex.code(backend) if c: code.append(c) return links, code compyle-release-0.8.1/compyle/jit.py000066400000000000000000000506701414173670100174320ustar00rootroot00000000000000from textwrap import dedent import numpy as np import inspect import ast import importlib import warnings import time from pytools import memoize from .config import get_config from .cython_generator import CythonGenerator from .transpiler import Transpiler, BUILTINS from .types import (dtype_to_ctype, get_declare_info, dtype_to_knowntype, annotate, BITS, KnownType) from .extern import Extern from .utils import getsourcelines from .profile import profile from . import array from . import parallel def memoize_kernel(key=lambda *args: args): def memoize_deco(method): def wrapper(*args): f = args[0].func key_val = key(*args) if not hasattr(f, 'cached_kernel'): setattr(f, 'cached_kernel', {key_val: method(*args)}) elif key_val not in f.cached_kernel: f.cached_kernel[key_val] = method(*args) return f.cached_kernel[key_val] return wrapper return memoize_deco def get_ctype_from_arg(arg, backend=None): if isinstance(arg, array.Array): return arg.gptr_type elif isinstance(arg, np.ndarray) or isinstance(arg, np.floating): return dtype_to_ctype(arg.dtype, backend=backend) else: if isinstance(arg, float): return 'double' else: if arg > 2147483648: return 'long long' if BITS.startswith('32') else 'long' else: return 'int' def kernel_cache_key_args(obj, *args): key = [get_ctype_from_arg(arg, backend=obj.backend) for arg in args] key.append(obj.func) key.append(obj.name) return tuple(key + list(parallel.get_common_cache_key(obj))) def kernel_cache_key_kwargs(obj, **kwargs): key = [get_ctype_from_arg(arg, backend=obj.backend) for arg in kwargs.values()] key.append(obj.input_func) key.append(obj.output_func) key.append(obj.scan_expr) return tuple(key + list(parallel.get_common_cache_key(obj))) def getargspec(f): getargspec_f = getattr(inspect, 'getfullargspec', getattr(inspect, 'getargspec')) return getargspec_f(f)[0] def get_signed_type(a): return a[1:] if a.startswith('u') else a def get_binop_return_type(a, b): int_types = ['short', 'int', 'long'] float_types = ['float', 'double'] if a is None or b is None: return None if a.endswith('p') and get_signed_type(b) in int_types: return a if b.endswith('p') and get_signed_type(a) in int_types: return b preference_order = int_types + float_types unsigned_a = unsigned_b = False if a.startswith('u'): unsigned_a = True a = a[1:] if b.startswith('u'): unsigned_b = True b = b[1:] idx_a = preference_order.index(a) idx_b = preference_order.index(b) return_type = preference_order[idx_a] if idx_a > idx_b else \ preference_order[idx_b] if unsigned_a and unsigned_b: return_type = 'u%s' % return_type return return_type class AnnotationHelper(ast.NodeVisitor): def __init__(self, func, arg_types): self.func = func self.name = self.func.__name__ self.arg_types = {name: self.get_declare_type(type_str) for name, type_str in arg_types.items()} self.var_types = self.arg_types.copy() self.undecl_var_types = {} self.external_funcs = {} self.external_missing_decl = {} self.warning_msg = (''' Function called is not marked by the annotate decorator. Argument type defaulting to 'double'. If the type is not 'double', store the value in a variable of appropriate type and use the variable ''') def get_declare_type(self, type_str): kind, address_space, ctype, shape = get_declare_info(type_str) if 'unsigned' in ctype: ctype = ctype.replace('unsigned ', 'u') if kind == 'matrix': ctype = '%sp' % ctype return ctype def get_missing_declarations(self, undecl_var_types): declarations = {} for var_name, dtype in undecl_var_types.items(): declarations[var_name] = '%s %s;' % (dtype, var_name) missing_decl = {self.func.__name__: declarations} missing_decl.update(self.external_missing_decl) return missing_decl def record_var_type(self, name, dtype): self.var_types[name] = self.get_declare_type(dtype) def record_undecl_var_type(self, name, dtype): if name not in self.var_types and name not in self.undecl_var_types: self.undecl_var_types[name] = self.get_declare_type(dtype) def get_var_type(self, name): return self.var_types.get( name, self.undecl_var_types.get(name, 'double')) def get_return_type(self): if getattr(self.func, 'is_jit', False): return self.arg_types.get('return_', 'double') else: annotations = getattr(self.func, '__annotations__', {}) return annotations.get('return', KnownType('double')).type def annotate(self): if getattr(self.func, 'is_jit', False): src = dedent('\n'.join(getsourcelines(self.func)[0])) self._src = src.splitlines() code = ast.parse(src) self.visit(code) self.func = annotate(self.func, **self.arg_types) return self.get_missing_declarations(self.undecl_var_types) def recursive_annotate(self, f, node): arg_types = {} f_arg_names = getargspec(f) for f_arg, arg in zip(f_arg_names, node.args): arg_type = self.visit(arg) if not arg_type: arg_type = 'double' arg_types[f_arg] = arg_type f_helper = AnnotationHelper(f, arg_types) self.external_missing_decl.update(f_helper.annotate()) self.external_funcs[node.func.id] = f_helper return f_helper def error(self, message, node): msg = '\nError in code in line %d:\n' % node.lineno if self._src: # pragma: no branch if node.lineno > 1: # pragma no branch msg += self._src[node.lineno - 2] + '\n' msg += self._src[node.lineno - 1] + '\n' msg += ' ' * node.col_offset + '^' + '\n\n' msg += message raise NotImplementedError(msg) def warn(self, message, node): msg = '\nIn code in line %d:\n' % node.lineno if self._src: # pragma: no branch if node.lineno > 1: # pragma no branch msg += self._src[node.lineno - 2] + '\n' msg += self._src[node.lineno - 1] + '\n' msg += ' ' * node.col_offset + '^' + '\n\n' msg += message warnings.warn(msg) def visit_declare(self, node): if not isinstance(node.args[0], ast.Str): self.error("Argument to declare should be a string.", node) type_str = node.args[0].s return self.get_declare_type(type_str) def visit_cast(self, node): if not isinstance(node.args[1], ast.Str): self.error("Cast type should be a string.", node) return node.args[1].s def visit_address(self, node): base_type = self.visit(node.args[0]) if base_type.endswith('p'): self.error("Cannot find address of a pointer", node) if isinstance(node.args[0], ast.Subscript): array_type = self.visit(node.args[0].value) if array_type.startswith('g'): base_type = 'g' + base_type return base_type + 'p' def visit_For(self, node): self.record_undecl_var_type(node.target.id, 'int') for stmt in node.body: self.visit(stmt) def visit_IfExp(self, node): return self.visit(node.body) def visit_Call(self, node): # FIXME: External functions have to be at the module level # for this to work. Pass list of external functions to # make this work if node.func.id == 'annotate': return mod = importlib.import_module(self.func.__module__) f = getattr(mod, node.func.id, None) if node.func.id == 'declare': return self.visit_declare(node) if node.func.id == 'cast': return self.visit_cast(node) if node.func.id == 'atomic_inc': return self.visit(node.args[0]) if node.func.id == 'atomic_dec': return self.visit(node.args[0]) if node.func.id == 'address': return self.visit_address(node) if node.func.id in self.external_funcs: return self.external_funcs[node.func.id].get_return_type() if isinstance(node.func, ast.Name) and node.func.id not in BUILTINS: if f is None or isinstance(f, Extern): self.warn("%s could not be found or is an external function" "and cannot be handled by JIT" % node.func.id) return 'double' else: f_helper = self.recursive_annotate(f, node) return f_helper.get_return_type() self.warn(dedent(self.warning_msg), node.func) return 'double' def visit_Subscript(self, node): base_type = self.visit(node.value) if base_type.startswith('g'): base_type = base_type[1:] return base_type[:-1] def visit_Name(self, node): return self.get_var_type(node.id) def visit_Assign(self, node): if len(node.targets) != 1: self.error("Assignments can have only one target.", node) left, right = node.targets[0], node.value right_type = self.visit(right) if isinstance(right, ast.Call) and right.func.id == 'declare': if isinstance(left, ast.Name): self.record_var_type(left.id, right_type) elif isinstance(left, ast.Tuple): names = [x.id for x in left.elts] for name in names: self.record_var_type(name, right_type) elif isinstance(left, ast.Name): self.record_undecl_var_type(left.id, right_type) def visit_Compare(self, node): return 'int' def visit_BinOp(self, node): if isinstance(node.op, ast.Pow): return self.visit(node.left) else: return get_binop_return_type(self.visit(node.left), self.visit(node.right)) def visit_Num(self, node): return get_ctype_from_arg(node.n) def visit_UnaryOp(self, node): return self.visit(node.operand) def visit_Return(self, node): if node and node.value: result_type = self.visit(node.value) if result_type: self.arg_types['return_'] = result_type return result_type class ElementwiseJIT(parallel.ElementwiseBase): def __init__(self, func, backend=None): backend = array.get_backend(backend) self.tp = Transpiler(backend=backend) self.backend = backend self.name = 'elwise_%s' % func.__name__ self.func = func self._config = get_config() self.cython_gen = CythonGenerator() self.source = '# Code jitted, call the function to generate the code.' self.all_source = self.source if backend == 'opencl': from .opencl import get_context, get_queue self.queue = get_queue() def get_type_info_from_args(self, *args): type_info = {} arg_names = getargspec(self.func) if 'i' in arg_names: arg_names.remove('i') type_info['i'] = 'int' for arg, name in zip(args, arg_names): arg_type = get_ctype_from_arg(arg, backend=self.backend) if not arg_type: arg_type = 'double' type_info[name] = arg_type return type_info @memoize_kernel(key=kernel_cache_key_args) def _generate_kernel(self, *args): if self.func is not None: arg_types = self.get_type_info_from_args(*args) helper = AnnotationHelper(self.func, arg_types) declarations = helper.annotate() self.func = helper.func return self._generate(declarations=declarations) def _massage_arg(self, x): if isinstance(x, array.Array): return x.dev elif self.backend != 'cuda' or isinstance(x, np.ndarray): return x else: return np.asarray(x) @profile def __call__(self, *args, **kw): c_func = self._generate_kernel(*args) c_args = [self._massage_arg(x) for x in args] if self.backend == 'cython': size = len(c_args[0]) c_args.insert(0, size) c_func(*c_args, **kw) elif self.backend == 'opencl': c_func(*c_args, **kw) self.queue.finish() elif self.backend == 'cuda': import pycuda.driver as drv event = drv.Event() c_func(*c_args, **kw) event.record() event.synchronize() class ReductionJIT(parallel.ReductionBase): def __init__(self, reduce_expr, map_func=None, dtype_out=np.float64, neutral='0', backend='cython'): backend = array.get_backend(backend) self.tp = Transpiler(backend=backend) self.backend = backend self.func = map_func if map_func is not None: self.name = 'reduce_' + map_func.__name__ else: self.name = 'reduce' self.reduce_expr = reduce_expr self.dtype_out = dtype_out self.type = dtype_to_ctype(dtype_out, backend) if backend == 'cython': # On Windows, INFINITY is not defined so we use INFTY which we # internally define. self.neutral = neutral.replace('INFINITY', 'INFTY') else: self.neutral = neutral self._config = get_config() self.cython_gen = CythonGenerator() self.source = '# Code jitted, call the function to generate the code.' self.all_source = self.source if backend == 'opencl': from .opencl import get_context, get_queue self.queue = get_queue() def get_type_info_from_args(self, *args): type_info = {} arg_names = getargspec(self.func) if 'i' in arg_names: arg_names.remove('i') type_info['i'] = 'int' for arg, name in zip(args, arg_names): arg_type = get_ctype_from_arg(arg, backend=self.backend) if not arg_type: arg_type = 'double' type_info[name] = arg_type return type_info @memoize_kernel(key=kernel_cache_key_args) def _generate_kernel(self, *args): if self.func is not None: arg_types = self.get_type_info_from_args(*args) helper = AnnotationHelper(self.func, arg_types) declarations = helper.annotate() self.func = helper.func return self._generate(declarations=declarations) def _massage_arg(self, x): if isinstance(x, array.Array): return x.dev elif self.backend != 'cuda' or isinstance(x, np.ndarray): return x else: return np.asarray(x) @profile def __call__(self, *args, **kw): c_func = self._generate_kernel(*args) c_args = [self._massage_arg(x) for x in args] if self.backend == 'cython': size = len(c_args[0]) c_args.insert(0, size) return c_func(*c_args, **kw) elif self.backend == 'opencl': result = c_func(*c_args, **kw) self.queue.finish() return result.get() elif self.backend == 'cuda': import pycuda.driver as drv event = drv.Event() result = c_func(*c_args, **kw) event.record() event.synchronize() return result.get() class ScanJIT(parallel.ScanBase): def __init__(self, input=None, output=None, scan_expr="a+b", is_segment=None, dtype=np.float64, neutral='0', complex_map=False, backend='opencl'): backend = array.get_backend(backend) self.tp = Transpiler(backend=backend, incl_cluda=False) self.backend = backend self.input_func = input self.output_func = output self.is_segment_func = is_segment self.complex_map = complex_map if input is not None: self.name = 'scan_' + input.__name__ else: self.name = 'scan' self.scan_expr = scan_expr self.dtype = dtype self.type = dtype_to_ctype(dtype, backend) if backend == 'cython': # On Windows, INFINITY is not defined so we use INFTY which we # internally define. self.neutral = neutral.replace('INFINITY', 'INFTY') else: self.neutral = neutral self._config = get_config() self.source = '# Code jitted, call the function to generate the code.' self.all_source = self.source self.cython_gen = CythonGenerator() if backend == 'opencl': from .opencl import get_context, get_queue self.queue = get_queue() builtin_symbols = ['item', 'prev_item', 'last_item'] self.builtin_types = {'i': 'int', 'N': 'int'} for sym in builtin_symbols: self.builtin_types[sym] = dtype_to_knowntype( self.dtype, backend=backend ) def get_type_info_from_kwargs(self, func, **kwargs): type_info = {} arg_names = getargspec(func) for name in arg_names: arg = kwargs.get(name, None) if arg is None and name not in self.builtin_types: raise ValueError("Argument %s not found" % name) if name in self.builtin_types: arg_type = self.builtin_types[name] else: arg_type = get_ctype_from_arg(arg, backend=self.backend) if not arg_type: arg_type = 'double' type_info[name] = arg_type return type_info @memoize(key=kernel_cache_key_kwargs, use_kwargs=True) def _generate_kernel(self, **kwargs): declarations = {} if self.input_func is not None: arg_types = self.get_type_info_from_kwargs( self.input_func, **kwargs) arg_types['return_'] = dtype_to_knowntype( self.dtype, backend=self.backend ) helper = AnnotationHelper(self.input_func, arg_types) declarations.update(helper.annotate()) self.input_func = helper.func if self.output_func is not None: arg_types = self.get_type_info_from_kwargs( self.output_func, **kwargs) helper = AnnotationHelper(self.output_func, arg_types) declarations.update(helper.annotate()) self.output_func = helper.func if self.is_segment_func is not None: arg_types = self.get_type_info_from_kwargs( self.is_segment_func, **kwargs) arg_types['return_'] = 'int' helper = AnnotationHelper(self.is_segment_func, arg_types) declarations.update(helper.annotate()) self.is_segment_func = helper.func return self._generate(declarations=declarations) def _massage_arg(self, x): if isinstance(x, array.Array): return x.dev elif self.backend != 'cuda' or isinstance(x, np.ndarray): return x else: return np.asarray(x) @profile def __call__(self, **kwargs): c_func = self._generate_kernel(**kwargs) c_args_dict = {k: self._massage_arg(x) for k, x in kwargs.items()} if self._get_backend_key() in self.output_func.arg_keys: output_arg_keys = self.output_func.arg_keys[ self._get_backend_key()] else: raise ValueError("No kernel arguments found for backend = %s, " "use_openmp = %s, use_double = %s" % self._get_backend_key()) if self.backend == 'cython': size = len(c_args_dict[output_arg_keys[1]]) c_args_dict['SIZE'] = size c_func(*[c_args_dict[k] for k in output_arg_keys]) elif self.backend == 'opencl': c_func(*[c_args_dict[k] for k in output_arg_keys]) self.queue.finish() elif self.backend == 'cuda': import pycuda.driver as drv event = drv.Event() c_func(*[c_args_dict[k] for k in output_arg_keys]) event.record() event.synchronize() compyle-release-0.8.1/compyle/low_level.py000066400000000000000000000260571414173670100206360ustar00rootroot00000000000000"""Low level utility code. The intention is for users to use these but with the knowledge that these are not general cross-backend tools but rather specific tools. """ import re import inspect import numpy as np from .config import get_config from .array import Array, get_backend from .transpiler import Transpiler from .types import KnownType, ctype_to_dtype from .extern import Extern from .profile import profile LID_0 = LDIM_0 = GDIM_0 = GID_0 = 0 def local_barrier(): """Dummy method to keep Python happy. This is a valid function in OpenCL but has no meaning in Python for now. """ pass class LocalMem(object): '''A local memory specification for a GPU kernel. An example illustrates this best:: >>> l = LocalMem(2) >>> m = l.get('double', 128) >>> m.size 2048 Note that this is basically ``sizeof(double) * 128 * 2`` ''' def __init__(self, size, backend=None): ''' Constructor Parameters ---------- size: int: a multiple of the current work group size. baackend: str: one of 'opencl', 'cuda' ''' self.backend = get_backend(backend) if backend == 'cython': raise NotImplementedError( 'LocalMem is only meaningful for the opencl/cuda backends.' ) self.size = size self._cache = {} def get(self, c_type, workgroup_size): """Return the local memory required given the type and work group size. """ key = (c_type, workgroup_size) if key in self._cache: return self._cache[key] elif self.backend == 'opencl': import pyopencl as cl dtype = ctype_to_dtype(c_type) sz = dtype.itemsize mem = cl.LocalMemory(sz * self.size * workgroup_size) self._cache[key] = mem return mem else: raise NotImplementedError( 'Backend %s not implemented' % self.backend ) def splay_cl(queue, n, kernel_specific_max_wg_size=None): dev = queue.device max_work_items = min(128, dev.max_work_group_size) if kernel_specific_max_wg_size is not None: max_work_items = min(max_work_items, kernel_specific_max_wg_size) min_work_items = min(64, max_work_items) full_groups = dev.max_compute_units * 4 * 8 # 4 to overfill the device # 8 is an Nvidia constant--that's how many # groups fit onto one compute device if n < min_work_items: group_count = 1 work_items_per_group = min_work_items elif n < (full_groups * min_work_items): group_count = (n + min_work_items - 1) // min_work_items work_items_per_group = min_work_items elif n < (full_groups * max_work_items): group_count = full_groups grp = (n + min_work_items - 1) // min_work_items work_items_per_group = ( (grp + full_groups - 1) // full_groups) * min_work_items else: group_count = (n + max_work_items - 1) // max_work_items work_items_per_group = max_work_items return (group_count * work_items_per_group,), (work_items_per_group,) class Kernel(object): """A simple abstraction to create GPU kernels with pure Python. This will not work currently with the Cython backend. The idea is that one can create a Python function with suitable type annotations along with standard names from the CLUDA header (`LDIM_0, LID_0, GID_0, local_barrier()`, )etc.) to write kernels in pure Python. Note ---- This works best with functions with annotations via the @annotate decorator or with function annotation as we need the type information for some simple type checking of the passed constants. """ def __init__(self, func, backend='opencl'): backend = get_backend(backend) if backend == 'cython': raise NotImplementedError( 'Kernels only work with opencl/cuda backends.' ) elif backend == 'opencl': from .opencl import get_queue self.queue = get_queue() elif backend == 'cuda': from .cuda import set_context set_context() self.tp = Transpiler(backend=backend) self.backend = backend self.name = func.__name__ self.func = func self.source = '' # The generated source. self._config = get_config() self._use_double = self._config.use_double self._func_info = self._get_func_info() self._generate() def _to_float(self, s): return re.sub(r'\bdouble\b', 'float', s) def _get_func_info(self): getfullargspec = getattr(inspect, 'getfullargspec', inspect.getargspec) argspec = getfullargspec(self.func) annotations = getattr( argspec, 'annotations', self.func.__annotations__ ) arg_info = [] local_info = {} for arg in argspec.args: kt = annotations[arg] if not self._use_double: kt = KnownType( self._to_float(kt.type), self._to_float(kt.base_type) ) if 'LOCAL_MEM' in kt.type: local_info[arg] = kt.base_type arg_info.append((arg, kt)) func_info = { 'args': arg_info, 'local_info': local_info, 'return': annotations.get('return', KnownType('void')) } return func_info def _get_local_size(self, args, workgroup_size): local_info = self._func_info['local_info'] arg_info = self._func_info['args'] total_size = 0 for arg, a_info in zip(args, arg_info): if isinstance(arg, LocalMem): dtype = ctype_to_dtype(local_info[a_info[0]]) total_size += dtype.itemsize return workgroup_size * total_size def _generate(self): self.tp.add(self.func) self._correct_opencl_address_space() self.tp.compile() self.source = self.tp.source if self.backend == 'opencl': self.knl = getattr(self.tp.mod, self.name) import pyopencl as cl self._max_work_group_size = self.knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, self.queue.device ) elif self.backend == 'cuda': self.knl = self.tp.mod.get_function(self.name) def _correct_opencl_address_space(self): code = self.tp.blocks[-1].code.splitlines() # To remove WITHIN_KERNEL code[0] = 'KERNEL ' + code[0][13:] self.tp.blocks[-1].code = '\n'.join(code) def _massage_arg(self, x, type_info, workgroup_size): if isinstance(x, Array): if self.backend == 'opencl': return x.dev.data elif self.backend == 'cuda': return x.dev elif isinstance(x, LocalMem): if self.backend == 'opencl': return x.get(type_info.base_type, workgroup_size) elif self.backend == 'cuda': return np.array(workgroup_size, dtype=np.int32) else: dtype = ctype_to_dtype(type_info.type) return np.array([x], dtype=dtype) def _get_args(self, args, workgroup_size): arg_info = self._func_info['args'] c_args = [] for arg, a_info in zip(args, arg_info): c_args.append(self._massage_arg(arg, a_info[1], workgroup_size)) return c_args def _get_workgroup_size(self, global_size): if self.backend == 'opencl': gs, ls = splay_cl(self.queue, global_size, self._max_work_group_size) elif self.backend == 'cuda': from pycuda.gpuarray import splay gs, ls = splay(global_size) return gs, ls @profile def __call__(self, *args, **kw): size = args[0].data.shape gs = kw.pop('global_size', size) n = np.prod(gs) ls = kw.pop('local_size', None) if ls is not None: local_size = np.prod(ls) global_size = ((n + local_size - 1) // local_size) * local_size gs = (global_size, ) else: gs, ls = self._get_workgroup_size(n) c_args = self._get_args(args, ls[0]) if self.backend == 'opencl': prepend = [self.queue, gs, ls] c_args = prepend + c_args self.knl(*c_args) self.queue.finish() elif self.backend == 'cuda': import pycuda.driver as drv shared_mem_size = int(self._get_local_size(args, ls[0])) num_blocks = int((n + ls[0] - 1) / ls[0]) num_tpb = int(ls[0]) event = drv.Event() self.knl(*c_args, block=(num_tpb, 1, 1), grid=(num_blocks, 1), shared=shared_mem_size) event.record() event.synchronize() class _prange(Extern): def code(self, backend): if backend != 'cython': raise NotImplementedError('prange only available with Cython') return 'from cython.parallel import prange' def __call__(self, *args, **kw): # Ignore the kwargs. return range(*args) class _parallel(Extern): def code(self, backend): if backend != 'cython': raise NotImplementedError('prange only available with Cython') return 'from cython.parallel import parallel' def __call__(self, *args, **kw): pass class _nogil(Extern): def code(self, backend): if backend != 'cython': raise NotImplementedError('prange only available with Cython') return '' def __call__(self, *args, **kw): pass class _address(Extern): def code(self, backend): if backend == 'cython': return 'from cython import address' else: return '' def __call__(self, *args, **kw): pass class _atomic_inc(Extern): def code(self, backend): return '' def __call__(self, *args, **kw): pass class _atomic_dec(Extern): def code(self, backend): return '' def __call__(self, *args, **kw): pass class _cast(Extern): def code(self, backend): return '' def __call__(self, x, type_str): return eval(type_str)(x) prange = _prange() parallel = _parallel() nogil = _nogil() address = _address() atomic_inc = _atomic_inc() atomic_dec = _atomic_dec() cast = _cast() class Cython(object): def __init__(self, func): self.tp = Transpiler(backend='cython') self.tp._cgen.set_make_python_methods(True) self.name = func.__name__ self.func = func self.source = '' # The generated source. self._generate() def _generate(self): self.tp.add(self.func) self.tp.compile() self.source = self.tp.source self.c_func = getattr(self.tp.mod, 'py_' + self.name) def _massage_arg(self, x): if isinstance(x, Array): return x.data else: return x def __call__(self, *args): args = [self._massage_arg(x) for x in args] return self.c_func(*args) compyle-release-0.8.1/compyle/opencl.py000066400000000000000000000055741414173670100201270ustar00rootroot00000000000000"""Common OpenCL related functionality. """ from __future__ import print_function import pyopencl as cl from .config import get_config from .profile import profile_kernel, named_profile _ctx = None _queue = None class DeviceWGSException(Exception): pass def get_context(): global _ctx if _ctx is None: _ctx = cl.create_some_context() return _ctx def set_context(ctx): global _ctx _ctx = ctx def get_queue(): global _queue if _queue is None: properties = None if get_config().profile: properties = cl.command_queue_properties.PROFILING_ENABLE _queue = cl.CommandQueue(get_context(), properties=properties) return _queue def set_queue(q): global _queue _queue = q class SimpleKernel(object): """ElementwiseKernel substitute that supports a custom work group size. """ def __init__(self, ctx, args, operation, wgs, name="", preamble="", options=[]): self.args = args self.operation = operation self.name = name self.preamble = preamble self.options = options self.prg = cl.Program(ctx, self._generate()).build(options) self.knl = getattr(self.prg, name) if self.get_max_wgs() < wgs: raise DeviceWGSException("") def _massage_arg(self, arg): if '*' in arg: return "__global " + arg return arg def _generate(self): args = [self._massage_arg(arg) for arg in self.args.split(",")] source = r""" %(preamble)s __kernel void %(name)s(%(args)s) { int lid = get_local_id(0); int gsize = get_global_size(0); int work_group_start = get_local_size(0)*get_group_id(0); long i = get_global_id(0); %(body)s } """ % { "args": ",".join(args), "name": self.name, "preamble": self.preamble, "body": self.operation } return source def get_max_wgs(self): return self.knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, get_queue().device ) def __call__(self, *args, **kwargs): wait_for = kwargs.pop("wait_for", None) queue = kwargs.pop("queue", None) gs = kwargs.pop("gs", None) ls = kwargs.pop("ls", None) if queue is None or gs is None or ls is None: raise ValueError("queue, gs and ls can not be empty") if kwargs: raise TypeError("unknown keyword arguments: '%s'" % ", ".join(kwargs)) def unwrap(arg): return arg.data if isinstance(arg, cl.array.Array) else arg self.knl.set_args(*[unwrap(arg) for arg in args]) return cl.enqueue_nd_range_kernel(queue, self.knl, gs, ls, wait_for=wait_for) compyle-release-0.8.1/compyle/parallel.py000066400000000000000000001227621414173670100204420ustar00rootroot00000000000000"""A set of parallel algorithms that allow users to solve a variety of problems. These functions are heavily inspired by the same functionality provided in pyopencl. However, we also provide Cython implementations for these and unify the syntax in a transparent way which allows users to write the code once and have it run on different execution backends. """ from functools import wraps from textwrap import wrap from mako.template import Template import numpy as np from .config import get_config from .profile import profile from .cython_generator import get_parallel_range, CythonGenerator from .transpiler import Transpiler, convert_to_float_if_needed from .types import dtype_to_ctype from . import array elementwise_cy_template = ''' from cython.parallel import parallel, prange cdef c_${name}(${c_arg_sig}): cdef int i %if openmp: with nogil, parallel(): for i in ${get_parallel_range("SIZE")}: %else: if 1: for i in range(SIZE): %endif ${name}(${c_args}) cpdef py_${name}(${py_arg_sig}): c_${name}(${py_args}) ''' reduction_cy_template = ''' from cython.parallel import parallel, prange, threadid from libc.stdlib cimport abort, malloc, free from libc.math cimport INFINITY cimport openmp cdef double INFTY = float('inf') cpdef int get_number_of_threads(): % if openmp: cdef int i, n with nogil, parallel(): for i in prange(1): n = openmp.omp_get_num_threads() return n % else: return 1 % endif cdef int gcd(int a, int b): while b != 0: a, b = b, a%b return a cdef int get_stride(int sz, int itemsize): return sz//gcd(sz, itemsize) cdef ${type} c_${name}(${c_arg_sig}): cdef int i, n_thread, tid, scan_stride, sz cdef ${type} a, b n_thread = get_number_of_threads() sz = sizeof(${type}) # This striding is to do 64 bit alignment to prevent false sharing. scan_stride = get_stride(64, sz) cdef ${type}* buffer buffer = <${type}*>malloc(n_thread*scan_stride*sz) if buffer == NULL: raise MemoryError("Unable to allocate memory for reduction") %if openmp: with nogil, parallel(): % else: if 1: % endif tid = threadid() buffer[tid*scan_stride] = ${neutral} %if openmp: for i in ${get_parallel_range("SIZE")}: %else: for i in range(SIZE): %endif a = buffer[tid*scan_stride] b = ${map_expr} buffer[tid*scan_stride] = ${reduce_expr} a = ${neutral} for i in range(n_thread): b = buffer[i*scan_stride] a = ${reduce_expr} free(buffer) return a cpdef py_${name}(${py_arg_sig}): return c_${name}(${py_args}) ''' scan_cy_template = ''' from cython.parallel import parallel, prange, threadid from libc.stdlib cimport abort, malloc, free cimport openmp cimport numpy as np cpdef int get_number_of_threads(): % if openmp: cdef int i, n with nogil, parallel(): for i in prange(1): n = openmp.omp_get_num_threads() return n % else: return 1 % endif cdef int gcd(int a, int b): while b != 0: a, b = b, a % b return a cdef int get_stride(int sz, int itemsize): return sz // gcd(sz, itemsize) cdef void c_${name}(${c_arg_sig}): cdef int i, n_thread, tid, scan_stride, sz, N N = SIZE n_thread = get_number_of_threads() sz = sizeof(${type}) # This striding is to do 64 bit alignment to prevent false sharing. scan_stride = get_stride(64, sz) cdef ${type}* buffer buffer = <${type}*> malloc(n_thread * scan_stride * sz) if buffer == NULL: raise MemoryError("Unable to allocate memory for scan.") % if use_segment: cdef int* scan_seg_flags cdef int* chunk_new_segment scan_seg_flags = malloc(SIZE * sizeof(int)) chunk_new_segment = malloc(n_thread * scan_stride * sizeof(int)) if scan_seg_flags == NULL or chunk_new_segment == NULL: raise MemoryError("Unable to allocate memory for segmented scan") % endif % if complex_map: cdef ${type}* map_output map_output = <${type}*> malloc(SIZE * sz) if map_output == NULL: raise MemoryError("Unable to allocate memory for scan. (Recommended: Set complex_map=False.)") % endif cdef int buffer_idx, start, end, has_segment cdef ${type} a, b, temp # This chunksize would divide input data equally # between threads % if not calc_last_item: # A chunk of 1 MB per thread cdef int chunksize = 1048576 // sz % else: # Process all data together. Only then can we get # the last item immediately cdef int chunksize = (SIZE + n_thread - 1) // n_thread % endif cdef int offset = 0 cdef ${type} global_carry = ${neutral} cdef ${type} last_item cdef ${type} carry, item, prev_item while offset < SIZE: # Pass 1 with nogil, parallel(): tid = threadid() buffer_idx = tid * scan_stride start = offset + tid * chunksize end = min(offset + (tid + 1) * chunksize, SIZE) has_segment = 0 temp = ${neutral} for i in range(start, end): % if use_segment: # Generate segment flags scan_seg_flags[i] = ${is_segment_start_expr} if (scan_seg_flags[i]): has_segment = 1 % endif # Carry % if use_segment: if (scan_seg_flags[i]): a = ${neutral} else: a = temp % else: a = temp % endif # Map b = ${input_expr} % if complex_map: map_output[i] = b % endif # Scan temp = ${scan_expr} buffer[buffer_idx] = temp % if use_segment: chunk_new_segment[buffer_idx] = has_segment % endif # Pass 2: Aggregate chunks # Add previous carry to buffer[0] % if use_segment: if chunk_new_segment[0]: a = ${neutral} else: a = global_carry b = buffer[0] % else: a = global_carry b = buffer[0] % endif buffer[0] = ${scan_expr} for i in range(n_thread - 1): % if use_segment: # With segmented scan if chunk_new_segment[(i + 1) * scan_stride]: a = ${neutral} else: a = buffer[i * scan_stride] b = buffer[(i + 1) * scan_stride] buffer[(i + 1) * scan_stride] = ${scan_expr} % else: # Without segmented scan a = buffer[i * scan_stride] b = buffer[(i + 1) * scan_stride] buffer[(i + 1) * scan_stride] = ${scan_expr} % endif last_item = buffer[(n_thread - 1) * scan_stride] # Shift buffer to right by 1 unit for i in range(n_thread - 1, 0, -1): buffer[i * scan_stride] = buffer[(i - 1) * scan_stride] buffer[0] = global_carry global_carry = last_item # Pass 3: Output with nogil, parallel(): tid = threadid() buffer_idx = tid * scan_stride carry = buffer[buffer_idx] start = offset + tid * chunksize end = min(offset + (tid + 1) * chunksize, SIZE) for i in range(start, end): # Output % if use_segment: if scan_seg_flags[i]: a = ${neutral} else: a = carry % else: a = carry % endif % if complex_map: b = map_output[i] % else: b = ${input_expr} % endif % if calc_prev_item: prev_item = carry % endif carry = ${scan_expr} item = carry ${output_expr} offset += chunksize * n_thread # Clean up free(buffer) % if use_segment: free(scan_seg_flags) free(chunk_new_segment) % endif % if complex_map: free(map_output) % endif cpdef py_${name}(${py_arg_sig}): return c_${name}(${py_args}) ''' scan_cy_single_thread_template = ''' from cython.parallel import parallel, prange, threadid from libc.stdlib cimport abort, malloc, free cimport openmp cimport numpy as np cdef void c_${name}(${c_arg_sig}): cdef int i, N, across_seg_boundary cdef ${type} a, b, item N = SIZE % if calc_last_item: a = ${neutral} for i in range(N): b = ${input_expr} a = ${scan_expr} last_item = a % endif a = ${neutral} for i in range(N): # Segment operation % if use_segment: across_seg_boundary = ${is_segment_start_expr} if across_seg_boundary: a = ${neutral} % endif # Map b = ${input_expr} % if calc_prev_item: prev_item = a % endif # Scan a = ${scan_expr} item = a # Output ${output_expr} cpdef py_${name}(${py_arg_sig}): return c_${name}(${py_args}) ''' def drop_duplicates(arr): result = [] for x in arr: if x not in result: result.extend([x]) return result def serial(func=None, **kw): """Decorator to specify serial execution of a cython function """ def wrapper(func): func.is_serial = True return func if func is None: return wrapper else: return wrapper(func) def get_common_cache_key(obj): return obj.backend, obj._config.use_openmp, obj._config.use_double class ElementwiseBase(object): def __init__(self, func, backend=None): backend = array.get_backend(backend) self.tp = Transpiler(backend=backend) self.backend = backend self.name = 'elwise_%s' % func.__name__ self.func = func self._config = get_config() self.cython_gen = CythonGenerator() self.queue = None # This is the source generated for the user code. self.source = '# Source not yet generated.' # This is all the source code used for the elementwise. self.all_source = '# Source not yet generated.' self.c_func = self._generate() def _generate(self, declarations=None): self.tp.add(self.func, declarations=declarations) if self.backend == 'cython': # FIXME: Handle the name of the kernel correctly py_data, c_data = self.cython_gen.get_func_signature(self.func) py_defn = ['long SIZE'] + py_data[0][1:] c_defn = ['long SIZE'] + c_data[0][1:] py_args = ['SIZE'] + py_data[1][1:] template = Template(text=elementwise_cy_template) src = template.render( name=self.name[7:], c_arg_sig=', '.join(c_defn), c_args=', '.join(c_data[1]), py_arg_sig=', '.join(py_defn), py_args=', '.join(py_args), openmp=self._config.use_openmp and not getattr( self.func, 'is_serial', False), get_parallel_range=get_parallel_range ) # This is the user code source. self.source = self.tp.get_code() self.tp.add_code(src) self.tp.compile() # All the source code for the elementwise self.all_source = self.tp.source return getattr(self.tp.mod, 'py_' + self.name[7:]) elif self.backend == 'opencl': py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) from .opencl import get_context, get_queue from pyopencl.elementwise import ElementwiseKernel from pyopencl._cluda import CLUDA_PREAMBLE ctx = get_context() self.queue = get_queue() name = self.func.__name__ expr = '{func}({args})'.format( func=name, args=', '.join(c_data[1]) ) arguments = convert_to_float_if_needed(', '.join(c_data[0][1:])) preamble = convert_to_float_if_needed(self.tp.get_code()) cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True ) knl = ElementwiseKernel( ctx, name=self.name, arguments=arguments, operation=expr, preamble="\n".join([cluda_preamble, preamble]) ) # only code we generate is saved here. self.source = "\n".join([cluda_preamble, preamble]) all_source = knl.get_kernel(False)[0].program.source self.all_source = all_source or self.source return knl elif self.backend == 'cuda': py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) from .cuda import set_context set_context() from pycuda.elementwise import ElementwiseKernel from pycuda._cluda import CLUDA_PREAMBLE name = self.func.__name__ expr = '{func}({args})'.format( func=name, args=', '.join(c_data[1]) ) arguments = convert_to_float_if_needed(', '.join(c_data[0][1:])) preamble = convert_to_float_if_needed(self.tp.get_code()) cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True ) knl = ElementwiseKernel( name=self.name, arguments=arguments, operation=expr, preamble="\n".join([cluda_preamble, preamble]) ) # only code we generate is saved here. self.source = cluda_preamble + preamble # FIXME: it is difficult to get the sources from pycuda. self.all_source = self.source return knl def _correct_opencl_address_space(self, c_data): code = self.tp.blocks[-1].code.splitlines() header_idx = 1 for line in code: if line.rstrip().endswith(')'): break header_idx += 1 def _add_address_space(arg): if '*' in arg and 'GLOBAL_MEM' not in arg: return 'GLOBAL_MEM ' + arg else: return arg args = [_add_address_space(arg) for arg in c_data[0]] code[:header_idx] = wrap( 'WITHIN_KERNEL void {func}({args})'.format( func=self.func.__name__, args=', '.join(args) ), width=78, subsequent_indent=' ' * 4, break_long_words=False ) self.tp.blocks[-1].code = '\n'.join(code) def _massage_arg(self, x): if isinstance(x, array.Array): return x.dev elif self.backend != 'cuda' or isinstance(x, np.ndarray): return x else: return np.asarray(x) @profile def __call__(self, *args, **kw): c_args = [self._massage_arg(x) for x in args] if self.backend == 'cython': size = len(c_args[0]) c_args.insert(0, size) self.c_func(*c_args, **kw) elif self.backend == 'opencl': self.c_func(*c_args, **kw) self.queue.finish() elif self.backend == 'cuda': import pycuda.driver as drv event = drv.Event() self.c_func(*c_args, **kw) event.record() event.synchronize() class Elementwise(object): def __init__(self, func, backend=None): if getattr(func, '__annotations__', None) and not hasattr(func, 'is_jit'): self.elementwise = ElementwiseBase(func, backend=backend) else: from .jit import ElementwiseJIT self.elementwise = ElementwiseJIT(func, backend=backend) def __getattr__(self, name): return getattr(self.elementwise, name) def __dir__(self): return sorted(dir(self.elementwise) + ['elementwise']) def __call__(self, *args, **kwargs): self.elementwise(*args, **kwargs) def elementwise(func=None, backend=None): def _wrapper(function): return wraps(function)(Elementwise(function, backend=backend)) if func is None: return _wrapper else: return _wrapper(func) class ReductionBase(object): def __init__(self, reduce_expr, map_func=None, dtype_out=np.float64, neutral='0', backend='cython'): backend = array.get_backend(backend) self.tp = Transpiler(backend=backend) self.backend = backend self.func = map_func if map_func is not None: self.name = 'reduce_' + map_func.__name__ else: self.name = 'reduce' self.reduce_expr = reduce_expr self.dtype_out = dtype_out self.type = dtype_to_ctype(dtype_out, backend=backend) if backend == 'cython': # On Windows, INFINITY is not defined so we use INFTY which we # internally define. self.neutral = neutral.replace('INFINITY', 'INFTY') else: self.neutral = neutral self._config = get_config() self.cython_gen = CythonGenerator() self.queue = None # This is the source generated for the user code. self.source = '# Source not yet generated.' # This is all the source code used. self.all_source = '# Source not yet generated.' self.c_func = self._generate() def _generate(self, declarations=None): if self.backend == 'cython': if self.func is not None: self.tp.add(self.func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_return_type(c_data) name = self.func.__name__ cargs = ', '.join(c_data[1]) map_expr = '{name}({cargs})'.format(name=name, cargs=cargs) else: py_data = (['int i', '{type}[:] inp'.format(type=self.type)], ['i', '&inp[0]']) c_data = (['int i', '{type}* inp'.format(type=self.type)], ['i', 'inp']) map_expr = 'inp[i]' py_defn = ['long SIZE'] + py_data[0][1:] c_defn = ['long SIZE'] + c_data[0][1:] py_args = ['SIZE'] + py_data[1][1:] template = Template(text=reduction_cy_template) src = template.render( name=self.name, type=self.type, map_expr=map_expr, reduce_expr=self.reduce_expr, neutral=self.neutral, c_arg_sig=', '.join(c_defn), py_arg_sig=', '.join(py_defn), py_args=', '.join(py_args), openmp=self._config.use_openmp, get_parallel_range=get_parallel_range ) # This is the user code source. self.source = self.tp.get_code() self.tp.add_code(src) self.tp.compile() self.all_source = self.tp.source return getattr(self.tp.mod, 'py_' + self.name) elif self.backend == 'opencl': if self.func is not None: self.tp.add(self.func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) name = self.func.__name__ expr = '{func}({args})'.format( func=name, args=', '.join(c_data[1]) ) arguments = convert_to_float_if_needed( ', '.join(c_data[0][1:]) ) preamble = convert_to_float_if_needed(self.tp.get_code()) else: arguments = '{type} *in'.format(type=self.type) expr = None preamble = '' from .opencl import get_context, get_queue from pyopencl.reduction import ReductionKernel from pyopencl._cluda import CLUDA_PREAMBLE cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True ) ctx = get_context() self.queue = get_queue() knl = ReductionKernel( ctx, dtype_out=self.dtype_out, neutral=self.neutral, reduce_expr=self.reduce_expr, map_expr=expr, arguments=arguments, preamble="\n".join([cluda_preamble, preamble]) ) # only code we generate is saved here. self.source = "\n".join([cluda_preamble, preamble]) if knl.stage_1_inf.source: self.all_source = "\n".join([ "// ------ stage 1 -----", knl.stage_1_inf.source, "// ------ stage 2 -----", knl.stage_2_inf.source, ]) else: self.all_source = self.source return knl elif self.backend == 'cuda': if self.func is not None: self.tp.add(self.func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) name = self.func.__name__ expr = '{func}({args})'.format( func=name, args=', '.join(c_data[1]) ) arguments = convert_to_float_if_needed( ', '.join(c_data[0][1:]) ) preamble = convert_to_float_if_needed(self.tp.get_code()) else: arguments = '{type} *in'.format(type=self.type) expr = None preamble = '' from .cuda import set_context set_context() from pycuda.reduction import ReductionKernel from pycuda._cluda import CLUDA_PREAMBLE cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True ) knl = ReductionKernel( dtype_out=self.dtype_out, neutral=self.neutral, reduce_expr=self.reduce_expr, map_expr=expr, arguments=arguments, preamble="\n".join([cluda_preamble, preamble]) ) # only code we generate is saved here. self.source = cluda_preamble + preamble # FIXME: it is difficult to get the sources from pycuda. self.all_source = self.source return knl def _correct_return_type(self, c_data): code = self.tp.blocks[-1].code.splitlines() if self._config.use_openmp: gil = " nogil" else: gil = "" code[0] = "cdef inline {type} {name}({args}){gil}:".format( type=self.type, name=self.func.__name__, args=', '.join(c_data[0]), gil=gil ) self.tp.blocks[-1].code = '\n'.join(code) def _add_address_space(self, arg): if '*' in arg and 'GLOBAL_MEM' not in arg: return 'GLOBAL_MEM ' + arg else: return arg def _correct_opencl_address_space(self, c_data): code = self.tp.blocks[-1].code.splitlines() header_idx = 1 for line in code: if line.rstrip().endswith(')'): break header_idx += 1 args = [self._add_address_space(arg) for arg in c_data[0]] code[:header_idx] = wrap( 'WITHIN_KERNEL {type} {func}({args})'.format( type=self.type, func=self.func.__name__, args=', '.join(args) ), width=78, subsequent_indent=' ' * 4, break_long_words=False ) self.tp.blocks[-1].code = '\n'.join(code) def _massage_arg(self, x): if isinstance(x, array.Array): return x.dev elif self.backend != 'cuda' or isinstance(x, np.ndarray): return x else: return np.asarray(x) @profile def __call__(self, *args): c_args = [self._massage_arg(x) for x in args] if self.backend == 'cython': size = len(c_args[0]) c_args.insert(0, size) return self.c_func(*c_args) elif self.backend == 'opencl': result = self.c_func(*c_args) self.queue.finish() return result.get() elif self.backend == 'cuda': import pycuda.driver as drv event = drv.Event() result = self.c_func(*c_args) event.record() event.synchronize() return result.get() class Reduction(object): def __init__(self, reduce_expr, map_func=None, dtype_out=np.float64, neutral='0', backend='cython'): if map_func is None or getattr(map_func, '__annotations__', None) and \ not hasattr(map_func, 'is_jit'): self.reduction = ReductionBase(reduce_expr, map_func=map_func, dtype_out=dtype_out, neutral=neutral, backend=backend) else: from .jit import ReductionJIT self.reduction = ReductionJIT(reduce_expr, map_func=map_func, dtype_out=dtype_out, neutral=neutral, backend=backend) def __dir__(self): return sorted(dir(self.reduction) + ['reduction']) def __getattr__(self, name): return getattr(self.reduction, name) def __call__(self, *args, **kwargs): return self.reduction(*args, **kwargs) class ScanBase(object): def __init__(self, input=None, output=None, scan_expr="a+b", is_segment=None, dtype=np.float64, neutral='0', complex_map=False, backend=None): backend = array.get_backend(backend) self.tp = Transpiler(backend=backend, incl_cluda=False) self.backend = backend self.input_func = input self.output_func = output self.is_segment_func = is_segment self.complex_map = complex_map if input is not None: self.name = 'scan_' + input.__name__ else: self.name = 'scan' self.scan_expr = scan_expr self.dtype = dtype self.type = dtype_to_ctype(dtype, backend=backend) if backend == 'cython': # On Windows, INFINITY is not defined so we use INFTY which we # internally define. self.neutral = neutral.replace('INFINITY', 'INFTY') else: self.neutral = neutral self._config = get_config() # This is the source generated for the user code. self.source = '# Source not yet generated.' # This is all the source code used for the elementwise. self.all_source = '# Source not yet generated.' self.cython_gen = CythonGenerator() self.queue = None self.c_func = self._generate() def _get_backend_key(self): return get_common_cache_key(self) def _correct_return_type(self, c_data, modifier): code = self.tp.blocks[-1].code.splitlines() if self._config.use_openmp: gil = " nogil" else: gil = "" code[0] = "cdef inline {type} {name}_{modifier}({args}){gil}:".format( type=self.type, name=self.name, modifier=modifier, args=', '.join(c_data[0]), gil=gil ) self.tp.blocks[-1].code = '\n'.join(code) def _include_prev_item(self): if 'prev_item' in self.tp.blocks[-1].code: return True else: return False def _include_last_item(self): if 'last_item' in self.tp.blocks[-1].code: return True else: return False def _not_ignored(self, args): ignore = ['item', 'prev_item', 'last_item', 'i', 'N'] return [i for (i, x) in enumerate(args) if x not in ignore] def _filter_ignored(self, args, indices): return [args[x] for x in indices] def _generate(self, declarations=None): if self.backend == 'opencl': return self._generate_opencl_kernel(declarations=declarations) elif self.backend == 'cuda': return self._generate_cuda_kernel(declarations=declarations) elif self.backend == 'cython': return self._generate_cython_code(declarations=declarations) def _default_cython_input_function(self): py_data = (['int i', '{type}[:] input'.format(type=self.type)], ['i', '&input[0]']) c_data = (['int i', '{type}* input'.format(type=self.type)], ['i', 'input']) input_expr = 'input[i]' return py_data, c_data, input_expr def _wrap_cython_code(self, func, func_type=None, declarations=None): name = self.name if func is not None: self.tp.add(func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(func) self._correct_return_type(c_data, func_type) cargs = ', '.join(c_data[1]) expr = '{name}_{modifier}({cargs})'.format(name=name, cargs=cargs, modifier=func_type) else: if func_type == 'input': py_data, c_data, expr = self._default_cython_input_function() else: py_data, c_data, expr = [], [], None return py_data, c_data, expr def _append_cython_arg_data(self, all_py_data, all_c_data, py_data, c_data): if len(c_data) > 0: select = self._not_ignored(c_data[1]) all_py_data[0].extend(self._filter_ignored(py_data[0], select)) all_py_data[1].extend(self._filter_ignored(py_data[1], select)) all_c_data[0].extend(self._filter_ignored(c_data[0], select)) all_c_data[1].extend(self._filter_ignored(c_data[1], select)) def _generate_cython_code(self, declarations=None): all_py_data = [[], []] all_c_data = [[], []] # Process input function py_data, c_data, input_expr = self._wrap_cython_code( self.input_func, func_type='input', declarations=declarations ) self._append_cython_arg_data(all_py_data, all_c_data, py_data, c_data) # Process segment function use_segment = True if self.is_segment_func is not None else False py_data, c_data, segment_expr = self._wrap_cython_code( self.is_segment_func, func_type='segment', declarations=declarations ) self._append_cython_arg_data(all_py_data, all_c_data, py_data, c_data) # Process output expression calc_last_item = False calc_prev_item = False py_data, c_data, output_expr = self._wrap_cython_code( self.output_func, func_type='output', declarations=declarations) if self.output_func is not None: calc_last_item = self._include_last_item() calc_prev_item = self._include_prev_item() self._append_cython_arg_data(all_py_data, all_c_data, py_data, c_data) # Add size argument py_defn = ['long SIZE'] + all_py_data[0] c_defn = ['long SIZE'] + all_c_data[0] py_args = ['SIZE'] + all_py_data[1] c_args = ['SIZE'] + all_c_data[1] # Only use unique arguments py_defn = drop_duplicates(py_defn) c_defn = drop_duplicates(c_defn) py_args = drop_duplicates(py_args) c_args = drop_duplicates(c_args) if not hasattr(self.output_func, 'arg_keys'): self.output_func.arg_keys = {} self.output_func.arg_keys[self._get_backend_key()] = c_args if self._config.use_openmp: template = Template(text=scan_cy_template) else: template = Template(text=scan_cy_single_thread_template) src = template.render( name=self.name, type=self.type, input_expr=input_expr, scan_expr=self.scan_expr, output_expr=output_expr, neutral=self.neutral, c_arg_sig=', '.join(c_defn), py_arg_sig=', '.join(py_defn), py_args=', '.join(py_args), openmp=self._config.use_openmp, calc_last_item=calc_last_item, calc_prev_item=calc_prev_item, use_segment=use_segment, is_segment_start_expr=segment_expr, complex_map=self.complex_map ) self.source = self.tp.get_code() self.tp.add_code(src) self.tp.compile() self.all_source = self.tp.source return getattr(self.tp.mod, 'py_' + self.name) def _wrap_ocl_function(self, func, func_type=None, declarations=None): if func is not None: self.tp.add(func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(func) self._correct_opencl_address_space(c_data, func, func_type) name = func.__name__ expr = '{func}({args})'.format( func=name, args=', '.join(c_data[1]) ) select = self._not_ignored(c_data[1]) arguments = self._filter_ignored(c_data[0], select) c_args = self._filter_ignored(c_data[1], select) else: if func_type == 'input': if self.backend == 'opencl': arguments = ['__global %(type)s *input' % {'type': self.type}] elif self.backend == 'cuda': arguments = ['%(type)s *input' % {'type': self.type}] expr = 'input[i]' c_args = ['input'] else: arguments = [] expr = None c_args = [] return expr, arguments, c_args def _get_scan_expr_opencl_cuda(self): if self.is_segment_func is not None: return '(across_seg_boundary ? b : (%s))' % self.scan_expr else: return self.scan_expr def _get_opencl_cuda_code(self, declarations=None): input_expr, input_args, input_c_args = \ self._wrap_ocl_function(self.input_func, func_type='input', declarations=declarations) output_expr, output_args, output_c_args = \ self._wrap_ocl_function(self.output_func, func_type='output', declarations=declarations) segment_expr, segment_args, segment_c_args = \ self._wrap_ocl_function(self.is_segment_func, declarations=declarations) scan_expr = self._get_scan_expr_opencl_cuda() preamble = convert_to_float_if_needed(self.tp.get_code()) args = input_args + segment_args + output_args args = drop_duplicates(args) arg_defn = convert_to_float_if_needed(','.join(args)) c_args = input_c_args + segment_c_args + output_c_args c_args = drop_duplicates(c_args) if not hasattr(self.output_func, 'arg_keys'): self.output_func.arg_keys = {} self.output_func.arg_keys[self._get_backend_key()] = c_args return scan_expr, arg_defn, input_expr, output_expr, \ segment_expr, preamble def _generate_opencl_kernel(self, declarations=None): scan_expr, arg_defn, input_expr, output_expr, \ segment_expr, preamble = self._get_opencl_cuda_code( declarations=declarations ) from .opencl import get_context, get_queue from pyopencl.scan import GenericScanKernel ctx = get_context() self.queue = get_queue() knl = GenericScanKernel( ctx, dtype=self.dtype, arguments=arg_defn, input_expr=input_expr, scan_expr=scan_expr, neutral=self.neutral, output_statement=output_expr, is_segment_start_expr=segment_expr, preamble=preamble ) self.source = preamble if knl.first_level_scan_info.kernel.program.source: self.all_source = '\n'.join([ '// ----- Level 1 ------', knl.first_level_scan_info.kernel.program.source, '// ----- Level 2 ------', knl.second_level_scan_info.kernel.program.source, '// ----- Final output ------', knl.final_update_info.kernel.program.source, ]) else: self.all_source = self.source return knl def _generate_cuda_kernel(self, declarations=None): scan_expr, arg_defn, input_expr, output_expr, \ segment_expr, preamble = self._get_opencl_cuda_code( declarations=declarations ) from .cuda import set_context, GenericScanKernel set_context() knl = GenericScanKernel( dtype=self.dtype, arguments=arg_defn, input_expr=input_expr, scan_expr=scan_expr, neutral=self.neutral, output_statement=output_expr, is_segment_start_expr=segment_expr, preamble=preamble ) self.source = preamble # FIXME: Difficult to get the pycuda sources self.all_source = self.source return knl def _add_address_space(self, arg): if '*' in arg and 'GLOBAL_MEM' not in arg: return 'GLOBAL_MEM ' + arg else: return arg def _correct_opencl_address_space(self, c_data, func, func_type): return_type = 'void' if func_type == 'output' else self.type code = self.tp.blocks[-1].code.splitlines() header_idx = 1 for line in code: if line.rstrip().endswith(')'): break header_idx += 1 args = [self._add_address_space(arg) for arg in c_data[0]] code[:header_idx] = wrap( 'WITHIN_KERNEL {type} {func}({args})'.format( type=return_type, func=func.__name__, args=', '.join(args) ), width=78, subsequent_indent=' ' * 4, break_long_words=False ) self.tp.blocks[-1].code = '\n'.join(code) def _massage_arg(self, x): if isinstance(x, array.Array): return x.dev elif self.backend != 'cuda' or isinstance(x, np.ndarray): return x else: return np.asarray(x) @profile def __call__(self, **kwargs): c_args_dict = {k: self._massage_arg(x) for k, x in kwargs.items()} if self._get_backend_key() in self.output_func.arg_keys: output_arg_keys = self.output_func.arg_keys[ self._get_backend_key() ] else: raise ValueError("No kernel arguments found for backend = %s, " "use_openmp = %s, use_double = %s" % self._get_backend_key()) if self.backend == 'cython': size = len(c_args_dict[output_arg_keys[1]]) c_args_dict['SIZE'] = size self.c_func(*[c_args_dict[k] for k in output_arg_keys]) elif self.backend == 'opencl': self.c_func(*[c_args_dict[k] for k in output_arg_keys]) self.queue.finish() elif self.backend == 'cuda': import pycuda.driver as drv event = drv.Event() self.c_func(*[c_args_dict[k] for k in output_arg_keys]) event.record() event.synchronize() class Scan(object): def __init__(self, input=None, output=None, scan_expr="a+b", is_segment=None, dtype=np.float64, neutral='0', complex_map=False, backend=None): # FIXME: Revisit these conditions input_base = input is None or \ getattr(input, '__annotations__', None) and \ not hasattr(input, 'is_jit') output_base = output is None or \ getattr(output, '__annotations__', None) and \ not hasattr(input, 'is_jit') is_segment_base = is_segment is None or \ getattr(is_segment, '__annotations__', None) and \ not hasattr(input, 'is_jit') if input_base and output_base and is_segment_base: self.scan = ScanBase(input=input, output=output, scan_expr=scan_expr, is_segment=is_segment, dtype=dtype, neutral=neutral, complex_map=complex_map, backend=backend) else: from .jit import ScanJIT self.scan = ScanJIT(input=input, output=output, scan_expr=scan_expr, is_segment=is_segment, dtype=dtype, neutral=neutral, complex_map=complex_map, backend=backend) def __dir__(self): return sorted(dir(self.scan) + ['scan']) def __getattr__(self, name): return getattr(self.scan, name) def __call__(self, **kwargs): self.scan(**kwargs) compyle-release-0.8.1/compyle/profile.py000066400000000000000000000145311414173670100203000ustar00rootroot00000000000000""" Utils for profiling kernels """ from contextlib import contextmanager from collections import defaultdict import time from .config import get_config def _make_default(): return dict(calls=0, time=0.0) _current_level = 0 _profile_info = defaultdict( lambda: defaultdict(_make_default) ) def _record_profile(name, time): global _profile_info, _current_level li = _profile_info[_current_level] li[name]['time'] += time li[name]['calls'] += 1 @contextmanager def profile_ctx(name): """ Context manager for profiling For profiling a function f, it can be used as follows:: with profile_ctx('f'): f() """ global _current_level _current_level += 1 start = time.time() try: yield start end = time.time() finally: _current_level -= 1 _record_profile(name, end - start) def profile(method=None, name=None): """Decorator for profiling a function. Can be used as follows:: @profile def f(): pass If explicitly passed a name, with @profile(name='some name'), it will use the given name. Otherwise, if the function is a class method, and the class has a `self.name` attribute, it will use that. Otherwise, it will use the method's qualified name to record the profile. """ def make_wrapper(method): def wrapper(*args, **kwargs): self = args[0] if len(args) else None if name is None: if hasattr(self, method.__name__) and hasattr(self, 'name'): p_name = self.name else: p_name = getattr(method, '__qualname__', method.__name__) else: p_name = name with profile_ctx(p_name): return method(*args, **kwargs) wrapper.__doc__ = method.__doc__ return wrapper if method is None: return make_wrapper else: return make_wrapper(method) class ProfileContext: """Used for a low-level profiling context. This is typically useful in Cython code where decorators are not usable and using a context manager makes the code hard to read. Example ------- p = ProfileContext('some_func') do_something() p.stop() """ def __init__(self, name): self.name = name global _current_level _current_level += 1 self.start = time.time() def stop(self): global _current_level _current_level -= 1 _record_profile(self.name, time.time() - self.start) def get_profile_info(): global _profile_info return _profile_info def print_profile(): global _profile_info hr = '-'*70 print(hr) if len(_profile_info) == 0: print("No profiling information available") print(hr) return print("Profiling info:") print( "{:<6} {:<40} {:<10} {:<10}".format( 'Level', 'Function', 'N calls', 'Time') ) tot_time = 0 for level in range(0, min(len(_profile_info), 2)): profile_data = sorted( _profile_info[level].items(), key=lambda x: x[1]['time'], reverse=True ) for kernel, data in profile_data: print("{:<6} {:<40} {:<10} {:<10.3g}".format( level, kernel, data['calls'], data['time']) ) if level == 0: tot_time += data['time'] print("Total profiled time: %g secs" % tot_time) print(hr) def profile2csv(fname, info=None): '''Write profile info to a CSV file. If the optional info argument is passed, it is used as the profile info. The `info` argument is a list, potentially one for each rank (for a parallel simulation). ''' if info is None: info = [get_profile_info()] with open(fname, 'w') as f: f.write("{0},{1},{2},{3},{4}\n".format( 'rank', 'level', 'function', 'calls', 'time') ) for rank in range(len(info)): pdata = info[rank] for level in sorted(pdata.keys()): profile_data = sorted( pdata[level].items(), key=lambda x: x[1]['time'], reverse=True ) for name, data in profile_data: f.write("{0},{1},{2},{3},{4}\n".format( rank, level, name, data['calls'], data['time'] )) def profile_kernel(kernel, name, backend=None): """For profiling raw PyCUDA/PyOpenCL kernels or cython functions """ from compyle.array import get_backend backend = get_backend(backend) def _profile_knl(*args, **kwargs): if backend == 'opencl': start = time.time() event = kernel(*args, **kwargs) event.wait() end = time.time() _record_profile(name, end - start) return event elif backend == 'cuda': exec_time = kernel(*args, **kwargs, time_kernel=True) _record_profile(name, exec_time) return exec_time else: start = time.time() kernel(*args, **kwargs) end = time.time() _record_profile(name, end - start) if get_config().profile: wgi = getattr(kernel, 'get_work_group_info', None) if wgi is not None: _profile_knl.get_work_group_info = wgi return _profile_knl else: return kernel def named_profile(name, backend=None): """Decorator for profiling raw PyOpenCL/PyCUDA kernels or cython functions. This can be used on a function that returns a raw PyCUDA/PyOpenCL kernel For example:: @named_profile('prefix_sum') def _get_prefix_sum(ctx): return GenericScanKernel(ctx, np.int32, arguments="__global int *ary", input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i] = prev_item") """ from compyle.array import get_backend backend = get_backend(backend) def _decorator(f): if name is None: n = f.__name__ else: n = name def _profiled_kernel_generator(*args, **kwargs): kernel = f(*args, **kwargs) return profile_kernel(kernel, n, backend=backend) return _profiled_kernel_generator return _decorator compyle-release-0.8.1/compyle/sort.py000066400000000000000000000054761414173670100176370ustar00rootroot00000000000000import numpy as np from .config import get_config from .cython_generator import get_parallel_range, CythonGenerator from .transpiler import Transpiler, convert_to_float_if_needed from .types import dtype_to_ctype, annotate from .parallel import Scan from .template import Template from . import array class OutputSortBit(Template): def __init__(self, name, num_arys): super(OutputSortBit, self).__init__(name=name) self.num_arys = num_arys def extra_args(self): args = ['inp_%s' % num for num in range(self.num_arys)] args += ['out_%s' % num for num in range(self.num_arys)] return args, {} def template(self, i, item, prev_item, last_item, bit_number, indices, sorted_indices): ''' key_bit = (inp_0[i] >> bit_number) & 1 t = last_item + i - prev_item idx = t if key_bit else prev_item sorted_indices[idx] = indices[i] % for num in range(obj.num_arys): out_${num}[idx] = inp_${num}[i] % endfor ''' @annotate def input_sort_bit(i, inp_0, bit_number): return 1 if (inp_0[i] >> bit_number) & 1 == 0 else 0 def radix_sort(ary_list, out_list=None, max_key_bits=None, backend=None): keys = ary_list[0] backend = array.get_backend(backend) if not np.issubdtype(keys.dtype, np.integer): raise ValueError("RadixSort can only sort integer types") if max_key_bits is None: max_key_bits = 8 * keys.dtype.itemsize # temp arrays sorted_indices = array.zeros(keys.length, np.int32, backend=backend) temp_indices = array.zeros_like(sorted_indices) indices = array.arange(0, keys.length, 1, backend=backend) # allocate temp arrays if out_list: temp_ary_list = out_list else: temp_ary_list = [array.zeros_like(ary) for ary in ary_list] sorted_ary_list = [array.zeros_like(ary) for ary in ary_list] # kernel output_sort_bit = OutputSortBit('output_sort_bit', len(ary_list)) sort_bit_knl = Scan(input_sort_bit, output_sort_bit.function, 'a+b', dtype=keys.dtype, backend=backend) for bit_number in range(max_key_bits): if bit_number == 0: inp_indices = indices inp_ary_list = ary_list else: inp_indices = temp_indices inp_ary_list = temp_ary_list args = {'bit_number': bit_number, 'indices': indices, 'sorted_indices': sorted_indices} args.update({'inp_%i' % i: ary for i, ary in enumerate(inp_ary_list)}) args.update({'out_%i' % i: ary for i, ary in enumerate(sorted_ary_list)}) sort_bit_knl(**args) temp_indices, sorted_indices = sorted_indices, temp_indices temp_ary_list, sorted_ary_list = sorted_ary_list, temp_ary_list return temp_ary_list, temp_indices compyle-release-0.8.1/compyle/template.py000066400000000000000000000064651414173670100204620ustar00rootroot00000000000000import ast import inspect from textwrap import dedent from .types import kwtype_to_annotation import mako.template getfullargspec = getattr( inspect, 'getfullargspec', inspect.getargspec ) class Template(object): def __init__(self, name): self.name = name self._function = None @property def function(self): if self._function is None: self._function = self._make_function() return self._function def _make_function(self): src, annotations = self._get_code() self._source = src namespace = {} exec(src, namespace) f = namespace[self.name] f.__module__ = self.__module__ f.is_jit = len(annotations) == 0 try: f.__annotations__ = annotations except AttributeError: f.im_func.__annotations__ = annotations f.source = src return f def _get_code(self): m = ast.parse(dedent(inspect.getsource(self.template))) argspec = getfullargspec(self.template) args = argspec.args if args[0] == 'self': args = args[1:] extra_args, extra_annotations = self.extra_args() args += extra_args arg_string = ', '.join(args) body = m.body[0].body template = body[-1].value.s docstring = body[0].value.s if len(body) == 2 else '' name = self.name sig = 'def {name}({args}):\n """{docs}\n """'.format( name=name, args=arg_string, docs=docstring ) src = sig + self.render(template) annotations = getattr(self.template, '__annotations__', {}) data = kwtype_to_annotation(extra_annotations) annotations.update(data) return src, annotations def inject(self, func, indent=1): '''Returns the source code of the body of `func`. The optional `indent` parameter is the indentation to be used for the code. When indent is 1, 4 spaces are added to each line. This is meant to be used from the mako template. The idea is that one can define the code to be injected as a method and have the body be directly injected. ''' lines = inspect.getsourcelines(func)[0] src = dedent(''.join(lines)) m = ast.parse(src) # We do this so as to not inject any docstrings. body_start_index = 1 if isinstance(m.body[0].body[0], ast.Expr) else 0 body_start = m.body[0].body[body_start_index].lineno - 1 body_lines = lines[body_start:] first = body_lines[0] leading = first.index(first.lstrip()) diff = indent*4 - leading if diff < 0: indented_body = [x[-diff:] for x in body_lines] else: indented_body = [' '*diff + x for x in body_lines] return ''.join(indented_body) def render(self, src): t = mako.template.Template(text=src) return t.render(obj=self) def extra_args(self): '''Override this to provide configurable arguments. Return a list of strings which are the arguments and a dictionary with the type annotations. ''' return [], {} def template(self): '''Override this to write your mako template. `obj` is mapped to self. ''' ''' ## Mako code here. ''' compyle-release-0.8.1/compyle/tests/000077500000000000000000000000001414173670100174245ustar00rootroot00000000000000compyle-release-0.8.1/compyle/tests/__init__.py000066400000000000000000000000001414173670100215230ustar00rootroot00000000000000compyle-release-0.8.1/compyle/tests/py3_code.py000066400000000000000000000002351414173670100215030ustar00rootroot00000000000000# Python3 specific code for some tests. from ..types import int_, declare def py3_f(x: int_) -> int_: y = declare('int') y = x + 1 return x*y compyle-release-0.8.1/compyle/tests/test_array.py000066400000000000000000000327341414173670100221640ustar00rootroot00000000000000import pytest import numpy as np from ..array import Array, wrap_array from ..config import Config, get_config import compyle.array as array from compyle import config check_all_backends = pytest.mark.parametrize('backend', ['cython', 'opencl', 'cuda']) check_all_dtypes = pytest.mark.parametrize('dtype', [np.int32, np.float32, np.float64]) def make_dev_array(backend, n=16): dev_array = Array(np.int32, n=n, backend=backend) dev_array.fill(0) dev_array[0] = 1 return dev_array def check_import(backend): if backend == 'opencl': pytest.importorskip('pyopencl') if backend == 'cuda': pytest.importorskip('pycuda') @check_all_backends def test_reserve(backend): check_import(backend) # Given dev_array = make_dev_array(backend) # When dev_array.reserve(64) # Then assert len(dev_array.get_data()) == 64 assert dev_array.length == 16 assert dev_array[0] == 1 @check_all_backends def test_resize_with_reallocation(backend): check_import(backend) # Given dev_array = make_dev_array(backend) # When dev_array.resize(64) # Then assert len(dev_array.get_data()) == 64 assert dev_array.length == 64 assert dev_array[0] == 1 @check_all_backends def test_resize_without_reallocation(backend): check_import(backend) # Given dev_array = make_dev_array(backend, n=128) # When dev_array.resize(64) # Then assert len(dev_array.get_data()) == 128 assert dev_array.length == 64 assert dev_array[0] == 1 @check_all_backends def test_copy(backend): check_import(backend) # Given dev_array = make_dev_array(backend) # When dev_array_copy = dev_array.copy() # Then print(dev_array.dev, dev_array_copy.dev) assert np.all(dev_array.get() == dev_array_copy.get()) dev_array_copy[0] = 2 assert dev_array[0] != dev_array_copy[0] @check_all_backends def test_append_with_reallocation(backend): check_import(backend) # Given dev_array = make_dev_array(backend) # When dev_array.append(2) # Then assert dev_array[-1] == 2 assert len(dev_array.get_data()) == 32 @check_all_backends def test_append_without_reallocation(backend): check_import(backend) # Given dev_array = make_dev_array(backend) dev_array.reserve(20) # When dev_array.append(2) # Then assert dev_array[-1] == 2 assert len(dev_array.get_data()) == 20 @check_all_backends def test_extend(backend): check_import(backend) # Given dev_array = make_dev_array(backend) new_array = 2 + array.zeros(64, dtype=np.int32, backend=backend) # When dev_array.extend(new_array) # Then old_nparr = dev_array.get() new_nparr = new_array.get() assert np.all(old_nparr[-len(new_array)] == new_nparr) @check_all_backends def test_remove(backend): check_import(backend) # Given dev_array = Array(np.int32, backend=backend) orig_array = array.arange(0, 16, 1, dtype=np.int32, backend=backend) dev_array.set_data(orig_array) indices = array.arange(0, 8, 1, dtype=np.int32, backend=backend) # When dev_array.remove(indices) # Then assert np.all(dev_array.get() == (8 + indices).get()) @check_all_backends def test_align(backend): check_import(backend) # Given dev_array = Array(np.int32, backend=backend) orig_array = array.arange(0, 16, 1, dtype=np.int32, backend=backend) dev_array.set_data(orig_array) indices = array.arange(15, -1, -1, dtype=np.int32, backend=backend) # When dev_array = dev_array.align(indices) # Then assert np.all(dev_array.get() == indices.get()) @check_all_backends def test_align_multiple(backend): check_import(backend) # Given dev_array_a = Array(np.uint32, backend=backend) dev_array_b = Array(np.float32, backend=backend) orig_array_a = array.arange(0, 1024, 1, dtype=np.uint32, backend=backend) orig_array_b = array.arange( 1024, 2048, 1, dtype=np.float32, backend=backend) dev_array_a.set_data(orig_array_a) dev_array_b.set_data(orig_array_b) indices = array.arange(1023, -1, -1, dtype=np.int64, backend=backend) # When dev_array_a, dev_array_b = array.align([dev_array_a, dev_array_b], indices) # Then assert np.all(dev_array_a.get() == indices.get()) assert np.all(dev_array_b.get() - 1024 == indices.get()) @check_all_backends def test_squeeze(backend): check_import(backend) # Given dev_array = make_dev_array(backend) dev_array.fill(2) dev_array.reserve(32) assert dev_array.alloc == 32 # When dev_array.squeeze() # Then assert dev_array.alloc == 16 @check_all_backends def test_copy_values(backend): check_import(backend) # Given dev_array = make_dev_array(backend) dev_array.fill(2) dest = array.empty(8, dtype=np.int32, backend=backend) indices = array.arange(0, 8, 1, dtype=np.int32, backend=backend) # When dev_array.copy_values(indices, dest) # Then assert np.all(dev_array[:len(indices)].get() == dest.get()) @check_all_backends def test_min_max(backend): check_import(backend) # Given dev_array = make_dev_array(backend) dev_array.fill(2) dev_array[0], dev_array[1] = 1, 10 # When dev_array.update_min_max() # Then assert dev_array.minimum == 1 assert dev_array.maximum == 10 @check_all_backends def test_sort_by_keys(backend): check_import(backend) # Given nparr1 = np.random.randint(0, 100, 16, dtype=np.int32) nparr2 = np.random.randint(0, 100, 16, dtype=np.int32) dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend) # When out_array1, out_array2 = array.sort_by_keys([dev_array1, dev_array2]) # Then order = np.argsort(nparr1) act_result1 = np.take(nparr1, order) act_result2 = np.take(nparr2, order) assert np.all(out_array1.get() == act_result1) assert np.all(out_array2.get() == act_result2) def test_radix_sort_by_keys(): backend = 'cython' for use_openmp in [True, False]: get_config().use_openmp = use_openmp # Given nparr1 = np.random.randint(0, 100, 16, dtype=np.int32) nparr2 = np.random.randint(0, 100, 16, dtype=np.int32) dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend) # When out_array1, out_array2 = array.sort_by_keys([dev_array1, dev_array2], use_radix_sort=True) # Then order = np.argsort(nparr1) act_result1 = np.take(nparr1, order) act_result2 = np.take(nparr2, order) assert np.all(out_array1.get() == act_result1) assert np.all(out_array2.get() == act_result2) get_config().use_openmp = False @pytest.mark.parametrize( 'backend', ['cython', 'opencl', pytest.param('cuda', marks=pytest.mark.xfail)]) def test_sort_by_keys_with_output(backend): check_import(backend) # Given nparr1 = np.random.randint(0, 100, 16, dtype=np.int32) nparr2 = np.random.randint(0, 100, 16, dtype=np.int32) dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend) out_arrays = [ array.zeros_like(dev_array1), array.zeros_like(dev_array2)] # When array.sort_by_keys([dev_array1, dev_array2], out_list=out_arrays, use_radix_sort=False) # Then order = np.argsort(nparr1) act_result1 = np.take(nparr1, order) act_result2 = np.take(nparr2, order) assert np.all(out_arrays[0].get() == act_result1) assert np.all(out_arrays[1].get() == act_result2) @pytest.mark.parametrize( 'backend', ['cython', 'cuda', pytest.param('opencl', marks=pytest.mark.xfail)] ) def test_argsort(backend): check_import(backend) # Given nparr1 = np.random.randint(0, 100, 16, dtype=np.int32) devarr1 = array.wrap(nparr1, backend=backend) # When out = array.argsort(devarr1) # Then ans = np.argsort(nparr1) assert np.all(out.get() == ans) @check_all_backends def test_dot(backend): check_import(backend) # Given a = make_dev_array(backend) a.fill(1) b = make_dev_array(backend) b.fill(2) # When out_array = array.dot(a, b) # Then assert np.all(out_array == 32) @check_all_backends def test_cumsum(backend): check_import(backend) # Given a = array.ones(100, dtype=int, backend=backend) # When b = array.cumsum(a) # Then a.pull() b.pull() assert np.all(b.data == np.cumsum(a.data)) # Test cumsum with an out argument # Given out = array.zeros(100, dtype=int, backend=backend) # When b = array.cumsum(a, out=out) # Then out.pull() assert np.all(out.data == np.cumsum(a.data)) @check_all_backends def test_linspace(backend): check_import(backend) dev_array = array.linspace(2, 10, 100, backend=backend) assert(dev_array[-1] == 10) dev_array = array.linspace(2, 10, 100, endpoint=False, backend=backend) assert(dev_array[-1] < 10) dtype = dev_array.dtype assert(np.issubdtype(dtype, np.floating)) @check_all_backends @check_all_dtypes def test_diff(backend, dtype): check_import(backend) if dtype == np.float64: get_config().use_double = True dev_array = array.ones(1, dtype=dtype, backend=backend) with pytest.raises(ValueError): y = array.diff(dev_array, 1) y = array.diff(dev_array, 0) assert(y[0] == dev_array[0]) dev_array = array.ones(2, dtype=dtype, backend=backend) with pytest.raises(ValueError): y = array.diff(dev_array, -1) y = array.diff(dev_array, 1) assert(len(y) == 1) assert(y[0] == 0) dev_array = np.linspace(0, 10, 11, dtype=dtype)**2 yt = np.diff(dev_array, 2) dev_array = wrap_array(dev_array, backend=backend) y = array.diff(dev_array, 2) for i in range(8): assert(y[i] == yt[i]) @check_all_backends def test_trapz(backend): check_import(backend) x = array.linspace(0, 5, 6, dtype=np.float32, backend=backend) y = array.linspace(0, 5, 6, dtype=np.float32, backend=backend) xn = np.linspace(0, 5, 6, dtype=np.float32) yn = np.linspace(0, 5, 6, dtype=np.float32) assert(array.trapz(y) == np.trapz(yn)) assert(array.trapz(y, x,) == np.trapz(yn, xn)) assert(array.trapz(y, dx=3) == np.trapz(yn, dx=3)) x = array.linspace(0, 5, 5, dtype=np.float32, backend=backend) with pytest.raises(Exception): array.trapz(y, x) check_comparison_methods = pytest.mark.parametrize( 'method', ['__gt__', '__lt__', '__ge__', '__le__', '__ne__', '__eq__']) @check_all_backends @check_all_dtypes @check_comparison_methods def test_comparison(backend, dtype, method): check_import(backend) if dtype == np.float64: get_config().use_double = True # Given x = array.arange(0., 10., 1., dtype=dtype, backend=backend) # When out = getattr(x, method)(5) # Then x_np = np.arange(10, dtype=dtype) comp = [int(i) for i in getattr(x_np, method)(5)] assert np.all(out.get() == comp) @check_all_backends def test_where(backend): check_import(backend) # Given a = array.arange(0, 10, 1, backend=backend) b = array.arange(10, 20, 1, backend=backend) # When out = np.array([10, 11, 12, 13, 14, 15, 6, 7, 8, 9]) # Then ans = array.where(a > 5, a, b) assert np.all(ans.get() == out) def test_where_for_raised_errors(): check_import('opencl') check_import('cuda') # check errors a = array.arange(0, 10, 1, backend='opencl', dtype=np.int32) b = array.arange(10, 20, 1, backend='cuda', dtype=np.int32) with pytest.raises(TypeError): array.where(a > 5, a, b) b = array.arange(10, 20, 1, backend='opencl', dtype=np.float32) with pytest.raises(TypeError): array.where(a > 5, a, b) @check_all_backends def test_ones_like(backend): check_import(backend) # Given x = array.arange(1, 10, 1, dtype=np.int32) # When y = array.ones_like(x) z = array.zeros_like(x) # Then assert np.all(y.get() == np.ones_like(x)) assert np.all(z.get() == np.zeros_like(x)) @check_all_dtypes @check_all_backends def test_minimum(dtype, backend): check_import(backend) # Given x = array.arange(3, 5, 1, backend=backend, dtype=dtype) # When out = array.minimum(x) # Then assert (out == 3) @check_all_dtypes @check_all_backends def test_sum(dtype, backend): check_import(backend) # Given x = array.arange(0, 5, 1, backend=backend, dtype=dtype) # When out = array.sum(x) # Then assert (out == 10) @check_all_dtypes @check_all_backends def test_take_bool(dtype, backend): check_import(backend) if dtype == np.float64: get_config().use_double = True # Given x = array.arange(0, 10, 1, backend=backend, dtype=dtype) cond = x > 5 # When out = array.take_bool(x, cond) # Then ans = np.arange(6, 10, dtype=dtype) assert np.all(out.get() == ans) @check_all_backends def test_binary_op(backend): check_import(backend) # Given x = array.ones(10, dtype=np.float32, backend=backend) y = array.ones_like(x) x_np = np.ones(10, dtype=np.float32) # When out_add = x + y out_sub = x - y # Then assert np.all(out_add.get() == x_np + x_np) assert np.all(out_sub.get() == np.zeros_like(x_np)) compyle-release-0.8.1/compyle/tests/test_ast_utils.py000066400000000000000000000065701414173670100230540ustar00rootroot00000000000000 import ast import sys from textwrap import dedent import unittest from ..ast_utils import ( get_assigned, get_symbols, get_unknown_names_and_calls, has_node, has_return ) class TestASTUtils(unittest.TestCase): def test_get_symbols(self): code = ''' x = 1 d_x[d_idx] += s_x[s_idx] ''' tree = ast.parse(dedent(code)) result = list(get_symbols(tree)) result.sort() expect = ['d_idx', 'd_x', 's_idx', 's_x', 'x'] self.assertEqual(result, expect) # Test if it parses with the code itself instead of a tree. result = list(get_symbols(dedent(code))) result.sort() self.assertEqual(result, expect) result = list(get_symbols(tree, ctx=ast.Store)) result.sort() self.assertEqual(result, ['x']) def test_has_return(self): code = dedent(''' x = 1 ''') self.assertFalse(has_return(code)) code = dedent(''' def f(): pass ''') self.assertFalse(has_return(code)) code = dedent(''' def f(x): return x+1 ''') self.assertTrue(has_return(code)) def test_has_node(self): code = dedent(''' x = 1 ''') self.assertFalse(has_node(code, (ast.Return, ast.AugAssign))) code = dedent(''' def f(): pass ''') self.assertTrue(has_node(code, (ast.AugAssign, ast.FunctionDef))) def test_assigned_values(self): code = dedent(''' u[0] = 0.0 x = 1 y = sin(x)*theta z += 1 ''') assigned = list(sorted(get_assigned(code))) # sin or theta should not be detected. expect = ['u', 'x', 'y', 'z'] self.assertEqual(assigned, expect) def test_assigned_tuple_expansion(self): code = dedent(''' u, v = 0.0, 1.0 [x, y] = 0.0, 1.0 ''') assigned = list(sorted(get_assigned(code))) expect = ['u', 'v', 'x', 'y'] self.assertEqual(assigned, expect) def test_get_unknown_names_and_calls(self): code = dedent(''' def f(x): g(h(x)) y = x + SIZE for i in range(y): x += func(JUNK) sin(x) ''') # When names, calls = get_unknown_names_and_calls(code) # Then. e_names = {'SIZE', 'i', 'JUNK'} e_calls = {'g', 'h', 'range', 'func', 'sin'} self.assertSetEqual(names, e_names) self.assertSetEqual(calls, e_calls) @unittest.skipIf(sys.version_info < (3, 4), reason='Test requires Python 3.') def test_get_unknown_names_and_calls_with_py3_annotation(self): code = dedent(''' from compyle import types as T def f(x: T.doublep, n: T.int_)-> T.double: s = declare('double') for i in range(n): s += func(x) return s ''') # When names, calls = get_unknown_names_and_calls(code) # Then. e_names = {'i'} e_calls = {'declare', 'func', 'range'} self.assertSetEqual(names, e_names) self.assertSetEqual(calls, e_calls) if __name__ == '__main__': unittest.main() compyle-release-0.8.1/compyle/tests/test_capture_stream.py000066400000000000000000000042121414173670100240520ustar00rootroot00000000000000import subprocess import sys import unittest import pytest from ..capture_stream import CaptureMultipleStreams, CaptureStream if sys.platform.startswith("win32") and sys.version_info[:2] > (3, 5): pytest.skip("skipping capture tests on windows", allow_module_level=True) def write_stderr(): subprocess.call( [sys.executable, "-S", "-s", "-c", "import sys;sys.stderr.write('stderr')"] ) def write_stdout(): subprocess.call( [sys.executable, "-S", "-s", "-c", "import sys;sys.stdout.write('stdout')"] ) class TestCaptureStream(unittest.TestCase): def test_that_stderr_is_captured_by_default(self): # Given # When with CaptureStream() as stream: write_stderr() # Then self.assertEqual(stream.get_output(), "stderr") def test_that_stdout_can_be_captured(self): # Given # When with CaptureStream(sys.stdout) as stream: write_stdout() # Then self.assertEqual(stream.get_output(), "stdout") def test_that_output_is_available_in_context_and_outside(self): # Given # When with CaptureStream(sys.stderr) as stream: write_stderr() # Then self.assertEqual(stream.get_output(), "stderr") # Then self.assertEqual(stream.get_output(), "stderr") class TestCaptureMultipleStreams(unittest.TestCase): def test_that_stdout_stderr_are_captured_by_default(self): # Given # When with CaptureMultipleStreams() as stream: write_stderr() write_stdout() # Then outputs = stream.get_output() self.assertEqual(outputs[0], "stdout") self.assertEqual(outputs[1], "stderr") def test_that_order_is_preserved(self): # Given # When with CaptureMultipleStreams((sys.stderr, sys.stdout)) as stream: write_stderr() write_stdout() # Then outputs = stream.get_output() self.assertEqual(outputs[0], "stderr") self.assertEqual(outputs[1], "stdout") if __name__ == '__main__': unittest.main() compyle-release-0.8.1/compyle/tests/test_config.py000066400000000000000000000063621414173670100223110ustar00rootroot00000000000000"""Tests for the configuration. """ from unittest import TestCase, main from ..config import Config, get_config, set_config, use_config class ConfigTestCase(TestCase): def setUp(self): # Unset any default configuration. set_config(None) self.config = Config() def tearDown(self): # Unset any default configuration. set_config(None) def test_use_openmp_config_default(self): # Given config = self.config # When # Then self.assertFalse(config.use_openmp) def test_set_get_use_openmp_config(self): # Given config = self.config # When config.use_openmp = 10 # Then self.assertEqual(config.use_openmp, 10) def test_set_get_omp_schedule_config(self): # Given config = self.config # When config.omp_schedule = ("static", 10) # Then self.assertEqual(config.omp_schedule, ("static", 10)) def test_set_string_omp_schedule(self): # Given config = self.config # When config.set_omp_schedule("dynamic,20") # Then self.assertEqual(config.omp_schedule, ("dynamic", 20)) def test_set_omp_schedule_config_exception(self): # Given config = self.config # When # Then with self.assertRaises(ValueError): config.omp_schedule = ("random", 20) def test_use_opencl_config_default(self): # Given config = self.config # When # Then self.assertFalse(config.use_opencl) def test_set_get_use_opencl_config(self): # Given config = self.config # When config.use_opencl = 10 # Then self.assertEqual(config.use_opencl, 10) def test_use_double_config_default(self): # Given config = self.config # When # Then self.assertFalse(config.use_double) def test_set_get_use_double_config(self): # Given config = self.config # When config.use_double = 10 # Then self.assertEqual(config.use_double, 10) def test_default_global_config_is_really_global(self): # Given. config = get_config() self.assertTrue(isinstance(config, Config)) # When config.use_openmp = 100 # Then. config1 = get_config() self.assertEqual(config1.use_openmp, 100) def test_set_global(self): # Given. self.config.use_openmp = 200 set_config(self.config) # When config = get_config() # Then. self.assertEqual(config.use_openmp, 200) def test_use_config(self): # Given self.config.use_openmp = 200 set_config(self.config) # When/Then with use_config(use_openmp=300) as cfg: config = get_config() self.assertEqual(config.use_openmp, 300) self.assertEqual(cfg.use_openmp, 300) cfg.use_openmp = 100 cfg.use_double = False self.assertEqual(config.use_openmp, 100) self.assertEqual(config.use_double, False) # Then self.assertEqual(get_config().use_openmp, 200) if __name__ == '__main__': main() compyle-release-0.8.1/compyle/tests/test_cuda.py000066400000000000000000000005501414173670100217510ustar00rootroot00000000000000import pytest pytest.importorskip('pycuda') from compyle.array import wrap from compyle.thrust.sort import argsort import numpy as np def test_sort(): length = 100 a = np.array(np.random.rand(length), dtype=np.float32) b = wrap(a, backend='cuda') res_gpu = argsort(b).get() res_cpu = np.argsort(a) assert np.all(res_gpu == res_cpu) compyle-release-0.8.1/compyle/tests/test_cython_generator.py000066400000000000000000000341531414173670100244150ustar00rootroot00000000000000"""Test code for Cython code generation. """ import unittest from textwrap import dedent from math import pi, sin import sys from ..config import get_config, set_config, use_config from ..types import declare, KnownType, annotate from ..cython_generator import (CythonGenerator, CythonClassHelper, all_numeric, get_parallel_range) class BasicEq: def __init__(self, hidden=None, rho=0.0, c=0.0): self.rho = rho self.c = c self._hidden = ['a', 'b'] class EqWithMethod(BasicEq): def func(self, d_idx=0, d_x=[0.0, 0.0]): tmp = abs(self.rho*self.c)*sin(pi*self.c) d_x[d_idx] = d_x[d_idx]*tmp class EqWithReturn(BasicEq): def func(self, d_idx=0, d_x=[0.0, 0.0]): return d_x[d_idx] class EqWithKnownTypes: def some_func(self, d_idx, d_p, WIJ, DWIJ, user, d_user, s_user): d_p[d_idx] = WIJ*DWIJ[0] class EqWithMatrix: def func(self, d_idx, d_x=[0.0, 0.0]): mat = declare('matrix((2,2))') mat[0][0] = d_x[d_idx] vec, vec1 = declare('matrix(3, "float")', 2) vec[0] = d_x[d_idx] class EqWithDeclare: def func(self, d_idx, d_x=[0.0, 0.0]): val, val1 = declare('float', 2) # val1 = declare('double') val = d_x[d_idx] index = declare('unsigned int') index = d_idx def func_with_return(d_idx, d_x, x=0.0): x += 1 return d_x[d_idx] + x def simple_func(d_idx, d_x, x=0.0): d_x[d_idx] += x @annotate(i='int', y='floatp', return_='float') def annotated_f(i, y=[0.0]): x = declare('LOCAL_MEM matrix(64, "unsigned int")') return y[i] class TestBase(unittest.TestCase): def assert_code_equal(self, result, expect): expect = expect.strip() result = result.strip() msg = 'EXPECTED:\n%s\nGOT:\n%s' % (expect, result) self.assertEqual(expect, result, msg) class TestMiscUtils(TestBase): def test_all_numeric(self): x = [1, 2, 3.0] self.assertTrue(all_numeric(x)) try: x = [0.0, 1, long(3)] except NameError: x = [0.0, 1, 3] self.assertTrue(all_numeric(x)) x = [0.0, 1.0, ''] self.assertFalse(all_numeric(x)) def test_detect_type(self): cases = [ (('d_something', None), 'double*'), (('s_something', None), 'double*'), (('d_idx', 0), 'long'), (('x', 1), 'long'), (('s', 'asdas'), 'str'), (('junk', 1.0), 'double'), (('y', [0.0, 1]), 'double*'), (('y', [0, 1, 0]), 'double*'), (('y', None), 'object'), ] cg = CythonGenerator() for args, expect in cases: msg = 'detect_type(*%r) != %r' % (args, expect) self.assertEqual(cg.detect_type(*args), expect, msg) def test_cython_class_helper(self): code = ('def f(self, x):', ' x += 1\n return x+1') c = CythonClassHelper(name='A', public_vars={'x': 'double'}, methods=[code]) expect = dedent(""" cdef class A: cdef public double x def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) def f(self, x): x += 1 return x+1 """) self.assert_code_equal(c.generate().strip(), expect.strip()) def test_get_parallel_range_without_openmp(self): with use_config(use_openmp=False): # Given/When res = get_parallel_range('NP') # Then self.assertEqual(res, 'range(0, NP, 1)') # Given/When res = get_parallel_range('START', 'NP') # Then self.assertEqual(res, 'range(START, NP, 1)') # Given/When res = get_parallel_range('NP', step=2) # Then self.assertEqual(res, 'range(0, NP, 2)') # Given/When res = get_parallel_range(1, 'NP+1', 2) # Then self.assertEqual(res, 'range(1, NP+1, 2)') def test_get_parallel_range_with_openmp(self): with use_config(use_openmp=True): cfg = get_config() sched, chunk = cfg.omp_schedule # Given/When res = get_parallel_range('NP') # Then expect = "prange(0, NP, 1, schedule='{}', chunksize={})".format( sched, chunk ) self.assertEqual(res, expect) # Given/When res = get_parallel_range('START', 'NP', 2) # Then expect = ( "prange(START, NP, 2, schedule='{}', chunksize={})".format( sched, chunk ) ) self.assertEqual(res, expect) # Given/When res = get_parallel_range('NP', nogil=True) # Then expect = ( "prange(0, NP, 1, schedule='{}', chunksize={}, " "nogil=True)".format( sched, chunk ) ) self.assertEqual(res, expect) # Given/When res = get_parallel_range('NP', nogil=True, num_threads=4) # Then expect = ( "prange(0, NP, 1, schedule='{}', chunksize={}, " "nogil=True, num_threads=4)".format( sched, chunk ) ) self.assertEqual(res, expect) with use_config(use_openmp=True, omp_schedule=('static', 32)): # Given/When res = get_parallel_range('NP') # Then expect = "prange(0, NP, 1, schedule='{}', chunksize={})".format( 'static', 32 ) self.assertEqual(res, expect) class TestCythonCodeGenerator(TestBase): def setUp(self): get_config().use_openmp = False def tearDown(self): set_config(None) def test_simple_constructor(self): cg = CythonGenerator() cg.parse(BasicEq()) expect = dedent(""" cdef class BasicEq: cdef public list _hidden cdef public double c cdef public double rho def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_func_signature(self): # Given def f(x=1, y=[1.0]): pass cg = CythonGenerator() # When py_data, c_data = cg.get_func_signature(f) # Then self.assertEqual(py_data[0], ['long x', 'double[:] y']) self.assertEqual(py_data[1], ['x', '&y[0]']) self.assertEqual(c_data[0], ['long x', 'double* y']) self.assertEqual(c_data[1], ['x', 'y']) def test_function_with_annotation(self): # Given cg = CythonGenerator() # When cg.parse(annotated_f) # Then expect = dedent(''' cdef inline float annotated_f(int i, float* y): cdef unsigned int x[64] return y[i] ''') self.assert_code_equal(cg.get_code().strip(), expect.strip()) @unittest.skipIf(sys.version_info < (3, 4), reason='Requires Python3.') def test_python3_annotation(self): # Given from .py3_code import py3_f cg = CythonGenerator() # When cg.parse(py3_f) expect = dedent(''' cdef inline int py3_f(int x): cdef int y y = x + 1 return x*y ''') self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_simple_method(self): cg = CythonGenerator() cg.parse(EqWithMethod()) expect = dedent(""" cdef class EqWithMethod: cdef public list _hidden cdef public double c cdef public double rho def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) cdef inline void func(self, long d_idx, double* d_x): cdef double tmp tmp = abs(self.rho*self.c)*sin(pi*self.c) d_x[d_idx] = d_x[d_idx]*tmp """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_honors_use_openmp_setting(self): # When get_config().use_openmp = True # Then cg = CythonGenerator() cg.parse(EqWithMethod()) expect = dedent(""" cdef class EqWithMethod: cdef public list _hidden cdef public double c cdef public double rho def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) cdef inline void func(self, long d_idx, double* d_x) nogil: cdef double tmp tmp = abs(self.rho*self.c)*sin(pi*self.c) d_x[d_idx] = d_x[d_idx]*tmp """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_python_methods(self): cg = CythonGenerator(python_methods=True) cg.parse(EqWithMethod()) expect = dedent(""" cdef class EqWithMethod: cdef public list _hidden cdef public double c cdef public double rho def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) cdef inline void func(self, long d_idx, double* d_x): cdef double tmp tmp = abs(self.rho*self.c)*sin(pi*self.c) d_x[d_idx] = d_x[d_idx]*tmp cpdef py_func(self, long d_idx, double[:] d_x): self.func(d_idx, &d_x[0]) """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) cg.parse(EqWithReturn()) expect = dedent(""" cdef class EqWithReturn: cdef public list _hidden cdef public double c cdef public double rho def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) cdef inline double func(self, long d_idx, double* d_x): return d_x[d_idx] cpdef double py_func(self, long d_idx, double[:] d_x): return self.func(d_idx, &d_x[0]) """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) cg.parse(func_with_return) expect = dedent(""" cdef inline double func_with_return(long d_idx, double* d_x, double x): x += 1 return d_x[d_idx] + x cpdef double py_func_with_return(long d_idx, double[:] d_x, double x): return func_with_return(d_idx, &d_x[0], x) """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_method_with_return(self): cg = CythonGenerator() cg.parse(EqWithReturn()) expect = dedent(""" cdef class EqWithReturn: cdef public list _hidden cdef public double c cdef public double rho def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) cdef inline double func(self, long d_idx, double* d_x): return d_x[d_idx] """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_method_with_matrix(self): cg = CythonGenerator() cg.parse(EqWithMatrix()) expect = dedent(""" cdef class EqWithMatrix: def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) cdef inline void func(self, long d_idx, double* d_x): cdef double mat[2][2] mat[0][0] = d_x[d_idx] cdef float vec[3], vec1[3] vec[0] = d_x[d_idx] """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_method_with_declare(self): cg = CythonGenerator() cg.parse(EqWithDeclare()) expect = dedent(""" cdef class EqWithDeclare: def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) cdef inline void func(self, long d_idx, double* d_x): cdef float val, val1 # val1 = declare('double') val = d_x[d_idx] cdef unsigned int index index = d_idx """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_method_with_known_types(self): # noqa cg = CythonGenerator( known_types={'WIJ': 0.0, 'DWIJ': [0.0, 0.0, 0.0], 'user': KnownType('ndarray'), 'd_user': KnownType('long*'), 's_user': KnownType('int*')} ) cg.parse(EqWithKnownTypes()) expect = dedent(""" cdef class EqWithKnownTypes: def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) cdef inline void some_func(self, long d_idx, double* d_p, double WIJ, double* DWIJ, ndarray user, long* d_user, int* s_user): d_p[d_idx] = WIJ*DWIJ[0] """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) def test_wrap_function(self): cg = CythonGenerator() cg.parse(func_with_return) expect = dedent(""" cdef inline double func_with_return(long d_idx, double* d_x, double x): x += 1 return d_x[d_idx] + x """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) cg.parse(simple_func) expect = dedent(""" cdef inline void simple_func(long d_idx, double* d_x, double x): d_x[d_idx] += x """) self.assert_code_equal(cg.get_code().strip(), expect.strip()) if __name__ == '__main__': unittest.main() compyle-release-0.8.1/compyle/tests/test_ext_module.py000066400000000000000000000154441414173670100232120ustar00rootroot00000000000000from contextlib import contextmanager from io import open as io_open import os from os.path import join, exists import shutil import sys import tempfile from textwrap import dedent from multiprocessing import Pool from unittest import TestCase, main try: from unittest import mock except ImportError: import mock import compyle.ext_module from ..ext_module import (get_md5, ExtModule, get_ext_extension, get_config_file_opts, get_openmp_flags) def _check_write_source(root): """Used to create an ExtModule and test if a file was opened. It returns the number of times "open" was called. """ m = mock.mock_open() orig_side_effect = m.side_effect def _side_effect(*args, **kw): with io_open(*args, **kw) as fp: fp.write("junk") return orig_side_effect(*args, **kw) m.side_effect = _side_effect with mock.patch('compyle.ext_module.io.open', m, create=True): s = ExtModule("print('hello')", root=root) s.write_source() return m.call_count def _check_compile(root): with mock.patch('shutil.copy') as m: s = ExtModule("print('hello')", root=root) s.write_and_build() if m.called: # If it was called, do the copy to mimic the action. shutil.copy(*m.call_args[0]) return m.call_count def test_get_config_file_opts(): # Given cfg = dedent(''' OMP_CFLAGS = ['-fxxx'] OMP_LINK = ['-fyyy'] ''') m = mock.mock_open(read_data=cfg) with mock.patch('compyle.ext_module.open', m), \ mock.patch('compyle.ext_module.exists') as mock_exists: # When mock_exists.return_value = False opts = get_config_file_opts() print(opts) # Then assert 'OMP_CFLAGS' not in opts assert 'OMP_LINK' not in opts # When mock_exists.return_value = True opts = get_config_file_opts() # Then assert opts['OMP_CFLAGS'] == ['-fxxx'] assert opts['OMP_LINK'] == ['-fyyy'] def test_get_openmp_flags(): # Given/When f = get_openmp_flags() # Then assert f[0] != ['-fxxx'] assert f[1] != ['-fyyy'] assert len(f[0]) > 0 # Given m = dict(OMP_CFLAGS=['-fxxx'], OMP_LINK=['-fyyy']) with mock.patch.object(compyle.ext_module, 'CONFIG_OPTS', m): # When f = get_openmp_flags() # Then assert f[0] == ['-fxxx'] assert f[1] == ['-fyyy'] class TestMiscExtMod(TestCase): def test_md5(self): data = "hello world" # Two calls with same data produce same result self.assertEqual(get_md5(data), get_md5(data)) # Two calls with different data produce different md5sums. self.assertNotEqual(get_md5(data), get_md5(data + ' ')) class TestExtModule(TestCase): def setUp(self): self.root = tempfile.mkdtemp() self.data = dedent('''\ # cython: language_level=3 def f(): return "hello world" ''') def tearDown(self): if sys.platform.startswith('win'): try: shutil.rmtree(self.root) except WindowsError: pass else: shutil.rmtree(self.root) def test_constructor(self): data = self.data s = ExtModule(data, root=self.root) self.assertTrue(exists(join(self.root, 'build'))) self.assertEqual(s.hash, get_md5(data)) self.assertEqual(s.code, data) expect_name = 'm_%s' % (s.hash) self.assertEqual(s.name, expect_name) self.assertEqual(s.src_path, join(self.root, expect_name + '.pyx')) self.assertEqual(s.ext_path, join(self.root, expect_name + get_ext_extension())) s.write_source() self.assertTrue(exists(s.src_path)) self.assertEqual(data, open(s.src_path).read()) def test_default_root(self): try: data = self.data s = ExtModule(data) s.write_source() self.assertTrue(exists(join(s.root, 'build'))) self.assertEqual(s.hash, get_md5(data)) self.assertEqual(s.code, data) self.assertTrue(exists(s.src_path)) self.assertEqual(data, open(s.src_path).read()) finally: os.unlink(s.src_path) def test_load_module(self): data = self.data s = ExtModule(data, root=self.root) mod = s.load() self.assertEqual(mod.f(), "hello world") self.assertTrue(exists(s.ext_path)) def _create_dummy_module(self): code = "# cython: language_level=3\ndef hello(): return 'hello'" modname = 'test_rebuild.py' f = join(self.root, modname) with open(f, 'w') as fp: fp.write(code) return f @contextmanager def _add_root_to_sys_path(self): import sys if self.root not in sys.path: sys.path.insert(0, self.root) try: yield finally: sys.path.remove(self.root) def test_rebuild_when_dependencies_change(self): # Given. data = self.data depends = ["test_rebuild"] s = ExtModule(data, root=self.root, depends=depends) fname = self._create_dummy_module() f_stat = os.stat(fname) with self._add_root_to_sys_path(): # When self.assertTrue(s.should_recompile()) s.write_and_build() # Then. self.assertFalse(s.should_recompile()) # Now lets re-create the module and try again. # When. fname = self._create_dummy_module() # Update the timestamp to make it newer, otherwise we need to # sleep. os.utime(fname, (f_stat.st_atime, f_stat.st_mtime + 10)) # Then. self.assertTrue(s.should_recompile()) def test_that_multiple_writes_do_not_occur_for_same_source(self): # Given n_proc = 5 p = Pool(n_proc) # When # Note that _create_extension cannot be defined here or even in the # class as a nested function or instance method cannot be pickled. result = p.map(_check_write_source, [self.root]*n_proc) p.close() # Then # The file should have been opened only once. self.assertEqual(sum(result), 1) def test_that_multiple_compiles_do_not_occur_for_same_source(self): # Given n_proc = 5 p = Pool(n_proc) # When # Note that _check_compile cannot be defined here or even in the # class as a nested function or instance method cannot be pickled. result = p.map(_check_compile, [self.root]*n_proc) p.close() # Then # The shutil.copy should have been run only once. self.assertEqual(sum(result), 1) if __name__ == '__main__': main() compyle-release-0.8.1/compyle/tests/test_gpu_struct.py000066400000000000000000000017001414173670100232320ustar00rootroot00000000000000import unittest import pytest import numpy as np class TestStructMapping(unittest.TestCase): @classmethod def setUpClass(cls): print("SetupClass") pytest.importorskip("pycuda") from compyle.cuda import set_context set_context() def test_cuda_struct_mapping(self): from compyle.cuda import match_dtype_to_c_struct from pycuda import gpuarray # Given dtype = np.dtype([('l', np.int64), ('i', np.uint8), ('x', np.float32)]) a = np.empty(1, dtype) a['l'] = 1.0 a['i'] = 2 a['x'] = 1.23 # When gs1, code1 = match_dtype_to_c_struct(None, "junk", a.dtype) a_ga = a.astype(gs1) ga = gpuarray.to_gpu(a_ga) # Then result = ga.get() np.testing.assert_almost_equal(result.tolist(), a.tolist()) self.assertFalse(a.dtype.fields == gs1.fields) compyle-release-0.8.1/compyle/tests/test_jit.py000066400000000000000000000331521414173670100216270ustar00rootroot00000000000000from math import sin import unittest import numpy as np from pytest import importorskip from ..config import get_config, use_config from ..array import wrap from ..jit import get_binop_return_type, AnnotationHelper from ..types import annotate from ..parallel import Elementwise, Reduction, Scan @annotate def g(x): return x @annotate(x='long', return_='long') def g_nonjit(x): return x + 1 @annotate def h(a, b): return g(a) * g(b) @annotate def undeclared_f(a, b): h_ab = h(a, b) return g(h_ab) class TestAnnotationHelper(unittest.TestCase): def test_const_as_call_arg(self): # Given @annotate def int_f(a): return g(1) # When types = {'a': 'int'} helper = AnnotationHelper(int_f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'int' # Given @annotate def long_f(a): return g(10000000000) # When types = {'a': 'int'} helper = AnnotationHelper(long_f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'long' # Given @annotate def double_f(a): return g(1.) # When types = {'a': 'int'} helper = AnnotationHelper(double_f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'double' def test_declare_multiple_variables(self): # Given @annotate def f(x): a, b = declare('int', 2) a = 0 b = 1 return x + a + b # When types = {'x': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.get_var_type('a') == 'int' assert helper.get_var_type('b') == 'int' def test_variable_as_call_arg(self): # Given @annotate def f(a, b): x = declare('int') x = a + b return g(x) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_variable_as_call_arg_nonjit(self): # Given @annotate def f(a, b): x = declare('int') x = a + b return g_nonjit(x) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g_nonjit'].arg_types['x'] == 'int' # Should not clobber the nonjit function annotations. assert g_nonjit.__annotations__['x'].type == 'long' assert g_nonjit.__annotations__['return'].type == 'long' def test_subscript_as_call_arg(self): # Given @annotate def f(i, a): return g(a[i]) # When types = {'i': 'int', 'a': 'intp'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_binop_as_call_arg(self): # Given @annotate def f(a, b): return g(a + b) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_compare_as_call_arg(self): # Given @annotate def f(a, b): return g(a == b) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_call_as_call_arg(self): # Given @annotate def f(a, b): return g(h(a, b)) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_binop_with_call_as_call_arg(self): # Given @annotate def f(a, b): return g(h(a, b) + h(b, a)) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_non_jit_call_as_call_arg(self): # Given @annotate def f(a, b): return g(sin(a)) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'double' def test_if_exp_as_call_arg(self): # Given @annotate def f(a, b): return g(g(a) if a > b else g(b)) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_variable_in_return(self): # Given @annotate def f(a): return a # When types = {'a': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'int' def test_subscript_in_return(self): # Given @annotate def f(i, a): return a[i] # When types = {'i': 'int', 'a': 'intp'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'int' def test_const_in_return(self): # Given @annotate def int_f(a, b): return 1 # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(int_f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'int' # Given @annotate def long_f(a, b): return 10000000000 # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(long_f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'long' # Given @annotate def double_f(a, b): return 1. # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(double_f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'double' def test_binop_in_return(self): # Given @annotate def f(a, b): return a + b # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'int' def test_call_in_return(self): # Given @annotate def f(a, b): return g(a) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert 'g' in helper.external_funcs assert helper.arg_types['return_'] == 'int' def test_binop_with_call_in_return(self): # Given @annotate def f(a, b): return g(a) + g(b) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'int' def test_multi_level_call_in_return(self): # Given @annotate def f(a, b): return h(a, b) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert 'h' in helper.external_funcs assert 'g' in helper.external_funcs['h'].external_funcs assert helper.arg_types['return_'] == 'int' def test_non_jit_call_in_return(self): # Given @annotate def f(a): return sin(a) # When types = {'a': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'double' def test_if_exp_in_return(self): # Given @annotate def f(a, b): return g(a) if a > b else g(b) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'int' def test_binop_return_type(self): # Given @annotate def f(a, b): return a + b # When types = {'a': 'long', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'long' # When types = {'a': 'int', 'b': 'double'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'double' # When types = {'a': 'uint', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'int' # When types = {'a': 'uint', 'b': 'ulong'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'ulong' # When types = {'a': 'intp', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'intp' # When types = {'a': 'gdoublep', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'gdoublep' # When types = {'a': 'int', 'b': 'intp'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'intp' # When types = {'a': 'int', 'b': 'guintp'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'guintp' # When types = {'a': 'uint', 'b': 'guintp'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.arg_types['return_'] == 'guintp' def test_cast_return_type(self): # Given @annotate def f(a): return cast(a, "int") # When types = {'a': 'double'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.get_return_type() == 'int' def test_address_type(self): # Given @annotate def f(a): b = address(a[0]) return b[0] # When types = {'a': 'gintp'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.get_var_type('b') == 'gintp' assert helper.get_return_type() == 'int' def test_undeclared_variable_declaration(self): # Given @annotate def f(a, b): h_ab = h(a, b) return g(h_ab) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.undecl_var_types['h_ab'] == 'int' assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_undeclared_variable_declaration_in_external_func(self): # Given @annotate def f(a, b): return undeclared_f(a, b) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then external_f = helper.external_funcs['undeclared_f'] assert external_f.undecl_var_types['h_ab'] == 'int' assert external_f.external_funcs['g'].arg_types['x'] == 'int' def test_undeclared_variable_declaration_in_if_exp(self): # Given @annotate def f(a, b): g_ab = g(a) if a > b else g(b) return g(g_ab) # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.undecl_var_types['g_ab'] == 'int' assert helper.external_funcs['g'].arg_types['x'] == 'int' def test_undeclared_variable_declaration_in_for(self): # Given @annotate def f(a, b): for i in range(a): b += 1 return b # When types = {'a': 'int', 'b': 'int'} helper = AnnotationHelper(f, types) helper.annotate() # Then assert helper.undecl_var_types['i'] == 'int' def test_no_return_value(self): # Given @annotate def f_no_return(a, n): for i in range(n): a[i] += 1 return # When types = {'a': 'guintp', 'n': 'int'} helper = AnnotationHelper(f_no_return, types) helper.annotate() # Then assert 'return_' not in helper.arg_types # Given @annotate def f_return(a, n): for i in range(n): a[i] += 1 return n # When helper = AnnotationHelper(f_return, types) helper.annotate() # Then assert 'return_' in helper.arg_types and \ helper.arg_types['return_'] == 'int' compyle-release-0.8.1/compyle/tests/test_low_level.py000066400000000000000000000110231414173670100230220ustar00rootroot00000000000000import unittest import numpy as np from pytest import importorskip from ..config import use_config from ..array import wrap from ..types import annotate, declare from ..low_level import ( Cython, Kernel, LocalMem, local_barrier, GID_0, LDIM_0, LID_0, nogil, prange, parallel, cast ) class TestKernel(unittest.TestCase): def test_simple_kernel_opencl(self): importorskip('pyopencl') # Given @annotate(gdoublep='x, y', a='float', size='int') def knl(x, y, a, size): i = declare('int') i = GID_0*LDIM_0 + LID_0 if i < size: y[i] = x[i]*a x = np.linspace(0, 1, 1000) y = np.zeros_like(x) x, y = wrap(x, y, backend='opencl') # When k = Kernel(knl, backend='opencl') a = 21.0 k(x, y, a, 1000) # Then y.pull() self.assertTrue(np.allclose(y.data, x.data * a)) def test_simple_kernel_cuda(self): importorskip('pycuda') # Given @annotate(gdoublep='x, y', a='float', size='int') def knl(x, y, a, size): i = declare('int') i = GID_0*LDIM_0 + LID_0 if i < size: y[i] = x[i]*a x = np.linspace(0, 1, 1000) y = np.zeros_like(x) x, y = wrap(x, y, backend='cuda') # When k = Kernel(knl, backend='cuda') a = 21.0 k(x, y, a, 1000) # Then y.pull() self.assertTrue(np.allclose(y.data, x.data * a)) def test_kernel_with_local_memory_opencl(self): importorskip('pyopencl') # Given @annotate(gdoublep='x, y', xc='ldoublep', a='float') def knl(x, y, xc, a): i, lid = declare('int', 2) lid = LID_0 i = GID_0 * LDIM_0 + lid xc[lid] = x[i] local_barrier() y[i] = xc[lid] * a x = np.linspace(0, 1, 1024) y = np.zeros_like(x) xc = LocalMem(1, backend='opencl') x, y = wrap(x, y, backend='opencl') # When k = Kernel(knl, backend='opencl') a = 21.0 k(x, y, xc, a) # Then y.pull() self.assertTrue(np.allclose(y.data, x.data * a)) def test_kernel_with_local_memory_cuda(self): importorskip('pycuda') # Given @annotate(gdoublep='x, y', xc='ldoublep', a='float') def knl(x, y, xc, a): i, lid = declare('int', 2) lid = LID_0 i = GID_0 * LDIM_0 + lid xc[lid] = x[i] local_barrier() y[i] = xc[lid] * a x = np.linspace(0, 1, 1024) y = np.zeros_like(x) xc = LocalMem(1, backend='cuda') x, y = wrap(x, y, backend='cuda') # When k = Kernel(knl, backend='cuda') a = 21.0 k(x, y, xc, a) # Then y.pull() self.assertTrue(np.allclose(y.data, x.data * a)) @annotate(double='x, y, a', return_='double') def func(x, y, a): return x * y * a @annotate(doublep='x, y', a='double', n='int', return_='double') def knl(x, y, a, n): i = declare('int') s = declare('double') s = 0.0 for i in range(n): s += func(x[i], y[i], a) return s @annotate(n='int', doublep='x, y', a='double') def cy_extern(x, y, a, n): i = declare('int') with nogil, parallel(): for i in prange(n): y[i] = x[i] * a @annotate(int='num, return_') def _factorial(num): if num == 0: return 1 else: return num*_factorial(num - 1) class TestCython(unittest.TestCase): def test_cython_code_with_return_and_nested_call(self): # Given n = 1000 x = np.linspace(0, 1, n) y = x.copy() a = 2.0 # When cy = Cython(knl) result = cy(x, y, a, n) # Then self.assertAlmostEqual(result, np.sum(x * y * a)) def test_cython_with_externs(self): # Given n = 1000 x = np.linspace(0, 1, n) y = np.zeros_like(x) a = 2.0 # When with use_config(use_openmp=True): cy = Cython(cy_extern) cy(x, y, a, n) # Then self.assertTrue(np.allclose(y, x * a)) def test_recursive_function(self): # Given/when fac = Cython(_factorial) # Then self.assertEqual(fac(0), 1) self.assertEqual(fac(1), 1) self.assertEqual(fac(3), 6) def test_cast_works_in_pure_python(): x = cast(1.23, "int") assert x == 1 y = cast(2, "float") assert y == 2.0 compyle-release-0.8.1/compyle/tests/test_parallel.py000066400000000000000000000616641414173670100226460ustar00rootroot00000000000000from math import sin import unittest import numpy as np from pytest import importorskip from ..config import get_config, use_config from ..array import wrap, zeros from ..types import annotate, declare from ..parallel import Elementwise, Reduction, Scan from ..low_level import atomic_inc, atomic_dec from .test_jit import g MY_CONST = 42 @annotate(x='int', return_='int') def external(x): return x class ParallelUtilsBase(object): def test_elementwise_works_with_cython(self): self._check_simple_elementwise(backend='cython') def test_elementwise_works_with_opencl(self): importorskip('pyopencl') self._check_simple_elementwise(backend='opencl') def test_elementwise_works_with_cuda(self): importorskip('pycuda') self._check_simple_elementwise(backend='cuda') def test_elementwise_works_with_global_constant_cython(self): self._check_elementwise_with_constant(backend='cython') def test_elementwise_works_with_global_constant_opencl(self): importorskip('pyopencl') self._check_elementwise_with_constant(backend='opencl') def test_elementwise_works_with_global_constant_cuda(self): importorskip('pycuda') self._check_elementwise_with_constant(backend='cuda') def test_reduction_works_without_map_cython(self): self._check_simple_reduction(backend='cython') def test_reduction_works_with_map_cython(self): self._check_reduction_with_map(backend='cython') def test_reduction_works_with_external_func_cython(self): self._check_reduction_with_external_func(backend='cython') def test_reduction_works_neutral_cython(self): self._check_reduction_min(backend='cython') def test_reduction_works_without_map_opencl(self): importorskip('pyopencl') self._check_simple_reduction(backend='opencl') def test_reduction_works_with_map_opencl(self): importorskip('pyopencl') self._check_reduction_with_map(backend='opencl') def test_reduction_works_with_external_func_opencl(self): importorskip('pyopencl') self._check_reduction_with_external_func(backend='opencl') def test_reduction_works_neutral_opencl(self): importorskip('pyopencl') self._check_reduction_min(backend='opencl') def test_reduction_works_without_map_cuda(self): importorskip('pycuda') self._check_simple_reduction(backend='cuda') def test_reduction_works_with_map_cuda(self): importorskip('pycuda') self._check_reduction_with_map(backend='cuda') def test_reduction_works_with_external_func_cuda(self): importorskip('pycuda') self._check_reduction_with_external_func(backend='cuda') def test_reduction_works_neutral_cuda(self): importorskip('pycuda') self._check_reduction_min(backend='cuda') def test_scan_works_cython(self): self._test_scan(backend='cython') def test_scan_works_cython_parallel(self): with use_config(use_openmp=True): self._test_scan(backend='cython') def test_large_scan_works_cython_parallel(self): with use_config(use_openmp=True): self._test_large_scan(backend='cython') def test_scan_works_opencl(self): importorskip('pyopencl') self._test_scan(backend='opencl') def test_scan_works_cuda(self): importorskip('pycuda') self._test_scan(backend='cuda') def test_scan_works_with_external_func_cython(self): self._test_scan_with_external_func(backend='cython') def test_scan_works_with_external_func_cython_parallel(self): with use_config(use_openmp=True): self._test_scan_with_external_func(backend='cython') def test_scan_works_with_external_func_opencl(self): importorskip('pyopencl') self._test_scan_with_external_func(backend='opencl') def test_scan_works_with_external_func_cuda(self): importorskip('pycuda') self._test_scan_with_external_func(backend='cuda') def test_unique_scan_cython(self): self._test_unique_scan(backend='cython') def test_unique_scan_cython_parallel(self): with use_config(use_openmp=True): self._test_unique_scan(backend='cython') def test_unique_scan_opencl(self): importorskip('pyopencl') self._test_unique_scan(backend='opencl') def test_unique_scan_cuda(self): importorskip('pycuda') self._test_unique_scan(backend='cuda') def _get_segmented_scan_actual(self, a, segment_flags): output_actual = np.zeros_like(a) for i in range(len(a)): if segment_flags[i] == 0 and i != 0: output_actual[i] = output_actual[i - 1] + a[i] else: output_actual[i] = a[i] return output_actual def test_segmented_scan_cython(self): self._test_segmented_scan(backend='cython') def test_segmented_scan_cython_parallel(self): with use_config(use_openmp=True): self._test_segmented_scan(backend='cython') def test_segmented_scan_opencl(self): importorskip('pyopencl') self._test_segmented_scan(backend='opencl') def test_segmented_scan_cuda(self): importorskip('pycuda') self._test_segmented_scan(backend='cuda') def test_scan_last_item_cython_parallel(self): with use_config(use_openmp=True): self._test_scan_last_item(backend='cython') def test_scan_last_item_cython_serial(self): self._test_scan_last_item(backend='cython') def test_scan_last_item_opencl(self): importorskip('pyopencl') self._test_scan_last_item(backend='opencl') def test_scan_last_item_cuda(self): importorskip('pycuda') self._test_scan_last_item(backend='cuda') def test_atomic_inc_cython(self): self._test_atomic_inc(backend='cython') def test_atomic_inc_cython_parallel(self): with use_config(use_openmp=True): self._test_atomic_inc(backend='cython') def test_atomic_inc_opencl(self): importorskip('pyopencl') self._test_atomic_inc(backend='opencl') def test_atomic_inc_cuda(self): importorskip('pycuda') self._test_atomic_inc(backend='cuda') def test_atomic_dec_cython(self): self._test_atomic_dec(backend='cython') def test_atomic_dec_cython_parallel(self): with use_config(use_openmp=True): self._test_atomic_dec(backend='cython') def test_atomic_dec_opencl(self): importorskip('pyopencl') self._test_atomic_dec(backend='opencl') def test_atomic_dec_cuda(self): importorskip('pycuda') self._test_atomic_dec(backend='cuda') def test_repeated_scans_with_different_settings(self): importorskip('pyopencl') with use_config(use_double=False): self._test_unique_scan(backend='opencl') with use_config(use_double=True): self._test_unique_scan(backend='opencl') with use_config(use_openmp=False): self._test_unique_scan(backend='cython') with use_config(use_openmp=True): self._test_unique_scan(backend='cython') class TestParallelUtils(ParallelUtilsBase, unittest.TestCase): def setUp(self): cfg = get_config() self._use_double = cfg.use_double cfg.use_double = True def tearDown(self): get_config().use_double = self._use_double def _check_simple_elementwise(self, backend): # Given @annotate(i='int', x='doublep', y='doublep', double='a,b') def axpb(i, x, y, a, b): y[i] = a * sin(x[i]) + b x = np.linspace(0, 1, 10000) y = np.zeros_like(x) a = 2.0 b = 3.0 x, y = wrap(x, y, backend=backend) # When e = Elementwise(axpb, backend=backend) e(x, y, a, b) # Then y.pull() self.assertTrue(np.allclose(y.data, a * np.sin(x.data) + b)) self.assertTrue(len(e.source) > 100) self.assertTrue(len(e.all_source) > 100) self.assertTrue(len(e.all_source) >= len(e.source)) def _check_elementwise_with_constant(self, backend): # Given @annotate(i='int', x='doublep') def set_const(i, x): x[i] = MY_CONST x = np.zeros(100) x = wrap(x, backend=backend) # When e = Elementwise(set_const, backend=backend) e(x) # Then x.pull() np.testing.assert_almost_equal(x.data, MY_CONST) def _check_simple_reduction(self, backend): x = np.linspace(0, 1, 1000) / 1000 x = wrap(x, backend=backend) # When r = Reduction('a+b', backend=backend) result = r(x) # Then self.assertAlmostEqual(result, 0.5, 6) self.assertTrue(len(r.source) > 100) self.assertTrue(len(r.all_source) > 100) self.assertTrue(len(r.all_source) >= len(r.source)) def _check_reduction_min(self, backend): x = np.linspace(0, 1, 1000) / 1000 x = wrap(x, backend=backend) # When r = Reduction('min(a, b)', neutral='INFINITY', backend=backend) result = r(x) # Then self.assertAlmostEqual(result, 0.0, 6) def _check_reduction_with_map(self, backend): # Given from math import cos, sin x = np.linspace(0, 1, 1000) / 1000 y = x.copy() x, y = wrap(x, y, backend=backend) @annotate(i='int', doublep='x, y') def map(i=0, x=[0.0], y=[0.0]): return cos(x[i]) * sin(y[i]) # When r = Reduction('a+b', map_func=map, backend=backend) result = r(x, y) # Then self.assertAlmostEqual(result, 0.5, 6) def _check_reduction_with_external_func(self, backend): # Given x = np.arange(1000, dtype=np.int32) x = wrap(x, backend=backend) @annotate(i='int', x='intp') def map(i=0, x=[0]): return external(x[i]) # When r = Reduction('a+b', map_func=map, backend=backend) result = r(x) # Then self.assertAlmostEqual(result, 499500) def _test_scan(self, backend): # Given a = np.arange(10000, dtype=np.int32) data = a.copy() expect = np.cumsum(data) a = wrap(a, backend=backend) @annotate(i='int', ary='intp', return_='int') def input_f(i, ary): return ary[i] @annotate(int='i, item', ary='intp') def output_f(i, item, ary): ary[i] = item # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend) scan(ary=a) a.pull() result = a.data # Then np.testing.assert_equal(expect, result) self.assertTrue(len(scan.source) > 100) self.assertTrue(len(scan.all_source) > 100) self.assertTrue(len(scan.all_source) >= len(scan.source)) def _test_large_scan(self, backend): # Given a = np.ones(3000000, dtype=np.int32) data = a.copy() expect = np.cumsum(data) a = wrap(a, backend=backend) @annotate(i='int', ary='intp', return_='int') def input_f(i, ary): return ary[i] @annotate(int='i, item', ary='intp') def output_f(i, item, ary): ary[i] = item # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend) scan(ary=a) a.pull() result = a.data # Then np.testing.assert_equal(expect, result) def _test_scan_with_external_func(self, backend): # Given a = np.arange(10000, dtype=np.int32) data = a.copy() expect = np.cumsum(data) a = wrap(a, backend=backend) @annotate(i='int', ary='intp', return_='int') def input_f(i, ary): return external(ary[i]) @annotate(int='i, item', ary='intp') def output_f(i, item, ary): ary[i] = item # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend) scan(ary=a) a.pull() result = a.data # Then np.testing.assert_equal(expect, result) def _test_unique_scan(self, backend): # Given a = np.random.randint(0, 100, 100, dtype=np.int32) a = np.sort(a) data = a.copy() unique_ary_actual = np.sort(np.unique(data)) unique_count_actual = len(np.unique(data)) a = wrap(a, backend=backend) unique_ary = np.zeros(len(a.data), dtype=np.int32) unique_ary = wrap(unique_ary, backend=backend) unique_count = np.zeros(1, dtype=np.int32) unique_count = wrap(unique_count, backend=backend) @annotate(i='int', ary='intp', return_='int') def input_f(i, ary): if i == 0 or ary[i] != ary[i - 1]: return 1 else: return 0 @annotate(int='i, prev_item, item, N', ary='intp', unique='intp', unique_count='intp') def output_f(i, N, ary, unique, unique_count, item, prev_item): if item != prev_item: unique[item - 1] = ary[i] if i == N - 1: unique_count[0] = item # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend) scan(ary=a, unique=unique_ary, unique_count=unique_count) unique_ary.pull() unique_count.pull() unique_count = unique_count.data[0] # Then self.assertTrue(unique_count == unique_count_actual) np.testing.assert_equal(unique_ary_actual, unique_ary.data[:unique_count]) def _test_segmented_scan(self, backend): # Given a = np.random.randint(0, 100, 50000, dtype=np.int32) a_copy = a.copy() seg = np.random.randint(0, 100, 50000, dtype=np.int32) seg = (seg == 0).astype(np.int32) seg_copy = seg.copy() a = wrap(a, backend=backend) seg = wrap(seg, backend=backend) @annotate(i='int', ary='intp', return_='int') def input_f(i, ary): return ary[i] @annotate(i='int', seg_flag='intp', return_='int') def segment_f(i, seg_flag): return seg_flag[i] @annotate(int='i, item', ary='intp') def output_f(i, item, ary): ary[i] = item output_actual = self._get_segmented_scan_actual(a_copy, seg_copy) # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend, is_segment=segment_f) scan(ary=a, seg_flag=seg) a.pull() # Then np.testing.assert_equal(output_actual, a.data) def _test_scan_last_item(self, backend): # Given a = np.random.randint(0, 100, 50000, dtype=np.int32) a_copy = a.copy() a = wrap(a, backend=backend) @annotate(int='i, last_item, item', ary='intp') def output_f(i, last_item, item, ary): ary[i] = item + last_item expect = np.cumsum(a_copy) + np.cumsum(a_copy)[-1] # When scan = Scan(output=output_f, scan_expr='a+b', dtype=np.int32, backend=backend) scan(input=a, ary=a) a.pull() # Then np.testing.assert_equal(expect, a.data) def _test_atomic_inc(self, backend): # Given a = np.random.randint(0, 100, 50000, dtype=np.int32) result = zeros(1, dtype=np.int32, backend=backend) a = wrap(a, backend=backend) @annotate(gintp='x, result', i='int') def reduce_knl(i, x, result): b = declare('int') b = atomic_inc(result[0]) # When knl = Elementwise(reduce_knl, backend=backend) knl(a, result) # Then self.assertTrue(result[0] == 50000) def _test_atomic_dec(self, backend): # Given a = np.random.randint(0, 100, 50000, dtype=np.int32) result = zeros(1, dtype=np.int32, backend=backend) a = wrap(a, backend=backend) @annotate(gintp='x, result', i='int') def reduce_knl(i, x, result): b = declare('int') b = atomic_dec(result[0]) # When knl = Elementwise(reduce_knl, backend=backend) knl(a, result) # Then self.assertTrue(result[0] == -50000) class TestParallelUtilsJIT(ParallelUtilsBase, unittest.TestCase): def setUp(self): cfg = get_config() self._use_double = cfg.use_double cfg.use_double = True def tearDown(self): get_config().use_double = self._use_double def _check_simple_elementwise(self, backend): # Given @annotate def axpb(i, x, y, a, b): y[i] = a * sin(x[i]) + b x = np.linspace(0, 1, 10000) y = np.zeros_like(x) a = 2.0 b = 3.0 x, y = wrap(x, y, backend=backend) # When e = Elementwise(axpb, backend=backend) e(x, y, a, b) # Then y.pull() self.assertTrue(np.allclose(y.data, a * np.sin(x.data) + b)) self.assertTrue(len(e.source) > 100, e.source) self.assertTrue(len(e.all_source) > 100, e.all_source) self.assertTrue(len(e.all_source) >= len(e.source)) def _check_elementwise_with_constant(self, backend): # Given @annotate def set_const(i, x): x[i] = MY_CONST x = np.zeros(100) x = wrap(x, backend=backend) # When e = Elementwise(set_const, backend=backend) e(x) # Then x.pull() np.testing.assert_almost_equal(x.data, MY_CONST) def _check_simple_reduction(self, backend): x = np.linspace(0, 1, 1000) / 1000 x = wrap(x, backend=backend) # When r = Reduction('a+b', backend=backend) result = r(x) # Then self.assertAlmostEqual(result, 0.5, 6) self.assertTrue(len(r.source) > 100) self.assertTrue(len(r.all_source) > 100, r.all_source) self.assertTrue(len(r.all_source) >= len(r.source)) def _check_reduction_min(self, backend): x = np.linspace(0, 1, 1000) / 1000 x = wrap(x, backend=backend) # When r = Reduction('min(a, b)', neutral='INFINITY', backend=backend) result = r(x) # Then self.assertAlmostEqual(result, 0.0, 6) def _check_reduction_with_map(self, backend): # Given from math import cos, sin x = np.linspace(0, 1, 1000) / 1000 y = x.copy() x, y = wrap(x, y, backend=backend) @annotate def map(i=0, x=[0.0], y=[0.0]): result = declare('double') result = cos(x[i]) * sin(y[i]) return result # When r = Reduction('a+b', map_func=map, backend=backend) result = r(x, y) # Then self.assertAlmostEqual(result, 0.5, 6) def _check_reduction_with_external_func(self, backend): # Given x = np.arange(1000, dtype=np.int32) x = wrap(x, backend=backend) @annotate def map(i=0, x=[0]): return g(x[i]) # When r = Reduction('a+b', map_func=map, backend=backend) result = r(x) # Then self.assertAlmostEqual(result, 499500) def _test_scan(self, backend): # Given a = np.arange(10000, dtype=np.int32) data = a.copy() expect = np.cumsum(data) a = wrap(a, backend=backend) @annotate def input_f(i, ary): return ary[i] @annotate def output_f(i, item, ary): ary[i] = item # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend) scan(ary=a) a.pull() result = a.data # Then np.testing.assert_equal(expect, result) self.assertTrue(len(scan.source) > 100, scan.source) self.assertTrue(len(scan.all_source) > 100, scan.all_source) self.assertTrue(len(scan.all_source) >= len(scan.source)) def _test_large_scan(self, backend): # Given a = np.ones(3000000, dtype=np.int32) data = a.copy() expect = np.cumsum(data) a = wrap(a, backend=backend) @annotate def input_f(i, ary): return ary[i] @annotate def output_f(i, item, ary): ary[i] = item # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend) scan(ary=a) a.pull() result = a.data # Then np.testing.assert_equal(expect, result) def _test_scan_with_external_func(self, backend): # Given a = np.arange(10000, dtype=np.int32) data = a.copy() expect = np.cumsum(data) a = wrap(a, backend=backend) @annotate def input_f(i, ary): return g(ary[i]) @annotate def output_f(i, item, ary): ary[i] = item # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend) scan(ary=a) a.pull() result = a.data # Then np.testing.assert_equal(expect, result) def _test_unique_scan(self, backend): # Given a = np.random.randint(0, 100, 100, dtype=np.int32) a = np.sort(a) data = a.copy() unique_ary_actual = np.sort(np.unique(data)) unique_count_actual = len(np.unique(data)) a = wrap(a, backend=backend) unique_ary = np.zeros(len(a.data), dtype=np.int32) unique_ary = wrap(unique_ary, backend=backend) unique_count = np.zeros(1, dtype=np.int32) unique_count = wrap(unique_count, backend=backend) @annotate def input_f(i, ary): if i == 0 or ary[i] != ary[i - 1]: return 1 else: return 0 @annotate def output_f(i, prev_item, item, N, ary, unique, unique_count): if item != prev_item: unique[item - 1] = ary[i] if i == N - 1: unique_count[0] = item # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend) scan(ary=a, unique=unique_ary, unique_count=unique_count) unique_ary.pull() unique_count.pull() unique_count = unique_count.data[0] # Then self.assertTrue(unique_count == unique_count_actual) np.testing.assert_equal(unique_ary_actual, unique_ary.data[:unique_count]) def _test_segmented_scan(self, backend): # Given a = np.random.randint(0, 100, 50000, dtype=np.int32) a_copy = a.copy() seg = np.random.randint(0, 100, 50000, dtype=np.int32) seg = (seg == 0).astype(np.int32) seg_copy = seg.copy() a = wrap(a, backend=backend) seg = wrap(seg, backend=backend) @annotate def input_f(i, ary): return ary[i] @annotate def segment_f(i, seg_flag): return seg_flag[i] @annotate def output_f(i, item, ary): ary[i] = item output_actual = self._get_segmented_scan_actual(a_copy, seg_copy) # When scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, backend=backend, is_segment=segment_f) scan(ary=a, seg_flag=seg) a.pull() # Then np.testing.assert_equal(output_actual, a.data) def _test_scan_last_item(self, backend): # Given a = np.random.randint(0, 100, 50000, dtype=np.int32) a_copy = a.copy() a = wrap(a, backend=backend) @annotate def output_f(i, last_item, item, ary): ary[i] = item + last_item expect = np.cumsum(a_copy) + np.cumsum(a_copy)[-1] # When scan = Scan(output=output_f, scan_expr='a+b', dtype=np.int32, backend=backend) scan(input=a, ary=a) a.pull() # Then np.testing.assert_equal(expect, a.data) def _test_atomic_inc(self, backend): # Given a = np.random.randint(0, 100, 50000, dtype=np.int32) result = zeros(1, dtype=np.int32, backend=backend) a = wrap(a, backend=backend) @annotate def reduce_knl(i, x, result): b = declare('int') b = atomic_inc(result[0]) # When knl = Elementwise(reduce_knl, backend=backend) knl(a, result) # Then self.assertTrue(result[0] == 50000) def _test_atomic_dec(self, backend): # Given a = np.random.randint(0, 100, 50000, dtype=np.int32) result = zeros(1, dtype=np.int32, backend=backend) a = wrap(a, backend=backend) @annotate def reduce_knl(i, x, result): b = declare('int') b = atomic_dec(result[0]) # When knl = Elementwise(reduce_knl, backend=backend) knl(a, result) # Then self.assertTrue(result[0] == -50000) compyle-release-0.8.1/compyle/tests/test_profile.py000066400000000000000000000047141414173670100225030ustar00rootroot00000000000000import unittest import numpy as np from pytest import importorskip from ..config import get_config, use_config from ..array import wrap, zeros, ones from ..profile import ( get_profile_info, named_profile, profile, profile_ctx, ProfileContext ) def axpb(): a, b = 7, 13 x = np.random.rand(1000) return a * x + b class A: @profile def f(self): pass class B: def __init__(self): self.name = 'my_name' @profile def f(self): pass @profile(name='explicit_name') def named(self): pass @profile def profiled_axpb(): axpb() @profile def nested(): profiled_axpb() @named_profile('prefix_sum', backend='opencl') def get_prefix_sum_knl(): from ..opencl import get_queue, get_context from pyopencl.scan import GenericScanKernel ctx = get_context() queue = get_queue() return GenericScanKernel(ctx, np.int32, arguments="__global int *ary", input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i] = prev_item") def test_profile_ctx(): with profile_ctx('axpb'): axpb() profile_info = get_profile_info() assert profile_info[0]['axpb']['calls'] == 1 def test_profile(): for i in range(100): profiled_axpb() profile_info = get_profile_info() assert profile_info[0]['profiled_axpb']['calls'] == 100 def test_profile_method(): # Given a = A() b = B() # When for i in range(5): a.f() b.f() b.named() # Then profile_info = get_profile_info() assert profile_info[0]['A.f']['calls'] == 5 # For b.f(), b.name is my_name. assert profile_info[0]['my_name']['calls'] == 5 # profile was given an explicit name for b.named() assert profile_info[0]['explicit_name']['calls'] == 5 def test_named_profile(): importorskip('pyopencl') get_config().profile = True knl = get_prefix_sum_knl() x = ones(100, np.int32, backend='opencl') knl(x.dev) profile_info = get_profile_info() assert profile_info[0]['prefix_sum']['calls'] == 1 def test_nesting_and_context(): # When p = ProfileContext('main') nested() p.stop() # Then prof = get_profile_info() assert len(prof) == 3 assert prof[0]['main']['calls'] == 1 assert prof[1]['nested']['calls'] == 1 assert prof[2]['profiled_axpb']['calls'] == 1 compyle-release-0.8.1/compyle/tests/test_template.py000066400000000000000000000056451414173670100226620ustar00rootroot00000000000000from textwrap import dedent import numpy as np from ..array import wrap from ..types import annotate, KnownType from ..template import Template from ..parallel import Elementwise class SimpleTemplate(Template): def __init__(self, name, cond=False): super(SimpleTemplate, self).__init__(name=name) self.cond = cond def template(self, x, y): '''Docstring text''' ''' % for i in range(5): print(${i}) % endfor % if obj.cond: return 'hello' % else: return 'bye' % endif ''' class Dummy(Template): def template(self): '''Docs''' ''' print(123) ''' class ParallelExample(Template): @annotate(i='int', x='doublep', y='doublep') def template(self, i, x, y): ''' y[i] = x[i]*2.0 ''' class ExtraArgs(Template): def extra_args(self): return ['x'], {'x': 'int'} def template(self): ''' return x + 1 ''' def test_simple_template(): # Given t = SimpleTemplate(name='simple') # When simple = t.function x = simple(1, 2) # Then assert x == 'bye' # Given t = SimpleTemplate(name='simple', cond=True) # When simple = t.function x = simple(1, 2) # Then assert x == 'hello' def test_that_source_code_is_available(): # Given/When dummy = Dummy('dummy').function # Then expect = dedent('''\ def dummy(): """Docs """ print(123) ''') assert dummy.source.strip() == expect.strip() assert dummy.is_jit is True def test_template_usable_in_code_generation(): # Given twice = ParallelExample('twice').function x = np.linspace(0, 1, 10) y = np.zeros_like(x) x, y = wrap(x, y) # When e = Elementwise(twice) e(x, y) # Then y.pull() np.testing.assert_almost_equal(y, 2.0*x.data) assert twice.is_jit is False def test_template_with_extra_args(): # Given extra = ExtraArgs('extra').function # When result = extra(1) # Then assert result == 2 assert extra.__annotations__ == {'x': KnownType('int')} def test_template_inject_works(): # Given def f(x): '''Docs ''' for i in range(5): x += i return x + 1 # When t = Template('t') result = t.inject(f, indent=1) # Then lines = ['for i in range(5):\n', ' x += i\n', 'return x + 1\n'] expect = ''.join([' '*4 + x for x in lines]) assert result == expect # When result = t.inject(f, indent=2) # Then lines = ['for i in range(5):\n', ' x += i\n', 'return x + 1\n'] expect = ''.join([' '*8 + x for x in lines]) assert result == expect # When result = t.inject(f, indent=0) # Then lines = ['for i in range(5):\n', ' x += i\n', 'return x + 1\n'] expect = ''.join(lines) assert result == expect compyle-release-0.8.1/compyle/tests/test_translator.py000066400000000000000000000626051414173670100232370ustar00rootroot00000000000000from textwrap import dedent import pytest import numpy as np import sys from ..config import get_config from ..types import annotate, declare from ..translator import ( CConverter, CodeGenerationError, CStructHelper, KnownType, OpenCLConverter, CUDAConverter, py2c ) @annotate(i='int', y='floatp', return_='float') def annotated_f(i, y): x = declare('LOCAL_MEM matrix(64)') return y[i] def test_simple_assignment_expression(): # Given src = dedent(''' b = (2*a + 1)*(-a/1.5)%2 ''') # When code = py2c(src) # Then expect = dedent(''' double a; double b; b = ((((2 * a) + 1) * (-a / 1.5)) % 2); ''') assert code == expect.strip() def test_multiple_assignment_expressions(): # Given src = dedent(''' a = 21.5 b = (2*a + 1)*(a/1.5)%2 ''') # When code = py2c(src) # Then expect = dedent(''' double a; double b; a = 21.5; b = ((((2 * a) + 1) * (a / 1.5)) % 2); ''') assert code == expect.strip() def test_if_block(): # Given src = dedent(''' a = 21.5 if a > 20: b = a - 1 elif a < 20: b = a + 1 else: b = a ''') # When code = py2c(src) # Then expect = dedent(''' double a; double b; a = 21.5; if ((a > 20)) { b = (a - 1); } else { if ((a < 20)) { b = (a + 1); } else { b = a; } } ''') assert code.strip() == expect.strip() def test_conditionals(): # Given src = dedent(''' if (x > 10 and x < 20) or not (x >= 10 and x <= 20): y ''') # When code = py2c(src) # Then expect = dedent(''' double x; double y; if ((((x > 10) && (x < 20)) || !((x >= 10) && (x <= 20)))) { y; } ''') assert code.strip() == expect.strip() # Given src = dedent(''' if x != 10 and x is 100 or (x == 20 and x is not 1): pass ''') # When code = py2c(src) # Then expect = dedent(''' double x; if ((((x != 10) && (x == 100)) || ((x == 20) && (x != 1)))) { ; } ''') assert code.strip() == expect.strip() # Given src = dedent(''' if x != 10 and x is 100 or (x == 20 and x is not 1): pass ''') # When code = py2c(src) # Then expect = dedent(''' double x; if ((((x != 10) && (x == 100)) || ((x == 20) && (x != 1)))) { ; } ''') assert code.strip() == expect.strip() def test_ternary_operator(): # Given src = dedent(''' y = 2.0 x = 1.0 if y >= 2.0 else 0.0 ''') # When code = py2c(src) # Then expect = dedent(''' double x; double y; y = 2.0; x = (y >= 2.0) ? 1.0 : 0.0; ''') assert code.strip() == expect.strip() def test_multiple_boolops(): # Given src = dedent(''' if x % 2 == 0 or x % 2 == 1 or x > 0: pass ''') # When code = py2c(src) # Then expect = dedent(''' double x; if ((((x % 2) == 0) || ((x % 2) == 1) || (x > 0))) { ; } ''') assert code.strip() == expect.strip() def test_multiple_bitwise_ops(): # Given src = dedent(''' x = 1 << 5 y = x >> 2 z = (x | y) ^ (x & y) ''') # When code = py2c(src) # Then expect = dedent(''' double x; double y; double z; x = (1 << 5); y = (x >> 2); z = ((x | y) ^ (x & y)); ''') assert code.strip() == expect.strip() def test_power(): # Given src = dedent(''' 1.5*x**2 ''') # When code = py2c(src) # Then expect = dedent(''' double x; (1.5 * pow(x, 2)); ''') assert code.strip() == expect.strip() def test_only_two_operands_supported_for_comparisons(): # Given src = dedent(''' if 10 < x < 20: pass ''') # When with pytest.raises(NotImplementedError): py2c(src) def test_calling_function(): # Given src = dedent(''' sin(23.2 + 1) ''') # When code = py2c(src) # Then expect = dedent(''' sin((23.2 + 1)); ''') assert code == expect.strip() def test_calling_printf_with_string(): # Given src = dedent(r''' printf('%s %d %f\n', 'hello', 1, 2.0) ''') # When code = py2c(src) # Then expect = dedent(''' printf("%s %d %f\n", "hello", 1, 2.0); ''') assert code == expect.strip() def test_subscript(): # Given src = dedent(''' x[1] ''') # When code = py2c(src) # Then expect = dedent(''' double x; x[1]; ''') assert code == expect.strip() def test_known_math_constants(): # Given src = dedent(''' x = M_E + M_LOG2E + M_LOG10E + M_LN2 + M_LN10 x += M_PI + M_PI_2 + M_PI_4 + M_1_PI * M_2_PI x += M_2_SQRTPI * M_SQRT2 * M_SQRT1_2 x = INFINITY x = NAN x = HUGE_VALF ''') # When code = py2c(src) # Then expect = dedent(''' double x; x = ((((M_E + M_LOG2E) + M_LOG10E) + M_LN2) + M_LN10); x += (((M_PI + M_PI_2) + M_PI_4) + (M_1_PI * M_2_PI)); x += ((M_2_SQRTPI * M_SQRT2) * M_SQRT1_2); x = INFINITY; x = NAN; x = HUGE_VALF; ''') assert code == expect.strip() def test_simple_function_with_return(): # Given src = dedent(''' def f(x=0.0): 'docstring' y = x + 1 return y ''') # When code = py2c(src) # Then expect = dedent(''' double f(double x) { double y; y = (x + 1); return y; } ''') assert code.strip() == expect.strip() def test_simple_function_without_return(): # Given src = dedent(''' def f(y=0.0, x=0.0): z = y + x y = z ''') # When code = py2c(src) # Then expect = dedent(''' void f(double y, double x) { double z; z = (y + x); y = z; } ''') assert code.strip() == expect.strip() def test_function_argument_types(): # Given src = dedent(''' def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)): pass ''') # When code = py2c(src) # Then expect = dedent(''' void f(long s_idx, double* s_p, long d_idx, double* d_p, long J, double t, double* l, double* xx) { ; } ''') assert code.strip() == expect.strip() def test_known_types_in_funcargs(): # Given src = dedent(''' def f(x, xx, cond=True): pass ''') # When known_types = {'xx': KnownType('foo*'), 'x': KnownType('float32')} code = py2c(src, known_types=known_types) # Then expect = dedent(''' void f(float32 x, foo* xx, int cond) { ; } ''') assert code.strip() == expect.strip() def test_annotated_function(): # Given/When t = CConverter() code = t.parse_function(annotated_f) # Then expect = dedent(''' float annotated_f(int i, float* y) { LOCAL_MEM double x[64]; return y[i]; } ''') assert code.strip() == expect.strip() @pytest.mark.skipif(sys.version_info < (3, 4), reason='Requires Python3') def test_py3_annotations(): # Given/When from .py3_code import py3_f t = CConverter() code = t.parse_function(py3_f) # Then expect = dedent(''' int py3_f(int x) { int y; y = (x + 1); return (x * y); } ''') assert code.strip() == expect.strip() def test_calling_method_of_known_type(): # Given src = dedent(''' obj.method(1, 2) obj.meth() ''') known = {'obj': KnownType('SomeClass*', base_type='SomeClass')} # When code = py2c(src, known_types=known) # Then expect = dedent(''' SomeClass_method(obj, 1, 2); SomeClass_meth(obj); ''') assert code.strip() == expect.strip() def test_calling_method_of_known_type_in_method(): # Given src = dedent(''' class Foo(object): def g(self): pass def f(self, obj): obj.method(1, 2) self.g() ''') # When known = {'obj': KnownType('SomeClass*', base_type='SomeClass')} code = py2c(src, known_types=known) # Then expect = dedent(''' void Foo_g(Foo* self) { ; } void Foo_f(Foo* self, SomeClass* obj) { SomeClass_method(obj, 1, 2); Foo_g(self); } ''') assert code.strip() == expect.strip() def test_raises_error_when_unknown_args_are_given(): # Given src = dedent(''' def f(x): pass ''') # When/Then with pytest.raises(CodeGenerationError): py2c(src) # Given # Unsupported default arg. src = dedent(''' def f(x=''): pass ''') # When/Then with pytest.raises(CodeGenerationError): py2c(src) # Given # Unsupported default arg list. src = dedent(''' def f(x=(1, '')): pass ''') # When/Then with pytest.raises(CodeGenerationError): py2c(src) def test_user_supplied_detect_type(): # Given src = dedent(''' def f(x, xx=[1,2,3], cond=True): pass ''') # When def dt(name, value): return 'double' code = py2c(src, detect_type=dt) # Then expect = dedent(''' void f(double x, double xx, double cond) { ; } ''') assert code.strip() == expect.strip() def test_while(): # Given src = dedent(''' while x < 21: do(x) do1(x) ''') # When code = py2c(src) # Then expect = dedent(''' double x; while ((x < 21)) { do(x); do1(x); } ''') assert code.strip() == expect.strip() def test_bool_true_false_and_none(): # Given src = dedent(''' while True: pass if False: pass if x is None or x is not None: pass ''') # When code = py2c(src) # Then expect = dedent(''' double x; while (1) { ; } if (0) { ; } if (((x == NULL) || (x != NULL))) { ; } ''') assert code.strip() == expect.strip() def test_for(): # Given src = dedent(''' for i in range(5): do(i) ''') # When code = py2c(src) # Then expect = dedent(''' for (long i=0; i<5; i+=1) { do(i); } ''') assert code.strip() == expect.strip() # Given src = dedent(''' for i in range(2, 5): pass ''') # When code = py2c(src) # Then expect = dedent(''' for (long i=2; i<5; i+=1) { ; } ''') assert code.strip() == expect.strip() # Given src = dedent(''' for i in range(2, 10, 2): pass ''') # When code = py2c(src) # Then expect = dedent(''' for (long i=2; i<10; i+=2) { ; } ''') assert code.strip() == expect.strip() def test_for_with_decreasing_range(): # Given src = dedent(''' for i in range(10, -1, -1): pass ''') # When code = py2c(src) # Then expect = dedent(''' for (long i=10; i>-1; i+=-1) { ; } ''') assert code.strip() == expect.strip() def test_for_with_declare(): # Given src = dedent(''' i = declare('int') for i in range(5): do(i) ''') # When code = py2c(src) # Then expect = dedent(''' int i; for (i=0; i<5; i+=1) { do(i); } ''') assert code.strip() == expect.strip() def test_two_fors(): # Given src = dedent(''' for i in range(5): do(i) for i in range(5): pass ''') # When code = py2c(src) # Then expect = dedent(''' for (long i=0; i<5; i+=1) { do(i); } for (long i=0; i<5; i+=1) { ; } ''') assert code.strip() == expect.strip() def test_for_with_symbols(): # Given src = dedent(''' n = declare('int') n = 25 for i in range(n): pass for i in range(0, n+1, step()): pass ''') # When code = py2c(src) # Then expect = dedent(''' int n; n = 25; long __cpy_stop_0 = n; for (long i=0; i<__cpy_stop_0; i+=1) { ; } long __cpy_stop_1 = (n + 1); long __cpy_step_1 = step(); if (__cpy_step_1 < 0) { for (long i=0; i>__cpy_stop_1; i+=__cpy_step_1) { ; } } else { for (long i=0; i<__cpy_stop_1; i+=__cpy_step_1) { ; } } ''') assert code.strip() == expect.strip() def test_nested_for_with_symbols(): # Given src = dedent(''' n = declare('int') n = 25 for i in range(n): for j in range(0, n+1, step()): pass for i in range(n+1): for j in range(0, n+2, step()): pass ''') # When code = py2c(src) # Then expect = dedent(''' int n; n = 25; long __cpy_stop_0 = n; for (long i=0; i<__cpy_stop_0; i+=1) { long __cpy_stop_1 = (n + 1); long __cpy_step_1 = step(); if (__cpy_step_1 < 0) { for (long j=0; j>__cpy_stop_1; j+=__cpy_step_1) { ; } } else { for (long j=0; j<__cpy_stop_1; j+=__cpy_step_1) { ; } } } long __cpy_stop_2 = (n + 1); for (long i=0; i<__cpy_stop_2; i+=1) { long __cpy_stop_3 = (n + 2); long __cpy_step_3 = step(); if (__cpy_step_3 < 0) { for (long j=0; j>__cpy_stop_3; j+=__cpy_step_3) { ; } } else { for (long j=0; j<__cpy_stop_3; j+=__cpy_step_3) { ; } } } ''') assert code.strip() == expect.strip() def test_with_two_functions(): # Given src = dedent(''' def f(): n = declare('int') n = 20 for i in range(n): pass for i in range(n): pass def g(): n = declare('int') n = 20 for i in range(n): pass for i in range(n): pass ''') code = py2c(src) expect = dedent(''' void f() { int n; n = 20; long __cpy_stop_0 = n; for (long i=0; i<__cpy_stop_0; i+=1) { ; } long __cpy_stop_1 = n; for (long i=0; i<__cpy_stop_1; i+=1) { ; } } void g() { int n; n = 20; long __cpy_stop_0 = n; for (long i=0; i<__cpy_stop_0; i+=1) { ; } long __cpy_stop_1 = n; for (long i=0; i<__cpy_stop_1; i+=1) { ; } } ''') assert code.strip() == expect.strip() def test_for_with_break_continue(): # Given src = dedent(''' for i in range(10): if i%7 == 0: break if i%2 == 0: continue do(i) ''') # When code = py2c(src) # Then expect = dedent(''' for (long i=0; i<10; i+=1) { if (((i % 7) == 0)) { break; } if (((i % 2) == 0)) { continue; } do(i); } ''') assert code.strip() == expect.strip() def test_for_not_range_and_else_fails(): # Given src = dedent(''' for i in something(): pass ''') # When/Then with pytest.raises(NotImplementedError): py2c(src) # Given src = dedent(''' for i in range(5): pass else: pass ''') # When/Then with pytest.raises(NotImplementedError): py2c(src) # Given src = dedent(''' for i in range(0, 5, 2, 3): pass ''') # When/Then with pytest.raises(NotImplementedError): py2c(src) def test_while_else_raises_error(): # Given src = dedent(''' while 1: do() else: do() ''') # When/Then with pytest.raises(NotImplementedError): py2c(src) def test_try_block_raises_error(): # Given src = dedent(''' try: do() except ImportError: pass ''') # When/Then with pytest.raises(NotImplementedError): py2c(src) def test_attribute_access(): # Given src = dedent(''' self.x = 1 ''') # When code = py2c(src) # Then expect = dedent(''' double self; self->x = 1; ''') assert code.strip() == expect.strip() def test_declare_call_declares_variable(): # Given src = dedent(''' x = declare('int') x += 1 ''') # When code = py2c(src) # Then expect = dedent(''' int x; x += 1; ''') assert code.strip() == expect.strip() def test_declare_matrix(): # Given src = dedent(''' x = declare('matrix((3,))') do(x[0]) ''') # When code = py2c(src) # Then expect = dedent(''' double x[3]; do(x[0]); ''') assert code.strip() == expect.strip() # Given src = dedent(''' x = declare('matrix((2, 3))') do(x[0][1]) ''') # When code = py2c(src) # Then expect = dedent(''' double x[2][3]; do(x[0][1]); ''') assert code.strip() == expect.strip() # Given src = dedent(''' x = declare('matrix((2, 3), "int")') do(x[0][1]) ''') # When code = py2c(src) # Then expect = dedent(''' int x[2][3]; do(x[0][1]); ''') assert code.strip() == expect.strip() def test_declare_call_declares_multiple_variables(): # Given src = dedent(''' x, y = declare('int', 2) u, v = declare('matrix(3)', 2) A = declare('matrix((2,2), "long")') ''') # When code = py2c(src) # Then expect = dedent(''' int x, y; double u[3], v[3]; long A[2][2]; ''') assert code.strip() == expect.strip() def test_class(): # Given src = dedent(''' class Foo(object): def g(self, x=0.0): return x*2.0 def f(self, x=0.0): y = x + 1 do(self.a, x) z = self.g(y) ''') # When code = py2c(src) # Then expect = dedent(''' double Foo_g(Foo* self, double x) { return (x * 2.0); } void Foo_f(Foo* self, double x) { double y; double z; y = (x + 1); do(self->a, x); z = Foo_g(self, y); } ''') assert code.strip() == expect.strip() def test_unsupported_method(): # Given src = dedent(''' np.identity(25) ''') # When with pytest.raises(NotImplementedError): py2c(src) def test_c_struct_helper(): # Given class Fruit(object): pass f = Fruit() f.apple = 1 f.banana = 2.0 f.pear = 1.5 h = CStructHelper(f) # When result = h.get_code() # Then expect = dedent(''' typedef struct Fruit { int apple; double banana; double pear; } Fruit; ''') assert result.strip() == expect.strip() # When/Then array = h.get_array() use_double = get_config().use_double fdtype = np.float64 if use_double else np.float32 expect = np.dtype([('apple', np.int32), ('banana', fdtype), ('pear', fdtype)]) assert array.dtype == expect assert array['apple'] == 1 assert array['banana'] == 2.0 assert array['pear'] == 1.5 def test_c_struct_helper_empty_object(): # Given class Fruit(object): pass f = Fruit() h = CStructHelper(f) # When result = h.get_code() # Then expect = dedent(''' typedef struct Fruit { } Fruit; ''') assert result.strip() == expect.strip() # When/Then assert h.get_array() is None def test_wrapping_class(): # Given class Dummy(object): '''Class Docstring''' def __init__(self, x=0, f=0.0, s=''): "Constructor docstring" self.x = x self.f = f self.s = s self._private = 1 def method(self): '''Method docstring. ''' pass obj = Dummy() # When c = CConverter() result = c.parse_instance(obj) # Then expect = dedent(''' typedef struct Dummy { double f; int x; } Dummy; void Dummy_method(Dummy* self) { ; } ''') assert result.strip() == expect.strip() # When h = CStructHelper(obj) use_double = get_config().use_double fdtype = np.float64 if use_double else np.float32 dtype = np.dtype([('f', fdtype), ('x', np.int32)]) expect = np.zeros(1, dtype) assert h.get_array() == expect def test_wrapping_class_with_ignore_methods(): # Given class Dummy1(object): '''Class Docstring''' def f(self): pass def not_me(self): pass obj = Dummy1() # When c = CConverter() result = c.parse_instance(obj, ignore_methods=['not_me']) # Then expect = dedent(''' typedef struct Dummy1 { } Dummy1; void Dummy1_f(Dummy1* self) { ; } ''') assert result.strip() == expect.strip() def check_opencl_cuda_conversion(converter_obj): # Note that LID_0 etc. are predefined symbols when we include the CLUDA # preamble, therefore should be known. src = dedent(''' def f(s_idx, s_p, d_idx, d_p, J=0, t=0.0, l=[0,0], xx=(0, 0)): s_p[s_idx] = LID_0*GID_0 ''') # When known_types = {'d_p': KnownType('GLOBAL_MEM int*')} converter = converter_obj(known_types=known_types) code = converter.convert(src) # Then expect = dedent(''' WITHIN_KERNEL void f(long s_idx, GLOBAL_MEM double* s_p, long d_idx, GLOBAL_MEM int* d_p, long J, double t, double* l, double* xx) { s_p[s_idx] = (LID_0 * GID_0); } ''') assert code.strip() == expect.strip() def test_cuda_conversion(): check_opencl_cuda_conversion(CUDAConverter) def test_opencl_conversion(): check_opencl_cuda_conversion(OpenCLConverter) def test_opencl_class(): src = dedent(''' class Foo(object): def g(self, x=0.0): pass ''') # When converter = OpenCLConverter() code = converter.convert(src) # Then expect = dedent(''' WITHIN_KERNEL void Foo_g(GLOBAL_MEM Foo* self, double x) { ; } ''') assert code.strip() == expect.strip() def test_cuda_local_conversion(): @annotate(xc='ldoublep', yc='lintp') def knl(xc, yc): xc[LID_0] = 1 yc[LID_0] = 1 # When converter = CUDAConverter() code = converter.parse(knl) # Then expect_1 = dedent(''' WITHIN_KERNEL void knl(int size_xc, int size_yc) { extern LOCAL_MEM float shared_buff[]; double* xc = (double*) shared_buff; int* yc = (int*) &xc[size_xc]; xc[LID_0] = 1; yc[LID_0] = 1; } ''') expect_2 = dedent(''' WITHIN_KERNEL void knl(int size_xc, int size_yc) { extern LOCAL_MEM float shared_buff[]; int* yc = (int*) shared_buff; double* xc = (double*) &yc[size_yc]; xc[LID_0] = 1; yc[LID_0] = 1; } ''') assert code.strip() == expect_1.strip() or code.strip() == expect_2.strip() def test_handles_parsing_functions(): # Given def f(x=1.0): return x + 1 # When t = CConverter() code = t.parse_function(f) # Then expect = dedent(''' double f(double x) { return (x + 1); } ''') assert code.strip() == expect.strip() # Given class A(object): def f(self, x=1.0): return x + 1.0 # When t = CConverter() code = t.parse_function(A) # Then expect = dedent(''' double A_f(A* self, double x) { return (x + 1.0); } ''') assert code.strip() == expect.strip() def test_address_works(): # Given def f(x=1.0): return address(x) # When t = CConverter() code = t.parse_function(f) # Then expect = dedent(''' double f(double x) { return (&x); } ''') assert code.strip() == expect.strip() def test_atomic_inc_works(): # Given def f(x=1.0): return atomic_inc(x) # When t = OpenCLConverter() code = t.parse_function(f) # Then expect = dedent(''' WITHIN_KERNEL double f(double x) { return atomic_inc(&x); } ''') assert code.strip() == expect.strip() # When t = CUDAConverter() code = t.parse_function(f) # Then expect = dedent(''' WITHIN_KERNEL double f(double x) { return atomicAdd(&x, 1); } ''') assert code.strip() == expect.strip() def test_atomic_dec_works(): # Given def f(x=1.0): return atomic_dec(x) # When t = OpenCLConverter() code = t.parse_function(f) # Then expect = dedent(''' WITHIN_KERNEL double f(double x) { return atomic_dec(&x); } ''') assert code.strip() == expect.strip() # When t = CUDAConverter() code = t.parse_function(f) # Then expect = dedent(''' WITHIN_KERNEL double f(double x) { return atomicAdd(&x, -1); } ''') assert code.strip() == expect.strip() def test_cast_works(): # Given def f(x=1.0): return cast(x, "float") # When t = OpenCLConverter() code = t.parse_function(f) # Then expect = dedent(''' WITHIN_KERNEL double f(double x) { return (float) (x); } ''') assert code.strip() == expect.strip() # When t = CUDAConverter() code = t.parse_function(f) # Then expect = dedent(''' WITHIN_KERNEL double f(double x) { return (float) (x); } ''') assert code.strip() == expect.strip() compyle-release-0.8.1/compyle/tests/test_transpiler.py000066400000000000000000000041601414173670100232210ustar00rootroot00000000000000from math import sin import unittest from ..transpiler import get_external_symbols_and_calls, Transpiler from ..extern import printf SIZE = 10 my_printf = printf def h(x=0.0): return sin(x) + 1 def f(x=0.0): return h(x*2+1) def g(x=0.0): return f(x*2) def implicit_f(x, y): # These should be ignored. j = LID_0 + GID_0 + LDIM_0 + GDIM_0 s = y[SIZE-1] for i in range(SIZE): s += sin(x[i]) my_printf("%f", s) return s def undefined_call(x): # An intentional error that should be caught. foo(x) def _factorial(num): if num == 0: return 1 else: return num*_factorial(num - 1) class TestTranspiler(unittest.TestCase): def test_get_external_symbols_and_calls(self): # Given/When syms, implicit, calls, ext = get_external_symbols_and_calls( g, 'cython' ) # Then expect = [f] self.assertEqual(syms, {}) self.assertEqual(expect, calls) self.assertEqual(ext, []) # Given/When syms, implicit, calls, ext = get_external_symbols_and_calls( implicit_f, 'cython' ) # Then self.assertEqual(syms, {'SIZE': 10}) self.assertEqual(implicit, {'i'}) self.assertEqual(calls, []) self.assertEqual(ext, [my_printf]) # Given/When self.assertRaises(NameError, get_external_symbols_and_calls, undefined_call, 'cython') def test_get_external_symbols_and_calls_handles_recursion(self): # Given/When syms, implicit, calls, ext = get_external_symbols_and_calls( _factorial, 'cython' ) # Then self.assertEqual(syms, {}) self.assertEqual(calls, []) self.assertEqual(implicit, set()) self.assertEqual(ext, []) def test_transpiler(self): # Given t = Transpiler(backend='cython') # When t.add(g) # Then for func in (g, f, h): self.assertTrue(func in t.blocks) expect = [h, f, g] self.assertListEqual([x.obj for x in t.blocks], expect) compyle-release-0.8.1/compyle/tests/test_types.py000066400000000000000000000061601414173670100222040ustar00rootroot00000000000000import unittest import numpy as np from ..types import KnownType, declare, annotate class TestDeclare(unittest.TestCase): def test_declare(self): self.assertEqual(declare('int'), 0) self.assertEqual(declare('long'), 0) self.assertEqual(declare('double'), 0.0) self.assertEqual(declare('float'), 0.0) self.assertEqual(declare('int', 2), (0, 0)) self.assertEqual(declare('long', 3), (0, 0, 0)) self.assertEqual(declare('double', 2), (0.0, 0.0)) self.assertEqual(declare('float', 3), (0.0, 0.0, 0.0)) res = declare('matrix(3)') self.assertTrue(np.all(res == np.zeros(3))) res = declare('matrix(3)', 3) for i in range(3): self.assertTrue(np.all(res[0] == np.zeros(3))) res = declare('matrix((3,))') self.assertTrue(np.all(res == np.zeros(3))) res = declare('matrix((3, 3))') self.assertTrue(np.all(res == np.zeros((3, 3)))) def test_declare_with_type(self): res = declare('matrix(3, "int")') self.assertTrue(np.all(res == np.zeros(3))) self.assertEqual(res.dtype, np.int32) res = declare('matrix((2, 2), "unsigned int")') self.assertTrue(np.all(res == np.zeros((2, 2)))) self.assertEqual(res.dtype, np.uint32) res = declare('matrix((3,), "float")') self.assertTrue(np.all(res == np.zeros((3,)))) self.assertEqual(res.dtype, np.float32) def test_declare_with_address_space(self): self.assertEqual(declare('LOCAL_MEM int', 2), (0, 0)) self.assertEqual(declare('GLOBAL_MEM float', 2), (0.0, 0.0)) res = declare('LOCAL_MEM matrix(3)') self.assertTrue(np.all(res == np.zeros(3))) res = declare('GLOBAL_MEM matrix(3)') self.assertTrue(np.all(res == np.zeros(3))) class TestAnnotate(unittest.TestCase): def test_simple_annotation(self): # Given/When @annotate(i='int', x='floatp', return_='float') def f(i, x): return x[i]*2.0 # Then result = f.__annotations__ self.assertEqual(result['return'], KnownType('float')) self.assertEqual(result['i'], KnownType('int')) self.assertEqual(result['x'], KnownType('float*', 'float')) def test_reversed_annotation(self): # Given/When @annotate(i='int', floatp='x, y', return_='float') def f(i, x, y): return x[i]*y[i] # Then result = f.__annotations__ self.assertEqual(result['return'], KnownType('float')) self.assertEqual(result['i'], KnownType('int')) self.assertEqual(result['x'], KnownType('float*', 'float')) self.assertEqual(result['y'], KnownType('float*', 'float')) def test_decorator_accepts_known_type_instance(self): # Given/When @annotate(x=KnownType('Thing')) def f(x): x.f() # Then result = f.__annotations__ self.assertEqual(result['x'], KnownType('Thing')) def test_decorator_raises_error_for_unknown_error(self): def f(x): pass self.assertRaises(TypeError, annotate, f, x='alpha') compyle-release-0.8.1/compyle/tests/test_utils.py000066400000000000000000000024221414173670100221750ustar00rootroot00000000000000import inspect from textwrap import dedent from unittest import TestCase from .. import utils def func(x): return x class TestUtils(TestCase): def test_getsource_works_with_normal_function(self): # Given/When src = utils.getsource(func) # Then self.assertEqual(src, inspect.getsource(func)) def test_getsource_works_with_generated_function(self): # Given src = dedent(''' def gfunc(x): return x ''') ns = {} exec(src, ns) gfunc = ns['gfunc'] gfunc.source = src # When result = utils.getsource(gfunc) # Then self.assertEqual(result, src) def test_getsourcelines_works_with_normal_function(self): # Given/When result = utils.getsourcelines(func) # Then self.assertEqual(result, inspect.getsourcelines(func)) def test_getsourcelines_works_with_generated_function(self): # Given src = dedent(''' def gfunc(x): return x ''') ns = {} exec(src, ns) gfunc = ns['gfunc'] gfunc.source = src # When result = utils.getsourcelines(gfunc) # Then self.assertEqual(result, (src.splitlines(True), 0)) compyle-release-0.8.1/compyle/thrust/000077500000000000000000000000001414173670100176135ustar00rootroot00000000000000compyle-release-0.8.1/compyle/thrust/__init__.py000066400000000000000000000000001414173670100217120ustar00rootroot00000000000000compyle-release-0.8.1/compyle/thrust/sort.pyx000066400000000000000000000007661414173670100213550ustar00rootroot00000000000000import cupy.cuda.thrust as thrust from libcpp.vector cimport vector import compyle.array as carr import numpy as np cpdef argsort(array, keys=None): idx_array = carr.empty(array.length, np.intp, backend='cuda') cdef vector[int] shape shape.push_back( array.length) cdef size_t keys_ptr if keys: keys_ptr = keys.dev.ptr else: keys_ptr = 0 thrust.argsort(array.dtype, idx_array.dev.ptr, array.dev.ptr, keys_ptr, shape) return idx_array compyle-release-0.8.1/compyle/translator.py000066400000000000000000000700331414173670100210300ustar00rootroot00000000000000'''Simple Python to C converter. While this is a fresh implementation, it is highly inspired from https://github.com/mdipierro/ocl This code does not use meta and uses the standard ast visitor, it is also tested and modified to be suitable for use with PySPH. ''' from __future__ import absolute_import import ast import re import sys from textwrap import dedent, wrap import types import numpy as np from mako.template import Template from .config import get_config from .types import get_declare_info from .cython_generator import ( CodeGenerationError, KnownType, Undefined, all_numeric ) from .utils import getsource PY_VER = sys.version_info.major def detect_type(name, value): if isinstance(value, KnownType): return value.type if name.startswith(('s_', 'd_')) and name not in ['s_idx', 'd_idx']: return 'double*' if name in ['s_idx', 'd_idx']: return 'long' if value is Undefined or isinstance(value, Undefined): raise CodeGenerationError('Unknown type, for %s' % name) if isinstance(value, bool): return 'int' elif isinstance(value, int): return 'long' elif isinstance(value, float): return 'double' elif isinstance(value, (list, tuple)): if all_numeric(value): # We don't deal with integer lists for now. return 'double*' else: raise CodeGenerationError( 'Unknown type, for %s with value %s' % (name, value) ) else: raise CodeGenerationError( 'Unknown type, for %s with value %s' % (name, value) ) def py2c(src, detect_type=detect_type, known_types=None): converter = CConverter(detect_type=detect_type, known_types=known_types) result = converter.convert(src) r = converter.get_declarations() + result print(r) return r class CStructHelper(object): def __init__(self, obj): self._use_double = get_config().use_double self.parse(obj) def _get_public_vars(self): data = self.obj.__dict__ vars = {} for name in data: if name.startswith('_'): continue value = data[name] if isinstance(value, (int, bool)): vars[name] = 'int' elif isinstance(value, float): vars[name] = 'double' return vars def parse(self, obj): self.name = obj.__class__.__name__ self.obj = obj self.vars = self._get_public_vars() def get_array(self): f_dtype = np.float64 if self._use_double else np.float32 types = {'int': np.int32, 'double': f_dtype, 'long': np.int64} if len(self.vars) > 0: obj = self.obj fields = [] for var in sorted(self.vars): fields.append((var, types[self.vars[var]])) dtype = np.dtype(fields) ary = np.empty(1, dtype) for var in self.vars: ary[var][0] = getattr(obj, var) return ary else: return None def get_code(self): template = dedent(""" typedef struct ${class_name} { %for name, type in sorted(vars.items()): ${type} ${name}; %endfor } ${class_name}; """) t = Template(text=template) return t.render(class_name=self.name, vars=self.vars) class CConverter(ast.NodeVisitor): def __init__(self, detect_type=detect_type, known_types=None): self._declares = {} self._known = set(( 'M_E', 'M_LOG2E', 'M_LOG10E', 'M_LN2', 'M_LN10', 'M_PI', 'M_PI_2', 'M_PI_4', 'M_1_PI', 'M_2_PI', 'M_2_SQRTPI', 'M_SQRT2', 'M_SQRT1_2', 'INFINITY', 'NAN', 'HUGE_VALF' )) self._name_ctx = (ast.Load, ast.Store) self._indent = '' self._detect_type = detect_type self._known_types = known_types if known_types is not None else {} self._class_name = '' self._src = '' self._for_count = 0 self._added_loop_vars = set() self._annotations = {} self._declarations = None self._ignore_methods = [] self._replacements = { 'True': '1', 'False': '0', 'None': 'NULL', True: '1', False: '0', None: 'NULL', } self.function_address_space = '' def _body_has_return(self, body): return re.search(r'\breturn\b', body) is not None def _get_return_type(self, body, node): annotations = self._annotations.get(node.name) if annotations: kt = annotations.get('return') return kt.type if kt is not None else 'void' else: return 'double' if self._body_has_return(body) else 'void' def _get_self_type(self): return KnownType('%s*' % self._class_name) def _get_local_arg(self, arg, type): return arg, type def _get_function_args(self, node): node_args = node.args.args if PY_VER == 2: args = [x.id for x in node_args] else: args = [x.arg for x in node_args] annotations = self._annotations.get(node.name) call_args = {} if annotations: for arg in args: call_args[arg] = annotations.get(arg, Undefined) else: defaults = [ast.literal_eval(x) for x in node.args.defaults] # Fill up the call_args dict with the defaults. for i in range(1, len(defaults) + 1): call_args[args[-i]] = defaults[-i] # Set the rest to Undefined. for i in range(len(args) - len(defaults)): call_args[args[i]] = Undefined call_args.update(self._known_types) if len(self._class_name) > 0: call_args['self'] = self._get_self_type() call_sig = [] for arg in args: value = call_args[arg] type = self._detect_type(arg, value) if 'LOCAL_MEM' in type: arg, type = self._get_local_arg(arg, type) call_sig.append('{type} {arg}'.format(type=type, arg=arg)) return ', '.join(call_sig) def _get_variable_declaration(self, type_str, names): kind, address_space, ctype, shape = get_declare_info(type_str) if address_space: address_space += ' ' if kind == 'matrix': if not isinstance(shape, tuple): shape = (shape,) sz = ''.join('[%d]' % x for x in shape) vars = ['%s%s' % (x, sz) for x in names] return '{address}{type} {vars};'.format( address=address_space, type=ctype, vars=', '.join(vars) ) else: return '{address}{type} {vars};'.format( address=address_space, type=ctype, vars=', '.join(names) ) def _indent_block(self, code): lines = code.splitlines() pad = ' ' * 4 return '\n'.join(pad + x for x in lines) def _remove_docstring(self, body): if body and isinstance(body[0], ast.Expr) and \ isinstance(body[0].value, ast.Str): return body[1:] else: return body def _get_local_info(self, obj): return None def _get_local_declarations(self): return '' def add_known(self, names): '''Add a known name that should not be auto-declared. This is useful when we are declaring global constants for which we should not redeclare the types in the generated code. ''' self._known.update(names) def convert(self, src, ignore_methods=None): if ignore_methods is not None: self._ignore_methods = ignore_methods self._src = src.splitlines() code = ast.parse(src) result = self.visit(code) self._ignore_methods = [] return result def error(self, message, node): msg = '\nError in code in line %d:\n' % node.lineno if self._src: # pragma: no branch if node.lineno > 1: # pragma no branch msg += self._src[node.lineno - 2] + '\n' msg += self._src[node.lineno - 1] + '\n' msg += ' ' * node.col_offset + '^' + '\n\n' msg += message raise NotImplementedError(msg) def get_declarations(self): if len(self._declares) > 0: return '\n'.join( sorted(self._declares.values()) ) + '\n' else: return '' def get_struct_from_instance(self, obj): helper = CStructHelper(obj) return helper.get_code() + '\n' def parse(self, obj, declarations=None): obj_type = type(obj) if isinstance(obj, types.FunctionType): code = self.parse_function(obj, declarations=declarations) elif hasattr(obj, '__class__'): code = self.parse_instance(obj) else: raise TypeError('Unsupported type to wrap: %s' % obj_type) return code def parse_instance(self, obj, ignore_methods=None): code = self.get_struct_from_instance(obj) src = dedent(getsource(obj.__class__)) ignore_methods = [] if ignore_methods is None else ignore_methods for method in dir(obj): if not method.startswith(('_', 'py_')) \ and method not in ignore_methods: ann = getattr(getattr(obj, method), '__annotations__', None) self._annotations[method] = ann code += self.convert(src, ignore_methods) self._annotations = {} return code def parse_function(self, obj, declarations=None): src = dedent(getsource(obj)) fname = obj.__name__ self._declarations = declarations self._annotations[fname] = getattr(obj, '__annotations__', {}) self._local_decl = self._get_local_info(obj) code = self.convert(src) self._local_decl = None self._annotations = {} self._declarations = None return code def render_atomic(self, func, arg): raise NotImplementedError( "Atomics only supported by CUDA/OpenCL backends") def visit_LShift(self, node): return '<<' def visit_RShift(self, node): return '>>' def visit_BitOr(self, node): return '|' def visit_BitXor(self, node): return '^' def visit_BitAnd(self, node): return '&' def visit_Add(self, node): return '+' def visit_And(self, node): return '&&' def visit_Assign(self, node): if len(node.targets) != 1: self.error("Assignments can have only one target.", node) left, right = node.targets[0], node.value if isinstance(right, ast.Call) and \ isinstance(right.func, ast.Name) and right.func.id == 'declare': if not isinstance(right.args[0], ast.Str): self.error("Argument to declare should be a string.", node) type = right.args[0].s if isinstance(left, ast.Name): self._known.add(left.id) return self._get_variable_declaration(type, [self.visit(left)]) elif isinstance(left, ast.Tuple): names = [x.id for x in left.elts] self._known.update(names) return self._get_variable_declaration(type, names) return '%s = %s;' % (self.visit(left), self.visit(right)) def visit_Attribute(self, node): return '%s->%s' % (self.visit(node.value), node.attr) def visit_AugAssign(self, node): return '%s %s= %s;' % (self.visit(node.target), self.visit(node.op), self.visit(node.value)) def visit_BinOp(self, node): if isinstance(node.op, ast.Pow): return 'pow(%s, %s)' % ( self.visit(node.left), self.visit(node.right) ) else: result = tuple(self.visit(x) for x in (node.left, node.op, node.right)) return '(%s %s %s)' % result def visit_BoolOp(self, node): op = ' %s ' % self.visit(node.op) return '(%s)' % (op.join(self.visit(x) for x in node.values)) def visit_Break(self, node): return 'break;' def visit_Call(self, node): if isinstance(node.func, ast.Name): if node.func.id == 'address': return '(&%s)' % self.visit(node.args[0]) elif 'atomic' in node.func.id: return self.render_atomic(node.func.id, node.args[0]) elif node.func.id == 'cast': return '(%s) (%s)' % (node.args[1].s, self.visit(node.args[0])) else: return '{func}({args})'.format( func=node.func.id, args=', '.join(self.visit(x) for x in node.args) ) elif isinstance(node.func, ast.Attribute): if node.func.value.id in self._known_types: name = node.func.value.id cls = self._known_types[name].base_type args = [name] + [self.visit(x) for x in node.args] return '{func}({args})'.format( func='%s_%s' % (cls, node.func.attr), args=', '.join(args) ) elif len(self._class_name) > 0: args = ['self'] + [self.visit(x) for x in node.args] return '{func}({args})'.format( func='%s_%s' % (self._class_name, node.func.attr), args=', '.join(args) ) else: self.error('Unsupported function call', node) else: self.error('Unsupported function call', node) def visit_ClassDef(self, node): self._class_name = node.name # FIXME: Does not handle base class methods. code = [self.visit(x) for x in self._remove_docstring(node.body)] self._class_name = '' return '\n'.join(code) def visit_Compare(self, node): if len(node.ops) != 1 or len(node.comparators) != 1: self.error('Only simple comparisons are allowed.', node) return '(%s %s %s)' % (self.visit(node.left), self.visit(node.ops[0]), self.visit(node.comparators[0])) def visit_Continue(self, node): return 'continue;' def visit_Div(self, node): return '/' def visit_Eq(self, node): return '==' def visit_Expr(self, node): return self.visit(node.value) + ';' def _check_if_integer(self, s): try: int(ast.literal_eval(s)) except ValueError: return False else: return True def visit_For(self, node): if node.iter.func.id != 'range': self.error( 'Only for var in range syntax supported.', node.iter ) if node.orelse: self.error('For/else not supported.', node.orelse[0]) args = node.iter.args # If the stop or step elements are not numbers, then the semantics of a # for i in range can be very different from the translated C as in C, # one could change the stop or increment at each step. This is not # possible in Python. simple = True positive_step = True int_step = True int_stop = True if len(args) == 1: start, stop, incr = 0, self.visit(args[0]), 1 int_stop = simple = self._check_if_integer(stop) elif len(args) == 2: start, stop, incr = self.visit(args[0]), self.visit(args[1]), 1 int_stop = simple = self._check_if_integer(stop) elif len(args) == 3: start, stop, incr = [self.visit(x) for x in args] int_step = self._check_if_integer(incr) int_stop = self._check_if_integer(stop) simple = (int_stop and int_step) if int_step: positive_step = ast.literal_eval(incr) > 0 else: self.error('range should have either 1, 2, or 3 args', node.iter) local_scope = False if isinstance(node.target, ast.Name): if node.target.id not in self._known: target_type = 'long ' self._known.add(node.target.id) local_scope = True else: target_type = '' target = self.visit(node.target) if simple: comparator = '<' if positive_step else '>' r = ('for ({type}{i}={start}; {i}{comp}{stop}; {i}+={incr})' ' {{\n{block}\n}}\n').format( i=target, type=target_type, start=start, stop=stop, incr=incr, comp=comparator, block='\n'.join( self._indent_block(self.visit(x)) for x in node.body ) ) else: count = self._for_count self._for_count += 1 r = '' if not int_stop: stop_var = '__cpy_stop_{count}'.format(count=count) type = 'long ' if stop_var not in self._known else '' self._known.add(stop_var) if count > 0: self._added_loop_vars.add(stop_var) r += '{type}{stop_var} = {stop};\n'.format( type=type, stop_var=stop_var, stop=stop ) stop = stop_var if int_step: comparator = '<' if positive_step else '>' block = '\n'.join( self._indent_block(self.visit(x)) for x in node.body ) r += ('for ({type}{i}={start}; {i}{comp}{stop}; {i}+={incr})' ' {{\n{block}\n}}\n').format( i=target, type=target_type, start=start, stop=stop, incr=incr, comp=comparator, block=block ) else: step_var = '__cpy_step_{count}'.format(count=count) type = 'long ' if step_var not in self._known else '' self._known.add(step_var) if count > 0: self._added_loop_vars.add(step_var) r += '{type}{step_var} = {incr};\n'.format( type=type, step_var=step_var, incr=incr ) incr = step_var block = '\n'.join( self._indent_block(self.visit(x)) for x in node.body ) r += dedent('''\ if ({incr} < 0) {{ for ({type}{i}={start}; {i}>{stop}; {i}+={incr}) {{ {block} }} }} else {{ for ({type}{i}={start}; {i}<{stop}; {i}+={incr}) {{ {block} }} }} ''').format( i=target, type=target_type, start=start, stop=stop, incr=incr, block=block ) if count == 0: self._known -= self._added_loop_vars self._added_loop_vars = set() if local_scope: self._known.remove(node.target.id) return r def visit_FunctionDef(self, node): assert node.args.vararg is None, \ "Functions with varargs nor supported in line %d." % node.lineno assert node.args.kwarg is None, \ "Functions with kwargs not supported in line %d." % node.lineno if self._class_name and (node.name.startswith(('_', 'py_')) or node.name in self._ignore_methods): return '' self._for_count = 0 orig_declares = self._declares self._declares = {} if not self._declarations else self._declarations orig_known = set(self._known) if PY_VER == 2: self._known.update(x.id for x in node.args.args) else: self._known.update(x.arg for x in node.args.args) args = self._get_function_args(node) body = '\n'.join(self._indent_block(self.visit(item)) for item in self._remove_docstring(node.body)) local_decl = self._get_local_declarations() if len(self._class_name) > 0: func_name = self._class_name + '_' + node.name else: func_name = node.name return_type = self._get_return_type(body, node) sig = self.function_address_space + '{ret} {name}({args})'.format( ret=return_type, name=func_name, args=args ) declares = self._indent_block(self.get_declarations()) if len(declares) > 0: declares += '\n' sig = '\n'.join(wrap( sig, width=78, subsequent_indent=' ' * 4, break_long_words=False )) self._known = orig_known self._declares = orig_declares return sig + '\n{\n' + local_decl + declares + body + '\n}\n' def visit_Gt(self, node): return '>' def visit_GtE(self, node): return '>=' def visit_If(self, node): code = 'if ({cond}) {{\n{block}\n}}\n'.format( cond=self.visit(node.test), block='\n'.join( self._indent_block(self.visit(x)) for x in node.body ) ) if node.orelse: code += 'else {{\n{block}\n}}\n'.format( block='\n'.join( self._indent_block(self.visit(x)) for x in node.orelse ) ) return code def visit_IfExp(self, node): code = '{cond} ? {true} : {false}'.format( cond=self.visit(node.test), true=self.visit(node.body), false=self.visit(node.orelse) ) return code def visit_Is(self, node): return '==' def visit_IsNot(self, node): return '!=' def visit_Lt(self, node): return '<' def visit_LtE(self, node): return '<=' def visit_Mod(self, node): return '%' def visit_Module(self, node): return '\n'.join( self.visit(item) for item in node.body ) def visit_Mult(self, node): return '*' def visit_Name(self, node): assert isinstance(node.ctx, self._name_ctx) id = node.id if id in self._replacements: return self._replacements[id] if id not in self._declares and id not in self._known: self._declares[id] = 'double %s;' % id return id def visit_NameConstant(self, node): value = node.value if value in self._replacements: return self._replacements[value] else: return value def visit_Not(self, node): return '!' def visit_NotEq(self, node): return '!=' def visit_Num(self, node): return str(node.n) def visit_Or(self, node): return '||' def visit_Pass(self, node): return ';' def visit_Return(self, node): if node.value: return 'return %s;' % (self.visit(node.value)) else: return 'return;' def visit_Sub(self, node): return '-' def visit_Str(self, node): return r'"%s"' % node.s def visit_Subscript(self, node): if sys.version_info >= (3, 9): sliceval = node.slice else: sliceval = node.slice.value return '%s[%s]' % ( self.visit(node.value), self.visit(sliceval) ) def visit_TryExcept(self, node): self.error('Try/except not implemented.', node) visit_Try = visit_TryExcept def visit_UnaryOp(self, node): return '%s%s' % (self.visit(node.op), self.visit(node.operand)) def visit_USub(self, node): return '-' def visit_While(self, node): if node.orelse: self.error('Does not support while/else clauses.', node.orelse[0]) return 'while ({cond}) {{\n{block}\n}}\n'.format( cond=self.visit(node.test), block='\n'.join( self._indent_block(self.visit(x)) for x in node.body ) ) def ocl_detect_pointer_base_type(name, value): if isinstance(value, KnownType): if value.base_type: return value.base_type else: # Valid pointer type ends with a '*' # Exceptions like `int a[]` are possible but such definitions are # not generated by the translator pointer_type = value.type.rstrip() pointer_type = pointer_type.replace('__global', '') pointer_type = pointer_type.replace('GLOBAL_MEM', '') pointer_type = pointer_type.replace('__local', '') pointer_type = pointer_type.replace('LOCAL_MEM', '') if pointer_type[-1] != '*': raise Exception("Invalid pointer type: %s" % value.type) base_type = pointer_type[:-1].rstrip() return base_type elif name.startswith(('s_', 'd_')) and name not in ['s_idx', 'd_idx']: return 'double' else: raise NotImplementedError() def ocl_detect_type(name, value): if isinstance(value, KnownType): return value.type elif name.startswith(('s_', 'd_')) and name not in ['s_idx', 'd_idx']: return 'GLOBAL_MEM double*' else: return detect_type(name, value) class OpenCLConverter(CConverter): def __init__(self, detect_type=ocl_detect_type, known_types=None): super(OpenCLConverter, self).__init__(detect_type, known_types) self.function_address_space = 'WITHIN_KERNEL ' self._known.update(( 'LID_0', 'LID_1', 'LID_2', 'GID_0', 'GID_1', 'GID_2', 'LDIM_0', 'LDIM_1', 'LDIM_2', 'GDIM_0', 'GDIM_1', 'GDIM_2' )) def _get_self_type(self): return KnownType('GLOBAL_MEM %s*' % self._class_name) def render_atomic(self, func, arg): if func == 'atomic_inc': return 'atomic_inc(&%s)' % self.visit(arg) elif func == 'atomic_dec': return 'atomic_dec(&%s)' % self.visit(arg) else: raise NotImplementedError( "Only atomic_inc, atomic_dec supported right now") class CUDAConverter(OpenCLConverter): def __init__(self, detect_type=ocl_detect_type, known_types=None): super(CUDAConverter, self).__init__(detect_type, known_types) self._local_decl = None def _get_local_arg(self, arg, type): return 'size_%s' % arg, 'int' def _get_local_info(self, obj): fname = obj.__name__ annotations = self._annotations[fname] local_info = {} for arg, kt in annotations.items(): if 'LOCAL_MEM' in kt.type: local_info[arg] = kt.base_type if local_info: return local_info return None def _get_local_declarations(self): local_decl = '' if self._local_decl: decls = ['extern LOCAL_MEM float shared_buff[];'] # Reference: # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared for arg, dtype in self._local_decl.items(): if len(decls) == 1: local_decl = ('%(dtype)s* %(arg)s = ' '(%(dtype)s*) shared_buff;') local_decl = local_decl % {'dtype': dtype, 'arg': arg} decls.append(local_decl) prev_arg = arg else: local_decl = ('%(dtype)s* %(arg)s = (%(dtype)s*) ' '&%(prev_arg)s[size_%(prev_arg)s];') local_decl = local_decl % {'dtype': dtype, 'arg': arg, 'prev_arg': prev_arg} decls.append(local_decl) prev_arg = arg local_decl = self._indent_block('\n'.join(decls)) local_decl += '\n' return local_decl def render_atomic(self, func, arg): if func == 'atomic_inc': return 'atomicAdd(&%s, 1)' % self.visit(arg) elif func == 'atomic_dec': return 'atomicAdd(&%s, -1)' % self.visit(arg) else: raise NotImplementedError( "Only atomic_inc, atomic_dec supported right now") compyle-release-0.8.1/compyle/transpiler.py000066400000000000000000000231351414173670100210230ustar00rootroot00000000000000import importlib import math import re from textwrap import dedent from mako.template import Template from .config import get_config from .ast_utils import get_unknown_names_and_calls from .cython_generator import CythonGenerator, CodeGenerationError from .translator import OpenCLConverter, CUDAConverter from .ext_module import ExtModule from .extern import Extern, get_extern_code from .utils import getsourcelines BUILTINS = set( [x for x in dir(math) if not x.startswith('_')] + ['max', 'abs', 'min', 'range', 'declare', 'local_barrier', 'annotate', 'printf'] ) BUILTIN_SYMBOLS = set( 'LID_0 LID_1 LID_2 GID_0 GID_1 GID_2 LDIM_0 LDIM_1 LDIM_2 ' 'GDIM_0 GDIM_1 GDIM_2 ' 'M_E M_LOG2E M_LOG10E M_LN2 M_LN10 M_PI M_PI_2 M_PI_4 ' 'M_1_PI M_2_PI M_2_SQRTPI M_SQRT2 M_SQRT1_2 ' 'INFINITY NAN HUGE_VALF'.split() ) CY_BUILTIN_SYMBOLS = BUILTIN_SYMBOLS | set( ['HUGE_VAL', 'HUGE_VALL', 'e', 'pi'] ) OCL_BUILTIN_SYMBOLS = BUILTIN_SYMBOLS | set(['MAXFLOAT']) def filter_calls(calls): '''Given a set of calls filter out the math and other builtin functions. ''' return [x for x in calls if x not in BUILTINS] def get_external_symbols_and_calls(func, backend): '''Given a function, return a dictionary of all external names (with their values), a set of implicitly defined names, a list of functions that it calls ignoring standard math functions and a few other standard ones, and a list of Extern instances. If a function is not defined it will raise a ``NameError``. Parameters ---------- func: Function to look at. backend: str: The backend being used. Returns ------- names, implicits, functions, externs ''' if backend == 'cython': ignore = CY_BUILTIN_SYMBOLS else: ignore = OCL_BUILTIN_SYMBOLS src = dedent('\n'.join(getsourcelines(func)[0])) names, calls = get_unknown_names_and_calls(src) names -= ignore calls = filter_calls(calls) if func.__name__ in calls: calls.remove(func.__name__) mod = importlib.import_module(func.__module__) symbols = {} implicit = set() externs = [] for name in names: if hasattr(mod, name): value = getattr(mod, name) if isinstance(value, Extern): externs.append(value) else: symbols[name] = value else: implicit.add(name) funcs = [] undefined = [] for call in calls: f = getattr(mod, call, None) if f is None: undefined.append(call) elif isinstance(f, Extern): externs.append(f) else: funcs.append(f) if undefined: msg = 'The following functions are not defined:\n %s ' % ( ', '.join(undefined) ) raise NameError(msg) return symbols, implicit, funcs, externs def convert_to_float_if_needed(code): use_double = get_config().use_double if not use_double: code = re.sub(r'\bdouble\b', 'float', code) return code class CodeBlock(object): def __init__(self, obj, code): self.obj = obj self.code = code def __eq__(self, other): if isinstance(other, CodeBlock): return self.obj == other.obj else: return self.obj == other class Transpiler(object): def __init__(self, backend='cython', incl_cluda=True): """Constructor. Parameters ---------- backend: str: Backend to use. Can be one of 'cython', 'opencl', 'cuda' or 'python' """ self.backend = backend self.blocks = [] self.mod = None # This attribute will store the generated and compiled source for # debugging. self.source = '' if backend == 'cython': self._cgen = CythonGenerator() self.header = dedent(''' # cython: language_level=3 from libc.stdio cimport printf from libc.math cimport * from libc.math cimport fabs as abs from cython.parallel import parallel, prange ''') if get_config().use_openmp: self.header += dedent(''' cimport openmp cdef openmp.omp_lock_t cy_lock openmp.omp_init_lock(&cy_lock) ''') elif backend == 'opencl': from pyopencl._cluda import CLUDA_PREAMBLE self._cgen = OpenCLConverter() cluda = '' if incl_cluda: cluda = Template(text=CLUDA_PREAMBLE).render( double_support=True ) self.header = cluda + dedent(''' #define max(x, y) fmax((double)(x), (double)(y)) #ifdef __APPLE__ #ifndef M_PI #define M_PI 3.14159265358979323846 #endif #endif ''') elif backend == 'cuda': from pycuda._cluda import CLUDA_PREAMBLE self._cgen = CUDAConverter() cluda = '' if incl_cluda: cluda = Template(text=CLUDA_PREAMBLE).render( double_support=True ) self.header = cluda + dedent(''' #define max(x, y) fmax((double)(x), (double)(y)) ''') def _handle_symbol(self, name, value): backend = self.backend value_type = type(value) if isinstance(value, int): if value > 2147483648: ctype = 'long' else: ctype = 'int' elif isinstance(value, float): ctype = 'double' elif isinstance(value, bool): ctype = 'bint' if backend == 'cython' else 'int' if backend == 'opencl' or backend == 'cuda': value = str(value).lower() else: msg = 'Unsupported type (%s) of variable "%s"' % ( value_type, name ) raise CodeGenerationError(msg) if self.backend == 'cython': return 'cdef {type} {name} = {value}'.format( type=ctype, name=name, value=value ) elif self.backend == 'opencl' or self.backend == 'cuda': return '#define {name} {value}'.format( name=name, value=value ) def _get_comment(self): return '#' if self.backend == 'cython' else '//' def _handle_symbols(self, syms): lines = [] comment = self._get_comment() if len(syms): hline = '{com} {line}'.format(com=comment, line='-' * 70) code = '{com} Global constants from user namespace'.format( com=comment ) lines.extend([hline, code, '']) for name, value in syms.items(): lines.append(self._handle_symbol(name, value)) lines.extend(['', hline]) self.header += '\n'.join(lines) def _handle_externs(self, externs): link, code = get_extern_code(externs, self.backend) # Link is ignored for now until we have a concrete example. if code: comment = self._get_comment() hline = '{com} {line}'.format(com=comment, line='-' * 70) info = '{com} External definitions.'.format(com=comment) lines = [hline, info, ''] + code + [hline] self.header += '\n'.join(lines) def _handle_external(self, func, declarations=None): syms, implicit, calls, externs = get_external_symbols_and_calls( func, self.backend ) if implicit and not get_config().suppress_warnings: msg = ('Warning: the following symbols are implicitly defined.\n' ' %s\n' 'You may want to explicitly declare/define them.' % implicit) print(msg) self._handle_externs(externs) self._handle_symbols(syms) self._cgen.add_known(syms) for f in calls: self.add(f, declarations=declarations) def add(self, obj, declarations=None): if obj in self.blocks: return self._handle_external(obj, declarations=declarations) if self.backend == 'cython': is_serial = getattr(obj, 'is_serial', False) self._cgen.parse( obj, declarations=declarations.get(obj.__name__) if declarations else None, is_serial=is_serial) code = self._cgen.get_code() elif self.backend == 'opencl' or self.backend == 'cuda': code = self._cgen.parse( obj, declarations=declarations.get(obj.__name__) if declarations else None) cb = CodeBlock(obj, code) self.blocks.append(cb) def add_code(self, code): cb = CodeBlock(code, code) self.blocks.append(cb) def get_code(self): code = [self.header] + [x.code for x in self.blocks] return '\n'.join(code) def compile(self): if self.backend == 'cython': self.source = self.get_code() mod = ExtModule(self.source) self.mod = mod.load() elif self.backend == 'opencl': import pyopencl as cl from .opencl import get_context ctx = get_context() self.source = convert_to_float_if_needed(self.get_code()) self.mod = cl.Program(ctx, self.source).build( options=['-w'] ) elif self.backend == 'cuda': import pycuda as cu from pycuda.compiler import SourceModule self.source = convert_to_float_if_needed(self.get_code()) self.mod = SourceModule(self.source) compyle-release-0.8.1/compyle/types.py000066400000000000000000000216371414173670100200110ustar00rootroot00000000000000import ast import platform import sys import numpy as np BITS = platform.architecture()[0] def declare(type, num=1): """Declare the variable to be of the given type. The additional optional argument num is the number of items to return. Normally, the declare function only defines a variable when compiled, however, this function here is a pure Python implementation so that the same code can be executed in Python. Parameters ---------- type: str: String representing the type. num: int: the number of values to return Examples -------- >>> declare('int') 0 >>> declare('int', 3) 0, 0, 0 """ if num == 1: return _declare(type) else: return tuple(_declare(type) for i in range(num)) def get_declare_info(arg): """Given the first argument to the declare function, return the (kind, address_space, type, shape), information. kind: is a string, 'primitive' or 'matrix' address_space: is the address space string. type: is the c data type to use. shape: is a tuple with the shape of the matrix. It is None for primitives. """ address_space = '' shape = None if arg.startswith(('LOCAL_MEM', 'GLOBAL_MEM')): idx = arg.index(' ') address_space = arg[:idx] arg = arg[idx + 1:] if arg.startswith('matrix'): kind = 'matrix' m_arg = ast.literal_eval(arg[7:-1]) if isinstance(m_arg, tuple) and \ len(m_arg) > 1 and \ isinstance(m_arg[1], str): shape = m_arg[0] type = m_arg[1] else: shape = m_arg type = 'double' else: kind = 'primitive' type = arg return kind, address_space, type, shape def _declare(arg): kind, address_space, ctype, shape = get_declare_info(arg) if kind == 'matrix': dtype = C_NP_TYPE_MAP[ctype] return np.zeros(shape, dtype=dtype) else: if ctype in ['double', 'float']: return 0.0 else: return 0 class Undefined(object): pass class KnownType(object): """Simple object to specify a known type as a string. Smells but is convenient as the type may be one available only inside Cython without a corresponding Python type. """ def __init__(self, type_str, base_type=''): """Constructor The ``base_type`` argument is optional and used to represent the base type, i.e. the type_str may be 'Foo*' but the base type will be 'Foo' if specified. Parameters ---------- type_str: str: A string representation of how the type is declared. base_type: str: The base type of this entity. (optional) """ self.type = type_str self.base_type = base_type def __repr__(self): if self.base_type: return 'KnownType("%s", "%s")' % (self.type, self.base_type) else: return 'KnownType("%s")' % self.type def __eq__(self, other): return self.type == other.type and self.base_type == other.base_type TYPES = dict( float=KnownType('float'), double=KnownType('double'), int=KnownType('int'), long=KnownType('long'), uint=KnownType('unsigned int'), ulong=KnownType('unsigned long'), floatp=KnownType('float*', 'float'), doublep=KnownType('double*', 'double'), intp=KnownType('int*', 'int'), longp=KnownType('long*', 'long'), uintp=KnownType('unsigned int*', 'unsigned int'), ulongp=KnownType('unsigned long*', 'unsigned long'), gfloatp=KnownType('GLOBAL_MEM float*', 'float'), gdoublep=KnownType('GLOBAL_MEM double*', 'double'), gintp=KnownType('GLOBAL_MEM int*', 'int'), glongp=KnownType('GLOBAL_MEM long*', 'long'), guintp=KnownType('GLOBAL_MEM unsigned int*', 'unsigned int'), gulongp=KnownType('GLOBAL_MEM unsigned long*', 'unsigned long'), lfloatp=KnownType('LOCAL_MEM float*', 'float'), ldoublep=KnownType('LOCAL_MEM double*', 'double'), lintp=KnownType('LOCAL_MEM int*', 'int'), llongp=KnownType('LOCAL_MEM long*', 'long'), luintp=KnownType('LOCAL_MEM unsigned int*', 'unsigned int'), lulongp=KnownType('LOCAL_MEM unsigned long*', 'unsigned long'), ) def _inject_types_in_module(): g = globals() for name, type in TYPES.items(): if name in ['int', 'long', 'float']: name = name + '_' g[name] = type # A convenience so users can import types directly from the module. _inject_types_in_module() NP_C_TYPE_MAP = { np.dtype(np.bool): 'char', np.dtype(np.float32): 'float', np.dtype(np.float64): 'double', np.dtype(np.int8): 'char', np.dtype(np.uint8): 'unsigned char', np.dtype(np.int16): 'short', np.dtype(np.uint16): 'unsigned short', np.dtype(np.int32): 'int', np.dtype(np.uint32): 'unsigned int', np.dtype(np.int64): 'long', np.dtype(np.uint64): 'unsigned long' } C_NP_TYPE_MAP = { 'bool': np.bool, 'char': np.int8, 'double': np.float64, 'float': np.float32, 'int': np.int32, 'long': np.int64, 'short': np.int16, 'unsigned char': np.uint8, 'unsigned int': np.uint32, 'unsigned long': np.uint64, 'unsigned short': np.uint16 } if sys.platform.startswith('win') or BITS.startswith('32bit'): NP_C_TYPE_MAP[np.dtype(np.int64)] = 'long long' NP_C_TYPE_MAP[np.dtype(np.uint64)] = 'unsigned long long' C_NP_TYPE_MAP['long long'] = np.int64 C_NP_TYPE_MAP['unsigned long long'] = np.uint64 TYPES['glonglongp'] = KnownType('GLOBAL_MEM long long*', 'long long') TYPES['gulonglongp'] = KnownType('GLOBAL_MEM unsigned long long*', 'unsigned long long') TYPES['llonglongp'] = KnownType('LOCAL_MEM long long*', 'long long') TYPES['lulonglongp'] = KnownType('LOCAL_MEM unsigned long long*', 'unsigned long long') NP_TYPE_LIST = list(C_NP_TYPE_MAP.values()) def dtype_to_ctype(dtype, backend=None): if backend in ('opencl', 'cuda'): try: from pyopencl.compyte.dtypes import \ dtype_to_ctype as d2c_opencl return d2c_opencl(dtype) except (ValueError, ImportError): pass dtype = np.dtype(dtype) return NP_C_TYPE_MAP[dtype] def ctype_to_dtype(ctype): return np.dtype(C_NP_TYPE_MAP[ctype]) def knowntype_to_ctype(knowntype): knowntype_obj = TYPES.get(knowntype, None) if knowntype_obj: return knowntype_obj.type else: raise ValueError("Not a vaild known type") def dtype_to_knowntype(dtype, address='scalar', backend=None): ctype = dtype_to_ctype(dtype, backend=backend) if 'unsigned' in ctype: ctype = 'u%s' % ctype.replace('unsigned ', '') knowntype = ctype.replace(' ', '') if address == 'ptr': knowntype = '%sp' % knowntype elif address == 'global': knowntype = 'g%sp' % knowntype elif address == 'local': knowntype = 'l%sp' % knowntype elif address != 'scalar': raise ValueError("address can only be scalar," " ptr, global or local") return knowntype if knowntype in TYPES: return knowntype else: raise TypeError("Not a vaild KnownType") def annotate(func=None, **kw): """A decorator to specify the types of a function. These types are injected into the functions, `__annotations__` attribute. An example describes this best: @annotate(i='int', x='floatp', return_='float') def f(i, x): return x[i]*2.0 One could also do: @annotate(i='int', floatp='x, y', return_='float') def f(i, x, y): return x[i]*y[i] """ data = {} if not kw: def wrapper(func): func.is_jit = True return func else: data = kwtype_to_annotation(kw) def wrapper(func): # For jitted functions, we should retain # the is_jit attribute when we annotate the function. func.is_jit = getattr(func, 'is_jit', False) try: func.__annotations__ = data except AttributeError: func.im_func.__annotations__ = data return func if func is None: return wrapper else: return wrapper(func) def _clean_name(name): return 'return' if name == 'return_' else name def _get_type(type): if isinstance(type, KnownType): return type elif type in TYPES: return TYPES[type] else: msg = ('Unknown type {type}, not a KnownType and not one of ' 'the pre-declared types.'.format(type=str(type))) raise TypeError(msg) def kwtype_to_annotation(kw): """Convert type to a KnownType""" data = {} for name, type in kw.items(): if isinstance(type, str) and ',' in type: for x in type.split(','): data[_clean_name(x.strip())] = _get_type(name) else: data[_clean_name(name)] = _get_type(type) return data compyle-release-0.8.1/compyle/utils.py000066400000000000000000000054331414173670100200010ustar00rootroot00000000000000import inspect import argparse import atexit from compyle.config import get_config from compyle.profile import print_profile def getsourcelines(obj): '''Given an object return the source code that defines it as a list of lines along with the starting line. ''' try: return inspect.getsourcelines(obj) except Exception: if hasattr(obj, 'source'): return obj.source.splitlines(True), 0 else: raise def getsource(obj): '''Given an object return the source that defines it. ''' try: return inspect.getsource(obj) except Exception: if hasattr(obj, 'source'): return obj.source else: raise class ArgumentParser(argparse.ArgumentParser): '''Standard argument parser for compyle applications. Includes arguments for backend, openmp and use_double ''' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # setup standard arguments self.add_argument( '-b', '--backend', action='store', dest='backend', default='cython', choices = ['cython', 'opencl', 'cuda'], help='Choose the backend.' ) self.add_argument( '--openmp', action='store_true', dest='openmp', default=False, help='Use OpenMP.' ) self.add_argument( '--use-double', action='store_true', dest='use_double', default=False, help='Use double precision on the GPU.' ) self.add_argument( '--suppress-warnings', action='store_true', dest='suppress_warnings', default=False, help='Suppress warnings' ) self.add_argument( '--profile', action='store_true', dest='profile', default=False, help='Print profiling info' ) self.profile_registered = False def _set_config_options(self, options): get_config().use_openmp = options.openmp get_config().use_double = options.use_double get_config().suppress_warnings = options.suppress_warnings if options.backend == 'opencl': get_config().use_opencl = True if options.backend == 'cuda': get_config().use_cuda = True if options.profile and not self.profile_registered: get_config().profile = True atexit.register(print_profile) self.profile_registered = True def parse_args(self, *args, **kwargs): options = super().parse_args(*args, **kwargs) self._set_config_options(options) return options def parse_known_args(self, *args, **kwargs): options, unknown = super().parse_known_args(*args, **kwargs) self._set_config_options(options) return options, unknown compyle-release-0.8.1/docs/000077500000000000000000000000001414173670100155425ustar00rootroot00000000000000compyle-release-0.8.1/docs/Makefile000066400000000000000000000011401414173670100171760ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = ComPyle SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)compyle-release-0.8.1/docs/source/000077500000000000000000000000001414173670100170425ustar00rootroot00000000000000compyle-release-0.8.1/docs/source/conf.py000066400000000000000000000116441414173670100203470ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Compyle documentation build configuration file, created by # sphinx-quickstart on Sun Dec 2 14:26:18 2018. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os from os.path import join # import sys # sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General information about the project. project = 'Compyle' copyright = '2018-2021, PySPH Developers' author = 'PySPH Developers' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # _d = {} fname = join(os.pardir, os.pardir, 'compyle', '__init__.py') exec(compile(open(fname).read(), fname, 'exec'), _d) version = release = _d['__version__'] # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = 'Compyledoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'Compyle.tex', 'Compyle Documentation', 'PySPH Developers', 'manual'), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'compyle', 'Compyle Documentation', [author], 1) ] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'Compyle', 'Compyle Documentation', author, 'Compyle', 'One line description of project.', 'Miscellaneous'), ] compyle-release-0.8.1/docs/source/details.rst000066400000000000000000000727261414173670100212370ustar00rootroot00000000000000Using Compyle ============== In this section we provide more details on the compyle package and how it can be used. An overview of functionality ----------------------------- The functionality provided falls into two broad categories, - Common parallel algorithms that will work across backends. This includes, elementwise operations, reductions, and prefix-sums/scans. - Specific support to run code on a particular backend. This is for code that will only work on one backend by definition. This is necessary in order to best use different hardware and also use differences in the particular backend implementations. For example, the notion of local (or shared) memory only has meaning on a GPGPU. In this category we provide support to compile and execute Cython code, and also create and execute a GPU kernel. In addition there is common functionality to perform type annotations. At a lower level, there are code translators (transpilers) that handle generation of Cython and C code from annotated Python code. Technically these transpilers can be reused by users to do other things but we only go over the higher level tools in this documentation. All the code is fairly extensively tested and developed using a test-driven approach. In fact, a good place to see examples are the tests. We now go into the details of each of these so as to provide a high-level overview of what can be done with Compyle. Annotating functions --------------------- The first step in getting started using Compyle is to annotate your functions and also declare variables in code. Annotation is provided by a simple decorator, called ``annotate``. One can declare local variables inside these functions using ``declare``. A simple example serves to illustrate these:: @annotate(i='int', x='floatp', return_='float') def f(i, x): return x[i]*2.0 @annotate(i='int', floatp='x, y', return_='float') def g(i, x, y): return f(i, x)*y[i] Note that for convenience ``annotate``, accepts types and variable names in two different ways, which you can use interchangeably. 1. You can simply use ``var_name=type_str``, or ``var_name=type`` where the type is from the ``compyle.types`` module. 2. You can instead use ``type_name='x, y, z'``, which is often very convenient. The order of the variables is not important and need not match with the order in which they are declared. You can use ``return_=type``, where ``type`` is an appropriate type or standard string representing one of the types. If the return type is not specified it assumes a ``void`` return. The definitions of the various standard types is in ``compyle.types.TYPES``. Some are listed below: - ``'float', 'double', 'int', 'long', 'uint', 'ulong'``: etc. are exactly as you would expect. - ``'doublep'`` would refer to a double pointer, i.e. ``double*`` and similarly for anything with a ``p`` at the end. - ``gdoublep`` would be a ``global doublep``, which makes sense with OpenCL where you would have ``__global double* xx``. The global address space specification is ignored when Cython code is generated, so this is safe to use with Cython code too. - ``ldoublep`` would be equivalent to ``__local double*`` in OpenCL, for local memory. Again this address space qualifier is ignored in Cython. All these types are available in the ``compyle.types`` module namespace also for your convenience. The ``int, float, long`` types are accessible as ``int_, float_, long_`` so as not to override the default Python types. For example the function ``f`` in the above could also have been declared like so:: from compyle.types import floatp, float_, int_ @annotate(i=int_, x=floatp, return_=float_) def f(i, x): return x[i]*2.0 One can also use custom types (albeit with care) by using the ``compyle.typs.KnownType`` class. This is convenient in other scenarios where you could potentially pass instances/structs to a function. We will discuss this later but all of the basic types discussed above are all instances of ``KnownType``. Compyle actually supports Python3 style annotations but only for the function arguments and NOT for the local variables. The only caveat is you must use the types in ``compyle.types``, i.e. you must use ``KnownType`` instances as the types for things to work. JIT transpilation ----------------- Compyle also support just-in-time transpilation when annotations of a function are not provided. These functions are annotated at runtime when the call arguments are passed. The generated kernel and annotated functions are then cached with the types of the call arguments as key. Thus, the function ``f`` defined in the previous section can also be defined as follows:: @annotate def f(i, x): return x[i]*2.0 While using in-built functions such as ``sin``, ``cos``, ``abs`` etc. it is recommended that you store the value in a variable or appropriate type before returning it. If not the return type will default to ``double``. For example,:: @annotate def f(i, x): return abs(x[i]) This will set the return type of function ``f`` to the default type, ``double`` even when ``x`` is an array of integers. To avoid this problem, one could define ``f`` instead as,:: @annotate def f(i, x): y = declare('int') y = abs(x[i]) return y Currently JIT support is only limited to the common parallel algorithms explained in a later section. Declaring variables ------------------- In addition to annotating the function arguments and return types, it is important to be able to declare the local variables. We provide a simple ``declare`` function that lets us do this. One again, a few examples serve to illustrate this:: i = declare('int') x = declare('float') u, v = declare('double', 2) Notice the last one where we passed an additional argument of the number of types we want. This is really done to keep this functional in pure Python so that your code executes on Python also. In Cython these would produce:: cdef int i cdef float x cdef double u, v On OpenCL this would produce the equivalent:: int i; float x; double u, v; Technically one could also write:: f = declare('float4') but clearly this would only work on OpenCL, however, you can definitely declare other variables too! Note that in OpenCL/Cython code if you do not declare a variable, it is automatically declared as a ``double`` to prevent compilation errors. We often also require arrays, ``declare`` also supports this, for example consider these examples:: r = declare('matrix(3)') a = declare('matrix((2, 2))') u, v = declare('matrix(2)', 2) This reduces to the following on OpenCL:: double r[3]; double a[3][3]; double u[2], v[2]; Note that this will only work with fixed sizes, and not with dynamic sizes. As we pointed out earlier, dynamic memory allocation is not allowed. Of course you could easily do this with Cython code but the declare syntax does not allow this. If you want non-double matrices, you can simply pass a type as in:: a = declare('matrix((2, 2), "int")') Which would result in:: int a[2][2]; As you can imagine, just being able to do this opens up quite a few possibilities. You could also do things like this:: xloc = declare('LOCAL_MEM matrix(128)') which will become in OpenCL:: LOCAL_MEM double xloc[128]; The ``LOCAL_MEM`` is a special variable that expands to the appropriate flag on OpenCL or CUDA to allow you to write kernels once and have them run on either OpenCL or CUDA. These special variables are discussed later below. Writing the functions ---------------------- All of basic Python is supported. As you may have seen in the examples, you can write code that uses the following: - Indexing (only positive indices please). - Conditionals with if/elif/else. - While loops. - For loops with the ``for var in range(...)`` construction. - Nested fors. - Ternary operators. This allows us to write most numerical code. Fancy slicing etc. are not supported, numpy based slicing and striding are not supported. You are supposed to write these out elementwise. The idea is to keep things simple. Yes, this may make things verbose but it does keep our life simple and eventually yours too. Do not create any Python data structures in the code unless you do not want to run the code on a GPU. No numpy arrays can be created, also avoid calling any numpy functions as these will NOT translate to any GPU code. You have to write what you need by hand. Having said that, all the basic math functions and symbols are automatically available. Essentially all of ``math`` is available. All of the ``math.h`` constants are also available for use. If you declare a global constant it will be automatically defined in the generated code. For example:: MY_CONST = 42 @annotate(x='double', return_='double') def f(x): return x + MY_CONST The ``MY_CONST`` will be automatically injected in your generated code. Now you may wonder about how you can call an external library that is not in ``math.h``. Lets say you have an external CUDA library, how do you call that? We have a simple approach for this which we discuss later. We call this an ``Extern`` and discuss it later. .. _PyOpenCL: https://documen.tician.de/pyopencl/ .. _Cython: http://www.cython.org .. _PySPH: http://pysph.readthedocs.io Common parallel algorithms --------------------------- Compyle provides a few very powerful parallel algorithms. These are all directly motivated by Andreas Kloeckner's PyOpenCL_ package. On the GPU they are wrappers on top of the functionality provided there. These algorithms make it possible to implement scalable algorithms for a variety of common numerical problems. In PySPH_ for example all of the GPU based nearest neighbor finding algorithms are written with these fundamental primitives and scale very well. All of the following parallel algorithms allow choice of a suitable backend and take a keyword argument to specify this backend. If no backend is provided a default is chosen from the ``compyle.config`` module. You can get the global config using:: from compyle.config import get_config cfg = get_config() cfg.use_openmp = True cfg.use_opencl = True etc. The following are the parallel algorithms available from the ``compyle.parallel`` module. ``Elementwise`` ~~~~~~~~~~~~~~~ This is also available as a decorator ``elementwise``. One can pass it an annotated function and an optional backend. The elementwise processes every element in the second argument to the function. The elementwise basically passes the function an index of the element it is processing and parallelizes the calls to this automatically. If you are familiar with writing GPU kernels, this is the same thing except the index is passed along to you. Here is a very simple example that shows how this works for a case where we compute ``y = a*sin(x) + b`` where ``y, a, x, b`` are all numpy arrays but let us say we want to do this in parallel:: import numpy as np from compyle.api import annotate, Elementwise, get_config @annotate(i='int', doublep='x, y, a, b') def axpb(i, x, y, a, b): y[i] = a[i]*sin(x[i]) + b[i] # Setup the input data n = 1000000 x = np.linspace(0, 1, n) y = np.zeros_like(x) a = np.random.random(n) b = np.random.random(n) # Use OpenMP get_config().use_openmp = True # Now run this in parallel with Cython. backend = 'cython' e = Elementwise(axpb, backend=backend) e(x, y, a, b) This will call the ``axpb`` function in parallel and if your problem is large enough will effectively scale on all your cores. Its as simple as that. Now let us say we want to run this with OpenCL. The only issue with OpenCL is that the data needs to be sent to the GPU. This is transparently handled by a simple ``Array`` wrapper that handles this for us automatically. Here is a simple example building on the above:: from compyle.api import wrap backend = 'opencl' x, y, a, b = wrap(x, y, a, b, backend=backend) What this does is to wrap each of the arrays and also sends the data to the device. ``x`` is now an instance of ``compyle.array.Array``, this simple class has two attributes, ``data`` and ``dev``. The first is the original data and the second is a suitable device array from PyOpenCL/PyCUDA depending on the backend. To get data from the device to the host you can call ``x.pull()`` to push data to the device you can call ``x.push()``. Now that we have this wrapped we can simply do:: e = Elementwise(axpb, backend=backend) e(x, y, a, b) We do not need to change any of our other code. As you can see this is very convenient. Here is all the code put together:: import numpy as np from compyle.api import annotate, Elementwise, get_config, wrap @annotate(i='int', doublep='x, y, a, b') def axpb(i, x, y, a, b): y[i] = a[i]*sin(x[i]) + b[i] # Setup the input data n = 1000000 x = np.linspace(0, 1, n) y = np.zeros_like(x) a = np.random.random(n) b = np.random.random(n) # Turn on OpenMP for Cython. get_config().use_openmp = True for backend in ('cython', 'opencl'): xa, ya, aa, ba = wrap(x, y, a, b, backend=backend) e = Elementwise(axpb, backend=backend) e(xa, ya, aa, ba) This will run the code on both backends! We use the for loop just to show that this will run on all backends! The ``axpb.py`` example shows this for a variety of array sizes and plots the performance. ``Reduction`` ~~~~~~~~~~~~~~~ The ``compyle.parallel`` module also provides a ``Reduction`` class which can be used fairly easily. Using it is a bit complex, a good starting point for this is the documentation of PyOpenCL_, here https://documen.tician.de/pyopencl/algorithm.html#module-pyopencl.reduction The difference from the PyOpenCL implementation is that the ``map_expr`` is a function rather than a string. We provide a couple of simple examples to illustrate the above. The first example is to find the sum of all elements of an array:: x = np.linspace(0, 1, 1000)/1000 x = wrap(x, backend=backend) r = Reduction('a+b', backend=backend) result = r(x) Here is an example of a function to find the minimum of an array:: x = np.linspace(0, 1, 1000)/1000 x = wrap(x, backend=backend) r = Reduction('min(a, b)', neutral='INFINITY', backend=backend) result = r(x) Here is a final one with a map expression thrown in:: from math import cos, sin x = np.linspace(0, 1, 1000)/1000 y = x.copy() x, y = wrap(x, y, backend=backend) @annotate(i='int', doublep='x, y') def map(i=0, x=[0.0], y=[0.0]): return cos(x[i])*sin(y[i]) r = Reduction('a+b', map_func=map, backend=backend) result = r(x, y) As you can see this is faithful to the PyOpenCL implementation with the only difference that the ``map_expr`` is actually a nice function. Further, this works on all backends, even on Cython. ``Scan`` ~~~~~~~~~~ Scans are generalizations of prefix sums / cumulative sums and can be used as building blocks to construct a number of parallel algorithms. These include but not are limited to sorting, polynomial evaluation, and tree operations. Blelloch's literature on prefix sums (`Prefix Sums and Their Applications `_) has many more examples and is a recommended read before using scans. The ``compyle.parallel`` module provides a ``Scan`` class which can be used to develop and execute such scans. The scans can be run on GPUs using the OpenCL or CUDA backend or on CPUs using either the OpenCL or Cython backend. The scan semantics in compyle are similar to those of the GenericScanKernel in PyOpenCL (https://documen.tician.de/pyopencl/algorithm.html#pyopencl.scan.GenericScanKernel). Similar to the case for reduction, the main differences from the PyOpenCL implementation are that the expressions (`input_expr`, `segment_expr`, `output_expr`) are all functions rather than strings. The following examples demonstrate how scans can be used in compyle. The first example is to find the cumulative sum of all elements of an array:: ary = np.arange(10000, dtype=np.int32) ary = wrap(ary, backend=backend) @annotate(i='int', ary='intp', return_='int') def input_expr(i, ary): return ary[i] @annotate(int='i, item', ary='intp') def output_expr(i, item, ary): ary[i] = item scan = Scan(input_expr, output_expr, 'a+b', dtype=np.int32, backend=backend) scan(ary=ary) ary.pull() # Result = ary.data Here is a more complex example of a function that finds the unique elements in an array:: ary = np.random.randint(0, 100, 1000, dtype=np.int32) unique_ary = np.zeros(len(ary.data), dtype=np.int32) unique_ary = wrap(unique_ary, backend=backend) unique_count = np.zeros(1, dtype=np.int32) unique_count = wrap(unique_count, backend=backend) ary = wrap(ary, backend=backend) @annotate(i='int', ary='intp', return_='int') def input_expr(i, ary): if i == 0 or ary[i] != ary[i - 1]: return 1 else: return 0 @annotate(int='i, prev_item, item, N', ary='intp', unique='intp', unique_count='intp') def output_expr(i, prev_item, item, N, ary, unique, unique_count): if item != prev_item: unique[item - 1] = ary[i] if i == N - 1: unique_count[0] = item scan = Scan(input_expr, output_expr, 'a+b', dtype=np.int32, backend=backend) scan(ary=ary, unique=unique_ary, unique_count=unique_count) unique_ary.pull() unique_count.pull() unique_count = unique_count.data[0] unique_ary = unique_ary.data[:unique_count] # Result = unique_ary The following points highlight some important details and quirks about using scans in compyle: 1. The scan call does not return anything. All output must be handled manually. Usually this involves writing the results available in ``output_expr`` (``prev_item``, ``item`` and ``last_item``) to an array. 2. ``input_expr`` might be evaluated multiple times. However, it can be assumed that ``input_expr`` for an element or index ``i`` is not evaluated again after the output expression ``output_expr`` for that element is evaluated. Therefore, it is safe to write the output of a scan back to an array also used for the input like in the first example. 3. (For PyOpenCL users) If a segmented scan is used, unlike PyOpenCL where the ``across_seg_boundary`` is used to handle the segment logic in the scan expression, in compyle the logic is handled automatically. More specifically, using ``a + b`` as the scan expression in compyle is equivalent to using ``(across_seg_boundary ? b : a + b)`` in PyOpenCL. Debugging ---------- Debugging can be a bit difficult with multiple different architectures and backends. One convenience that compyle provides is that the generated sources can be inspected. All the parallel algorithms (``Elementwise, Reduction, Scan``) provide a ``.source`` or ``.all_source`` attribute that contains the source. For example say you have the following:: e = Elementwise(axpb, backend=backend) e(x, y, a, b) You can examine the source generated for your functions using:: e.source This is probably most useful for end users. For those more curious, all of the source generated and used for the complete elementwise (or other) parallel algorithm can be seen using:: e.all_source This code can be rather long and difficult to read so use this only if you really need to see the underlying code from PyOpenCL or PyCUDA. On the GPU this will often include multiple kernels as well. Note that on CUDA the ``all_source`` does not show all of the sources as PyCUDA currently does not make it easy to inspect the code. Abstracting out arrays ----------------------- As discussed in the section on Elementwise operations, different backends need to do different things with arrays. With OpenCL/CUDA one needs to send the array to the device. This is transparently managed by the ``compyle.array.Array`` class. It is easiest to use this transparently with the ``wrap`` convenience function as below:: x = np.linspace(0, 1, 1000)/1000 y = x.copy() x, y = wrap(x, y, backend=backend) Thus these, new arrays can be passed to any operation and is handled transparently. Choice of backend and configuration ------------------------------------ The ``compyle.config`` module provides a simple ``Configuration`` class that is used internally in Compyle to set things like the backend (Cython, OpenCL/CUDA), and some common options like profiling, turning on OpenMP, using double on the GPU etc. Here is an example of the various options:: from compyle.config import get_config cfg = get_config() cfg.use_double cfg.profile cfg.use_opencl cfg.use_openmp If one wants to temporarily set an option and perform an action, one can do:: from compyle.config import use_config with use_config(use_openmp=False): ... Here everything within the ``with`` clause will be executed using the specified option and once the clause is exited, the previous settings will be restored. This can be convenient. Templates ---------- When creating libraries, it is useful to be able to write a function as a "template" where the code can be generated depending on various user options. Compyle facilitates this by using Mako_ templates. We provide a convenient ``compyle.template.Template`` class which can be used for this purpose. A trivial and contrived example demonstrates its use below. The example sets any number of given arrays to a constant value:: from compyle.types import annotate from compyle.template import Template class SetConstant(Template): def __init__(self, name, arrays): super(SetConstant, self).__init__(name=name) self.arrays = arrays def my_func(self, value): '''The contents of this function are directly injected. ''' tmp = sin(value) def extra_args(self): return self.arrays, {'doublep': ','.join(self.arrays)} @annotate(i='int', value='double') def template(self, i, value): '''Set the arrays to a constant value.''' ''' ${obj.inject(obj.my_func)} % for arr in obj.arrays: ${arr}[i] = tmp % endfor ''' set_const = SetConstant('set_const', ['x', 'y', 'z']).function print(set_const.source) This will print out this:: def set_const(i, value, x, y, z): """Set arrays to constant. """ tmp = sin(value) x[i] = tmp y[i] = tmp z[i] = tmp This is obviously a trivial example but the idea is that one can create fairly complex templated functions that can be then transpiled and used in different cases. The key point here is the ``template`` method which should simply create a string which is rendered using Mako_ and then put into a function. The ``extra_args`` method allows us to configure the arguments used by the function. The mako template can use the name ``obj`` which is ``self``. The ``obj.inject`` method allows one to literally inject any function into the body of the code with a suitable level of indentation. Of course normal mako functionality is available to do a variety of things. .. _Mako: https://www.makotemplates.org/ Low level functionality ----------------------- In addition to the above, there are also powerful low-level functionality that is provided in ``compyle.low_level``. ``Kernel`` ~~~~~~~~~~~ The ``Kernel`` class allows one to execute a pure GPU kernel. Unlike the Elementwise functionality above, this is specific to OpenCL/CUDA and will not execute via Cython. What this class lets one do is write low-level kernels which are often required to extract the best performance from your hardware. Most of the functionality is exactly the same, one declares functions and annotates them and then passes a function to the ``Kernel`` which calls this just as we would a normal OpenCL kernel for example. The major advantage is that all your code is pure Python. Here is a simple example:: from compyle.api import annotate, wrap, get_config from compyle.low_level import Kernel, LID_0, LDIM_0, GID_0 import numpy as np @annotate(x='doublep', y='doublep', double='a,b') def axpb(x, y, a, b): i = declare('int') i = LDIM_0*GID_0 + LID_0 y[i] = a*sin(x[i]) + b x = np.linspace(0, 1, 10000) y = np.zeros_like(x) a = 2.0 b = 3.0 get_config().use_opencl = True x, y = wrap(x, y) k = Kernel(axpb) k(x, y, a, b) This is the same Elementwise kernel equivalent from the first example at the top but written as a raw kernel. Notice that ``i`` is not passed but computed using ``LDIM_0, GID_0 and LID_0`` which are automatically made available on OpenCL/CUDA. In addition to these the function ``local_barrier`` is also available. Internally these are ``#defines`` that are like so on OpenCL:: #define LID_0 get_local_id(0) #define LID_1 get_local_id(1) #define LID_2 get_local_id(2) #define GID_0 get_group_id(0) #define GID_1 get_group_id(1) #define GID_2 get_group_id(2) #define LDIM_0 get_local_size(0) #define LDIM_1 get_local_size(1) #define LDIM_2 get_local_size(2) #define GDIM_0 get_num_groups(0) #define GDIM_1 get_num_groups(1) #define GDIM_2 get_num_groups(2) #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE); On CUDA, these are mapped to the equivalent :: #define LID_0 threadIdx.x #define LID_1 threadIdx.y #define LID_2 threadIdx.z #define GID_0 blockIdx.x #define GID_1 blockIdx.y #define GID_2 blockIdx.z #define LDIM_0 blockDim.x #define LDIM_1 blockDim.y #define LDIM_2 blockDim.z #define GDIM_0 gridDim.x #define GDIM_1 gridDim.y #define GDIM_2 gridDim.z #define local_barrier() __syncthreads(); In fact these are all provided by the ``_cluda.py`` in PyOpenCL and PyCUDA. These allow us to write CUDA/OpenCL agnostic code from Python. One may also pass local memory to such a kernel, this trivial example demonstrates this:: from compyle.api import annotate from compyle.low_level import ( Kernel, LID_0, LDIM_0, GID_0, LocalMem, local_barrier ) import numpy as np @annotate(gdoublep='x', ldoublep='xl') def f(x, xl): i, thread_id = declare('int', 2) thread_id = LID_0 i = GID_0*LDIM_0 + thread_id xl[thread_id] = x[i] local_barrier() x = np.linspace(0, 1, 10000) get_config().use_opencl = True x = wrap(x) xl = LocalMem(1) k = Kernel(f) k(x, xl) This kernel does nothing useful and is just meant to demonstrate how one can allocate and use local memory. Note that here we "allocated" the local memory on the host and are passing it in to the Kernel. The local memory is allocated as ``LocalMem(1)``, this implicitly means allocate the required size in multiples of the size of the type and the work group size. Thus the allocated memory is ``work_group_size * sizeof(double) * 1``. This is convenient as very often the exact work group size is not known. A more complex and meaningful example is the ``vm_kernel.py`` example that is included with Compyle. ``Cython`` ~~~~~~~~~~~ Just like the ``Kernel`` we also have a ``Cython`` class to run pure Cython code. Here is an example of its usage:: from compyle.config import use_config from compyle.types import annotate from compyle.low_level import Cython, nogil, parallel, prange import numpy as np @annotate(n='int', doublep='x, y', a='double') def cy_ex(x, y, a, n): i = declare('int') with nogil, parallel(): for i in prange(n): y[i] = x[i]*a n = 1000 x = np.linspace(0, 1, n) y = np.zeros_like(x) a = 2.0 with use_config(use_openmp=True): cy = Cython(cy_ex) cy(x, y, a, n) If you look at the above code, we are effectively writing Cython code but compiling it and calling it in the last two lines. Note the use of the ``nogil, parallel`` and ``prange`` functions which are also provided in the ``low_level`` module. As you can see it is just as easy to write Cython code and have it execute in parallel. Externs ~~~~~~~ The ``nogil, parallel`` and ``prange`` functions we see in the previous section are examples of external functionality. Note that these have no straight-forward Python analog or implementation. They are implemented as Externs. This functionality allows us to link to external code opening up many interesting possibilities. Note that as far as Compyle is concerned, we need to know if a function needs to be wrapped or somehow injected. Externs offer us a way to cleanly inject external function definitions and use them. This is useful for example when you need to include an external CUDA library. Let us see how the ``prange`` extern is internally defined:: from compyle.extern import Extern class _prange(Extern): def link(self, backend): # We don't need to link to anything to get prange working. return [] def code(self, backend): if backend != 'cython': raise NotImplementedError('prange only available with Cython') return 'from cython.parallel import prange' def __call__(self, *args, **kw): # Ignore the kwargs. return range(*args) prange = _prange() The Extern object has two important methods, ``link`` and ``code``. The ``__call__`` interface is provided just so this can be executed with pure Python. The link returns a list of link args, these are currently ignored until we figure out a good test/example for this. The ``code`` method returns a suitable line of code inserted into the generated code. Note that in this case it just performs a suitable import. Thus, with this feature we are able to connect Compyle with other libraries. This functionality will probably evolve a little more as we gain more experience linking with other libraries. However, we have a clean mechanism for doing so already in-place. compyle-release-0.8.1/docs/source/index.rst000066400000000000000000000015321414173670100207040ustar00rootroot00000000000000.. Compyle documentation master file, created by sphinx-quickstart on Sun Dec 2 14:26:18 2018. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to Compyle's documentation! =================================== Compyle allows users to execute a restricted subset of Python (almost similar to C) on a variety of HPC platforms. Currently we support multi-core CPU execution using Cython, and support GPU devices using OpenCL and CUDA. You can try Compyle online on a `Google Colab notebook `_. .. toctree:: :maxdepth: 2 :caption: Contents: overview.rst installation.rst details.rst Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` compyle-release-0.8.1/docs/source/installation.rst000066400000000000000000000166631414173670100223110ustar00rootroot00000000000000Installation ============== ComPyle is itself pure Python but depends on numpy_ and requires either Cython_ or PyOpenCL_ or PyCUDA_ along with the respective backends of a C/C++ compiler, OpenCL and CUDA. If you are only going to execute code on a CPU then all you need is Cython_. The full list of requirements is shown in the ``requirements.txt`` file on the repository. You should be able to install ComPyle by doing:: $ pip install compyle Note that when executing code on a CPU, you will need to have a C/C++ compiler that is compatible with your Python installation. In addition, if you need to use OpenMP you will need to make sure your compiler is compatible with that. Some additional information on this is included below. Installing the bleeding edge ---------------------------- Note that if you want the latest bleeding edge of compyle, clone the repository and install compyle like so:: $ git clone https://github.com/pypr/compyle $ cd compyle $ python setup.py develop # Or $ pip install -e . If you just want the latest version and do not want to clone the repository, you can also do:: $ pip install https://github.com/pypr/compyle/zipball/master .. _PyOpenCL: https://documen.tician.de/pyopencl/ .. _OpenCL: https://www.khronos.org/opencl/ .. _Cython: http://www.cython.org .. _numpy: http://www.numpy.org .. _PyCUDA: https://documen.tician.de/pycuda .. _OpenMP: http://openmp.org/ .. _CuPy: https://cupy.chainer.org/ Setting up on GNU/Linux ------------------------- This is usually very simple, just installing the standard gcc/g++ packages ought to work. OpenMP_ is typically available but if it is not, it can be installed with (on apt-compatible systems):: $ sudo apt-get install libgomp1 Installation with conda on MacOS --------------------------------- Recent conda_ packages make the process of setup very easy on MacOS assuming that you have the `XCode command line utilities`_ installed. Please make sure you install this. For example with conda-forge_ the following creates a new Python 3.8 environment with compyle installed and working with both OpenMP and OpenCL:: $ conda create -c conda-forge -n py38 python=3.8 numpy pyopencl $ conda activate py38 # or a suitable such invocation $ pip install compyle Note that the above implicitly installs the ``llvm-openmp`` package in the environment which works out of the box with clang and provides OpenMP support. .. _conda: https://docs.conda.io/ .. _conda-forge: https://conda-forge.org/ .. _XCode command line utilities: http://stackoverflow.com/questions/12228382/after-install-xcode-where-is-clang Possible issues on MacOS -------------------------- Ensure that you have gcc or clang installed by installing XCode. See installing `XCode command line utilities`_ if you installed XCode but can't find clang or gcc. If you are getting strange errors of the form:: lang: warning: libstdc++ is deprecated; move to libc++ with a minimum deployment target of OS X 10.9 [-Wdeprecated] ld: library not found for -lstdc++ clang: error: linker command failed with exit code 1 (use -v to see invocation) Then try this (on a bash shell):: $ export MACOSX_DEPLOYMENT_TARGET=10.9 And run your command again (replace the above with a suitable line on other shells). This is necessary because your Python was compiled with an older deployment target and the current version of XCode that you have installed is not compatible with that. By setting the environment variable you allow compyle to use a newer version. If this works, it is a good idea to set this in your default environment (``.bashrc`` for bash shells) so you do not have to do this every time. You may also do this in the compyle configuration file, see :ref:`config`. OpenMP on MacOS ~~~~~~~~~~~~~~~~ These instructions are a bit old and only if you are not using conda as discussed above. The default clang compiler available on MacOS uses an LLVM backend and does not support OpenMP_ out of the box. There are two ways to support OpenMP. The first involves installing the OpenMP support for clang. This can be done with brew_ using:: $ brew install libomp Once that is done, it should "just work". If you get strange errors, try setting the ``MACOSX_DEPLOYMENT_TARGET`` as shown in the previous section. Another option is to install GCC for MacOS available on brew_ using :: $ brew install gcc Once this is done, you need to use this as your default compiler. The ``gcc`` formula on brew currently ships with gcc version 9. Therefore, you can tell Python to use the GCC installed by brew by setting:: $ export CC=gcc-9 $ export CXX=g++-9 Note that you still do need to have the command-line-tools for XCode installed, otherwise the important header files are not available. See `how-to-install-xcode-command-line-tools `_ for more details. You may also want to set these environment variables in your ``.bashrc`` so you don't have to do this every time. You may also do this in the compyle configuration file, see :ref:`config`. Once you do this, compyle will automatically use this version of GCC and will also work with OpenMP. Note that on some preliminary benchmarks, GCC's OpenMP implementation seems about 10% or so faster than the LLVM version. Your mileage may vary. .. _brew: http://brew.sh/ Setting up on Windows ---------------------- Windows will work but you need to make sure you have the right compiler installed. See this page for the details of what you need installed. https://wiki.python.org/moin/WindowsCompilers OpenMP will work if you have this installed. For recent Python versions (>=3.5), install the `Microsoft Build Tools for Visual Studio 2019 `_ Setting up OpenCL/CUDA ----------------------- This is too involved a topic to discuss here, instead look at the appropriate documentation for PyOpenCL_ and PyCUDA_. Once those packages work correctly, you should be all set. Note that if you are only using OpenCL/CUDA you do not need to have Cython or a C/C++ compiler. Some features on CUDA require the use of the CuPy_ library. If you want to use OpenCL support, you will need to install the ``pyopencl`` package (``conda install -c conda-forge pyopencl`` or ``pip install pyopencl``). For CUDA Support, you will need to install ``pycuda`` and ``cupy``. Of course this assumes you have the required hardware for this. .. _config: Using the configuration file ----------------------------- Instead of setting environment variables and build options on the shell you can have them setup using a simple configuration file. The file is located in ``~/.compyle/config.py``. Here ``~`` is your home directory which on Linux is ``/home/username``, on MacOS ``/Users/username`` and on Windows the location is likely ``\Users\username``. This file is executed and certain options may be set there. For example if you wish to set the environment variables ``CC`` and ``CXX`` you could do this in the ``config.py``:: import os os.environ['CC'] = 'gcc-9' os.environ['CXX'] = 'g++-9' If you are using an atypical compiler like icc, Cray, or PGI, you can set these up here too. You may also setup custom OpenMP related flags. For example, on a Cray system you may do the following:: OMP_CFLAGS = ['-homp'] OMP_LINK = ['-homp'] The ``OMP_CFLAGS`` and ``OMP_LINK`` parameters should be lists. Other packages like pyzoltan or pysph may also use this file for customizations. compyle-release-0.8.1/docs/source/overview.rst000066400000000000000000000202621414173670100214440ustar00rootroot00000000000000An overview ============== Compyle allows users to execute a restricted subset of Python (almost similar to C) on a variety of HPC platforms. Currently we support multi-core execution using Cython, and OpenCL and CUDA for GPU devices. An introduction to compyle in the context of writing a molecular dynamics simulator is available in our `SciPy 2020 paper`_. You may also `try Compyle`_ online on a Google Colab notebook if you wish. Users start with code implemented in a very restricted Python syntax, this code is then automatically transpiled, compiled and executed to run on either one CPU core, or multiple CPU cores or on a GPU. Compyle offers source-to-source transpilation, making it a very convenient tool for writing HPC libraries. Compyle is not a magic bullet, - Do not expect that you may get a tremendous speedup. - Performance optimization can be hard and is platform specific. What works on the CPU may not work on the GPU and vice-versa. Compyle does not do anything to make this aspect easier. All the issues with memory bandwidth, cache, false sharing etc. still remain. Differences between memory architectures of CPUs and GPUs are not avoided at all -- you still have to deal with it. But you can do so from the comfort of one simple programming language, Python. - Compyle makes it easy to write everything in pure Python and generate the platform specific code from Python. It provides a low-level tool to make it easy for you to generate whatever appropriate code. - The restrictions Compyle imposes make it easy for you to think about your algorithms in that context and thereby allow you to build functionality that exploits the hardware as you see fit. - Compyle hides the details of the backend to the extent possible. You can write your code in Python, you can reuse your functions and decompose your problem to maximize reuse. Traditionally you would end up implementing some code in C, some in Python, some in OpenCL/CUDA, some in string fragments that you put together. Then you'd have to manage each of the runtimes yourself, worry about compilation etc. Compyle minimizes that pain. - By being written in Python, we make it easy to assemble these building blocks together to do fairly sophisticated things relatively easily from the same language. - Compyle is fairly simple and does source translation making it generally easier to understand and debug. The core code-base is less than 7k lines of code. - Compyle has relatively simple dependencies, for CPU support it requires Cython_ and a C-compiler which supports OpenMP_. On the GPU you need either PyOpenCL_ or PyCUDA_. In addition it depends on NumPy_ and Mako_. .. _Cython: http://www.cython.org .. _OpenMP: http://openmp.org/ .. _PyOpenCL: https://documen.tician.de/pyopencl/ .. _PyCUDA: https://documen.tician.de/pycuda/ .. _OpenCL: https://www.khronos.org/opencl/ .. _NumPy: http://numpy.scipy.org .. _Mako: https://pypi.python.org/pypi/Mako .. _SciPy 2020 paper: http://conference.scipy.org/proceedings/scipy2020/compyle_pr_ab.html .. _try Compyle: https://colab.research.google.com/drive/1SGRiArYXV1LEkZtUeg9j0qQ21MDqQR2U?usp=sharing While Compyle is simple and modest, it is quite powerful and convenient. In fact, Compyle has its origins in PySPH_ which is a powerful Python package supporting SPH, molecular dynamics, and other particle-based algorithms. The basic elements of Compyle are used in PySPH_ to automatically generate HPC code from code written in pure Python and execute it on multiple cores, and on GPUs without the user having to change any of their code. Compyle generalizes this code generation to make it available as a general tool. .. _PySPH: http://pysph.readthedocs.io These are the restrictions on the Python language that Compyle poses: - Functions with a C-syntax. - Function arguments must be declared using either type annotation or with a decorator or with default arguments. - No Python data structures, i.e. no lists, tuples, or dictionaries. - Contiguous Numpy arrays are supported but must be one dimensional. - No memory allocation is allowed inside these functions. - On OpenCL no recursion is supported. - All function calls must not use dotted names, i.e. don't use ``math.sin``, instead just use ``sin``. This is because we do not perform any kind of name mangling of the generated code to make it easier to read. Basically think of it as good old FORTRAN. Technically we do support structs internally (we use it heavily in PySPH_) but this is not yet exposed at the high-level and is very likely to be supported in the future. Simple example -------------- Enough talk, lets look at some code. Here is a very simple example:: from compyle.api import Elementwise, annotate, wrap, get_config import numpy as np @annotate(i='int', x='doublep', y='doublep', double='a,b') def axpb(i, x, y, a, b): y[i] = a*sin(x[i]) + b x = np.linspace(0, 1, 10000) y = np.zeros_like(x) a = 2.0 b = 3.0 backend = 'cython' get_config().use_openmp = True x, y = wrap(x, y, backend=backend) e = Elementwise(axpb, backend=backend) e(x, y, a, b) This will execute the elementwise operation in parallel using OpenMP with Cython. The code is auto-generated, compiled and called for you transparently. The first time this runs, it will take a bit of time to compile everything but the next time, this is cached and will run much faster. If you just change the ``backend = 'opencl'``, the same exact code will be executed using PyOpenCL_ and if you change the backend to ``'cuda'``, it will execute via CUDA without any other changes to your code. This is obviously a very trivial example, there are more complex examples available as well. To see the source code that is automatically generated for the above elementwise operation example use:: e.source This will contain the sources that are generated based on the user code alone. To see all the sources created, use:: e.all_source A word of warning though that this can be fairly long especially on a GPU and for other kind of operations may actually include multiple GPU kernels. This is largely for reference and debugging. More examples -------------- More complex examples (but still fairly simple) are available in the `examples `_ directory. - `axpb.py `_: the above example but for openmp and opencl compared with serial showing that in some cases serial is actually faster than parallel! - `vm_elementwise.py `_: shows a simple N-body code with two-dimensional point vortices. The code uses a simple elementwise operation and works with OpenMP and OpenCL. - `vm_numba.py `_: shows the same code written in numba for comparison. In our benchmarks, Compyle is actually faster even in serial and in parallel it can be much faster when you use all cores. - `vm_kernel.py `_: shows how one can write a low-level OpenCL kernel in pure Python and use that. This also shows how you can allocate and use local (or shared) memory which is often very important for performance on GPGPUs. This code will only run via PyOpenCL. - `bench_vm.py `_: Benchmarks the various vortex method results above for a comparison with numba. Read on for more details about Compyle. Citing Compyle --------------- If you find Compyle useful or just want to read a paper on it, please see: - Aditya Bhosale and Prabhu Ramachandran, "Compyle: Python once, parallel computing anywhere", Proceedings of the 19th Python in Science Conference (SciPy 2020), July, 2020, Austin, Texas, USA. `doi:10.25080/Majora-342d178e-005 `_ **Won best poster** `SciPy 2020 Paper`_. Accompanying the paper is the - `Compyle poster presentation `_ - and the `Compyle poster video `_ compyle-release-0.8.1/examples/000077500000000000000000000000001414173670100164305ustar00rootroot00000000000000compyle-release-0.8.1/examples/axpb.py000066400000000000000000000032741414173670100177420ustar00rootroot00000000000000from compyle.api import Elementwise, annotate, wrap, get_config import numpy as np from numpy import sin import time @annotate(i='int', doublep='x, y, a, b') def axpb(i, x, y, a, b): y[i] = a[i]*sin(x[i]) + b[i] def setup(backend, openmp=False): get_config().use_openmp = openmp e = Elementwise(axpb, backend=backend) return e def data(n, backend): x = np.linspace(0, 1, n) y = np.zeros_like(x) a = x*x b = np.sqrt(x + 1) return wrap(x, y, a, b, backend=backend) def compare(m=20): N = 2**np.arange(1, 25) backends = [['cython', False], ['cython', True]] try: import pyopencl backends.append(['opencl', False]) except ImportError as e: pass try: import pycuda backends.append(['cuda', False]) except ImportError as e: pass timing = [] for backend in backends: e = setup(*backend) times = [] for n in N: args = data(n, backend[0]) t = [] for j in range(m): start = time.time() e(*args) secs = time.time() - start t.append(secs) times.append(np.average(t)) timing.append(times) return N, backends, np.array(timing) def plot_timing(n, timing, backends): from matplotlib import pyplot as plt backends[1][0] = 'openmp' for t, backend in zip(timing[1:], backends[1:]): plt.semilogx(n, timing[0]/t, label='serial/' + backend[0], marker='+') plt.grid() plt.xlabel('N') plt.ylabel('Speedup') plt.legend() plt.show() if __name__ == '__main__': n, backends, times = compare() plot_timing(n, times, backends) compyle-release-0.8.1/examples/axpb_jit.py000066400000000000000000000011021414173670100205740ustar00rootroot00000000000000"""Shows the use of annotate without any type information. The type information is extracted from the arguments passed and the function is annotated and compiled at runtime. """ from compyle.api import annotate, Elementwise, wrap, get_config, declare import numpy as np from numpy import sin @annotate def axpb(i, x, y, a, b): xi = x[i] y[i] = a * sin(xi) + b x = np.linspace(0, 1, 10000) y = np.zeros_like(x) a = 2.0 b = 3.0 backend = 'opencl' get_config().use_openmp = True x, y = wrap(x, y, backend=backend) e = Elementwise(axpb, backend=backend) e(x, y, a, b) compyle-release-0.8.1/examples/bench_vm.py000066400000000000000000000035411414173670100205660ustar00rootroot00000000000000import numpy as np import time from compyle.config import get_config import vm_numba as VN import vm_elementwise as VE import vm_kernel as VK def setup(mod, backend, openmp): get_config().use_openmp = openmp if mod == VE: e = VE.Elementwise(VE.velocity, backend) elif mod == VN: e = VN.velocity elif mod == VK: e = VK.Kernel(VK.velocity, backend) return e def data(n, mod, backend): if mod == VN: args = mod.make_vortices(n) else: args = mod.make_vortices(n, backend) return args def compare(m=5): # Warm up the jit to prevent the timing from going off for the first point. VN.velocity(*VN.make_vortices(100)) N = np.array([10, 50, 100, 200, 500, 1000, 2000, 4000, 6000, 8000, 10000, 12000]) backends = [(VN, '', False), (VE, 'cython', False), (VE, 'cython', True), (VE, 'opencl', False), (VK, 'opencl', False)] timing = [] for backend in backends: e = setup(*backend) times = [] for n in N: args = data(n, backend[0], backend[1]) t = [] for j in range(m): start = time.time() e(*args) t.append(time.time() - start) times.append(np.average(t)) timing.append(times) return N, np.array(timing) def plot_timing(n, timing): from matplotlib import pyplot as plt plt.plot(n, timing[0]/timing[1], label='numba/cython', marker='+') plt.plot(n, timing[0]/timing[2], label='numba/openmp', marker='+') plt.plot(n, timing[0]/timing[3], label='numba/opencl', marker='+') plt.plot(n, timing[0]/timing[4], label='numba/opencl local', marker='+') plt.grid() plt.xlabel('N') plt.ylabel('Speedup') plt.legend() plt.show() if __name__ == '__main__': n, t = compare() plot_timing(n, t) compyle-release-0.8.1/examples/julia_set.py000066400000000000000000000051621414173670100207650ustar00rootroot00000000000000import time from math import cos, sin import numpy as np from compyle.api import annotate, elementwise, get_config, wrap @annotate def julia(i, z, xa, ya, t): c0 = 0.7885*cos(t) c1 = 0.7885*sin(t) x = xa[i] y = ya[i] iters = 0 while (x*x + y*y) < 400 and iters < 50: xn = x*x - y*y + c0 y = x*y*2.0 + c1 x = xn iters += 1 z[i] = 1.0 - iters*0.02 def timer(x, y, z): s = time.perf_counter() n = 2000 dt = 4*np.pi/n for i in range(n): julia(z, x, y, -dt*i) print("Took", time.perf_counter() - s, "seconds") def plot(x, y, z, nx, ny): from mayavi import mlab mlab.figure(size=(600, 600)) xmin, xmax = np.min(x.data), np.max(x.data) ymin, ymax = np.min(y.data), np.max(y.data) s = mlab.imshow(z.data.reshape((nx, ny)), extent=[xmin, xmax, ymin, ymax, 0, 0], colormap='jet') s.scene.z_plus_view() n = 2000 dt = 4*np.pi/n for i in range(n): julia(z, x, y, -dt*i) z.pull() s.mlab_source.scalars = z.data.reshape((nx, ny)) if i % 3 == 0: mlab.process_ui_events() mlab.show() def save(x, y, z, gif_path='julia_set.gif'): import imageio as iio n = 250 dt = 2*np.pi/n print(f"Writing {gif_path}") with iio.get_writer(gif_path, mode='I') as writer: for i in range(n): julia(z, x, y, -dt*i) z.pull() writer.append_data( (z.data.reshape((nx, ny))*255).astype(np.uint8) ) print(f"{i}/{n}", end='\r') print("Done. ") try: from pygifsicle import optimize optimize(gif_path) except ImportError: print("Install pygifsicle for an optimized GIF") if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument('-n', action='store', type=int, dest='n', default=512, help='Number of grid points in y.') p.add_argument( '--show', action='store_true', dest='show', default=False, help='Show animation (requires mayavi)' ) p.add_argument( '--gif', action='store_true', default=False, help='Make a gif animation (requires imageio)' ) cfg = get_config() cfg.suppress_warnings = True o = p.parse_args() julia = elementwise(julia) ny = o.n nx = int(4*ny//3) x, y = np.mgrid[-2:2:nx*1j, -1.5:1.5:ny*1j] x, y = x.ravel(), y.ravel() z = np.zeros_like(x) x, y, z = wrap(x, y, z) timer(x, y, z) if o.show: plot(x, y, z, nx, ny) if o.gif: save(x, y, z) compyle-release-0.8.1/examples/laplace.py000066400000000000000000000071221414173670100204050ustar00rootroot00000000000000import numpy as np from math import pi import time from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import Elementwise from compyle.array import get_backend, wrap from compyle.low_level import cast import compyle.array as carr def bc(x, y): return np.sin(np.pi * (x + y)) @annotate def laplace_step(i, u, res, err, nx, ny, dx2, dy2, dnr_inv): xid = cast(i % nx, "int") yid = cast(i / nx, "int") if xid == 0 or xid == nx - 1 or yid == 0 or yid == ny - 1: return res[i] = ((u[i - 1] + u[i + 1]) * dx2 + (u[i - nx] + u[i + nx]) * dy2) * dnr_inv diff = res[i] - u[i] err[i] = diff * diff class Grid(object): def __init__(self, nx=10, ny=10, xmin=0., xmax=1., ymin=0., ymax=1., bc=lambda x: 0, backend=None): self.backend = get_backend(backend) self.xmin, self.xmax, self.ymin, self.ymax = xmin, xmax, ymin, ymax self.nx, self.ny = nx, ny self.dx = (xmax - xmin) / (nx - 1) self.dy = (ymax - ymin) / (ny - 1) self.x = np.arange(self.xmin, self.xmax + self.dx * 0.5, self.dx) self.y = np.arange(self.ymin, self.ymax + self.dy * 0.5, self.dy) self.bc = bc self.setup() def setup(self): u_host = np.zeros((self.nx, self.ny)).astype(np.float32) u_host[0, :] = self.bc(self.xmin, self.y) u_host[-1, :] = self.bc(self.xmax, self.y) u_host[:, 0] = self.bc(self.x, self.ymin) u_host[:, -1] = self.bc(self.x, self.ymax) self.u = wrap(u_host.flatten(), backend=self.backend) self.err = carr.zeros_like(self.u) def get(self): u_host = self.u.get() return np.resize(u_host, (self.nx, self.ny)) def compute_err(self): return np.sqrt(carr.dot(self.err, self.err)) def plot(self): import matplotlib.pyplot as plt plt.imshow(self.get()) plt.show() class LaplaceSolver(object): def __init__(self, grid, backend=None): self.grid = grid self.backend = get_backend(backend) self.step_method = Elementwise(laplace_step, backend=self.backend) self.res = self.grid.u.copy() def solve(self, max_iter=None, eps=1.0e-8): err = np.inf g = self.grid dx2 = g.dx ** 2 dy2 = g.dy ** 2 dnr_inv = 0.5 / (dx2 + dy2) count = 0 while err > eps: if max_iter and count >= max_iter: return err, count self.step_method(g.u, self.res, g.err, g.nx, g.ny, dx2, dy2, dnr_inv) err = g.compute_err() tmp = g.u g.u = self.res self.res = tmp count += 1 return err, count if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument('--nx', action='store', type=int, dest='nx', default=100, help='Number of grid points in x.') p.add_argument('--ny', action='store', type=int, dest='ny', default=100, help='Number of grid points in y.') p.add_argument( '--show', action='store_true', dest='show', default=False, help='Show plot at the end of simulation' ) o = p.parse_args() grid = Grid(nx=o.nx, ny=o.ny, bc=bc, backend=o.backend) solver = LaplaceSolver(grid, backend=o.backend) start = time.time() err, count = solver.solve(eps=1e-6) end = time.time() print("Number of iterations = %s" % count) print("Time taken = %g secs" % (end - start)) if o.show: solver.grid.plot() compyle-release-0.8.1/examples/molecular_dynamics/000077500000000000000000000000001414173670100223025ustar00rootroot00000000000000compyle-release-0.8.1/examples/molecular_dynamics/3D/000077500000000000000000000000001414173670100225505ustar00rootroot00000000000000compyle-release-0.8.1/examples/molecular_dynamics/3D/compare_results.py000066400000000000000000000023011414173670100263250ustar00rootroot00000000000000from hoomd_periodic import simulate from md_nnps_periodic import MDNNPSSolverPeriodic import numpy as np import matplotlib.pyplot as plt def run_simulations(num_particles, tf, dt): # run hoomd simulation simulate(num_particles, dt, tf, log=True) # run compyle simulation solver = MDNNPSSolverPeriodic(num_particles) solver.solve(tf, dt, log_output=True) solver.write_log('compyle-output.log') def plot_props(hoomd_fname, comp_fname): data_hoomd = np.genfromtxt(fname=hoomd_fname, skip_header=True) data_compyle = np.genfromtxt(fname=comp_fname) plt.plot(data_hoomd[:,0], data_hoomd[:,1], label="HooMD") plt.plot(data_hoomd[:,0], data_compyle[:,1], label="Compyle") plt.xlabel("Timestep") plt.ylabel("Potential Energy") plt.legend() plt.savefig("hoomd_pe.png", dpi=300) plt.clf() plt.plot(data_hoomd[:,0], data_hoomd[:,2], label="HooMD") plt.plot(data_hoomd[:,0], data_compyle[:,2], label="Compyle") plt.xlabel("Timestep") plt.ylabel("Kinetic Energy") plt.legend() plt.savefig("hoomd_ke.png", dpi=300) if __name__ == '__main__': run_simulations(2000, 200, 0.02) plot_props('hoomd-output.log', 'compyle-output.log') compyle-release-0.8.1/examples/molecular_dynamics/3D/hoomd_periodic.py000066400000000000000000000041341414173670100261100ustar00rootroot00000000000000import hoomd import hoomd.md import numpy as np import time def setup_positions(num_particles, dx): ndim = np.ceil(num_particles ** (1 / 3.)) dim_length = ndim * dx xmax = 3 * (1 + round(dim_length * 1.5 / 3.)) ymax = 3 * (1 + round(dim_length * 1.5 / 3.)) zmax = 3 * (1 + round(dim_length * 1.5 / 3.)) print(dim_length, xmax) xmin_eff = (xmax - dim_length) / 2. xmax_eff = (xmax + dim_length) / 2. x, y, z = np.mgrid[xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx] x = x.ravel().astype(np.float32)[:num_particles] y = y.ravel().astype(np.float32)[:num_particles] z = z.ravel().astype(np.float32)[:num_particles] return x, y, z, xmax def simulate(num_particles, dt, tf, profile=False, log=False): x, y, z, L = setup_positions(num_particles, 2.) positions = np.array((x, y, z)).T hoomd.context.initialize("") snapshot = hoomd.data.make_snapshot(N=len(positions), box=hoomd.data.boxdim( Lx=L, Ly=L, Lz=L), particle_types=['A'], ) # need to get automated positions... snapshot.particles.position[:] = positions - 0.5 * L snapshot.particles.typeid[:] = 0 hoomd.init.read_snapshot(snapshot) nl = hoomd.md.nlist.cell(r_buff=0) lj = hoomd.md.pair.lj(r_cut=3.0, nlist=nl) lj.pair_coeff.set('A', 'A', epsilon=1.0, sigma=1.0) if log: hoomd.analyze.log(filename="hoomd-output.log", quantities=['potential_energy', 'kinetic_energy'], period=100, overwrite=True) # Create integrator and forces hoomd.md.integrate.mode_standard(dt=dt) hoomd.md.integrate.nve(group=hoomd.group.all()) nsteps = int(tf // dt) start = time.time() hoomd.run(nsteps, profile=profile) end = time.time() return end - start if __name__ == '__main__': import sys print(simulate(int(sys.argv[1]), 0.02, 200., profile=True, log=True)) compyle-release-0.8.1/examples/molecular_dynamics/3D/md_nnps.py000066400000000000000000000121721414173670100245630ustar00rootroot00000000000000import numpy as np from math import pi import time from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import Elementwise, Reduction from compyle.array import get_backend, wrap import compyle.array as carr from nnps import NNPSCountingSort, NNPSRadixSort from md_simple import integrate_step1, integrate_step2, \ boundary_condition, MDSolverBase @annotate def calculate_force(i, x, y, z, fx, fy, fz, pe, nbr_starts, nbr_lengths, nbrs): start_idx = nbr_starts[i] length = nbr_lengths[i] for k in range(start_idx, start_idx + length): j = nbrs[k] if i == j: continue xij = x[i] - x[j] yij = y[i] - y[j] zij = z[i] - z[j] rij2 = xij * xij + yij * yij + zij * zij irij2 = 1.0 / rij2 irij6 = irij2 * irij2 * irij2 irij12 = irij6 * irij6 pe[i] += (2 * (irij12 - irij6)) f_base = 24 * irij2 * (2 * irij12 - irij6) fx[i] += f_base * xij fy[i] += f_base * yij fz[i] += f_base * zij @annotate def step_method1(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax, m, dt, nbr_starts, nbr_lengths, nbrs): integrate_step1(i, m, dt, x, y, z, vx, vy, vz, fx, fy, fz) boundary_condition(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax) @annotate def step_method2(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax, m, dt, nbr_starts, nbr_lengths, nbrs): calculate_force(i, x, y, z, fx, fy, fz, pe, nbr_starts, nbr_lengths, nbrs) integrate_step2(i, m, dt, x, y, z, vx, vy, vz, fx, fy, fz) class MDNNPSSolver(MDSolverBase): def __init__(self, num_particles, x=None, y=None, z=None, vx=None, vy=None, vz=None, xmax=100., ymax=100., zmax=100., dx=2., init_T=0., backend=None, use_count_sort=False): super().__init__(num_particles, x=x, y=y, z=z, vx=vx, vy=vy, vz=vz, xmax=xmax, ymax=ymax, zmax=zmax, dx=dx, init_T=init_T, backend=backend) self.nnps_algorithm = NNPSCountingSort \ if use_count_sort else NNPSRadixSort self.nnps = self.nnps_algorithm(self.x, self.y, self.z, 3., 0.01, self.xmax, self.ymax, self.zmax, backend=self.backend) self.init_forces = Elementwise(calculate_force, backend=self.backend) self.step1 = Elementwise(step_method1, backend=self.backend) self.step2 = Elementwise(step_method2, backend=self.backend) def solve(self, t, dt, log_output=False): num_steps = int(t // dt) self.nnps.build() self.nnps.get_neighbors() self.init_forces(self.x, self.y, self.z, self.fx, self.fy, self.fz, self.pe, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) for i in range(num_steps): self.step1(self.x, self.y, self.z, self.vx, self.vy, self.vz, self.fx, self.fy, self.fz, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.zmin, self.zmax, self.m, dt, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) self.nnps.build() self.nnps.get_neighbors() self.step2(self.x, self.y, self.z, self.vx, self.vy, self.vz, self.fx, self.fy, self.fz, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.zmin, self.zmax, self.m, dt, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) if i % 100 == 0: self.post_step(i, log_output=log_output) if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument( '--use-count-sort', action='store_true', dest='use_count_sort', default=False, help='Use count sort instead of radix sort' ) p.add_argument( '--show', action='store_true', dest='show', default=False, help='Show plot' ) p.add_argument( '--log-output', action='store_true', dest='log_output', default=False, help='Log output' ) p.add_argument('-n', action='store', type=int, dest='n', default=100, help='Number of particles') p.add_argument('--tf', action='store', type=float, dest='t', default=40., help='Final time') p.add_argument('--dt', action='store', type=float, dest='dt', default=0.02, help='Time step') o = p.parse_args() solver = MDNNPSSolver( o.n, backend=o.backend, use_count_sort=o.use_count_sort) start = time.time() solver.solve(o.t, o.dt, log_output=o.log_output) end = time.time() print("Time taken for N = %i is %g secs" % (o.n, (end - start))) if o.log_output: solver.write_log('nnps_log.log') if o.show: solver.pull() solver.plot() compyle-release-0.8.1/examples/molecular_dynamics/3D/md_nnps_periodic.py000066400000000000000000000141661414173670100264460ustar00rootroot00000000000000import numpy as np from math import pi import time from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import Elementwise, Reduction from compyle.array import get_backend, wrap from compyle.low_level import cast import compyle.array as carr from nnps import NNPSCountingSortPeriodic, NNPSRadixSortPeriodic from md_simple import integrate_step1, integrate_step2, MDSolverBase @annotate def calculate_force(i, x, y, z, xmax, ymax, zmax, fx, fy, fz, pe, nbr_starts, nbr_lengths, nbrs): start_idx = nbr_starts[i] length = nbr_lengths[i] halfx = 0.5 * xmax halfy = 0.5 * ymax halfz = 0.5 * zmax for k in range(start_idx, start_idx + length): j = nbrs[k] if i == j: continue xij = x[i] - x[j] yij = y[i] - y[j] zij = z[i] - z[j] signx = 1 if xij > 0 else -1 signy = 1 if yij > 0 else -1 signz = 1 if zij > 0 else -1 xij = xij if abs(xij) < halfx else xij - signx * xmax yij = yij if abs(yij) < halfy else yij - signy * ymax zij = zij if abs(zij) < halfz else zij - signz * zmax rij2 = xij * xij + yij * yij + zij * zij irij2 = 1.0 / rij2 irij6 = irij2 * irij2 * irij2 irij12 = irij6 * irij6 pe[i] += (2 * (irij12 - irij6)) f_base = 24 * irij2 * (2 * irij12 - irij6) fx[i] += f_base * xij fy[i] += f_base * yij fz[i] += f_base * zij @annotate def step_method1(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax, m, dt, nbr_starts, nbr_lengths, nbrs): integrate_step1(i, m, dt, x, y, z, vx, vy, vz, fx, fy, fz) boundary_condition(i, x, y, z, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax) @annotate def step_method2(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax, m, dt, nbr_starts, nbr_lengths, nbrs): calculate_force(i, x, y, z, xmax, ymax, zmax, fx, fy, fz, pe, nbr_starts, nbr_lengths, nbrs) integrate_step2(i, m, dt, x, y, z, vx, vy, vz, fx, fy, fz) @annotate def boundary_condition(i, x, y, z, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax): fx[i] = 0. fy[i] = 0. fz[i] = 0. pe[i] = 0. xwidth = xmax - xmin ywidth = ymax - ymin zwidth = zmax - zmin xoffset = cast(floor(x[i] / xmax), "int") yoffset = cast(floor(y[i] / ymax), "int") zoffset = cast(floor(z[i] / zmax), "int") x[i] -= xoffset * xwidth y[i] -= yoffset * ywidth z[i] -= zoffset * zwidth class MDNNPSSolverPeriodic(MDSolverBase): def __init__(self, num_particles, x=None, y=None, z=None, vx=None, vy=None, vz=None, xmax=100., ymax=100., zmax=100., dx=2., init_T=0., backend=None, use_count_sort=False): super().__init__(num_particles, x=x, y=y, z=z, vx=vx, vy=vy, vz=vz, xmax=xmax, ymax=ymax, zmax=zmax, dx=dx, init_T=init_T, backend=backend) self.nnps_algorithm = NNPSCountingSortPeriodic \ if use_count_sort else NNPSRadixSortPeriodic self.nnps = self.nnps_algorithm(self.x, self.y, self.z, 3., 0.01, self.xmax, self.ymax, self.zmax, backend=self.backend) self.init_forces = Elementwise(calculate_force, backend=self.backend) self.step1 = Elementwise(step_method1, backend=self.backend) self.step2 = Elementwise(step_method2, backend=self.backend) def solve(self, t, dt, log_output=False): num_steps = int(t // dt) self.nnps.build() self.nnps.get_neighbors() self.init_forces(self.x, self.y, self.z, self.xmax, self.ymax, self.zmax, self.fx, self.fy, self.fz, self.pe, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) for i in range(num_steps): self.step1(self.x, self.y, self.z, self.vx, self.vy, self.vz, self.fx, self.fy, self.fz, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.zmin, self.zmax, self.m, dt, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) self.nnps.build() self.nnps.get_neighbors() self.step2(self.x, self.y, self.z, self.vx, self.vy, self.vz, self.fx, self.fy, self.fz, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.zmin, self.zmax, self.m, dt, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) if i % 100 == 0: self.post_step(i, log_output=log_output) if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument( '--use-count-sort', action='store_true', dest='use_count_sort', default=False, help='Use count sort instead of radix sort' ) p.add_argument( '--show', action='store_true', dest='show', default=False, help='Show plot' ) p.add_argument( '--log-output', action='store_true', dest='log_output', default=False, help='Log output' ) p.add_argument('-n', action='store', type=int, dest='n', default=100, help='Number of particles') p.add_argument('--tf', action='store', type=float, dest='t', default=40., help='Final time') p.add_argument('--dt', action='store', type=float, dest='dt', default=0.02, help='Time step') o = p.parse_args() solver = MDNNPSSolverPeriodic( o.n, backend=o.backend, use_count_sort=o.use_count_sort) start = time.time() solver.solve(o.t, o.dt, o.log_output) end = time.time() print("Time taken for N = %i is %g secs" % (o.n, (end - start))) if o.log_output: solver.write_log('nnps_periodic.log') if o.show: solver.pull() solver.plot() compyle-release-0.8.1/examples/molecular_dynamics/3D/md_simple.py000066400000000000000000000224351414173670100251010ustar00rootroot00000000000000import numpy as np from math import pi import time from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import Elementwise, Reduction from compyle.array import get_backend, wrap import compyle.array as carr @annotate def calculate_energy(i, vx, vy, vz, pe, num_particles): ke = 0.5 * (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) return pe[i] + ke @annotate def calculate_kinetic_energy(i, vx, vy, vz): return 0.5 * (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) @annotate def calculate_force(i, x, y, z, fx, fy, fz, pe, num_particles): force_cutoff = 3. force_cutoff2 = force_cutoff * force_cutoff for j in range(num_particles): if i == j: continue xij = x[i] - x[j] yij = y[i] - y[j] zij = z[i] - z[j] rij2 = xij * xij + yij * yij + zij * zij if rij2 > force_cutoff2: continue irij2 = 1.0 / rij2 irij6 = irij2 * irij2 * irij2 irij12 = irij6 * irij6 pe[i] += (2 * (irij12 - irij6)) f_base = 24 * irij2 * (2 * irij12 - irij6) fx[i] += f_base * xij fy[i] += f_base * yij fz[i] += f_base * zij @annotate def step_method1(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax, m, dt, num_particles): integrate_step1(i, m, dt, x, y, z, vx, vy, vz, fx, fy, fz) boundary_condition(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax) @annotate def step_method2(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax, m, dt, num_particles): calculate_force(i, x, y, z, fx, fy, fz, pe, num_particles) integrate_step2(i, m, dt, x, y, z, vx, vy, vz, fx, fy, fz) @annotate def integrate_step1(i, m, dt, x, y, z, vx, vy, vz, fx, fy, fz): x[i] += vx[i] * dt + 0.5 * fx[i] * dt * dt y[i] += vy[i] * dt + 0.5 * fy[i] * dt * dt z[i] += vz[i] * dt + 0.5 * fz[i] * dt * dt vx[i] += 0.5 * fx[i] * dt vy[i] += 0.5 * fy[i] * dt vz[i] += 0.5 * fz[i] * dt @annotate def integrate_step2(i, m, dt, x, y, z, vx, vy, vz, fx, fy, fz): vx[i] += 0.5 * fx[i] * dt vy[i] += 0.5 * fy[i] * dt vz[i] += 0.5 * fz[i] * dt @annotate def boundary_condition(i, x, y, z, vx, vy, vz, fx, fy, fz, pe, xmin, xmax, ymin, ymax, zmin, zmax): xwidth = xmax - xmin ywidth = ymax - ymin zwidth = zmax - zmin stiffness = 50. pe[i] = 0. if x[i] < 0.5: fx[i] = stiffness * (0.5 - x[i]) pe[i] += 0.5 * stiffness * (0.5 - x[i]) * (0.5 - x[i]) elif x[i] > xwidth - 0.5: fx[i] = stiffness * (xwidth - 0.5 - x[i]) pe[i] += 0.5 * stiffness * (xwidth - 0.5 - x[i]) * (xwidth - 0.5 - x[i]) else: fx[i] = 0. if y[i] < 0.5: fy[i] = stiffness * (0.5 - y[i]) pe[i] += 0.5 * stiffness * (0.5 - y[i]) * (0.5 - y[i]) elif y[i] > ywidth - 0.5: fy[i] = stiffness * (ywidth - 0.5 - y[i]) pe[i] += 0.5 * stiffness * (ywidth - 0.5 - y[i]) * (ywidth - 0.5 - y[i]) else: fy[i] = 0. if z[i] < 0.5: fz[i] = stiffness * (0.5 - z[i]) pe[i] += 0.5 * stiffness * (0.5 - z[i]) * (0.5 - z[i]) elif z[i] > zwidth - 0.5: fz[i] = stiffness * (zwidth - 0.5 - z[i]) pe[i] += 0.5 * stiffness * (zwidth - 0.5 - z[i]) * (zwidth - 0.5 - z[i]) else: fz[i] = 0. class MDSolverBase(object): def __init__(self, num_particles, x=None, y=None, z=None, vx=None, vy=None, vz=None, xmax=100., ymax=100., zmax=100., dx=2., init_T=0., backend=None): self.backend = get_backend(backend) self.num_particles = num_particles self.xmin, self.xmax = 0., xmax self.ymin, self.ymax = 0., ymax self.zmin, self.zmax = 0., zmax self.log_data = [] self.m = 1. if x is None and y is None and z is None: self.x, self.y, self.z = self.setup_positions(num_particles, dx) if vx is None and vy is None and vz is None: self.vx, self.vy, self.vz = self.setup_velocities( init_T, num_particles) self.fx = carr.zeros_like(self.x, backend=self.backend) self.fy = carr.zeros_like(self.y, backend=self.backend) self.fz = carr.zeros_like(self.z, backend=self.backend) self.pe = carr.zeros_like(self.x, backend=self.backend) self.energy_calc = Reduction("a+b", map_func=calculate_energy, backend=self.backend) self.kinetic_energy_calc = Reduction( "a+b", map_func=calculate_kinetic_energy, backend=self.backend) def setup_velocities(self, T, num_particles): np.random.seed(123) vx = np.random.uniform(0, 1., size=num_particles).astype(np.float64) vy = np.random.uniform(0, 1., size=num_particles).astype(np.float64) vz = np.random.uniform(0, 1., size=num_particles).astype(np.float64) T_current = np.average(vx ** 2 + vy ** 2 + vz ** 2) scaling_factor = (T / T_current) ** 0.5 vx = vx * scaling_factor vy = vy * scaling_factor vz = vz * scaling_factor return wrap(vx, vy, vz, backend=self.backend) def setup_positions(self, num_particles, dx): ndim = np.ceil(num_particles ** (1 / 3.)) dim_length = ndim * dx self.xmax = 3 * (1 + round(dim_length * 1.5 / 3.)) self.ymax = 3 * (1 + round(dim_length * 1.5 / 3.)) self.zmax = 3 * (1 + round(dim_length * 1.5 / 3.)) xmin_eff = ((self.xmax - self.xmin) - dim_length) / 2. xmax_eff = ((self.xmax - self.xmin) + dim_length) / 2. x, y, z = np.mgrid[xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx] x = x.ravel().astype(np.float64)[:num_particles] y = y.ravel().astype(np.float64)[:num_particles] z = z.ravel().astype(np.float64)[:num_particles] return wrap(x, y, z, backend=self.backend) def post_step(self, step, log_output=False): energy = self.energy_calc(self.vx, self.vy, self.vz, self.pe, self.num_particles) if log_output: self.log_data.append([step, carr.sum(self.pe), self.kinetic_energy_calc(self.vx, self.vy, self.vz)]) print("Energy at time step =", step, "is", energy) def write_log(self, fname): np.savetxt(fname, np.array(self.log_data), header="timestep\tpotential_energy\tkinetic_energy") def pull(self): self.x.pull() self.y.pull() self.z.pull() def plot(self): import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.set_xlim(self.xmin, self.xmax) ax.set_ylim(self.ymin, self.ymax) ax.set_zlim(self.zmin, self.zmax) ax.scatter(self.x.data, self.y.data, self.z.data) plt.show() class MDSolver(MDSolverBase): def __init__(self, num_particles, x=None, y=None, z=None, vx=None, vy=None, vz=None, xmax=100., ymax=100., zmax=100., dx=2., init_T=0., backend=None): super().__init__(num_particles, x=x, y=y, z=z, vx=vx, vy=vy, vz=vz, xmax=xmax, ymax=ymax, zmax=zmax, dx=dx, init_T=init_T, backend=backend) self.init_forces = Elementwise(calculate_force, backend=self.backend) self.step1 = Elementwise(step_method1, backend=self.backend) self.step2 = Elementwise(step_method2, backend=self.backend) def solve(self, t, dt, log_output=False): num_steps = int(t // dt) self.init_forces(self.x, self.y, self.z, self.fx, self.fy, self.fz, self.pe, self.num_particles) for i in range(num_steps): self.step1(self.x, self.y, self.z, self.vx, self.vy, self.vz, self.fx, self.fy, self.fz, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.zmin, self.zmax, self.m, dt, self.num_particles) self.step2(self.x, self.y, self.z, self.vx, self.vy, self.vz, self.fx, self.fy, self.fz, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.zmin, self.zmax, self.m, dt, self.num_particles) if i % 100 == 0: self.post_step(i, log_output=log_output) if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument( '--show', action='store_true', dest='show', default=False, help='Show plot' ) p.add_argument('-n', action='store', type=int, dest='n', default=100, help='Number of particles') p.add_argument('--tf', action='store', type=float, dest='t', default=40., help='Final time') p.add_argument('--dt', action='store', type=float, dest='dt', default=0.02, help='Time step') o = p.parse_args() solver = MDSolver(o.n, backend=o.backend) start = time.time() solver.solve(o.t, o.dt) end = time.time() print("Time taken for N = %i is %g secs" % (o.n, (end - start))) if o.show: solver.pull() solver.plot() compyle-release-0.8.1/examples/molecular_dynamics/3D/nnps.py000066400000000000000000000253101414173670100241010ustar00rootroot00000000000000from nnps_kernels import * from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import serial, Elementwise, Reduction, Scan from compyle.array import get_backend, wrap from compyle.low_level import atomic_inc, cast from math import floor from time import time import numpy as np import compyle.array as carr class NNPS(object): def __init__(self, x, y, z, h, eps, xmax, ymax, zmax, backend=None): self.backend = backend self.num_particles = x.length self.eps = eps self.x, self.y, self.z = x, y, z self.xmax, self.ymax, self.zmax = xmax, ymax, zmax self.h = h + 2 * eps cmax = np.array([floor((xmax + eps) / self.h), floor((ymax + eps) / self.h), floor((zmax + eps) / self.h)], dtype=np.int32) self.max_key = 1 + \ flatten(cmax[0], cmax[1], cmax[2], 1 + cmax[1], 1 + cmax[2]) self.pmax = 1 + cmax[0] self.qmax = 1 + cmax[1] self.rmax = 1 + cmax[2] self.setup_kernels() self.init_arrays() def setup_kernels(self): # neighbor kernels self.find_neighbor_lengths = Elementwise(find_neighbor_lengths_knl, backend=self.backend) self.find_neighbors = Elementwise(find_neighbors_knl, backend=self.backend) self.scan_start_indices = Scan(input=input_start_indices, output=output_start_indices, scan_expr="a+b", dtype=np.int32, backend=self.backend) def init_arrays(self): # sort arrays self.bin_counts = carr.zeros(self.max_key, dtype=np.int32, backend=self.backend) self.start_indices = carr.zeros(self.max_key, dtype=np.int32, backend=self.backend) self.keys = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) self.sorted_indices = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) # neighbor arrays self.nbr_lengths = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) self.nbr_starts = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) self.nbrs = carr.zeros(2 * self.num_particles, dtype=np.int32, backend=self.backend) def reset_arrays(self): # sort arrays self.bin_counts.fill(0) self.start_indices.fill(0) self.sorted_indices.fill(0) # neighbors array self.nbr_lengths.fill(0) self.nbr_starts.fill(0) def get_neighbors(self): self.find_neighbor_lengths(self.x, self.y, self.z, self.h, self.eps, self.qmax, self.rmax, self.start_indices, self.sorted_indices, self.bin_counts, self.nbr_lengths, self.max_key) self.scan_start_indices(counts=self.nbr_lengths, indices=self.nbr_starts) self.total_neighbors = int(self.nbr_lengths[-1] + self.nbr_starts[-1]) self.nbrs.resize(self.total_neighbors) self.find_neighbors(self.x, self.y, self.z, self.h, self.eps, self.qmax, self.rmax, self.start_indices, self.sorted_indices, self.bin_counts, self.nbr_starts, self.nbrs, self.max_key) def find_nearest_neighbors(self, qid): start_idx = self.nbr_starts.dev[qid] length = self.nbr_lengths.dev[qid] return self.nbrs.dev[start_idx:start_idx + length] class NNPSCountingSort(NNPS): def __init__(self, x, y, z, h, eps, xmax, ymax, zmax, backend=None): super().__init__(x, y, z, h, eps, xmax, ymax, zmax, backend=backend) # sort kernels self.count_bins = Elementwise(count_bins, backend=self.backend) self.sort_indices = Elementwise(sort_indices, backend=self.backend) def init_arrays(self): super().init_arrays() self.sort_offsets = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) def reset_arrays(self): super().reset_arrays() # sort arrays self.sort_offsets.fill(0) def build(self): self.reset_arrays() self.count_bins(self.x, self.y, self.z, self.h, self.eps, self.qmax, self.rmax, self.keys, self.bin_counts, self.sort_offsets) self.scan_start_indices(counts=self.bin_counts, indices=self.start_indices) self.sort_indices(self.keys, self.sort_offsets, self.start_indices, self.sorted_indices) class NNPSRadixSort(NNPS): def __init__(self, x, y, z, h, eps, xmax, ymax, zmax, backend=None): super().__init__(x, y, z, h, eps, xmax, ymax, zmax, backend=backend) self.max_bits = np.ceil(np.log2(self.max_key)) # sort kernels self.fill_keys = Elementwise(fill_keys, backend=self.backend) self.fill_bin_counts = Elementwise(fill_bin_counts, backend=self.backend) self.scan_keys = Scan(input=input_scan_keys, output=output_scan_keys, scan_expr="a+b", dtype=np.int32, backend=self.backend) def init_arrays(self): super().init_arrays() # sort arrays self.sorted_keys = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) self.indices = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) def reset_arrays(self): super().reset_arrays() self.sorted_keys.fill(0) def build(self): self.reset_arrays() self.fill_keys(self.x, self.y, self.z, self.h, self.eps, self.qmax, self.rmax, self.indices, self.keys) self.sorted_keys, self.sorted_indices = carr.sort_by_keys( [self.keys, self.indices], key_bits=self.max_bits, backend=self.backend) self.scan_keys(keys=self.sorted_keys, start_indices=self.start_indices) self.fill_bin_counts(self.sorted_keys, self.start_indices, self.bin_counts, self.num_particles) class NNPSCountingSortPeriodic(NNPSCountingSort): def setup_kernels(self): # neighbor kernels self.find_neighbor_lengths = Elementwise( find_neighbor_lengths_periodic_knl, backend=self.backend) self.find_neighbors = Elementwise( find_neighbors_periodic_knl, backend=self.backend) self.scan_start_indices = Scan(input=input_start_indices, output=output_start_indices, scan_expr="a+b", dtype=np.int32, backend=self.backend) def get_neighbors(self): self.find_neighbor_lengths(self.x, self.y, self.z, self.h, self.eps, self.xmax, self.ymax, self.zmax, self.pmax, self.qmax, self.rmax, self.start_indices, self.sorted_indices, self.bin_counts, self.nbr_lengths, self.max_key) self.scan_start_indices(counts=self.nbr_lengths, indices=self.nbr_starts) self.total_neighbors = int(self.nbr_lengths[-1] + self.nbr_starts[-1]) self.nbrs.resize(self.total_neighbors) self.find_neighbors(self.x, self.y, self.z, self.h, self.eps, self.xmax, self.ymax, self.zmax, self.pmax, self.qmax, self.rmax, self.start_indices, self.sorted_indices, self.bin_counts, self.nbr_starts, self.nbrs, self.max_key) class NNPSRadixSortPeriodic(NNPSRadixSort): def setup_kernels(self): # neighbor kernels self.find_neighbor_lengths = Elementwise( find_neighbor_lengths_periodic_knl, backend=self.backend) self.find_neighbors = Elementwise( find_neighbors_periodic_knl, backend=self.backend) self.scan_start_indices = Scan(input=input_start_indices, output=output_start_indices, scan_expr="a+b", dtype=np.int32, backend=self.backend) def get_neighbors(self): self.find_neighbor_lengths(self.x, self.y, self.z, self.h, self.eps, self.xmax, self.ymax, self.zmax, self.pmax, self.qmax, self.rmax, self.start_indices, self.sorted_indices, self.bin_counts, self.nbr_lengths, self.max_key) self.scan_start_indices(counts=self.nbr_lengths, indices=self.nbr_starts) self.total_neighbors = int(self.nbr_lengths[-1] + self.nbr_starts[-1]) self.nbrs.resize(self.total_neighbors) self.find_neighbors(self.x, self.y, self.z, self.h, self.eps, self.xmax, self.ymax, self.zmax, self.pmax, self.qmax, self.rmax, self.start_indices, self.sorted_indices, self.bin_counts, self.nbr_starts, self.nbrs, self.max_key) if __name__ == "__main__": import sys import matplotlib.pyplot as plt backend = sys.argv[1] if len(sys.argv) > 1 else 'cython' np.random.seed(123) num_particles = 10000 x, y, z = np.mgrid[0:30:50j, 0:30:50j, 0:30:50j] x = x.ravel().astype(np.float64) y = y.ravel().astype(np.float64) z = z.ravel().astype(np.float64) x, y, z = wrap(x, y, z, backend=backend) nnps = NNPSRadixSortPeriodic( x, y, z, 3., 0.01, 30., 30., 30., backend=backend) nnps.build() nnps.get_neighbors() nbrs = nnps.find_nearest_neighbors(75000) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.set_xlim(30) ax.set_ylim(30) ax.set_zlim(30) ax.scatter(x[nbrs], y[nbrs], z[nbrs]) plt.show() compyle-release-0.8.1/examples/molecular_dynamics/3D/nnps_kernels.py000066400000000000000000000170601414173670100256270ustar00rootroot00000000000000from compyle.api import declare, annotate from compyle.parallel import serial from compyle.low_level import atomic_inc, cast from math import floor import numpy as np @annotate def find_cell_id(x, y, z, h, eps, c): c[0] = cast(floor((x + eps) / h), "int") c[1] = cast(floor((y + eps) / h), "int") c[2] = cast(floor((z + eps) / h), "int") @annotate def flatten(p, q, r, qmax, rmax): return (p * qmax + q) * rmax + r @serial @annotate def count_bins(i, x, y, z, h, eps, qmax, rmax, keys, bin_counts, sort_offsets): c = declare('matrix(3, "int")') find_cell_id(x[i], y[i], z[i], h, eps, c) key = flatten(c[0], c[1], c[2], qmax, rmax) keys[i] = key idx = atomic_inc(bin_counts[key]) sort_offsets[i] = idx @annotate def sort_indices(i, keys, sort_offsets, start_indices, sorted_indices): key = keys[i] offset = sort_offsets[i] start_idx = start_indices[key] sorted_indices[start_idx + offset] = i @annotate def input_start_indices(i, counts): return 0 if i == 0 else counts[i - 1] @annotate def output_start_indices(i, item, indices): indices[i] = item @annotate def fill_keys(i, x, y, z, h, eps, qmax, rmax, indices, keys): c = declare('matrix(3, "int")') find_cell_id(x[i], y[i], z[i], h, eps, c) key = flatten(c[0], c[1], c[2], qmax, rmax) keys[i] = key indices[i] = i @annotate def input_scan_keys(i, keys): return 1 if i == 0 or keys[i] != keys[i - 1] else 0 @annotate def output_scan_keys(i, item, prev_item, keys, start_indices): key = keys[i] if item != prev_item: start_indices[key] = i @annotate def fill_bin_counts(i, keys, start_indices, bin_counts, num_particles): if i == num_particles - 1: last_key = keys[num_particles - 1] bin_counts[last_key] = num_particles - start_indices[last_key] if i == 0 or keys[i] == keys[i - 1]: return key = keys[i] prev_key = keys[i - 1] bin_counts[prev_key] = start_indices[key] - start_indices[prev_key] @annotate def find_neighbor_lengths_knl(i, x, y, z, h, eps, qmax, rmax, start_indices, sorted_indices, bin_counts, nbr_lengths, max_key): d = h * h q_c = declare('matrix(3, "int")') find_cell_id(x[i], y[i], z[i], h, eps, q_c) for p in range(-1, 2): for q in range(-1, 2): for r in range(-1, 2): cx = q_c[0] + p cy = q_c[1] + q cz = q_c[2] + r key = flatten(cx, cy, cz, qmax, rmax) if key >= max_key or key < 0: continue start_idx = start_indices[key] np = bin_counts[key] for k in range(np): j = sorted_indices[start_idx + k] xij = x[i] - x[j] yij = y[i] - y[j] zij = z[i] - z[j] rij2 = xij * xij + yij * yij + zij * zij if rij2 < d: nbr_lengths[i] += 1 @annotate def find_neighbors_knl(i, x, y, z, h, eps, qmax, rmax, start_indices, sorted_indices, bin_counts, nbr_starts, nbrs, max_key): d = h * h q_c = declare('matrix(3, "int")') find_cell_id(x[i], y[i], z[i], h, eps, q_c) length = 0 nbr_start_idx = nbr_starts[i] for p in range(-1, 2): for q in range(-1, 2): for r in range(-1, 2): cx = q_c[0] + p cy = q_c[1] + q cz = q_c[2] + r key = flatten(cx, cy, cz, qmax, rmax) if key >= max_key or key < 0: continue start_idx = start_indices[key] np = bin_counts[key] for k in range(np): j = sorted_indices[start_idx + k] xij = x[i] - x[j] yij = y[i] - y[j] zij = z[i] - z[j] rij2 = xij * xij + yij * yij + zij * zij if rij2 < d: nbrs[nbr_start_idx + length] = j length += 1 @annotate def find_neighbor_lengths_periodic_knl(i, x, y, z, h, eps, xmax, ymax, zmax, pmax, qmax, rmax, start_indices, sorted_indices, bin_counts, nbr_lengths, max_key): d = h * h q_c = declare('matrix(3, "int")') xij, yij, zij = declare('double', 3) find_cell_id(x[i], y[i], z[i], h, eps, q_c) for p in range(-1, 2): for q in range(-1, 2): for r in range(-1, 2): cx = q_c[0] + p cy = q_c[1] + q cz = q_c[2] + r cx_f = cast(cx, "float") cy_f = cast(cy, "float") cz_f = cast(cz, "float") xoffset = cast(floor(cx_f / pmax), "int") yoffset = cast(floor(cy_f / qmax), "int") zoffset = cast(floor(cz_f / rmax), "int") cx -= xoffset * pmax cy -= yoffset * qmax cz -= zoffset * rmax key = flatten(cx, cy, cz, qmax, rmax) if key >= max_key or key < 0: continue start_idx = start_indices[key] np = bin_counts[key] for k in range(np): j = sorted_indices[start_idx + k] xij = abs(x[i] - x[j]) yij = abs(y[i] - y[j]) zij = abs(z[i] - z[j]) xij = min(xij, xmax - xij) yij = min(yij, ymax - yij) zij = min(zij, zmax - zij) rij2 = xij * xij + yij * yij + zij * zij if rij2 < d: nbr_lengths[i] += 1 @annotate def find_neighbors_periodic_knl(i, x, y, z, h, eps, xmax, ymax, zmax, pmax, qmax, rmax, start_indices, sorted_indices, bin_counts, nbr_starts, nbrs, max_key): d = h * h q_c = declare('matrix(3, "int")') xij, yij, zij = declare('double', 3) find_cell_id(x[i], y[i], z[i], h, eps, q_c) length = 0 nbr_start_idx = nbr_starts[i] for p in range(-1, 2): for q in range(-1, 2): for r in range(-1, 2): cx = q_c[0] + p cy = q_c[1] + q cz = q_c[2] + r cx_f = cast(cx, "float") cy_f = cast(cy, "float") cz_f = cast(cz, "float") xoffset = cast(floor(cx_f / pmax), "int") yoffset = cast(floor(cy_f / qmax), "int") zoffset = cast(floor(cz_f / rmax), "int") cx -= xoffset * pmax cy -= yoffset * qmax cz -= zoffset * rmax key = flatten(cx, cy, cz, qmax, rmax) if key >= max_key or key < 0: continue start_idx = start_indices[key] np = bin_counts[key] for k in range(np): j = sorted_indices[start_idx + k] xij = abs(x[i] - x[j]) yij = abs(y[i] - y[j]) zij = abs(z[i] - z[j]) xij = min(xij, xmax - xij) yij = min(yij, ymax - yij) zij = min(zij, zmax - zij) rij2 = xij * xij + yij * yij + zij * zij if rij2 < d: nbrs[nbr_start_idx + length] = j length += 1 compyle-release-0.8.1/examples/molecular_dynamics/3D/performance_comparison.py000066400000000000000000000053561414173670100276660ustar00rootroot00000000000000import numpy as np import time from md_nnps_periodic import MDNNPSSolverPeriodic from compyle.config import get_config from hoomd_periodic import simulate def solve(n, backend, tf=4., dt=0.02, use_count_sort=False): if backend == 'hoomd': return simulate(n, dt, tf) else: solver = MDNNPSSolverPeriodic( n, dx=2., backend=backend, use_count_sort=use_count_sort) start = time.time() solver.solve(tf, dt) end = time.time() print("Time taken for backend = %s, N = %i is %g secs" % (backend, n, (end - start))) return end - start def compare(backends, n_list, niter=3, use_count_sort=False): t_list = {b: [] for b in backends} speedups = {b: [] for b in backends} for backend in backends: for n in n_list: print("Running for N = %i" % n) t = 1e9 for it in range(niter): t = min(t, solve(n, backend, use_count_sort=use_count_sort)) t_list[backend].append(t) if 'hoomd' in backends: for backend in backends: for i, n in enumerate(n_list): speedups[backend].append( t_list['hoomd'][i] / t_list[backend][i]) else: speedups = None return speedups, t_list def plot(n_list, speedups, t_list, label): backend_label_map = {'hoomd': 'HooMD', 'opencl': 'OpenCL', 'cuda': 'CUDA'} import matplotlib.pyplot as plt plt.figure() if speedups: for backend, arr in speedups.items(): if backend == "hoomd": continue plt.semilogx(n_list, arr, 'x-', label=backend_label_map[backend]) plt.xlabel("Number of particles") plt.ylabel("Speedup") plt.legend() plt.grid(True) plt.savefig("%s_speedup_%s.png" % (label, "_".join(speedups.keys())), dpi=300) plt.clf() for backend, arr in t_list.items(): plt.loglog(n_list, arr, 'x-', label=backend_label_map[backend]) plt.xlabel("Number of particles") plt.ylabel("Time (secs)") plt.legend() plt.grid(True) plt.savefig("%s_time_%s.png" % (label, "_".join(t_list.keys())), dpi=300) if __name__ == "__main__": from argparse import ArgumentParser p = ArgumentParser() p.add_argument( '--use-count-sort', action='store_true', dest='use_count_sort', default=False, help='Use count sort instead of radix sort' ) o = p.parse_args() n_list = [1000 * (2 ** i) for i in range(11)] backends = ["cuda", "hoomd"] print("Running for", n_list) speedups, t_list = compare(backends, n_list, use_count_sort=o.use_count_sort) plot(n_list, speedups, t_list, "hoomd") compyle-release-0.8.1/examples/molecular_dynamics/README.rst000066400000000000000000000041721414173670100237750ustar00rootroot00000000000000Molecular Dynamics Example -------------------------- We have 3 implementations of a simple molecular dynamics simulation of an N body problem in Lennard Jones potential. The first implementation is a simple :math:`O(N^2)` implementation that can be found in :code:`md_simple.py`. The second implementation is using nearest neighbor searching to reduce the complexity to :math:`O(N)` and can be found in :code:`md_nnps.py`. We also have two different implementations of nearest neighbor search algorithms, one using a radix sort on GPU and numpy sort on CPU and the other using a native counting sort implementation. The counting sort version is about 30% faster. Both these implementations can be found in :code:`nnps.py`. This example has been discussed at length in `this `_ SciPy 2020 paper. Following commands can be used to reproduce the performance results shown in the paper. +------------------+---------------------------------------------------------------+ | Figure 2 | `python performance_comparison.py -c omp_comp --nnps simple` | +------------------+---------------------------------------------------------------+ | Figure 3 | `python performance_comparison.py -c gpu_comp --nnps simple` | +------------------+---------------------------------------------------------------+ | Figure 4 & 5 | `python performance_comparison.py -c gpu_comp` | +------------------+---------------------------------------------------------------+ | Figure 6 & 7 | `python performance_comparison.py -c comp_algo` | +------------------+---------------------------------------------------------------+ | Figure 8 | `cd 3D && python performance_comparison.py --use-count-sort` | +------------------+---------------------------------------------------------------+ To generate energy plots for HooMD and Compyle implementations, run the script :code:`3D/compare_results.py` Users can use the google colab notebook `here `_ to play around with the example. compyle-release-0.8.1/examples/molecular_dynamics/md_nnps.py000066400000000000000000000106661414173670100243230ustar00rootroot00000000000000import numpy as np from math import pi import time from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import Elementwise, Reduction from compyle.array import get_backend, wrap import compyle.array as carr from nnps import NNPSCountingSort, NNPSRadixSort from md_simple import integrate_step1, integrate_step2, \ boundary_condition, MDSolverBase @annotate def calculate_force(i, x, y, fx, fy, pe, nbr_starts, nbr_lengths, nbrs): start_idx = nbr_starts[i] length = nbr_lengths[i] for k in range(start_idx, start_idx + length): j = nbrs[k] if i == j: continue xij = x[i] - x[j] yij = y[i] - y[j] rij2 = xij * xij + yij * yij irij2 = 1.0 / rij2 irij6 = irij2 * irij2 * irij2 irij12 = irij6 * irij6 pe[i] += (2 * (irij12 - irij6)) f_base = 24 * irij2 * (2 * irij12 - irij6) fx[i] += f_base * xij fy[i] += f_base * yij @annotate def step_method1(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax, m, dt, nbr_starts, nbr_lengths, nbrs): integrate_step1(i, m, dt, x, y, vx, vy, fx, fy) boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax) @annotate def step_method2(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax, m, dt, nbr_starts, nbr_lengths, nbrs): calculate_force(i, x, y, fx, fy, pe, nbr_starts, nbr_lengths, nbrs) integrate_step2(i, m, dt, x, y, vx, vy, fx, fy) class MDNNPSSolver(MDSolverBase): def __init__(self, num_particles, x=None, y=None, vx=None, vy=None, xmax=100., ymax=100., dx=1.5, init_T=0., backend=None, use_count_sort=False): super().__init__(num_particles, x=x, y=y, vx=vx, vy=vy, xmax=xmax, ymax=ymax, dx=dx, init_T=init_T, backend=backend) self.init_forces = Elementwise(calculate_force, backend=self.backend) self.step1 = Elementwise(step_method1, backend=self.backend) self.step2 = Elementwise(step_method2, backend=self.backend) self.nnps_algorithm = NNPSCountingSort \ if use_count_sort else NNPSRadixSort self.nnps = self.nnps_algorithm(self.x, self.y, 3., self.xmax, self.ymax, backend=self.backend) def solve(self, t, dt): num_steps = int(t // dt) self.nnps.build() self.nnps.get_neighbors() self.init_forces(self.x, self.y, self.fx, self.fy, self.pe, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) for i in range(num_steps): self.step1(self.x, self.y, self.vx, self.vy, self.fx, self.fy, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.m, dt, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) self.nnps.build() self.nnps.get_neighbors() self.step2(self.x, self.y, self.vx, self.vy, self.fx, self.fy, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.m, dt, self.nnps.nbr_starts, self.nnps.nbr_lengths, self.nnps.nbrs) if i % 100 == 0: self.post_step(i) if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument( '--use-count-sort', action='store_true', dest='use_count_sort', default=False, help='Use count sort instead of radix sort' ) p.add_argument( '--show', action='store_true', dest='show', default=False, help='Show plot at end of simulation' ) p.add_argument('-n', action='store', type=int, dest='n', default=100, help='Number of particles') p.add_argument('--tf', action='store', type=float, dest='t', default=40., help='Final time') p.add_argument('--dt', action='store', type=float, dest='dt', default=0.02, help='Time step') o = p.parse_args() solver = MDNNPSSolver( o.n, backend=o.backend, use_count_sort=o.use_count_sort) start = time.time() solver.solve(o.t, o.dt) end = time.time() print("Time taken for N = %i is %g secs" % (o.n, (end - start))) if o.show: solver.pull() solver.plot() compyle-release-0.8.1/examples/molecular_dynamics/md_simple.py000066400000000000000000000161061414173670100246310ustar00rootroot00000000000000import numpy as np from math import pi import time from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import Elementwise, Reduction from compyle.array import get_backend, wrap import compyle.array as carr @annotate def calculate_energy(i, vx, vy, pe, num_particles): ke = 0.5 * (vx[i] * vx[i] + vy[i] * vy[i]) return pe[i] + ke @annotate def calculate_force(i, x, y, fx, fy, pe, num_particles): force_cutoff = 3. force_cutoff2 = force_cutoff * force_cutoff for j in range(num_particles): if i == j: continue xij = x[i] - x[j] yij = y[i] - y[j] rij2 = xij * xij + yij * yij if rij2 > force_cutoff2: continue irij2 = 1.0 / rij2 irij6 = irij2 * irij2 * irij2 irij12 = irij6 * irij6 pe[i] += (2 * (irij12 - irij6)) f_base = 24 * irij2 * (2 * irij12 - irij6) fx[i] += f_base * xij fy[i] += f_base * yij @annotate def step_method1(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax, m, dt, num_particles): integrate_step1(i, m, dt, x, y, vx, vy, fx, fy) boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax) @annotate def step_method2(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax, m, dt, num_particles): calculate_force(i, x, y, fx, fy, pe, num_particles) integrate_step2(i, m, dt, x, y, vx, vy, fx, fy) @annotate def integrate_step1(i, m, dt, x, y, vx, vy, fx, fy): axi = fx[i] ayi = fy[i] x[i] += vx[i] * dt + 0.5 * axi * dt * dt y[i] += vy[i] * dt + 0.5 * ayi * dt * dt vx[i] += 0.5 * axi * dt vy[i] += 0.5 * ayi * dt @annotate def integrate_step2(i, m, dt, x, y, vx, vy, fx, fy): axi = fx[i] ayi = fy[i] vx[i] += 0.5 * axi * dt vy[i] += 0.5 * ayi * dt @annotate def boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax): xwidth = xmax - xmin ywidth = ymax - ymin stiffness = 50. pe[i] = 0. if x[i] < 0.5: fx[i] = stiffness * (0.5 - x[i]) pe[i] += 0.5 * stiffness * (0.5 - x[i]) * (0.5 - x[i]) elif x[i] > xwidth - 0.5: fx[i] = stiffness * (xwidth - 0.5 - x[i]) pe[i] += 0.5 * stiffness * (xwidth - 0.5 - x[i]) * (xwidth - 0.5 - x[i]) else: fx[i] = 0. if y[i] < 0.5: fy[i] = stiffness * (0.5 - y[i]) pe[i] += 0.5 * stiffness * (0.5 - y[i]) * (0.5 - y[i]) elif y[i] > ywidth - 0.5: fy[i] = stiffness * (ywidth - 0.5 - y[i]) pe[i] += 0.5 * stiffness * (ywidth - 0.5 - y[i]) * (ywidth - 0.5 - y[i]) else: fy[i] = 0. class MDSolverBase(object): def __init__(self, num_particles, x=None, y=None, vx=None, vy=None, xmax=100., ymax=100., dx=1.5, init_T=0., backend=None): self.backend = get_backend(backend) self.num_particles = num_particles self.xmin, self.xmax = 0., xmax self.ymin, self.ymax = 0., ymax self.m = 1. if x is None and y is None: self.x, self.y = self.setup_positions(num_particles, dx) if vx is None and vy is None: self.vx, self.vy = self.setup_velocities(init_T, num_particles) self.fx = carr.zeros_like(self.x, backend=self.backend) self.fy = carr.zeros_like(self.x, backend=self.backend) self.pe = carr.zeros_like(self.x, backend=self.backend) self.energy_calc = Reduction("a+b", map_func=calculate_energy, backend=self.backend) def setup_velocities(self, T, num_particles): np.random.seed(123) vx = np.random.uniform(0, 1., size=num_particles).astype(np.float64) vy = np.random.uniform(0, 1., size=num_particles).astype(np.float64) T_current = np.average(vx ** 2 + vy ** 2) scaling_factor = (T / T_current) ** 0.5 vx = vx * scaling_factor vy = vy * scaling_factor return wrap(vx, vy, backend=self.backend) def setup_positions(self, num_particles, dx): ndim = np.ceil(num_particles ** 0.5) dim_length = ndim * dx self.xmax = dim_length * 3 self.ymax = dim_length * 3 xmin_eff = ((self.xmax - self.xmin) - dim_length) / 2. xmax_eff = ((self.xmax - self.xmin) + dim_length) / 2. x, y = np.mgrid[xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx] x = x.ravel().astype(np.float64)[:num_particles] y = y.ravel().astype(np.float64)[:num_particles] return wrap(x, y, backend=self.backend) def post_step(self, step): energy = self.energy_calc(self.vx, self.vy, self.pe, self.num_particles) print("Energy at time step =", step, "is", energy) def pull(self): self.x.pull() self.y.pull() def plot(self): import matplotlib.pyplot as plt plt.xlim(self.xmin, self.xmax) plt.ylim(self.ymin, self.ymax) plt.scatter(self.x.data, self.y.data, 4.2) plt.show() class MDSolver(MDSolverBase): def __init__(self, num_particles, x=None, y=None, vx=None, vy=None, xmax=100., ymax=100., dx=1.5, init_T=0., backend=None): super().__init__(num_particles, x=x, y=y, vx=vx, vy=vy, xmax=xmax, ymax=ymax, dx=dx, init_T=init_T, backend=backend) self.init_forces = Elementwise(calculate_force, backend=self.backend) self.step1 = Elementwise(step_method1, backend=self.backend) self.step2 = Elementwise(step_method2, backend=self.backend) def solve(self, t, dt): num_steps = int(t // dt) self.init_forces(self.x, self.y, self.fx, self.fy, self.pe, self.num_particles) for i in range(num_steps): self.step1(self.x, self.y, self.vx, self.vy, self.fx, self.fy, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.m, dt, self.num_particles) self.step2(self.x, self.y, self.vx, self.vy, self.fx, self.fy, self.pe, self.xmin, self.xmax, self.ymin, self.ymax, self.m, dt, self.num_particles) if i % 100 == 0: self.post_step(i) if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument( '--show', action='store_true', dest='show', default=False, help='Show plot at end of simulation' ) p.add_argument('-n', action='store', type=int, dest='n', default=100, help='Number of particles') p.add_argument('--tf', action='store', type=float, dest='t', default=40., help='Final time') p.add_argument('--dt', action='store', type=float, dest='dt', default=0.02, help='Time step') o = p.parse_args() solver = MDSolver(o.n, backend=o.backend) start = time.time() solver.solve(o.t, o.dt) end = time.time() print("Time taken for N = %i is %g secs" % (o.n, (end - start))) if o.show: solver.pull() solver.plot() compyle-release-0.8.1/examples/molecular_dynamics/nnps.py000066400000000000000000000146351414173670100236430ustar00rootroot00000000000000from nnps_kernels import * from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import serial, Elementwise, Reduction, Scan from compyle.array import get_backend, wrap from compyle.low_level import atomic_inc, cast from math import floor from time import time import numpy as np import compyle.array as carr class NNPS(object): def __init__(self, x, y, h, xmax, ymax, backend=None): self.backend = backend self.num_particles = x.length self.x, self.y = x, y self.h = h cmax = np.array([floor(xmax / h), floor(ymax / h)], dtype=np.int32) self.max_key = 1 + flatten(cmax[0], cmax[1], 1 + cmax[1]) self.qmax = 1 + cmax[1] # neighbor kernels self.find_neighbor_lengths = Elementwise(find_neighbor_lengths_knl, backend=self.backend) self.find_neighbors = Elementwise(find_neighbors_knl, backend=self.backend) self.scan_start_indices = Scan(input=input_start_indices, output=output_start_indices, scan_expr="a+b", dtype=np.int32, backend=self.backend) self.init_arrays() def init_arrays(self): # sort arrays self.bin_counts = carr.zeros(self.max_key, dtype=np.int32, backend=self.backend) self.start_indices = carr.zeros(self.max_key, dtype=np.int32, backend=self.backend) self.keys = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) self.sorted_indices = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) # neighbor arrays self.nbr_lengths = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) self.nbr_starts = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) self.nbrs = carr.zeros(2 * self.num_particles, dtype=np.int32, backend=self.backend) def reset_arrays(self): # sort arrays self.bin_counts.fill(0) self.start_indices.fill(0) self.sorted_indices.fill(0) # neighbors array self.nbr_lengths.fill(0) self.nbr_starts.fill(0) def get_neighbors(self): self.find_neighbor_lengths(self.x, self.y, self.h, self.qmax, self.start_indices, self.sorted_indices, self.bin_counts, self.nbr_lengths, self.max_key) self.scan_start_indices(counts=self.nbr_lengths, indices=self.nbr_starts) self.total_neighbors = int(self.nbr_lengths[-1] + self.nbr_starts[-1]) self.nbrs.resize(self.total_neighbors) self.find_neighbors(self.x, self.y, self.h, self.qmax, self.start_indices, self.sorted_indices, self.bin_counts, self.nbr_starts, self.nbrs, self.max_key) class NNPSCountingSort(NNPS): def __init__(self, x, y, h, xmax, ymax, backend=None): super().__init__(x, y, h, xmax, ymax, backend=backend) # sort kernels self.count_bins = Elementwise(count_bins, backend=self.backend) self.sort_indices = Elementwise(sort_indices, backend=self.backend) def init_arrays(self): super().init_arrays() self.sort_offsets = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) def reset_arrays(self): super().reset_arrays() # sort arrays self.sort_offsets.fill(0) def build(self): self.reset_arrays() self.count_bins(self.x, self.y, self.h, self.qmax, self.keys, self.bin_counts, self.sort_offsets) self.scan_start_indices(counts=self.bin_counts, indices=self.start_indices) self.sort_indices(self.keys, self.sort_offsets, self.start_indices, self.sorted_indices) class NNPSRadixSort(NNPS): def __init__(self, x, y, h, xmax, ymax, backend=None): super().__init__(x, y, h, xmax, ymax, backend=backend) self.max_bits = np.ceil(np.log2(self.max_key)) # sort kernels self.fill_keys = Elementwise(fill_keys, backend=self.backend) self.fill_bin_counts = Elementwise(fill_bin_counts, backend=self.backend) self.scan_keys = Scan(input=input_scan_keys, output=output_scan_keys, scan_expr="a+b", dtype=np.int32, backend=self.backend) def init_arrays(self): super().init_arrays() # sort arrays self.sorted_keys = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) self.indices = carr.zeros(self.num_particles, dtype=np.int32, backend=self.backend) def reset_arrays(self): super().reset_arrays() self.sorted_keys.fill(0) def build(self): self.reset_arrays() self.fill_keys(self.x, self.y, self.h, self.qmax, self.indices, self.keys) self.sorted_keys, self.sorted_indices = carr.sort_by_keys( [self.keys, self.indices], key_bits=self.max_bits, backend=self.backend) self.scan_keys(keys=self.sorted_keys, start_indices=self.start_indices) self.fill_bin_counts(self.sorted_keys, self.start_indices, self.bin_counts, self.num_particles) if __name__ == "__main__": import sys backend = sys.argv[1] if len(sys.argv) > 1 else 'cython' np.random.seed(123) num_particles = 20 x = np.random.uniform(0, 10., size=num_particles).astype(np.float32) y = np.random.uniform(0, 10., size=num_particles).astype(np.float32) x, y = wrap(x, y, backend=backend) nnps = NNPSRadixSort(x, y, 3., 10., 10., backend=backend) nnps.build() nnps.get_neighbors() print(nnps.start_indices) print(nnps.bin_counts) print(nnps.nbr_lengths) compyle-release-0.8.1/examples/molecular_dynamics/nnps_kernels.py000066400000000000000000000067721414173670100253710ustar00rootroot00000000000000from compyle.api import declare, annotate from compyle.parallel import serial from compyle.low_level import atomic_inc, cast from math import floor import numpy as np @annotate def find_cell_id(x, y, h, c): c[0] = cast(floor((x) / h), "int") c[1] = cast(floor((y) / h), "int") @annotate def flatten(p, q, qmax): return p * qmax + q @serial @annotate def count_bins(i, x, y, h, cmax, keys, bin_counts, sort_offsets): c = declare('matrix(2, "int")') find_cell_id(x[i], y[i], h, c) key = flatten(c[0], c[1], cmax) keys[i] = key idx = atomic_inc(bin_counts[key]) sort_offsets[i] = idx @annotate def sort_indices(i, keys, sort_offsets, start_indices, sorted_indices): key = keys[i] offset = sort_offsets[i] start_idx = start_indices[key] sorted_indices[start_idx + offset] = i @annotate def input_start_indices(i, counts): return 0 if i == 0 else counts[i - 1] @annotate def output_start_indices(i, item, indices): indices[i] = item @annotate def fill_keys(i, x, y, h, cmax, indices, keys): c = declare('matrix(2, "int")') find_cell_id(x[i], y[i], h, c) key = flatten(c[0], c[1], cmax) keys[i] = key indices[i] = i @annotate def input_scan_keys(i, keys): return 1 if i == 0 or keys[i] != keys[i - 1] else 0 @annotate def output_scan_keys(i, item, prev_item, keys, start_indices): key = keys[i] if item != prev_item: start_indices[key] = i @annotate def fill_bin_counts(i, keys, start_indices, bin_counts, num_particles): if i == num_particles - 1: last_key = keys[num_particles - 1] bin_counts[last_key] = num_particles - start_indices[last_key] if i == 0 or keys[i] == keys[i - 1]: return key = keys[i] prev_key = keys[i - 1] bin_counts[prev_key] = start_indices[key] - start_indices[prev_key] @annotate def find_neighbor_lengths_knl(i, x, y, h, cmax, start_indices, sorted_indices, bin_counts, nbr_lengths, max_key): d = h * h q_c = declare('matrix(2, "int")') find_cell_id(x[i], y[i], h, q_c) for p in range(-1, 2): for q in range(-1, 2): cx = q_c[0] + p cy = q_c[1] + q key = flatten(cx, cy, cmax) if key >= max_key or key < 0: continue start_idx = start_indices[key] np = bin_counts[key] for k in range(np): j = sorted_indices[start_idx + k] xij = x[i] - x[j] yij = y[i] - y[j] rij2 = xij * xij + yij * yij if rij2 < d: nbr_lengths[i] += 1 @annotate def find_neighbors_knl(i, x, y, h, cmax, start_indices, sorted_indices, bin_counts, nbr_starts, nbrs, max_key): d = h * h q_c = declare('matrix(2, "int")') find_cell_id(x[i], y[i], h, q_c) length = 0 nbr_start_idx = nbr_starts[i] for p in range(-1, 2): for q in range(-1, 2): cx = q_c[0] + p cy = q_c[1] + q key = flatten(cx, cy, cmax) if key >= max_key or key < 0: continue start_idx = start_indices[key] np = bin_counts[key] for k in range(np): j = sorted_indices[start_idx + k] xij = x[i] - x[j] yij = y[i] - y[j] rij2 = xij * xij + yij * yij if rij2 < d: nbrs[nbr_start_idx + length] = j length += 1 compyle-release-0.8.1/examples/molecular_dynamics/performance_comparison.py000066400000000000000000000111131414173670100274040ustar00rootroot00000000000000import numpy as np import time import md_simple import md_nnps from compyle.config import get_config def solve(n, backend, solver_algo, tf=0.5, dt=0.02, use_count_sort=False): solver = solver_algo(n, backend=backend.replace("_omp", "")) start = time.time() solver.solve(tf, dt) end = time.time() print("Time taken for backend = %s, N = %i is %g secs" % (backend, n, (end - start))) return end - start def compare(backends, n_list, solver_algo, niter=3): t_list = {b: [] for b in backends} speedups = {b: [] for b in backends} for n in n_list: print("Running for N = %i" % n) for backend in backends: if "omp" in backend: get_config().use_openmp = True t = 1e9 for it in range(niter): t = min(t, solve(n, backend, solver_algo)) t_list[backend].append(t) if "omp" in backend: get_config().use_openmp = False if 'cython' in backends: for backend in backends: for i, n in enumerate(n_list): speedups[backend].append( t_list["cython"][i] / t_list[backend][i]) else: speedups = None return speedups, t_list def compare_implementations(backend, n_list, niter=3): import matplotlib.pyplot as plt sp, nnps_tlist = compare([backend], n_list, md_nnps.MDSolver, niter=niter) sp, simple_tlist = compare([backend], n_list, md_simple.MDSolver, niter=niter) speedup = [simple_tlist[backend][i] / nnps_tlist[backend][i] for i in range(len(n_list))] plt.loglog(n_list, nnps_tlist[backend], 'x-', label="Linear") plt.loglog(n_list, simple_tlist[backend], 'x-', label="Simple") plt.xlabel("Number of particles") plt.ylabel("Time (secs)") plt.legend() plt.grid(True) plt.savefig("time_comp_impl.png", dpi=300) plt.clf() plt.loglog(n_list, speedup, 'x-') plt.xlabel("Number of particles") plt.ylabel("Speedup") plt.grid(True) plt.savefig("speedup_comp_impl.png", dpi=300) def plot(n_list, speedups, t_list, label): backend_label_map = {'cython': 'Cython', 'cython_omp': 'OpenMP', 'opencl': 'OpenCL', 'cuda': 'CUDA'} import matplotlib.pyplot as plt plt.figure() if speedups: for backend, arr in speedups.items(): if backend == "cython": continue plt.semilogx(n_list, arr, 'x-', label=backend_label_map[backend]) plt.xlabel("Number of particles") plt.ylabel("Speedup") plt.legend() plt.grid(True) plt.savefig("%s_speedup_%s.png" % (label, "_".join(speedups.keys())), dpi=300) plt.clf() for backend, arr in t_list.items(): plt.loglog(n_list, arr, 'x-', label=backend_label_map[backend]) plt.xlabel("Number of particles") plt.ylabel("Time (secs)") plt.legend() plt.grid(True) plt.savefig("%s_time_%s.png" % (label, "_".join(t_list.keys())), dpi=300) if __name__ == "__main__": from argparse import ArgumentParser p = ArgumentParser() p.add_argument( '-c', '--comparison', action='store', dest='comp', default='gpu_comp', choices=['gpu_comp', 'omp_comp', 'comp_algo'], help='Choose the comparison.' ) p.add_argument( '--nnps', action='store', dest='nnps', default='linear', choices=['linear', 'simple'], help='Choose algorithm.' ) p.add_argument( '--use-double', action='store_true', dest='use_double', default=False, help='Use double precision on the GPU.' ) o = p.parse_args() get_config().use_double = o.use_double solver_algo = (md_nnps.MDNNPSSolver if o.nnps == 'linear' else md_simple.MDSolver) n_list = [10000 * (2 ** i) for i in range(10)] if o.nnps == 'linear' else \ [500 * (2 ** i) for i in range(8)] if o.comp == "gpu_comp": backends = ["opencl", "cuda", "cython"] print("Running for", n_list) speedups, t_list = compare(backends, n_list, solver_algo) plot(n_list, speedups, t_list, o.nnps) elif o.comp == "omp_comp": backends = ["cython_omp", "cython"] print("Running for", n_list) speedups, t_list = compare(backends, n_list, solver_algo) plot(n_list, speedups, t_list, o.nnps) elif o.comp == "comp_algo": backend = "cython" n_list = [500, 1000, 2000, 4000, 8000, 16000, 32000] print("Running for", n_list) compare_implementations(backend, n_list) compyle-release-0.8.1/examples/vm_elementwise.py000066400000000000000000000031331414173670100220250ustar00rootroot00000000000000import numpy as np from math import pi import time from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import Elementwise from compyle.array import wrap @annotate(double='xi, yi, xj, yj, gamma', result='doublep') def point_vortex(xi, yi, xj, yj, gamma, result): xij = xi - xj yij = yi - yj r2ij = xij*xij + yij*yij if r2ij < 1e-14: result[0] = 0.0 result[1] = 0.0 else: tmp = gamma/(2.0*pi*r2ij) result[0] = -tmp*yij result[1] = tmp*xij @annotate(int='i, nv', gdoublep='x, y, gamma, u, v') def velocity(i, x, y, gamma, u, v, nv): j = declare('int') tmp = declare('matrix(2)') xi = x[i] yi = y[i] u[i] = 0.0 v[i] = 0.0 for j in range(nv): point_vortex(xi, yi, x[j], y[j], gamma[j], tmp) u[i] += tmp[0] v[i] += tmp[1] def make_vortices(nv, backend): x = np.linspace(-1, 1, nv) y = x.copy() gamma = np.ones(nv) u = np.zeros_like(x) v = np.zeros_like(x) x, y, gamma, u, v = wrap(x, y, gamma, u, v, backend=backend) return x, y, gamma, u, v, nv def run(nv, backend): e = Elementwise(velocity, backend=backend) args = make_vortices(nv, backend) t1 = time.time() e(*args) print(time.time() - t1) u = args[-3] u.pull() return e, args if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument('-n', action='store', type=int, dest='n', default=10000, help='Number of particles.') o = p.parse_args() run(o.n, o.backend) compyle-release-0.8.1/examples/vm_elementwise_jit.py000066400000000000000000000027471414173670100227050ustar00rootroot00000000000000import numpy as np from math import pi import time from compyle.config import get_config from compyle.api import declare, annotate from compyle.parallel import Elementwise from compyle.array import wrap @annotate def point_vortex(xi, yi, xj, yj, gamma, result): xij = xi - xj yij = yi - yj r2ij = xij*xij + yij*yij if r2ij < 1e-14: result[0] = 0.0 result[1] = 0.0 else: tmp = gamma/(2.0*pi*r2ij) result[0] = -tmp*yij result[1] = tmp*xij @annotate def velocity(i, x, y, gamma, u, v, nv): tmp = declare('matrix(2)') xi = x[i] yi = y[i] u[i] = 0.0 v[i] = 0.0 for j in range(nv): point_vortex(xi, yi, x[j], y[j], gamma[j], tmp) u[i] += tmp[0] v[i] += tmp[1] def make_vortices(nv, backend): x = np.linspace(-1, 1, nv) y = x.copy() gamma = np.ones(nv) u = np.zeros_like(x) v = np.zeros_like(x) x, y, gamma, u, v = wrap(x, y, gamma, u, v, backend=backend) return x, y, gamma, u, v, nv def run(nv, backend): e = Elementwise(velocity, backend=backend) args = make_vortices(nv, backend) t1 = time.time() e(*args) print(time.time() - t1) u = args[-3] u.pull() return e, args if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument('-n', action='store', type=int, dest='n', default=10000, help='Number of particles.') o = p.parse_args() run(o.n, o.backend) compyle-release-0.8.1/examples/vm_kernel.py000066400000000000000000000054541414173670100207740ustar00rootroot00000000000000"""Shows the use of a raw opencl Kernel but written using pure Python. It makes use of local memory allocated on the host. Note that the local memory is allocated as a multiple of workgroup size times the size of the data type automatically. This is a raw opencl kernel so will not work on Cython! """ import numpy as np from math import pi import time from compyle.api import annotate, declare, get_config, wrap from compyle.low_level import (Kernel, LocalMem, local_barrier, LID_0, LDIM_0, GDIM_0) @annotate(double='xi, yi, xj, yj, gamma', result='doublep') def point_vortex(xi, yi, xj, yj, gamma, result): xij = xi - xj yij = yi - yj r2ij = xij*xij + yij*yij if r2ij < 1e-14: result[0] = 0.0 result[1] = 0.0 else: tmp = gamma/(2.0*pi*r2ij) result[0] = -tmp*yij result[1] = tmp*xij @annotate(nv='int', gdoublep='x, y, gamma, u, v', ldoublep='xc, yc, gc') def velocity(x, y, gamma, u, v, xc, yc, gc, nv): i, gid, nb = declare('int', 3) j, ti, nt, jb = declare('int', 4) ti = LID_0 nt = LDIM_0 gid = GID_0 i = gid*nt + ti idx = declare('int') tmp = declare('matrix(2)') uj, vj = declare('double', 2) nb = GDIM_0 if i < nv: xi = x[i] yi = y[i] uj = 0.0 vj = 0.0 for jb in range(nb): idx = jb*nt + ti if idx < nv: xc[ti] = x[idx] yc[ti] = y[idx] gc[ti] = gamma[idx] else: gc[ti] = 0.0 local_barrier() if i < nv: for j in range(nt): point_vortex(xi, yi, xc[j], yc[j], gc[j], tmp) uj += tmp[0] vj += tmp[1] local_barrier() if i < nv: u[i] = uj v[i] = vj def make_vortices(nv, backend): x = np.linspace(-1, 1, nv) y = x.copy() gamma = np.ones(nv) u = np.zeros_like(x) v = np.zeros_like(x) x, y, gamma, u, v = wrap(x, y, gamma, u, v, backend=backend) xc, yc, gc = (LocalMem(1, backend), LocalMem(1, backend), LocalMem(1, backend)) return x, y, gamma, u, v, xc, yc, gc, nv def run(nv, backend): e = Kernel(velocity, backend=backend) args = make_vortices(nv, backend) t1 = time.time() gs = ((nv + 128 - 1)//128)*128 e(*args, global_size=(gs,)) print(time.time() - t1) u = args[3] u.pull() print(u.data) return e, args if __name__ == '__main__': from compyle.utils import ArgumentParser p = ArgumentParser() p.add_argument('-n', action='store', type=int, dest='n', default=10000, help='Number of particles.') o = p.parse_args() assert o.backend in ['opencl', 'cuda'], ("Only OpenCL/CUDA backend is " "supported.") run(o.n, o.backend) compyle-release-0.8.1/examples/vm_numba.py000066400000000000000000000023351414173670100206110ustar00rootroot00000000000000import numpy as np from math import pi import time from numba import jit @jit def point_vortex(xi, yi, xj, yj, gamma, result): xij = xi - xj yij = yi - yj r2ij = xij*xij + yij*yij if r2ij < 1e-14: result[0] = 0.0 result[1] = 0.0 else: tmp = gamma/(2.0*pi*r2ij) result[0] = -tmp*yij result[1] = tmp*xij @jit def velocity(x, y, gamma, u, v, nv): tmp = np.zeros(2) for i in range(nv): xi = x[i] yi = y[i] u[i] = 0.0 v[i] = 0.0 for j in range(nv): point_vortex(xi, yi, x[j], y[j], gamma[j], tmp) u[i] += tmp[0] v[i] += tmp[1] def make_vortices(nv): x = np.linspace(-1, 1, nv) y = x.copy() gamma = np.ones(nv) u = np.zeros_like(x) v = np.zeros_like(x) return x, y, gamma, u, v, nv def run(nv): args = make_vortices(nv) t1 = time.time() velocity(*args) print(time.time() - t1) u = args[-3] print(u) return velocity, args if __name__ == '__main__': from argparse import ArgumentParser p = ArgumentParser() p.add_argument('-n', action='store', type=int, dest='n', default=10000) o = p.parse_args() run(o.n) compyle-release-0.8.1/pyproject.toml000066400000000000000000000002301414173670100175210ustar00rootroot00000000000000[build-system] requires = [ "wheel>=0.29.0", "setuptools>=42.0.0", "oldest-supported-numpy", "Cython>=0.20", "mako", "pytools" ]compyle-release-0.8.1/requirements.txt000066400000000000000000000000411414173670100200710ustar00rootroot00000000000000mako pytools cython numpy pytest compyle-release-0.8.1/setup.py000066400000000000000000000042271414173670100163310ustar00rootroot00000000000000import sys from setuptools import setup, find_packages try: from Cython.Distutils import Extension from Cython.Build import cythonize except ImportError: from distutils.core import Extension def cythonize(*args, **kw): return args[0] def get_version(): import os data = {} fname = os.path.join('compyle', '__init__.py') exec(compile(open(fname).read(), fname, 'exec'), data) return data.get('__version__') install_requires = ['mako', 'pytools', 'cython', 'numpy'] tests_require = ['pytest'] if sys.version_info[0] < 3: tests_require += ['mock>=1.0'] docs_require = ['sphinx'] cuda_require = ['pycuda', 'cupy'] opencl_require = ['pyopencl'] classes = ''' Development Status :: 4 - Beta Intended Audience :: Developers Intended Audience :: Science/Research License :: OSI Approved :: BSD License Natural Language :: English Operating System :: MacOS :: MacOS X Operating System :: Microsoft :: Windows Operating System :: POSIX Operating System :: Unix Programming Language :: Python Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 Topic :: Scientific/Engineering Topic :: Software Development :: Code Generators Topic :: Software Development :: Compilers Topic :: Software Development :: Libraries Topic :: Utilities ''' classifiers = [x.strip() for x in classes.splitlines() if x] ext_modules = [ Extension( name="compyle.thrust.sort", sources=["compyle/thrust/sort.pyx"], language="c++" ), ] setup( name='compyle', version=get_version(), author='Prabhu Ramachandran', author_email='prabhu@aero.iitb.ac.in', description='Execute a subset of Python on HPC platforms', long_description=open('README.rst').read(), license="BSD", url='https://github.com/pypr/compyle', classifiers=classifiers, packages=find_packages(), ext_modules=cythonize(ext_modules, language="c++"), install_requires=install_requires, extras_require={ "docs": docs_require, "tests": tests_require, "dev": docs_require + tests_require, "cuda": cuda_require, "opencl": opencl_require, }, )