pyopencl-2013.2/0002755000175000000500000000000012245716343012153 5ustar tomussrcpyopencl-2013.2/aksetup_helper.py0000644000175000000500000006174612245716340015551 0ustar tomussrcimport setuptools # noqa from setuptools import Extension def count_down_delay(delay): from time import sleep import sys while delay: sys.stdout.write("Continuing in %d seconds... \r" % delay) sys.stdout.flush() delay -= 1 sleep(1) print("") DASH_SEPARATOR = 75 * "-" def setup(*args, **kwargs): from setuptools import setup try: setup(*args, **kwargs) except KeyboardInterrupt: raise except SystemExit: raise except: print(DASH_SEPARATOR) print("Sorry, your build failed. Try rerunning configure.py with " "different options.") print(DASH_SEPARATOR) raise class NumpyExtension(Extension): # nicked from # http://mail.python.org/pipermail/distutils-sig/2007-September/008253.html # solution by Michael Hoffmann def __init__(self, *args, **kwargs): Extension.__init__(self, *args, **kwargs) self._include_dirs = self.include_dirs del self.include_dirs # restore overwritten property def get_numpy_incpath(self): from imp import find_module # avoid actually importing numpy, it screws up distutils file, pathname, descr = find_module("numpy") from os.path import join return join(pathname, "core", "include") def get_include_dirs(self): return self._include_dirs + [self.get_numpy_incpath()] def set_include_dirs(self, value): self._include_dirs = value def del_include_dirs(self): pass include_dirs = property(get_include_dirs, set_include_dirs, del_include_dirs) class PyUblasExtension(NumpyExtension): def get_module_include_path(self, name): from pkg_resources import Requirement, resource_filename return resource_filename(Requirement.parse(name), "%s/include" % name) @property def include_dirs(self): return self._include_dirs + [ self.get_numpy_incpath(), self.get_module_include_path("pyublas"), ] class HedgeExtension(PyUblasExtension): @property def include_dirs(self): return self._include_dirs + [ self.get_numpy_incpath(), self.get_module_include_path("pyublas"), self.get_module_include_path("hedge"), ] # {{{ tools def flatten(list): """For an iterable of sub-iterables, generate each member of each sub-iterable in turn, i.e. a flattened version of that super-iterable. Example: Turn [[a,b,c],[d,e,f]] into [a,b,c,d,e,f]. """ for sublist in list: for j in sublist: yield j def humanize(sym_str): words = sym_str.lower().replace("_", " ").split(" ") return " ".join([word.capitalize() for word in words]) # }}} # {{{ siteconf handling def get_config(schema=None, warn_about_no_config=True): if schema is None: from setup import get_config_schema schema = get_config_schema() if (not schema.have_config() and not schema.have_global_config() and warn_about_no_config): print("*************************************************************") print("*** I have detected that you have not run configure.py.") print("*************************************************************") print("*** Additionally, no global config files were found.") print("*** I will go ahead with the default configuration.") print("*** In all likelihood, this will not work out.") print("*** ") print("*** See README_SETUP.txt for more information.") print("*** ") print("*** If the build does fail, just re-run configure.py with the") print("*** correct arguments, and then retry. Good luck!") print("*************************************************************") print("*** HIT Ctrl-C NOW IF THIS IS NOT WHAT YOU WANT") print("*************************************************************") count_down_delay(delay=10) return expand_options(schema.read_config()) def hack_distutils(debug=False, fast_link=True, what_opt=3): # hack distutils.sysconfig to eliminate debug flags # stolen from mpi4py def remove_prefixes(optlist, bad_prefixes): for bad_prefix in bad_prefixes: for i, flag in enumerate(optlist): if flag.startswith(bad_prefix): optlist.pop(i) break return optlist import sys if not sys.platform.lower().startswith("win"): from distutils import sysconfig cvars = sysconfig.get_config_vars() cflags = cvars.get('OPT') if cflags: cflags = remove_prefixes(cflags.split(), ['-g', '-O', '-Wstrict-prototypes', '-DNDEBUG']) if debug: cflags.append("-g") else: if what_opt is None: pass else: cflags.append("-O%s" % what_opt) cflags.append("-DNDEBUG") cvars['OPT'] = str.join(' ', cflags) cvars["CFLAGS"] = cvars["BASECFLAGS"] + " " + cvars["OPT"] if fast_link: for varname in ["LDSHARED", "BLDSHARED"]: ldsharedflags = cvars.get(varname) if ldsharedflags: ldsharedflags = remove_prefixes(ldsharedflags.split(), ['-Wl,-O']) cvars[varname] = str.join(' ', ldsharedflags) # }}} # {{{ configure guts def default_or(a, b): if a is None: return b else: return a def expand_str(s, options): import re def my_repl(match): sym = match.group(1) try: repl = options[sym] except KeyError: from os import environ repl = environ[sym] return expand_str(repl, options) return re.subn(r"\$\{([a-zA-Z0-9_]+)\}", my_repl, s)[0] def expand_value(v, options): if isinstance(v, str): return expand_str(v, options) elif isinstance(v, list): result = [] for i in v: try: exp_i = expand_value(i, options) except: pass else: result.append(exp_i) return result else: return v def expand_options(options): return dict( (k, expand_value(v, options)) for k, v in options.items()) class ConfigSchema: def __init__(self, options, conf_file="siteconf.py", conf_dir="."): self.optdict = dict((opt.name, opt) for opt in options) self.options = options self.conf_dir = conf_dir self.conf_file = conf_file from os.path import expanduser self.user_conf_file = expanduser("~/.aksetup-defaults.py") import sys if not sys.platform.lower().startswith("win"): self.global_conf_file = "/etc/aksetup-defaults.py" else: self.global_conf_file = None def get_conf_file(self): import os return os.path.join(self.conf_dir, self.conf_file) def set_conf_dir(self, conf_dir): self.conf_dir = conf_dir def get_default_config(self): return dict((opt.name, opt.default) for opt in self.options) def read_config_from_pyfile(self, filename): result = {} filevars = {} infile = open(filename, "r") try: contents = infile.read() finally: infile.close() exec(compile(contents, filename, "exec"), filevars) for key, value in filevars.items(): if key in self.optdict: result[key] = value return result def update_conf_file(self, filename, config): result = {} filevars = {} try: exec(compile(open(filename, "r").read(), filename, "exec"), filevars) except IOError: pass if "__builtins__" in filevars: del filevars["__builtins__"] for key, value in config.items(): if value is not None: filevars[key] = value keys = filevars.keys() keys.sort() outf = open(filename, "w") for key in keys: outf.write("%s = %s\n" % (key, repr(filevars[key]))) outf.close() return result def update_user_config(self, config): self.update_conf_file(self.user_conf_file, config) def update_global_config(self, config): if self.global_conf_file is not None: self.update_conf_file(self.global_conf_file, config) def get_default_config_with_files(self): result = self.get_default_config() import os confignames = [] if self.global_conf_file is not None: confignames.append(self.global_conf_file) confignames.append(self.user_conf_file) for fn in confignames: if os.access(fn, os.R_OK): result.update(self.read_config_from_pyfile(fn)) return result def have_global_config(self): import os result = os.access(self.user_conf_file, os.R_OK) if self.global_conf_file is not None: result = result or os.access(self.global_conf_file, os.R_OK) return result def have_config(self): import os return os.access(self.get_conf_file(), os.R_OK) def read_config(self, warn_if_none=True): import os cfile = self.get_conf_file() result = self.get_default_config_with_files() if os.access(cfile, os.R_OK): filevars = {} exec(compile(open(cfile, "r").read(), cfile, "exec"), filevars) for key, value in filevars.items(): if key in self.optdict: result[key] = value elif key == "__builtins__": pass else: raise KeyError("invalid config key in %s: %s" % ( cfile, key)) return result def add_to_configparser(self, parser, def_config=None): if def_config is None: def_config = self.get_default_config_with_files() for opt in self.options: default = default_or(def_config.get(opt.name), opt.default) opt.add_to_configparser(parser, default) def get_from_configparser(self, options): result = {} for opt in self.options: result[opt.name] = opt.take_from_configparser(options) return result def write_config(self, config): outf = open(self.get_conf_file(), "w") for opt in self.options: value = config[opt.name] if value is not None: outf.write("%s = %s\n" % (opt.name, repr(config[opt.name]))) outf.close() def make_substitutions(self, config): return dict((opt.name, opt.value_to_str(config[opt.name])) for opt in self.options) class Option(object): def __init__(self, name, default=None, help=None): self.name = name self.default = default self.help = help def as_option(self): return self.name.lower().replace("_", "-") def metavar(self): last_underscore = self.name.rfind("_") return self.name[last_underscore+1:] def get_help(self, default): result = self.help if self.default: result += " (default: %s)" % self.value_to_str( default_or(default, self.default)) return result def value_to_str(self, default): return default def add_to_configparser(self, parser, default=None): default = default_or(default, self.default) default_str = self.value_to_str(default) parser.add_option( "--" + self.as_option(), dest=self.name, default=default_str, metavar=self.metavar(), help=self.get_help(default)) def take_from_configparser(self, options): return getattr(options, self.name) class Switch(Option): def add_to_configparser(self, parser, default=None): if not isinstance(self.default, bool): raise ValueError("Switch options must have a default") if default is None: default = self.default option_name = self.as_option() if default: option_name = "no-" + option_name action = "store_false" else: action = "store_true" parser.add_option( "--" + option_name, dest=self.name, help=self.get_help(default), default=default, action=action) class StringListOption(Option): def value_to_str(self, default): if default is None: return None return ",".join([str(el).replace(",", r"\,") for el in default]) def get_help(self, default): return Option.get_help(self, default) + " (several ok)" def take_from_configparser(self, options): opt = getattr(options, self.name) if opt is None: return None else: if opt: import re sep = re.compile(r"(? #include #include #include "bitlog.hpp" namespace PYGPU_PACKAGE { template inline T signed_left_shift(T x, signed shift_amount) { if (shift_amount < 0) return x >> -shift_amount; else return x << shift_amount; } template inline T signed_right_shift(T x, signed shift_amount) { if (shift_amount < 0) return x << -shift_amount; else return x >> shift_amount; } template class memory_pool { public: typedef typename Allocator::pointer_type pointer_type; typedef typename Allocator::size_type size_type; private: typedef boost::uint32_t bin_nr_t; typedef std::vector bin_t; typedef boost::ptr_map container_t; container_t m_container; typedef typename container_t::value_type bin_pair_t; std::auto_ptr m_allocator; // A held block is one that's been released by the application, but that // we are keeping around to dish out again. unsigned m_held_blocks; // An active block is one that is in use by the application. unsigned m_active_blocks; bool m_stop_holding; int m_trace; public: memory_pool(Allocator const &alloc=Allocator()) : m_allocator(alloc.copy()), m_held_blocks(0), m_active_blocks(0), m_stop_holding(false), m_trace(false) { if (m_allocator->is_deferred()) { PyErr_WarnEx(PyExc_UserWarning, "Memory pools expect non-deferred " "semantics from their allocators. You passed a deferred " "allocator, i.e. an allocator whose allocations can turn out to " "be unavailable long after allocation.", 1); } } virtual ~memory_pool() { free_held(); } static const unsigned mantissa_bits = 2; static const unsigned mantissa_mask = (1 << mantissa_bits) - 1; static bin_nr_t bin_number(size_type size) { signed l = bitlog2(size); size_type shifted = signed_right_shift(size, l-signed(mantissa_bits)); if (size && (shifted & (1 << mantissa_bits)) == 0) throw std::runtime_error("memory_pool::bin_number: bitlog2 fault"); size_type chopped = shifted & mantissa_mask; return l << mantissa_bits | chopped; } void set_trace(bool flag) { if (flag) ++m_trace; else --m_trace; } static size_type alloc_size(bin_nr_t bin) { bin_nr_t exponent = bin >> mantissa_bits; bin_nr_t mantissa = bin & mantissa_mask; size_type ones = signed_left_shift(1, signed(exponent)-signed(mantissa_bits) ); if (ones) ones -= 1; size_type head = signed_left_shift( (1<second; } void inc_held_blocks() { if (m_held_blocks == 0) start_holding_blocks(); ++m_held_blocks; } void dec_held_blocks() { --m_held_blocks; if (m_held_blocks == 0) stop_holding_blocks(); } virtual void start_holding_blocks() { } virtual void stop_holding_blocks() { } public: pointer_type allocate(size_type size) { bin_nr_t bin_nr = bin_number(size); bin_t &bin = get_bin(bin_nr); if (bin.size()) { if (m_trace) std::cout << "[pool] allocation of size " << size << " served from bin " << bin_nr << " which contained " << bin.size() << " entries" << std::endl; return pop_block_from_bin(bin, size); } size_type alloc_sz = alloc_size(bin_nr); assert(bin_number(alloc_sz) == bin_nr); if (m_trace) std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl; try { return get_from_allocator(alloc_sz); } catch (PYGPU_PACKAGE::error &e) { if (!e.is_out_of_memory()) throw; } if (m_trace) std::cout << "[pool] allocation triggered OOM, running GC" << std::endl; m_allocator->try_release_blocks(); if (bin.size()) return pop_block_from_bin(bin, size); if (m_trace) std::cout << "[pool] allocation still OOM after GC" << std::endl; while (try_to_free_memory()) { try { return get_from_allocator(alloc_sz); } catch (PYGPU_PACKAGE::error &e) { if (!e.is_out_of_memory()) throw; } } throw PYGPU_PACKAGE::error( "memory_pool::allocate", #ifdef PYGPU_PYCUDA CUDA_ERROR_OUT_OF_MEMORY, #endif #ifdef PYGPU_PYOPENCL CL_MEM_OBJECT_ALLOCATION_FAILURE, #endif "failed to free memory for allocation"); } void free(pointer_type p, size_type size) { --m_active_blocks; bin_nr_t bin_nr = bin_number(size); if (!m_stop_holding) { inc_held_blocks(); get_bin(bin_nr).push_back(p); if (m_trace) std::cout << "[pool] block of size " << size << " returned to bin " << bin_nr << " which now contains " << get_bin(bin_nr).size() << " entries" << std::endl; } else m_allocator->free(p); } void free_held() { BOOST_FOREACH(bin_pair_t bin_pair, m_container) { bin_t &bin = *bin_pair.second; while (bin.size()) { m_allocator->free(bin.back()); bin.pop_back(); dec_held_blocks(); } } assert(m_held_blocks == 0); } void stop_holding() { m_stop_holding = true; free_held(); } unsigned active_blocks() { return m_active_blocks; } unsigned held_blocks() { return m_held_blocks; } bool try_to_free_memory() { BOOST_FOREACH(bin_pair_t bin_pair, // free largest stuff first std::make_pair(m_container.rbegin(), m_container.rend())) { bin_t &bin = *bin_pair.second; if (bin.size()) { m_allocator->free(bin.back()); bin.pop_back(); dec_held_blocks(); return true; } } return false; } private: pointer_type get_from_allocator(size_type alloc_sz) { pointer_type result = m_allocator->allocate(alloc_sz); ++m_active_blocks; return result; } pointer_type pop_block_from_bin(bin_t &bin, size_type size) { pointer_type result = bin.back(); bin.pop_back(); dec_held_blocks(); ++m_active_blocks; return result; } }; template class pooled_allocation : public boost::noncopyable { public: typedef Pool pool_type; typedef typename Pool::pointer_type pointer_type; typedef typename Pool::size_type size_type; private: boost::shared_ptr m_pool; pointer_type m_ptr; size_type m_size; bool m_valid; public: pooled_allocation(boost::shared_ptr p, size_type size) : m_pool(p), m_ptr(p->allocate(size)), m_size(size), m_valid(true) { } ~pooled_allocation() { if (m_valid) free(); } void free() { if (m_valid) { m_pool->free(m_ptr, m_size); m_valid = false; } else throw PYGPU_PACKAGE::error( "pooled_device_allocation::free", #ifdef PYGPU_PYCUDA CUDA_ERROR_INVALID_HANDLE #endif #ifdef PYGPU_PYOPENCL CL_INVALID_VALUE #endif ); } pointer_type ptr() const { return m_ptr; } size_type size() const { return m_size; } }; } #endif pyopencl-2013.2/src/wrapper/wrap_cl.cpp0000644000175000000500000000060212245716340016546 0ustar tomussrc#include "wrap_cl.hpp" using namespace pyopencl; extern void pyopencl_expose_constants(); extern void pyopencl_expose_part_1(); extern void pyopencl_expose_part_2(); extern void pyopencl_expose_mempool(); BOOST_PYTHON_MODULE(_cl) { pyopencl_expose_constants(); pyopencl_expose_part_1(); pyopencl_expose_part_2(); pyopencl_expose_mempool(); } // vim: foldmethod=marker pyopencl-2013.2/src/wrapper/wrap_helpers.hpp0000644000175000000500000001167012245716340017626 0ustar tomussrc#ifndef PYCUDA_WRAP_HELPERS_HEADER_SEEN #define PYCUDA_WRAP_HELPERS_HEADER_SEEN #include #include #include namespace py = boost::python; #if (BOOST_VERSION/100) < 1035 #warning ******************************************************************* #warning **** Your version of Boost C++ is likely too old for PyOpenCL. **** #warning ******************************************************************* #endif #define PYTHON_ERROR(TYPE, REASON) \ { \ PyErr_SetString(PyExc_##TYPE, REASON); \ throw boost::python::error_already_set(); \ } #define ENUM_VALUE(NAME) \ value(#NAME, NAME) #define DEF_SIMPLE_METHOD(NAME) \ def(#NAME, &cls::NAME) #define DEF_SIMPLE_METHOD_WITH_ARGS(NAME, ARGS) \ def(#NAME, &cls::NAME, boost::python::args ARGS) #define DEF_SIMPLE_FUNCTION(NAME) \ boost::python::def(#NAME, &NAME) #define DEF_SIMPLE_FUNCTION_WITH_ARGS(NAME, ARGS) \ boost::python::def(#NAME, &NAME, boost::python::args ARGS) #define DEF_SIMPLE_RO_MEMBER(NAME) \ def_readonly(#NAME, &cls::m_##NAME) #define DEF_SIMPLE_RW_MEMBER(NAME) \ def_readwrite(#NAME, &cls::m_##NAME) #define PYTHON_FOREACH(NAME, ITERABLE) \ BOOST_FOREACH(boost::python::object NAME, \ std::make_pair( \ boost::python::stl_input_iterator(ITERABLE), \ boost::python::stl_input_iterator())) #define COPY_PY_LIST(TYPE, NAME) \ std::copy( \ boost::python::stl_input_iterator(py_##NAME), \ boost::python::stl_input_iterator(), \ std::back_inserter(NAME)); #define COPY_PY_COORD_TRIPLE(NAME) \ size_t NAME[3] = {0, 0, 0}; \ { \ size_t my_len = len(py_##NAME); \ if (my_len > 3) \ throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \ for (size_t i = 0; i < my_len; ++i) \ NAME[i] = py::extract(py_##NAME[i])(); \ } #define COPY_PY_PITCH_TUPLE(NAME) \ size_t NAME[2] = {0, 0}; \ if (py_##NAME.ptr() != Py_None) \ { \ size_t my_len = len(py_##NAME); \ if (my_len > 2) \ throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \ for (size_t i = 0; i < my_len; ++i) \ NAME[i] = py::extract(py_##NAME[i])(); \ } #define COPY_PY_REGION_TRIPLE(NAME) \ size_t NAME[3] = {1, 1, 1}; \ { \ size_t my_len = len(py_##NAME); \ if (my_len > 3) \ throw error("transfer", CL_INVALID_VALUE, #NAME "has too many components"); \ for (size_t i = 0; i < my_len; ++i) \ NAME[i] = py::extract(py_##NAME[i])(); \ } #define PYOPENCL_PARSE_NUMPY_ARRAY_SPEC \ PyArray_Descr *tp_descr; \ if (PyArray_DescrConverter(dtype.ptr(), &tp_descr) != NPY_SUCCEED) \ throw py::error_already_set(); \ \ py::extract shape_as_int(py_shape); \ std::vector shape; \ \ if (shape_as_int.check()) \ shape.push_back(shape_as_int()); \ else \ COPY_PY_LIST(npy_intp, shape); \ \ NPY_ORDER order = PyArray_CORDER; \ PyArray_OrderConverter(py_order.ptr(), &order); \ \ int ary_flags = 0; \ if (order == PyArray_FORTRANORDER) \ ary_flags |= NPY_FARRAY; \ else if (order == PyArray_CORDER) \ ary_flags |= NPY_CARRAY; \ else \ throw std::runtime_error("unrecognized order specifier"); \ \ std::vector strides; \ if (py_strides.ptr() != Py_None) \ { \ COPY_PY_LIST(npy_intp, strides); \ } #define PYOPENCL_RETURN_VECTOR(ITEMTYPE, NAME) \ { \ py::list pyopencl_result; \ BOOST_FOREACH(ITEMTYPE item, NAME) \ pyopencl_result.append(item); \ return pyopencl_result; \ } namespace { template inline boost::python::handle<> handle_from_new_ptr(T *ptr) { return boost::python::handle<>( typename boost::python::manage_new_object::apply::type()(ptr)); } template inline T *from_int_ptr(intptr_t obj_ref) { ClType clobj = (ClType) obj_ref; return new T(clobj, /* retain */ true); } template inline intptr_t to_int_ptr(T const &obj) { return (intptr_t) obj.data(); } } #define PYOPENCL_EXPOSE_TO_FROM_INT_PTR(CL_TYPENAME) \ .def("from_int_ptr", from_int_ptr, \ py::return_value_policy(), \ py::arg("int_ptr_value"), \ "(static method) Return a new Python object referencing the C-level " \ ":c:type:`" #CL_TYPENAME "` object at the location pointed to " \ "by *int_ptr_value*. The relevant :c:func:`clRetain*` function " \ "will be called." \ "\n\n.. versionadded:: 2013.2\n") \ .staticmethod("from_int_ptr") \ .add_property("int_ptr", to_int_ptr, \ "Return an integer corresponding to the pointer value " \ "of the underlying :c:type:`" #CL_TYPENAME "`. " \ "Use :meth:`from_int_ptr` to turn back into a Python object." \ "\n\n.. versionadded:: 2013.2\n") \ #endif pyopencl-2013.2/src/wrapper/wrap_cl_part_2.cpp0000644000175000000500000002551612245716340020030 0ustar tomussrc#include "wrap_cl.hpp" namespace pyopencl { #if PYOPENCL_CL_VERSION >= 0x1020 py::object image_desc_dummy_getter(cl_image_desc &desc) { return py::object(); } void image_desc_set_shape(cl_image_desc &desc, py::object py_shape) { COPY_PY_REGION_TRIPLE(shape); desc.image_width = shape[0]; desc.image_height = shape[1]; desc.image_depth = shape[2]; desc.image_array_size = shape[2]; } void image_desc_set_pitches(cl_image_desc &desc, py::object py_pitches) { COPY_PY_PITCH_TUPLE(pitches); desc.image_row_pitch = pitches[0]; desc.image_slice_pitch = pitches[1]; } void image_desc_set_buffer(cl_image_desc &desc, memory_object *mobj) { if (mobj) desc.buffer = mobj->data(); else desc.buffer = 0; } #endif } using namespace pyopencl; void pyopencl_expose_part_2() { py::docstring_options doc_op; doc_op.disable_cpp_signatures(); // {{{ image #if PYOPENCL_CL_VERSION >= 0x1020 { typedef cl_image_desc cls; py::class_("ImageDescriptor") .def_readwrite("image_type", &cls::image_type) .add_property("shape", &image_desc_dummy_getter, image_desc_set_shape) .def_readwrite("array_size", &cls::image_array_size) .add_property("pitches", &image_desc_dummy_getter, image_desc_set_pitches) .def_readwrite("num_mip_levels", &cls::num_mip_levels) .def_readwrite("num_samples", &cls::num_samples) .add_property("buffer", &image_desc_dummy_getter, image_desc_set_buffer) ; } #endif { typedef image cls; py::class_, boost::noncopyable>( "Image", py::no_init) .def("__init__", make_constructor(create_image, py::default_call_policies(), (py::args("context", "flags", "format"), py::arg("shape")=py::object(), py::arg("pitches")=py::object(), py::arg("hostbuf")=py::object() ))) #if PYOPENCL_CL_VERSION >= 0x1020 .def("__init__", make_constructor(create_image_from_desc, py::default_call_policies(), (py::args("context", "flags", "format", "desc"), py::arg("hostbuf")=py::object()))) #endif .DEF_SIMPLE_METHOD(get_image_info) ; } { typedef cl_image_format cls; py::class_("ImageFormat") .def("__init__", py::make_constructor(make_image_format)) .def_readwrite("channel_order", &cls::image_channel_order) .def_readwrite("channel_data_type", &cls::image_channel_data_type) .add_property("channel_count", &get_image_format_channel_count) .add_property("dtype_size", &get_image_format_channel_dtype_size) .add_property("itemsize", &get_image_format_item_size) ; } DEF_SIMPLE_FUNCTION(get_supported_image_formats); py::def("_enqueue_read_image", enqueue_read_image, (py::args("queue", "mem", "origin", "region", "hostbuf"), py::arg("row_pitch")=0, py::arg("slice_pitch")=0, py::arg("wait_for")=py::object(), py::arg("is_blocking")=true ), py::return_value_policy()); py::def("_enqueue_write_image", enqueue_write_image, (py::args("queue", "mem", "origin", "region", "hostbuf"), py::arg("row_pitch")=0, py::arg("slice_pitch")=0, py::arg("wait_for")=py::object(), py::arg("is_blocking")=true ), py::return_value_policy()); py::def("_enqueue_copy_image", enqueue_copy_image, (py::args("queue", "src", "dest", "src_origin", "dest_origin", "region"), py::arg("wait_for")=py::object()), py::return_value_policy()); py::def("_enqueue_copy_image_to_buffer", enqueue_copy_image_to_buffer, (py::args("queue", "src", "dest", "origin", "region", "offset"), py::arg("wait_for")=py::object()), py::return_value_policy()); py::def("_enqueue_copy_buffer_to_image", enqueue_copy_buffer_to_image, (py::args("queue", "src", "dest", "offset", "origin", "region"), py::arg("wait_for")=py::object()), py::return_value_policy()); #if PYOPENCL_CL_VERSION >= 0x1020 py::def("enqueue_fill_image", enqueue_write_image, (py::args("queue", "mem", "color", "origin", "region"), py::arg("wait_for")=py::object()), py::return_value_policy()); #endif // }}} // {{{ memory_map { typedef memory_map cls; py::class_("MemoryMap", py::no_init) .def("release", &cls::release, (py::arg("queue")=0, py::arg("wait_for")=py::object()), py::return_value_policy()) ; } py::def("enqueue_map_buffer", enqueue_map_buffer, (py::args("queue", "buf", "flags", "offset", "shape", "dtype"), py::arg("order")="C", py::arg("strides")=py::object(), py::arg("wait_for")=py::object(), py::arg("is_blocking")=true)); py::def("enqueue_map_image", enqueue_map_image, (py::args("queue", "img", "flags", "origin", "region", "shape", "dtype"), py::arg("order")="C", py::arg("strides")=py::object(), py::arg("wait_for")=py::object(), py::arg("is_blocking")=true)); // }}} // {{{ sampler { typedef sampler cls; py::class_("Sampler", py::init()) .DEF_SIMPLE_METHOD(get_info) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_sampler) ; } // }}} // {{{ program { typedef program cls; py::enum_("program_kind") .value("UNKNOWN", cls::KND_UNKNOWN) .value("SOURCE", cls::KND_SOURCE) .value("BINARY", cls::KND_BINARY) ; py::class_("_Program", py::no_init) .def("__init__", make_constructor( create_program_with_source, py::default_call_policies(), py::args("context", "src"))) .def("__init__", make_constructor( create_program_with_binary, py::default_call_policies(), py::args("context", "devices", "binaries"))) #if (PYOPENCL_CL_VERSION >= 0x1020) && \ ((PYOPENCL_CL_VERSION >= 0x1030) && defined(__APPLE__)) .def("create_with_built_in_kernels", create_program_with_built_in_kernels, py::args("context", "devices", "kernel_names"), py::return_value_policy()) .staticmethod("create_with_built_in_kernels") #endif .DEF_SIMPLE_METHOD(kind) .DEF_SIMPLE_METHOD(get_info) .DEF_SIMPLE_METHOD(get_build_info) .def("_build", &cls::build, (py::arg("options")="", py::arg("devices")=py::object())) #if PYOPENCL_CL_VERSION >= 0x1020 .def("compile", &cls::compile, (py::arg("options")="", py::arg("devices")=py::object(), py::arg("headers")=py::list())) .def("link", &link_program, (py::arg("context"), py::arg("programs"), py::arg("options")="", py::arg("devices")=py::object()), py::return_value_policy()) .staticmethod("link") #endif .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) .def("all_kernels", create_kernels_in_program) PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_program) ; } #if PYOPENCL_CL_VERSION >= 0x1020 py::def("unload_platform_compiler", unload_platform_compiler); #endif // }}} // {{{ kernel { typedef kernel cls; py::class_("Kernel", py::init()) .DEF_SIMPLE_METHOD(get_info) .DEF_SIMPLE_METHOD(get_work_group_info) .DEF_SIMPLE_METHOD(set_arg) #if PYOPENCL_CL_VERSION >= 0x1020 .DEF_SIMPLE_METHOD(get_arg_info) #endif .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_kernel) ; } { typedef local_memory cls; py::class_("LocalMemory", py::init(py::arg("size"))) .add_property("size", &cls::size) ; } py::def("enqueue_nd_range_kernel", enqueue_nd_range_kernel, (py::args("queue", "kernel"), py::arg("global_work_size"), py::arg("local_work_size"), py::arg("global_work_offset")=py::object(), py::arg("wait_for")=py::object(), py::arg("g_times_l")=false ), py::return_value_policy()); py::def("enqueue_task", enqueue_task, (py::args("queue", "kernel"), py::arg("wait_for")=py::object() ), py::return_value_policy()); // TODO: clEnqueueNativeKernel // }}} // {{{ GL interop DEF_SIMPLE_FUNCTION(have_gl); #ifdef HAVE_GL #ifdef __APPLE__ DEF_SIMPLE_FUNCTION(get_apple_cgl_share_group); #endif /* __APPLE__ */ { typedef gl_buffer cls; py::class_, boost::noncopyable>( "GLBuffer", py::no_init) .def("__init__", make_constructor(create_from_gl_buffer, py::default_call_policies(), (py::args("context", "flags", "bufobj")))) .def("get_gl_object_info", get_gl_object_info) ; } { typedef gl_renderbuffer cls; py::class_, boost::noncopyable>( "GLRenderBuffer", py::no_init) .def("__init__", make_constructor(create_from_gl_renderbuffer, py::default_call_policies(), (py::args("context", "flags", "bufobj")))) .def("get_gl_object_info", get_gl_object_info) ; } { typedef gl_texture cls; py::class_, boost::noncopyable>( "GLTexture", py::no_init) .def("__init__", make_constructor(create_from_gl_texture, py::default_call_policies(), (py::args("context", "flags", "texture_target", "miplevel", "texture", "dims")))) .def("get_gl_object_info", get_gl_object_info) .DEF_SIMPLE_METHOD(get_gl_texture_info) ; } py::def("enqueue_acquire_gl_objects", enqueue_acquire_gl_objects, (py::args("queue", "mem_objects"), py::arg("wait_for")=py::object() ), py::return_value_policy()); py::def("enqueue_release_gl_objects", enqueue_release_gl_objects, (py::args("queue", "mem_objects"), py::arg("wait_for")=py::object() ), py::return_value_policy()); #if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1) py::def("get_gl_context_info_khr", get_gl_context_info_khr, (py::args("properties", "param_name"), py::arg("platform")=py::object())); #endif #endif // }}} } // vim: foldmethod=marker pyopencl-2013.2/src/wrapper/tools.hpp0000644000175000000500000000114112245716340016263 0ustar tomussrc#ifndef _ASDFDAFVVAFF_PYCUDA_HEADER_SEEN_TOOLS_HPP #define _ASDFDAFVVAFF_PYCUDA_HEADER_SEEN_TOOLS_HPP #include #include #include "numpy_init.hpp" namespace pyopencl { inline npy_intp size_from_dims(int ndim, const npy_intp *dims) { if (ndim != 0) return std::accumulate(dims, dims+ndim, 1, std::multiplies()); else return 1; } inline void run_python_gc() { namespace py = boost::python; py::object gc_mod( py::handle<>( PyImport_ImportModule("gc"))); gc_mod.attr("collect")(); } } #endif pyopencl-2013.2/src/wrapper/bitlog.hpp0000644000175000000500000000142012245716340016403 0ustar tomussrc// Base-2 logarithm bithack. #ifndef _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP #define _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP #include #include namespace pyopencl { extern const char log_table_8[]; inline unsigned bitlog2_16(boost::uint16_t v) { if (unsigned long t = v >> 8) return 8+log_table_8[t]; else return log_table_8[v]; } inline unsigned bitlog2_32(boost::uint32_t v) { if (boost::uint16_t t = v >> 16) return 16+bitlog2_16(t); else return bitlog2_16(v); } inline unsigned bitlog2(unsigned long v) { #if (ULONG_MAX != 4294967295) if (boost::uint32_t t = v >> 32) return 32+bitlog2_32(t); else #endif return bitlog2_32(v); } } #endif pyopencl-2013.2/src/wrapper/wrap_cl.hpp0000644000175000000500000032555612245716340016575 0ustar tomussrc#ifndef _AFJHAYYTA_PYOPENCL_HEADER_SEEN_WRAP_CL_HPP #define _AFJHAYYTA_PYOPENCL_HEADER_SEEN_WRAP_CL_HPP // CL 1.2 undecided: // clSetPrintfCallback // {{{ includes #define CL_USE_DEPRECATED_OPENCL_1_1_APIS #ifdef __APPLE__ // Mac ------------------------------------------------------------------------ #include #ifdef HAVE_GL #define PYOPENCL_GL_SHARING_VERSION 1 #include #include #include #endif #else // elsewhere ------------------------------------------------------------------ #include #include #if defined(_WIN32) #define NOMINMAX #include #endif #ifdef HAVE_GL #include #include #endif #if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1) #define PYOPENCL_GL_SHARING_VERSION cl_khr_gl_sharing #endif #endif #include #include #include #include #include #include #include #include #include "wrap_helpers.hpp" #include "numpy_init.hpp" #include "tools.hpp" #ifdef PYOPENCL_PRETEND_CL_VERSION #define PYOPENCL_CL_VERSION PYOPENCL_PRETEND_CL_VERSION #else #if defined(CL_VERSION_1_2) #define PYOPENCL_CL_VERSION 0x1020 #elif defined(CL_VERSION_1_1) #define PYOPENCL_CL_VERSION 0x1010 #else #define PYOPENCL_CL_VERSION 0x1000 #endif #endif // }}} // {{{ tools #if PY_VERSION_HEX >= 0x02050000 typedef Py_ssize_t PYOPENCL_BUFFER_SIZE_T; #else typedef int PYOPENCL_BUFFER_SIZE_T; #endif #define PYOPENCL_CAST_BOOL(B) ((B) ? CL_TRUE : CL_FALSE) #define PYOPENCL_DEPRECATED(WHAT, KILL_VERSION, EXTRA_MSG) \ { \ PyErr_Warn( \ PyExc_DeprecationWarning, \ WHAT " is deprecated and will stop working in PyOpenCL " KILL_VERSION". " \ EXTRA_MSG); \ } #if PYOPENCL_CL_VERSION >= 0x1020 #define PYOPENCL_GET_EXT_FUN(PLATFORM, NAME, VAR) \ NAME##_fn VAR \ = (NAME##_fn) \ clGetExtensionFunctionAddressForPlatform(PLATFORM, #NAME); \ \ if (!VAR) \ throw error(#NAME, CL_INVALID_VALUE, #NAME \ "not available"); #else #define PYOPENCL_GET_EXT_FUN(PLATFORM, NAME, VAR) \ NAME##_fn VAR \ = (NAME##_fn) \ clGetExtensionFunctionAddress(#NAME); \ \ if (!VAR) \ throw error(#NAME, CL_INVALID_VALUE, #NAME \ "not available"); #endif #define PYOPENCL_PARSE_PY_DEVICES \ std::vector devices_vec; \ cl_uint num_devices; \ cl_device_id *devices; \ \ if (py_devices.ptr() == Py_None) \ { \ num_devices = 0; \ devices = 0; \ } \ else \ { \ PYTHON_FOREACH(py_dev, py_devices) \ devices_vec.push_back( \ py::extract(py_dev)().data()); \ num_devices = devices_vec.size(); \ devices = devices_vec.empty( ) ? NULL : &devices_vec.front(); \ } \ #define PYOPENCL_RETRY_RETURN_IF_MEM_ERROR(OPERATION) \ try \ { \ OPERATION \ } \ catch (pyopencl::error &e) \ { \ if (!e.is_out_of_memory()) \ throw; \ } \ \ /* If we get here, we got an error from CL. * We should run the Python GC to try and free up * some memory references. */ \ run_python_gc(); \ \ /* Now retry the allocation. If it fails again, * let it fail. */ \ { \ OPERATION \ } #define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \ { \ bool failed_with_mem_error = false; \ try \ { \ OPERATION \ } \ catch (pyopencl::error &e) \ { \ failed_with_mem_error = true; \ if (!e.is_out_of_memory()) \ throw; \ } \ \ if (failed_with_mem_error) \ { \ /* If we get here, we got an error from CL. * We should run the Python GC to try and free up * some memory references. */ \ run_python_gc(); \ \ /* Now retry the allocation. If it fails again, * let it fail. */ \ { \ OPERATION \ } \ } \ } // }}} // {{{ tracing and error reporting #ifdef PYOPENCL_TRACE #define PYOPENCL_PRINT_CALL_TRACE(NAME) \ std::cerr << NAME << std::endl; #define PYOPENCL_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) \ std::cerr << NAME << " (" << EXTRA_INFO << ')' << std::endl; #else #define PYOPENCL_PRINT_CALL_TRACE(NAME) /*nothing*/ #define PYOPENCL_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) /*nothing*/ #endif #define PYOPENCL_CALL_GUARDED_THREADED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \ { \ PYOPENCL_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \ cl_int status_code; \ Py_BEGIN_ALLOW_THREADS \ status_code = NAME ARGLIST; \ Py_END_ALLOW_THREADS \ if (status_code != CL_SUCCESS) \ throw pyopencl::error(#NAME, status_code);\ } #define PYOPENCL_CALL_GUARDED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \ { \ PYOPENCL_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \ cl_int status_code; \ status_code = NAME ARGLIST; \ if (status_code != CL_SUCCESS) \ throw pyopencl::error(#NAME, status_code);\ } #define PYOPENCL_CALL_GUARDED_THREADED(NAME, ARGLIST) \ { \ PYOPENCL_PRINT_CALL_TRACE(#NAME); \ cl_int status_code; \ Py_BEGIN_ALLOW_THREADS \ status_code = NAME ARGLIST; \ Py_END_ALLOW_THREADS \ if (status_code != CL_SUCCESS) \ throw pyopencl::error(#NAME, status_code);\ } #define PYOPENCL_CALL_GUARDED(NAME, ARGLIST) \ { \ PYOPENCL_PRINT_CALL_TRACE(#NAME); \ cl_int status_code; \ status_code = NAME ARGLIST; \ if (status_code != CL_SUCCESS) \ throw pyopencl::error(#NAME, status_code);\ } #define PYOPENCL_CALL_GUARDED_CLEANUP(NAME, ARGLIST) \ { \ PYOPENCL_PRINT_CALL_TRACE(#NAME); \ cl_int status_code; \ status_code = NAME ARGLIST; \ if (status_code != CL_SUCCESS) \ std::cerr \ << "PyOpenCL WARNING: a clean-up operation failed (dead context maybe?)" \ << std::endl \ << #NAME " failed with code " << status_code \ << std::endl; \ } // }}} // {{{ get_info helpers #define PYOPENCL_GET_OPAQUE_INFO(WHAT, FIRST_ARG, SECOND_ARG, CL_TYPE, TYPE) \ { \ CL_TYPE param_value; \ PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \ (FIRST_ARG, SECOND_ARG, sizeof(param_value), ¶m_value, 0)); \ if (param_value) \ return py::object(handle_from_new_ptr( \ new TYPE(param_value, /*retain*/ true))); \ else \ return py::object(); \ } #define PYOPENCL_GET_VEC_INFO(WHAT, FIRST_ARG, SECOND_ARG, RES_VEC) \ { \ size_t size; \ PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \ (FIRST_ARG, SECOND_ARG, 0, 0, &size)); \ \ RES_VEC.resize(size / sizeof(RES_VEC.front())); \ \ PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \ (FIRST_ARG, SECOND_ARG, size, \ RES_VEC.empty( ) ? NULL : &RES_VEC.front(), &size)); \ } #define PYOPENCL_GET_STR_INFO(WHAT, FIRST_ARG, SECOND_ARG) \ { \ size_t param_value_size; \ PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \ (FIRST_ARG, SECOND_ARG, 0, 0, ¶m_value_size)); \ \ std::vector param_value(param_value_size); \ PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \ (FIRST_ARG, SECOND_ARG, param_value_size, \ param_value.empty( ) ? NULL : ¶m_value.front(), ¶m_value_size)); \ \ return py::object( \ param_value.empty( ) ? "" : std::string(¶m_value.front(), param_value_size-1)); \ } #define PYOPENCL_GET_INTEGRAL_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \ { \ TYPE param_value; \ PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \ (FIRST_ARG, SECOND_ARG, sizeof(param_value), ¶m_value, 0)); \ return py::object(param_value); \ } // }}} // {{{ event helpers -------------------------------------------------------------- #define PYOPENCL_PARSE_WAIT_FOR \ cl_uint num_events_in_wait_list = 0; \ std::vector event_wait_list; \ \ if (py_wait_for.ptr() != Py_None) \ { \ event_wait_list.resize(len(py_wait_for)); \ PYTHON_FOREACH(evt, py_wait_for) \ event_wait_list[num_events_in_wait_list++] = \ py::extract(evt)().data(); \ } #define PYOPENCL_WAITLIST_ARGS \ num_events_in_wait_list, event_wait_list.empty( ) ? NULL : &event_wait_list.front() #define PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, obj) \ try \ { \ return new nanny_event(evt, false, obj); \ } \ catch (...) \ { \ clReleaseEvent(evt); \ throw; \ } #define PYOPENCL_RETURN_NEW_EVENT(evt) \ try \ { \ return new event(evt, false); \ } \ catch (...) \ { \ clReleaseEvent(evt); \ throw; \ } // }}} // {{{ equality testing #define PYOPENCL_EQUALITY_TESTS(cls) \ bool operator==(cls const &other) const \ { return data() == other.data(); } \ bool operator!=(cls const &other) const \ { return data() != other.data(); } \ long hash() const \ { return (long) (intptr_t) data(); } // }}} namespace pyopencl { // {{{ error class error : public std::runtime_error { private: const char *m_routine; cl_int m_code; public: error(const char *rout, cl_int c, const char *msg="") : std::runtime_error(msg), m_routine(rout), m_code(c) { } const char *routine() const { return m_routine; } cl_int code() const { return m_code; } bool is_out_of_memory() const { return (code() == CL_MEM_OBJECT_ALLOCATION_FAILURE || code() == CL_OUT_OF_RESOURCES || code() == CL_OUT_OF_HOST_MEMORY); } }; // }}} inline py::tuple get_cl_header_version() { return py::make_tuple( PYOPENCL_CL_VERSION >> (3*4), (PYOPENCL_CL_VERSION >> (1*4)) & 0xff ); } // {{{ platform class platform : boost::noncopyable { private: cl_platform_id m_platform; public: platform(cl_platform_id pid) : m_platform(pid) { } platform(cl_platform_id pid, bool /*retain (ignored)*/) : m_platform(pid) { } cl_platform_id data() const { return m_platform; } PYOPENCL_EQUALITY_TESTS(platform); py::object get_info(cl_platform_info param_name) const { switch (param_name) { case CL_PLATFORM_PROFILE: case CL_PLATFORM_VERSION: case CL_PLATFORM_NAME: case CL_PLATFORM_VENDOR: #if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001) case CL_PLATFORM_EXTENSIONS: #endif PYOPENCL_GET_STR_INFO(Platform, m_platform, param_name); default: throw error("Platform.get_info", CL_INVALID_VALUE); } } py::list get_devices(cl_device_type devtype); }; inline py::list get_platforms() { cl_uint num_platforms = 0; PYOPENCL_CALL_GUARDED(clGetPlatformIDs, (0, 0, &num_platforms)); std::vector platforms(num_platforms); PYOPENCL_CALL_GUARDED(clGetPlatformIDs, (num_platforms, platforms.empty( ) ? NULL : &platforms.front(), &num_platforms)); py::list result; BOOST_FOREACH(cl_platform_id pid, platforms) result.append(handle_from_new_ptr( new platform(pid))); return result; } // }}} // {{{ device class device : boost::noncopyable { public: enum reference_type_t { REF_NOT_OWNABLE, REF_FISSION_EXT, #if PYOPENCL_CL_VERSION >= 0x1020 REF_CL_1_2, #endif }; private: cl_device_id m_device; reference_type_t m_ref_type; public: device(cl_device_id did) : m_device(did), m_ref_type(REF_NOT_OWNABLE) { } device(cl_device_id did, bool retain, reference_type_t ref_type=REF_NOT_OWNABLE) : m_device(did), m_ref_type(ref_type) { if (retain && ref_type != REF_NOT_OWNABLE) { if (false) { } #if (defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION)) else if (ref_type == REF_FISSION_EXT) { #if PYOPENCL_CL_VERSION >= 0x1020 cl_platform_id plat; PYOPENCL_CALL_GUARDED(clGetDeviceInfo, (m_device, CL_DEVICE_PLATFORM, sizeof(plat), &plat, NULL)); #endif PYOPENCL_GET_EXT_FUN(plat, clRetainDeviceEXT, retain_func); PYOPENCL_CALL_GUARDED(retain_func, (did)); } #endif #if PYOPENCL_CL_VERSION >= 0x1020 else if (ref_type == REF_CL_1_2) { PYOPENCL_CALL_GUARDED(clRetainDevice, (did)); } #endif else throw error("Device", CL_INVALID_VALUE, "cannot own references to devices when device fission or CL 1.2 is not available"); } } ~device() { if (false) { } #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION) else if (m_ref_type == REF_FISSION_EXT) { #if PYOPENCL_CL_VERSION >= 0x1020 cl_platform_id plat; PYOPENCL_CALL_GUARDED(clGetDeviceInfo, (m_device, CL_DEVICE_PLATFORM, sizeof(plat), &plat, NULL)); #endif PYOPENCL_GET_EXT_FUN(plat, clReleaseDeviceEXT, release_func); PYOPENCL_CALL_GUARDED_CLEANUP(release_func, (m_device)); } #endif #if PYOPENCL_CL_VERSION >= 0x1020 else if (m_ref_type == REF_CL_1_2) PYOPENCL_CALL_GUARDED(clReleaseDevice, (m_device)); #endif } cl_device_id data() const { return m_device; } PYOPENCL_EQUALITY_TESTS(device); py::object get_info(cl_device_info param_name) const { #define DEV_GET_INT_INF(TYPE) \ PYOPENCL_GET_INTEGRAL_INFO(Device, m_device, param_name, TYPE); switch (param_name) { case CL_DEVICE_TYPE: DEV_GET_INT_INF(cl_device_type); case CL_DEVICE_VENDOR_ID: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MAX_COMPUTE_UNITS: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MAX_WORK_GROUP_SIZE: DEV_GET_INT_INF(size_t); case CL_DEVICE_MAX_WORK_ITEM_SIZES: { std::vector result; PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result); PYOPENCL_RETURN_VECTOR(size_t, result); } case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MAX_CLOCK_FREQUENCY: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_ADDRESS_BITS: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MAX_READ_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MAX_WRITE_IMAGE_ARGS: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MAX_MEM_ALLOC_SIZE: DEV_GET_INT_INF(cl_ulong); case CL_DEVICE_IMAGE2D_MAX_WIDTH: DEV_GET_INT_INF(size_t); case CL_DEVICE_IMAGE2D_MAX_HEIGHT: DEV_GET_INT_INF(size_t); case CL_DEVICE_IMAGE3D_MAX_WIDTH: DEV_GET_INT_INF(size_t); case CL_DEVICE_IMAGE3D_MAX_HEIGHT: DEV_GET_INT_INF(size_t); case CL_DEVICE_IMAGE3D_MAX_DEPTH: DEV_GET_INT_INF(size_t); case CL_DEVICE_IMAGE_SUPPORT: DEV_GET_INT_INF(cl_bool); case CL_DEVICE_MAX_PARAMETER_SIZE: DEV_GET_INT_INF(size_t); case CL_DEVICE_MAX_SAMPLERS: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MEM_BASE_ADDR_ALIGN: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_SINGLE_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config); #ifdef CL_DEVICE_DOUBLE_FP_CONFIG case CL_DEVICE_DOUBLE_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config); #endif #ifdef CL_DEVICE_HALF_FP_CONFIG case CL_DEVICE_HALF_FP_CONFIG: DEV_GET_INT_INF(cl_device_fp_config); #endif case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: DEV_GET_INT_INF(cl_device_mem_cache_type); case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: DEV_GET_INT_INF(cl_ulong); case CL_DEVICE_GLOBAL_MEM_SIZE: DEV_GET_INT_INF(cl_ulong); case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: DEV_GET_INT_INF(cl_ulong); case CL_DEVICE_MAX_CONSTANT_ARGS: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_LOCAL_MEM_TYPE: DEV_GET_INT_INF(cl_device_local_mem_type); case CL_DEVICE_LOCAL_MEM_SIZE: DEV_GET_INT_INF(cl_ulong); case CL_DEVICE_ERROR_CORRECTION_SUPPORT: DEV_GET_INT_INF(cl_bool); case CL_DEVICE_PROFILING_TIMER_RESOLUTION: DEV_GET_INT_INF(size_t); case CL_DEVICE_ENDIAN_LITTLE: DEV_GET_INT_INF(cl_bool); case CL_DEVICE_AVAILABLE: DEV_GET_INT_INF(cl_bool); case CL_DEVICE_COMPILER_AVAILABLE: DEV_GET_INT_INF(cl_bool); case CL_DEVICE_EXECUTION_CAPABILITIES: DEV_GET_INT_INF(cl_device_exec_capabilities); case CL_DEVICE_QUEUE_PROPERTIES: DEV_GET_INT_INF(cl_command_queue_properties); case CL_DEVICE_NAME: case CL_DEVICE_VENDOR: case CL_DRIVER_VERSION: case CL_DEVICE_PROFILE: case CL_DEVICE_VERSION: case CL_DEVICE_EXTENSIONS: PYOPENCL_GET_STR_INFO(Device, m_device, param_name); case CL_DEVICE_PLATFORM: PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_platform_id, platform); #if PYOPENCL_CL_VERSION >= 0x1010 case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_HOST_UNIFIED_MEMORY: DEV_GET_INT_INF(cl_bool); case CL_DEVICE_OPENCL_C_VERSION: PYOPENCL_GET_STR_INFO(Device, m_device, param_name); #endif #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV case CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV: case CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV: case CL_DEVICE_REGISTERS_PER_BLOCK_NV: case CL_DEVICE_WARP_SIZE_NV: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_GPU_OVERLAP_NV: case CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV: case CL_DEVICE_INTEGRATED_MEMORY_NV: DEV_GET_INT_INF(cl_bool); #endif #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION) case CL_DEVICE_PARENT_DEVICE_EXT: PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_device_id, device); case CL_DEVICE_PARTITION_TYPES_EXT: case CL_DEVICE_AFFINITY_DOMAINS_EXT: case CL_DEVICE_PARTITION_STYLE_EXT: { std::vector result; PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result); PYOPENCL_RETURN_VECTOR(cl_device_partition_property_ext, result); } case CL_DEVICE_REFERENCE_COUNT_EXT: DEV_GET_INT_INF(cl_uint); #endif #if PYOPENCL_CL_VERSION >= 0x1020 case CL_DEVICE_LINKER_AVAILABLE: DEV_GET_INT_INF(cl_bool); case CL_DEVICE_BUILT_IN_KERNELS: PYOPENCL_GET_STR_INFO(Device, m_device, param_name); case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE: DEV_GET_INT_INF(size_t); case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE: DEV_GET_INT_INF(size_t); case CL_DEVICE_PARENT_DEVICE: PYOPENCL_GET_OPAQUE_INFO(Device, m_device, param_name, cl_device_id, device); case CL_DEVICE_PARTITION_MAX_SUB_DEVICES: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_PARTITION_TYPE: case CL_DEVICE_PARTITION_PROPERTIES: { std::vector result; PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result); PYOPENCL_RETURN_VECTOR(cl_device_partition_property, result); } case CL_DEVICE_PARTITION_AFFINITY_DOMAIN: { std::vector result; PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result); PYOPENCL_RETURN_VECTOR(cl_device_affinity_domain, result); } case CL_DEVICE_REFERENCE_COUNT: DEV_GET_INT_INF(cl_uint); case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC: DEV_GET_INT_INF(cl_bool); case CL_DEVICE_PRINTF_BUFFER_SIZE: DEV_GET_INT_INF(cl_bool); #endif // {{{ AMD dev attrs // // types of AMD dev attrs divined from // https://www.khronos.org/registry/cl/api/1.2/cl.hpp #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD case CL_DEVICE_PROFILING_TIMER_OFFSET_AMD: DEV_GET_INT_INF(cl_ulong); #endif /* FIXME #ifdef CL_DEVICE_TOPOLOGY_AMD case CL_DEVICE_TOPOLOGY_AMD: #endif */ #ifdef CL_DEVICE_BOARD_NAME_AMD case CL_DEVICE_BOARD_NAME_AMD: ; PYOPENCL_GET_STR_INFO(Device, m_device, param_name); #endif #ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD case CL_DEVICE_GLOBAL_FREE_MEMORY_AMD: { std::vector result; PYOPENCL_GET_VEC_INFO(Device, m_device, param_name, result); PYOPENCL_RETURN_VECTOR(size_t, result); } #endif #ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD case CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD: DEV_GET_INT_INF(cl_uint); #endif #ifdef CL_DEVICE_SIMD_WIDTH_AMD case CL_DEVICE_SIMD_WIDTH_AMD: DEV_GET_INT_INF(cl_uint); #endif #ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD case CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD: DEV_GET_INT_INF(cl_uint); #endif #ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD case CL_DEVICE_WAVEFRONT_WIDTH_AMD: DEV_GET_INT_INF(cl_uint); #endif #ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD case CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD: DEV_GET_INT_INF(cl_uint); #endif #ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD: DEV_GET_INT_INF(cl_uint); #endif #ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD case CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD: DEV_GET_INT_INF(cl_uint); #endif #ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD case CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD: DEV_GET_INT_INF(cl_uint); #endif #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD case CL_DEVICE_LOCAL_MEM_BANKS_AMD: DEV_GET_INT_INF(cl_uint); #endif // }}} #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT case CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT: DEV_GET_INT_INF(cl_uint); #endif default: throw error("Device.get_info", CL_INVALID_VALUE); } } #if PYOPENCL_CL_VERSION >= 0x1020 py::list create_sub_devices(py::object py_properties) { std::vector properties; COPY_PY_LIST(cl_device_partition_property, properties); properties.push_back(0); cl_device_partition_property *props_ptr = properties.empty( ) ? NULL : &properties.front(); cl_uint num_entries; PYOPENCL_CALL_GUARDED(clCreateSubDevices, (m_device, props_ptr, 0, NULL, &num_entries)); std::vector result; result.resize(num_entries); PYOPENCL_CALL_GUARDED(clCreateSubDevices, (m_device, props_ptr, num_entries, &result.front(), NULL)); py::list py_result; BOOST_FOREACH(cl_device_id did, result) py_result.append(handle_from_new_ptr( new pyopencl::device(did, /*retain*/true, device::REF_CL_1_2))); return py_result; } #endif #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION) py::list create_sub_devices_ext(py::object py_properties) { std::vector properties; #if PYOPENCL_CL_VERSION >= 0x1020 cl_platform_id plat; PYOPENCL_CALL_GUARDED(clGetDeviceInfo, (m_device, CL_DEVICE_PLATFORM, sizeof(plat), &plat, NULL)); #endif PYOPENCL_GET_EXT_FUN(plat, clCreateSubDevicesEXT, create_sub_dev); COPY_PY_LIST(cl_device_partition_property_ext, properties); properties.push_back(CL_PROPERTIES_LIST_END_EXT); cl_device_partition_property_ext *props_ptr = properties.empty( ) ? NULL : &properties.front(); cl_uint num_entries; PYOPENCL_CALL_GUARDED(create_sub_dev, (m_device, props_ptr, 0, NULL, &num_entries)); std::vector result; result.resize(num_entries); PYOPENCL_CALL_GUARDED(create_sub_dev, (m_device, props_ptr, num_entries, &result.front(), NULL)); py::list py_result; BOOST_FOREACH(cl_device_id did, result) py_result.append(handle_from_new_ptr( new pyopencl::device(did, /*retain*/true, device::REF_FISSION_EXT))); return py_result; } #endif }; inline py::list platform::get_devices(cl_device_type devtype) { cl_uint num_devices = 0; PYOPENCL_CALL_GUARDED(clGetDeviceIDs, (m_platform, devtype, 0, 0, &num_devices)); if (num_devices == 0) return py::list(); std::vector devices(num_devices); PYOPENCL_CALL_GUARDED(clGetDeviceIDs, (m_platform, devtype, num_devices, devices.empty( ) ? NULL : &devices.front(), &num_devices)); py::list result; BOOST_FOREACH(cl_device_id did, devices) result.append(handle_from_new_ptr( new device(did))); return result; } // }}} // {{{ context class context : public boost::noncopyable { private: cl_context m_context; public: context(cl_context ctx, bool retain) : m_context(ctx) { if (retain) PYOPENCL_CALL_GUARDED(clRetainContext, (ctx)); } ~context() { PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseContext, (m_context)); } cl_context data() const { return m_context; } PYOPENCL_EQUALITY_TESTS(context); py::object get_info(cl_context_info param_name) const { switch (param_name) { case CL_CONTEXT_REFERENCE_COUNT: PYOPENCL_GET_INTEGRAL_INFO( Context, m_context, param_name, cl_uint); case CL_CONTEXT_DEVICES: { std::vector result; PYOPENCL_GET_VEC_INFO(Context, m_context, param_name, result); py::list py_result; BOOST_FOREACH(cl_device_id did, result) py_result.append(handle_from_new_ptr( new pyopencl::device(did))); return py_result; } case CL_CONTEXT_PROPERTIES: { std::vector result; PYOPENCL_GET_VEC_INFO(Context, m_context, param_name, result); py::list py_result; for (size_t i = 0; i < result.size(); i+=2) { cl_context_properties key = result[i]; py::object value; switch (key) { case CL_CONTEXT_PLATFORM: { value = py::object( handle_from_new_ptr(new platform( reinterpret_cast(result[i+1])))); break; } #if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1) #if defined(__APPLE__) && defined(HAVE_GL) case CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE: #else case CL_GL_CONTEXT_KHR: case CL_EGL_DISPLAY_KHR: case CL_GLX_DISPLAY_KHR: case CL_WGL_HDC_KHR: case CL_CGL_SHAREGROUP_KHR: #endif value = py::object(result[i+1]); break; #endif case 0: break; default: throw error("Context.get_info", CL_INVALID_VALUE, "unknown context_property key encountered"); } py_result.append(py::make_tuple(result[i], value)); } return py_result; } #if PYOPENCL_CL_VERSION >= 0x1010 case CL_CONTEXT_NUM_DEVICES: PYOPENCL_GET_INTEGRAL_INFO( Context, m_context, param_name, cl_uint); #endif default: throw error("Context.get_info", CL_INVALID_VALUE); } } }; inline std::vector parse_context_properties( py::object py_properties) { std::vector props; if (py_properties.ptr() != Py_None) { PYTHON_FOREACH(prop_tuple, py_properties) { if (len(prop_tuple) != 2) throw error("Context", CL_INVALID_VALUE, "property tuple must have length 2"); cl_context_properties prop = py::extract(prop_tuple[0]); props.push_back(prop); if (prop == CL_CONTEXT_PLATFORM) { py::extract value(prop_tuple[1]); props.push_back( reinterpret_cast(value().data())); } #if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1) #if defined(_WIN32) else if (prop == CL_WGL_HDC_KHR) { // size_t is a stand-in for HANDLE, hopefully has the same size. size_t hnd = py::extract(prop_tuple[1]); props.push_back(hnd); } #endif else if ( #if defined(__APPLE__) && defined(HAVE_GL) prop == CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE #else prop == CL_GL_CONTEXT_KHR || prop == CL_EGL_DISPLAY_KHR || prop == CL_GLX_DISPLAY_KHR || prop == CL_CGL_SHAREGROUP_KHR #endif ) { py::object ctypes = py::import("ctypes"); py::object prop = prop_tuple[1], c_void_p = ctypes.attr("c_void_p"); py::object ptr = ctypes.attr("cast")(prop, c_void_p); py::extract value(ptr.attr("value")); props.push_back(value); } #endif else throw error("Context", CL_INVALID_VALUE, "invalid context property"); } props.push_back(0); } return props; } inline context *create_context_inner(py::object py_devices, py::object py_properties, py::object py_dev_type) { std::vector props = parse_context_properties(py_properties); cl_context_properties *props_ptr = props.empty( ) ? NULL : &props.front(); cl_int status_code; cl_context ctx; // from device list if (py_devices.ptr() != Py_None) { if (py_dev_type.ptr() != Py_None) throw error("Context", CL_INVALID_VALUE, "one of 'devices' or 'dev_type' must be None"); std::vector devices; PYTHON_FOREACH(py_dev, py_devices) { py::extract dev(py_dev); devices.push_back(dev().data()); } PYOPENCL_PRINT_CALL_TRACE("clCreateContext"); ctx = clCreateContext( props_ptr, devices.size(), devices.empty( ) ? NULL : &devices.front(), 0, 0, &status_code); } // from dev_type else { cl_device_type dev_type = CL_DEVICE_TYPE_DEFAULT; if (py_dev_type.ptr() != Py_None) dev_type = py::extract(py_dev_type)(); PYOPENCL_PRINT_CALL_TRACE("clCreateContextFromType"); ctx = clCreateContextFromType(props_ptr, dev_type, 0, 0, &status_code); } if (status_code != CL_SUCCESS) throw pyopencl::error("Context", status_code); try { return new context(ctx, false); } catch (...) { PYOPENCL_CALL_GUARDED(clReleaseContext, (ctx)); throw; } } inline context *create_context(py::object py_devices, py::object py_properties, py::object py_dev_type) { PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( return create_context_inner(py_devices, py_properties, py_dev_type); ) } // }}} // {{{ command_queue class command_queue { private: cl_command_queue m_queue; public: command_queue(cl_command_queue q, bool retain) : m_queue(q) { if (retain) PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (q)); } command_queue(command_queue const &src) : m_queue(src.m_queue) { PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue)); } command_queue( const context &ctx, const device *py_dev=0, cl_command_queue_properties props=0) { cl_device_id dev; if (py_dev) dev = py_dev->data(); else { std::vector devs; PYOPENCL_GET_VEC_INFO(Context, ctx.data(), CL_CONTEXT_DEVICES, devs); if (devs.size() == 0) throw pyopencl::error("CommandQueue", CL_INVALID_VALUE, "context doesn't have any devices? -- don't know which one to default to"); dev = devs[0]; } cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateCommandQueue"); m_queue = clCreateCommandQueue( ctx.data(), dev, props, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("CommandQueue", status_code); } ~command_queue() { PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue, (m_queue)); } const cl_command_queue data() const { return m_queue; } PYOPENCL_EQUALITY_TESTS(command_queue); py::object get_info(cl_command_queue_info param_name) const { switch (param_name) { case CL_QUEUE_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name, cl_context, context); case CL_QUEUE_DEVICE: PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name, cl_device_id, device); case CL_QUEUE_REFERENCE_COUNT: PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name, cl_uint); case CL_QUEUE_PROPERTIES: PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name, cl_command_queue_properties); default: throw error("CommandQueue.get_info", CL_INVALID_VALUE); } } std::auto_ptr get_context() const { cl_context param_value; PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo, (m_queue, CL_QUEUE_CONTEXT, sizeof(param_value), ¶m_value, 0)); return std::auto_ptr( new context(param_value, /*retain*/ true)); } #if PYOPENCL_CL_VERSION < 0x1010 cl_command_queue_properties set_property( cl_command_queue_properties prop, bool enable) { cl_command_queue_properties old_prop; PYOPENCL_CALL_GUARDED(clSetCommandQueueProperty, (m_queue, prop, PYOPENCL_CAST_BOOL(enable), &old_prop)); return old_prop; } #endif void flush() { PYOPENCL_CALL_GUARDED(clFlush, (m_queue)); } void finish() { PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_queue)); } }; // }}} // {{{ event/synchronization class event : boost::noncopyable { private: cl_event m_event; public: event(cl_event event, bool retain) : m_event(event) { if (retain) PYOPENCL_CALL_GUARDED(clRetainEvent, (event)); } event(event const &src) : m_event(src.m_event) { PYOPENCL_CALL_GUARDED(clRetainEvent, (m_event)); } virtual ~event() { PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseEvent, (m_event)); } const cl_event data() const { return m_event; } PYOPENCL_EQUALITY_TESTS(event); py::object get_info(cl_event_info param_name) const { switch (param_name) { case CL_EVENT_COMMAND_QUEUE: PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name, cl_command_queue, command_queue); case CL_EVENT_COMMAND_TYPE: PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name, cl_command_type); case CL_EVENT_COMMAND_EXECUTION_STATUS: PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name, cl_int); case CL_EVENT_REFERENCE_COUNT: PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name, cl_uint); #if PYOPENCL_CL_VERSION >= 0x1010 case CL_EVENT_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name, cl_context, context); #endif default: throw error("Event.get_info", CL_INVALID_VALUE); } } py::object get_profiling_info(cl_profiling_info param_name) const { switch (param_name) { case CL_PROFILING_COMMAND_QUEUED: case CL_PROFILING_COMMAND_SUBMIT: case CL_PROFILING_COMMAND_START: case CL_PROFILING_COMMAND_END: PYOPENCL_GET_INTEGRAL_INFO(EventProfiling, m_event, param_name, cl_ulong); default: throw error("Event.get_profiling_info", CL_INVALID_VALUE); } } virtual void wait() { PYOPENCL_CALL_GUARDED_THREADED(clWaitForEvents, (1, &m_event)); } }; class nanny_event : public event { // In addition to everything an event does, the nanny event holds a reference // to a Python object and waits for its own completion upon destruction. protected: py::object m_ward; public: nanny_event(cl_event evt, bool retain, py::object ward) : event(evt, retain), m_ward(ward) { } nanny_event(nanny_event const &src) : event(src), m_ward(src.m_ward) { } ~nanny_event() { wait(); } py::object get_ward() const { return m_ward; } virtual void wait() { event::wait(); m_ward = py::object(); } }; inline void wait_for_events(py::object events) { cl_uint num_events_in_wait_list = 0; std::vector event_wait_list(len(events)); PYTHON_FOREACH(evt, events) event_wait_list[num_events_in_wait_list++] = py::extract(evt)().data(); PYOPENCL_CALL_GUARDED_THREADED(clWaitForEvents, ( PYOPENCL_WAITLIST_ARGS)); } #if PYOPENCL_CL_VERSION >= 0x1020 inline event *enqueue_marker_with_wait_list(command_queue &cq, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; cl_event evt; PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, ( cq.data(), PYOPENCL_WAITLIST_ARGS, &evt)); PYOPENCL_RETURN_NEW_EVENT(evt); } inline event *enqueue_barrier_with_wait_list(command_queue &cq, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; cl_event evt; PYOPENCL_CALL_GUARDED(clEnqueueBarrierWithWaitList, (cq.data(), PYOPENCL_WAITLIST_ARGS, &evt)); PYOPENCL_RETURN_NEW_EVENT(evt); } #endif // {{{ used internally for pre-OpenCL-1.2 contexts inline event *enqueue_marker(command_queue &cq) { cl_event evt; PYOPENCL_CALL_GUARDED(clEnqueueMarker, ( cq.data(), &evt)); PYOPENCL_RETURN_NEW_EVENT(evt); } inline void enqueue_wait_for_events(command_queue &cq, py::object py_events) { cl_uint num_events = 0; std::vector event_list(len(py_events)); PYTHON_FOREACH(py_evt, py_events) event_list[num_events++] = py::extract(py_evt)().data(); PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, ( cq.data(), num_events, event_list.empty( ) ? NULL : &event_list.front())); } inline void enqueue_barrier(command_queue &cq) { PYOPENCL_CALL_GUARDED(clEnqueueBarrier, (cq.data())); } // }}} #if PYOPENCL_CL_VERSION >= 0x1010 class user_event : public event { public: user_event(cl_event evt, bool retain) : event(evt, retain) { } void set_status(cl_int execution_status) { PYOPENCL_CALL_GUARDED(clSetUserEventStatus, (data(), execution_status)); } }; inline event *create_user_event(context &ctx) { cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateUserEvent"); cl_event evt = clCreateUserEvent(ctx.data(), &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("UserEvent", status_code); try { return new user_event(evt, false); } catch (...) { clReleaseEvent(evt); throw; } } #endif // }}} // {{{ memory_object py::object create_mem_object_wrapper(cl_mem mem); class memory_object_holder { public: virtual const cl_mem data() const = 0; PYOPENCL_EQUALITY_TESTS(memory_object_holder); size_t size() const { size_t param_value; PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (data(), CL_MEM_SIZE, sizeof(param_value), ¶m_value, 0)); return param_value; } py::object get_info(cl_mem_info param_name) const; }; class memory_object : boost::noncopyable, public memory_object_holder { private: bool m_valid; cl_mem m_mem; py::object m_hostbuf; public: memory_object(cl_mem mem, bool retain, py::object *hostbuf=0) : m_valid(true), m_mem(mem) { if (retain) PYOPENCL_CALL_GUARDED(clRetainMemObject, (mem)); if (hostbuf) m_hostbuf = *hostbuf; } memory_object(memory_object const &src) : m_valid(true), m_mem(src.m_mem), m_hostbuf(src.m_hostbuf) { PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem)); } memory_object(memory_object_holder const &src) : m_valid(true), m_mem(src.data()) { PYOPENCL_CALL_GUARDED(clRetainMemObject, (m_mem)); } void release() { if (!m_valid) throw error("MemoryObject.free", CL_INVALID_VALUE, "trying to double-unref mem object"); PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseMemObject, (m_mem)); m_valid = false; } virtual ~memory_object() { if (m_valid) release(); } py::object hostbuf() { return m_hostbuf; } const cl_mem data() const { return m_mem; } }; #if PYOPENCL_CL_VERSION >= 0x1020 inline event *enqueue_migrate_mem_objects( command_queue &cq, py::object py_mem_objects, cl_mem_migration_flags flags, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; std::vector mem_objects; PYTHON_FOREACH(mo, py_mem_objects) mem_objects.push_back(py::extract(mo)().data()); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueMigrateMemObjects, ( cq.data(), mem_objects.size(), mem_objects.empty( ) ? NULL : &mem_objects.front(), flags, PYOPENCL_WAITLIST_ARGS, &evt )); ); PYOPENCL_RETURN_NEW_EVENT(evt); } #endif #ifdef cl_ext_migrate_memobject inline event *enqueue_migrate_mem_object_ext( command_queue &cq, py::object py_mem_objects, cl_mem_migration_flags_ext flags, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; #if PYOPENCL_CL_VERSION >= 0x1020 // {{{ get platform cl_device_id dev; PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo, (cq.data(), CL_QUEUE_DEVICE, sizeof(dev), &dev, NULL)); cl_platform_id plat; PYOPENCL_CALL_GUARDED(clGetDeviceInfo, (cq.data(), CL_DEVICE_PLATFORM, sizeof(plat), &plat, NULL)); // }}} #endif PYOPENCL_GET_EXT_FUN(plat, clEnqueueMigrateMemObjectEXT, enqueue_migrate_fn); std::vector mem_objects; PYTHON_FOREACH(mo, py_mem_objects) mem_objects.push_back(py::extract(mo)().data()); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(enqueue_migrate_fn, ( cq.data(), mem_objects.size(), mem_objects.empty( ) ? NULL : &mem_objects.front(), flags, PYOPENCL_WAITLIST_ARGS, &evt )); ); PYOPENCL_RETURN_NEW_EVENT(evt); } #endif // }}} // {{{ buffer inline cl_mem create_buffer( cl_context ctx, cl_mem_flags flags, size_t size, void *host_ptr) { cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateBuffer"); cl_mem mem = clCreateBuffer(ctx, flags, size, host_ptr, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("create_buffer", status_code); return mem; } inline cl_mem create_buffer_gc( cl_context ctx, cl_mem_flags flags, size_t size, void *host_ptr) { PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( return create_buffer(ctx, flags, size, host_ptr); ); } #if PYOPENCL_CL_VERSION >= 0x1010 inline cl_mem create_sub_buffer( cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type bct, const void *buffer_create_info) { cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateSubBuffer"); cl_mem mem = clCreateSubBuffer(buffer, flags, bct, buffer_create_info, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clCreateSubBuffer", status_code); return mem; } inline cl_mem create_sub_buffer_gc( cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type bct, const void *buffer_create_info) { PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( return create_sub_buffer(buffer, flags, bct, buffer_create_info); ); } #endif class buffer : public memory_object { public: buffer(cl_mem mem, bool retain, py::object *hostbuf=0) : memory_object(mem, retain, hostbuf) { } #if PYOPENCL_CL_VERSION >= 0x1010 buffer *get_sub_region( size_t origin, size_t size, cl_mem_flags flags) const { cl_buffer_region region = { origin, size}; cl_mem mem = create_sub_buffer_gc( data(), flags, CL_BUFFER_CREATE_TYPE_REGION, ®ion); try { return new buffer(mem, false); } catch (...) { PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem)); throw; } } buffer *getitem(py::slice slc) const { PYOPENCL_BUFFER_SIZE_T start, end, stride, length; size_t my_length; PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (data(), CL_MEM_SIZE, sizeof(my_length), &my_length, 0)); #if PY_VERSION_HEX >= 0x03020000 if (PySlice_GetIndicesEx(slc.ptr(), #else if (PySlice_GetIndicesEx(reinterpret_cast(slc.ptr()), #endif my_length, &start, &end, &stride, &length) != 0) throw py::error_already_set(); if (stride != 1) throw pyopencl::error("Buffer.__getitem__", CL_INVALID_VALUE, "Buffer slice must have stride 1"); cl_mem_flags my_flags; PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (data(), CL_MEM_FLAGS, sizeof(my_flags), &my_flags, 0)); return get_sub_region(start, end, my_flags); } #endif }; // {{{ buffer creation inline buffer *create_buffer_py( context &ctx, cl_mem_flags flags, size_t size, py::object py_hostbuf ) { if (py_hostbuf.ptr() != Py_None && !(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))) PyErr_Warn(PyExc_UserWarning, "'hostbuf' was passed, " "but no memory flags to make use of it."); void *buf = 0; py::object *retained_buf_obj = 0; if (py_hostbuf.ptr() != Py_None) { PYOPENCL_BUFFER_SIZE_T len; if (flags & CL_MEM_USE_HOST_PTR) { if (PyObject_AsWriteBuffer(py_hostbuf.ptr(), &buf, &len)) throw py::error_already_set(); } else { if (PyObject_AsReadBuffer( py_hostbuf.ptr(), const_cast(&buf), &len)) throw py::error_already_set(); } if (flags & CL_MEM_USE_HOST_PTR) retained_buf_obj = &py_hostbuf; if (size > size_t(len)) throw pyopencl::error("Buffer", CL_INVALID_VALUE, "specified size is greater than host buffer size"); if (size == 0) size = len; } cl_mem mem = create_buffer_gc(ctx.data(), flags, size, buf); try { return new buffer(mem, false, retained_buf_obj); } catch (...) { PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem)); throw; } } // }}} // {{{ buffer transfers // {{{ byte-for-byte transfers inline event *enqueue_read_buffer( command_queue &cq, memory_object_holder &mem, py::object buffer, size_t device_offset, py::object py_wait_for, bool is_blocking) { PYOPENCL_PARSE_WAIT_FOR; void *buf; PYOPENCL_BUFFER_SIZE_T len; if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED_THREADED(clEnqueueReadBuffer, ( cq.data(), mem.data(), PYOPENCL_CAST_BOOL(is_blocking), device_offset, len, buf, PYOPENCL_WAITLIST_ARGS, &evt )) ); PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, buffer); } inline event *enqueue_write_buffer( command_queue &cq, memory_object_holder &mem, py::object buffer, size_t device_offset, py::object py_wait_for, bool is_blocking) { PYOPENCL_PARSE_WAIT_FOR; const void *buf; PYOPENCL_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED_THREADED(clEnqueueWriteBuffer, ( cq.data(), mem.data(), PYOPENCL_CAST_BOOL(is_blocking), device_offset, len, buf, PYOPENCL_WAITLIST_ARGS, &evt )) ); PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, buffer); } inline event *enqueue_copy_buffer( command_queue &cq, memory_object_holder &src, memory_object_holder &dst, ptrdiff_t byte_count, size_t src_offset, size_t dst_offset, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; if (byte_count < 0) { size_t byte_count_src = 0; size_t byte_count_dst = 0; PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_src, 0)); PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (src.data(), CL_MEM_SIZE, sizeof(byte_count), &byte_count_dst, 0)); byte_count = std::min(byte_count_src, byte_count_dst); } cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueCopyBuffer, ( cq.data(), src.data(), dst.data(), src_offset, dst_offset, byte_count, PYOPENCL_WAITLIST_ARGS, &evt )) ); PYOPENCL_RETURN_NEW_EVENT(evt); } // }}} // {{{ rectangular transfers #if PYOPENCL_CL_VERSION >= 0x1010 inline event *enqueue_read_buffer_rect( command_queue &cq, memory_object_holder &mem, py::object buffer, py::object py_buffer_origin, py::object py_host_origin, py::object py_region, py::object py_buffer_pitches, py::object py_host_pitches, py::object py_wait_for, bool is_blocking ) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(buffer_origin); COPY_PY_COORD_TRIPLE(host_origin); COPY_PY_REGION_TRIPLE(region); COPY_PY_PITCH_TUPLE(buffer_pitches); COPY_PY_PITCH_TUPLE(host_pitches); void *buf; PYOPENCL_BUFFER_SIZE_T len; if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED_THREADED(clEnqueueReadBufferRect, ( cq.data(), mem.data(), PYOPENCL_CAST_BOOL(is_blocking), buffer_origin, host_origin, region, buffer_pitches[0], buffer_pitches[1], host_pitches[0], host_pitches[1], buf, PYOPENCL_WAITLIST_ARGS, &evt )) ); PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, buffer); } inline event *enqueue_write_buffer_rect( command_queue &cq, memory_object_holder &mem, py::object buffer, py::object py_buffer_origin, py::object py_host_origin, py::object py_region, py::object py_buffer_pitches, py::object py_host_pitches, py::object py_wait_for, bool is_blocking ) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(buffer_origin); COPY_PY_COORD_TRIPLE(host_origin); COPY_PY_REGION_TRIPLE(region); COPY_PY_PITCH_TUPLE(buffer_pitches); COPY_PY_PITCH_TUPLE(host_pitches); const void *buf; PYOPENCL_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED_THREADED(clEnqueueWriteBufferRect, ( cq.data(), mem.data(), PYOPENCL_CAST_BOOL(is_blocking), buffer_origin, host_origin, region, buffer_pitches[0], buffer_pitches[1], host_pitches[0], host_pitches[1], buf, PYOPENCL_WAITLIST_ARGS, &evt )) ); PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, buffer); } inline event *enqueue_copy_buffer_rect( command_queue &cq, memory_object_holder &src, memory_object_holder &dst, py::object py_src_origin, py::object py_dst_origin, py::object py_region, py::object py_src_pitches, py::object py_dst_pitches, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(src_origin); COPY_PY_COORD_TRIPLE(dst_origin); COPY_PY_REGION_TRIPLE(region); COPY_PY_PITCH_TUPLE(src_pitches); COPY_PY_PITCH_TUPLE(dst_pitches); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueCopyBufferRect, ( cq.data(), src.data(), dst.data(), src_origin, dst_origin, region, src_pitches[0], src_pitches[1], dst_pitches[0], dst_pitches[1], PYOPENCL_WAITLIST_ARGS, &evt )) ); PYOPENCL_RETURN_NEW_EVENT(evt); } #endif // }}} // }}} #if PYOPENCL_CL_VERSION >= 0x1020 inline event *enqueue_fill_buffer( command_queue &cq, memory_object_holder &mem, py::object pattern, size_t offset, size_t size, py::object py_wait_for ) { PYOPENCL_PARSE_WAIT_FOR; const void *pattern_buf; PYOPENCL_BUFFER_SIZE_T pattern_len; if (PyObject_AsReadBuffer(pattern.ptr(), &pattern_buf, &pattern_len)) throw py::error_already_set(); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueFillBuffer, ( cq.data(), mem.data(), pattern_buf, pattern_len, offset, size, PYOPENCL_WAITLIST_ARGS, &evt )) ); PYOPENCL_RETURN_NEW_EVENT(evt); } #endif // }}} // {{{ image class image : public memory_object { public: image(cl_mem mem, bool retain, py::object *hostbuf=0) : memory_object(mem, retain, hostbuf) { } py::object get_image_info(cl_image_info param_name) const { switch (param_name) { case CL_IMAGE_FORMAT: PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, cl_image_format); case CL_IMAGE_ELEMENT_SIZE: case CL_IMAGE_ROW_PITCH: case CL_IMAGE_SLICE_PITCH: case CL_IMAGE_WIDTH: case CL_IMAGE_HEIGHT: case CL_IMAGE_DEPTH: #if PYOPENCL_CL_VERSION >= 0x1020 case CL_IMAGE_ARRAY_SIZE: #endif PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, size_t); #if PYOPENCL_CL_VERSION >= 0x1020 case CL_IMAGE_BUFFER: { cl_mem param_value; PYOPENCL_CALL_GUARDED(clGetImageInfo, \ (data(), param_name, sizeof(param_value), ¶m_value, 0)); if (param_value == 0) { // no associated memory object? no problem. return py::object(); } return create_mem_object_wrapper(param_value); } case CL_IMAGE_NUM_MIP_LEVELS: case CL_IMAGE_NUM_SAMPLES: PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, cl_uint); #endif default: throw error("MemoryObject.get_image_info", CL_INVALID_VALUE); } } }; // {{{ image formats inline cl_image_format *make_image_format(cl_channel_order ord, cl_channel_type tp) { std::auto_ptr result(new cl_image_format); result->image_channel_order = ord; result->image_channel_data_type = tp; return result.release(); } inline py::list get_supported_image_formats( context const &ctx, cl_mem_flags flags, cl_mem_object_type image_type) { cl_uint num_image_formats; PYOPENCL_CALL_GUARDED(clGetSupportedImageFormats, ( ctx.data(), flags, image_type, 0, NULL, &num_image_formats)); std::vector formats(num_image_formats); PYOPENCL_CALL_GUARDED(clGetSupportedImageFormats, ( ctx.data(), flags, image_type, formats.size(), formats.empty( ) ? NULL : &formats.front(), NULL)); PYOPENCL_RETURN_VECTOR(cl_image_format, formats); } inline cl_uint get_image_format_channel_count(cl_image_format const &fmt) { switch (fmt.image_channel_order) { case CL_R: return 1; case CL_A: return 1; case CL_RG: return 2; case CL_RA: return 2; case CL_RGB: return 3; case CL_RGBA: return 4; case CL_BGRA: return 4; case CL_INTENSITY: return 1; case CL_LUMINANCE: return 1; default: throw pyopencl::error("ImageFormat.channel_dtype_size", CL_INVALID_VALUE, "unrecognized channel order"); } } inline cl_uint get_image_format_channel_dtype_size(cl_image_format const &fmt) { switch (fmt.image_channel_data_type) { case CL_SNORM_INT8: return 1; case CL_SNORM_INT16: return 2; case CL_UNORM_INT8: return 1; case CL_UNORM_INT16: return 2; case CL_UNORM_SHORT_565: return 2; case CL_UNORM_SHORT_555: return 2; case CL_UNORM_INT_101010: return 4; case CL_SIGNED_INT8: return 1; case CL_SIGNED_INT16: return 2; case CL_SIGNED_INT32: return 4; case CL_UNSIGNED_INT8: return 1; case CL_UNSIGNED_INT16: return 2; case CL_UNSIGNED_INT32: return 4; case CL_HALF_FLOAT: return 2; case CL_FLOAT: return 4; default: throw pyopencl::error("ImageFormat.channel_dtype_size", CL_INVALID_VALUE, "unrecognized channel data type"); } } inline cl_uint get_image_format_item_size(cl_image_format const &fmt) { return get_image_format_channel_count(fmt) * get_image_format_channel_dtype_size(fmt); } // }}} // {{{ image creation inline image *create_image( context const &ctx, cl_mem_flags flags, cl_image_format const &fmt, py::object shape, py::object pitches, py::object buffer) { if (shape.ptr() == Py_None) throw pyopencl::error("Image", CL_INVALID_VALUE, "'shape' must be given"); void *buf = 0; PYOPENCL_BUFFER_SIZE_T len; py::object *retained_buf_obj = 0; if (buffer.ptr() != Py_None) { if (flags & CL_MEM_USE_HOST_PTR) { if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); } else { if (PyObject_AsReadBuffer( buffer.ptr(), const_cast(&buf), &len)) throw py::error_already_set(); } if (flags & CL_MEM_USE_HOST_PTR) retained_buf_obj = &buffer; } unsigned dims = py::len(shape); cl_int status_code; cl_mem mem; if (dims == 2) { size_t width = py::extract(shape[0]); size_t height = py::extract(shape[1]); size_t pitch = 0; if (pitches.ptr() != Py_None) { if (py::len(pitches) != 1) throw pyopencl::error("Image", CL_INVALID_VALUE, "invalid length of pitch tuple"); pitch = py::extract(pitches[0]); } // check buffer size cl_int itemsize = get_image_format_item_size(fmt); if (buf && std::max(pitch, width*itemsize)*height > cl_uint(len)) throw pyopencl::error("Image", CL_INVALID_VALUE, "buffer too small"); PYOPENCL_PRINT_CALL_TRACE("clCreateImage2D"); PYOPENCL_RETRY_IF_MEM_ERROR( { mem = clCreateImage2D(ctx.data(), flags, &fmt, width, height, pitch, buf, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clCreateImage2D", status_code); } ); } else if (dims == 3) { size_t width = py::extract(shape[0]); size_t height = py::extract(shape[1]); size_t depth = py::extract(shape[2]); size_t pitch_x = 0; size_t pitch_y = 0; if (pitches.ptr() != Py_None) { if (py::len(pitches) != 2) throw pyopencl::error("Image", CL_INVALID_VALUE, "invalid length of pitch tuple"); pitch_x = py::extract(pitches[0]); pitch_y = py::extract(pitches[1]); } // check buffer size cl_int itemsize = get_image_format_item_size(fmt); if (buf && std::max(std::max(pitch_x, width*itemsize)*height, pitch_y) * depth > cl_uint(len)) throw pyopencl::error("Image", CL_INVALID_VALUE, "buffer too small"); PYOPENCL_PRINT_CALL_TRACE("clCreateImage3D"); PYOPENCL_RETRY_IF_MEM_ERROR( { mem = clCreateImage3D(ctx.data(), flags, &fmt, width, height, depth, pitch_x, pitch_y, buf, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clCreateImage3D", status_code); } ); } else throw pyopencl::error("Image", CL_INVALID_VALUE, "invalid dimension"); try { return new image(mem, false, retained_buf_obj); } catch (...) { PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem)); throw; } } #if PYOPENCL_CL_VERSION >= 0x1020 inline image *create_image_from_desc( context const &ctx, cl_mem_flags flags, cl_image_format const &fmt, cl_image_desc &desc, py::object buffer) { if (buffer.ptr() != Py_None && !(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))) PyErr_Warn(PyExc_UserWarning, "'hostbuf' was passed, " "but no memory flags to make use of it."); void *buf = 0; PYOPENCL_BUFFER_SIZE_T len; py::object *retained_buf_obj = 0; if (buffer.ptr() != Py_None) { if (flags & CL_MEM_USE_HOST_PTR) { if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); } else { if (PyObject_AsReadBuffer( buffer.ptr(), const_cast(&buf), &len)) throw py::error_already_set(); } if (flags & CL_MEM_USE_HOST_PTR) retained_buf_obj = &buffer; } PYOPENCL_PRINT_CALL_TRACE("clCreateImage"); cl_int status_code; cl_mem mem = clCreateImage(ctx.data(), flags, &fmt, &desc, buf, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clCreateImage", status_code); try { return new image(mem, false, retained_buf_obj); } catch (...) { PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem)); throw; } } #endif // }}} // {{{ image transfers inline event *enqueue_read_image( command_queue &cq, image &img, py::object py_origin, py::object py_region, py::object buffer, size_t row_pitch, size_t slice_pitch, py::object py_wait_for, bool is_blocking) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(origin); COPY_PY_REGION_TRIPLE(region); void *buf; PYOPENCL_BUFFER_SIZE_T len; if (PyObject_AsWriteBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueReadImage, ( cq.data(), img.data(), PYOPENCL_CAST_BOOL(is_blocking), origin, region, row_pitch, slice_pitch, buf, PYOPENCL_WAITLIST_ARGS, &evt )); ); PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, buffer); } inline event *enqueue_write_image( command_queue &cq, image &img, py::object py_origin, py::object py_region, py::object buffer, size_t row_pitch, size_t slice_pitch, py::object py_wait_for, bool is_blocking) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(origin); COPY_PY_REGION_TRIPLE(region); const void *buf; PYOPENCL_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueWriteImage, ( cq.data(), img.data(), PYOPENCL_CAST_BOOL(is_blocking), origin, region, row_pitch, slice_pitch, buf, PYOPENCL_WAITLIST_ARGS, &evt )); ); PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, buffer); } inline event *enqueue_copy_image( command_queue &cq, memory_object_holder &src, memory_object_holder &dest, py::object py_src_origin, py::object py_dest_origin, py::object py_region, py::object py_wait_for ) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(src_origin); COPY_PY_COORD_TRIPLE(dest_origin); COPY_PY_REGION_TRIPLE(region); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueCopyImage, ( cq.data(), src.data(), dest.data(), src_origin, dest_origin, region, PYOPENCL_WAITLIST_ARGS, &evt )); ); PYOPENCL_RETURN_NEW_EVENT(evt); } inline event *enqueue_copy_image_to_buffer( command_queue &cq, memory_object_holder &src, memory_object_holder &dest, py::object py_origin, py::object py_region, size_t offset, py::object py_wait_for ) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(origin); COPY_PY_REGION_TRIPLE(region); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueCopyImageToBuffer, ( cq.data(), src.data(), dest.data(), origin, region, offset, PYOPENCL_WAITLIST_ARGS, &evt )); ); PYOPENCL_RETURN_NEW_EVENT(evt); } inline event *enqueue_copy_buffer_to_image( command_queue &cq, memory_object_holder &src, memory_object_holder &dest, size_t offset, py::object py_origin, py::object py_region, py::object py_wait_for ) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(origin); COPY_PY_REGION_TRIPLE(region); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueCopyBufferToImage, ( cq.data(), src.data(), dest.data(), offset, origin, region, PYOPENCL_WAITLIST_ARGS, &evt )); ); PYOPENCL_RETURN_NEW_EVENT(evt); } // }}} #if PYOPENCL_CL_VERSION >= 0x1020 inline event *enqueue_fill_image( command_queue &cq, memory_object_holder &mem, py::object color, py::object py_origin, py::object py_region, py::object py_wait_for ) { PYOPENCL_PARSE_WAIT_FOR; COPY_PY_COORD_TRIPLE(origin); COPY_PY_REGION_TRIPLE(region); const void *color_buf; PYOPENCL_BUFFER_SIZE_T color_len; if (PyObject_AsReadBuffer(color.ptr(), &color_buf, &color_len)) throw py::error_already_set(); cl_event evt; PYOPENCL_RETRY_IF_MEM_ERROR( PYOPENCL_CALL_GUARDED(clEnqueueFillImage, ( cq.data(), mem.data(), color_buf, origin, region, PYOPENCL_WAITLIST_ARGS, &evt )); ); PYOPENCL_RETURN_NEW_EVENT(evt); } #endif // }}} // {{{ maps class memory_map { private: bool m_valid; command_queue m_queue; memory_object m_mem; void *m_ptr; public: memory_map(command_queue &cq, memory_object const &mem, void *ptr) : m_valid(true), m_queue(cq), m_mem(mem), m_ptr(ptr) { } ~memory_map() { if (m_valid) delete release(0, py::object()); } event *release(command_queue *cq, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; if (cq == 0) cq = &m_queue; cl_event evt; PYOPENCL_CALL_GUARDED(clEnqueueUnmapMemObject, ( cq->data(), m_mem.data(), m_ptr, PYOPENCL_WAITLIST_ARGS, &evt )); m_valid = false; PYOPENCL_RETURN_NEW_EVENT(evt); } }; inline py::object enqueue_map_buffer( command_queue &cq, memory_object_holder &buf, cl_map_flags flags, size_t offset, py::object py_shape, py::object dtype, py::object py_order, py::object py_strides, py::object py_wait_for, bool is_blocking ) { PYOPENCL_PARSE_WAIT_FOR; PYOPENCL_PARSE_NUMPY_ARRAY_SPEC; npy_uintp size_in_bytes = tp_descr->elsize; BOOST_FOREACH(npy_intp sdim, shape) size_in_bytes *= sdim; py::handle<> result; cl_event evt; cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clEnqueueMapBuffer"); void *mapped; PYOPENCL_RETRY_IF_MEM_ERROR( { Py_BEGIN_ALLOW_THREADS mapped = clEnqueueMapBuffer( cq.data(), buf.data(), PYOPENCL_CAST_BOOL(is_blocking), flags, offset, size_in_bytes, PYOPENCL_WAITLIST_ARGS, &evt, &status_code); Py_END_ALLOW_THREADS if (status_code != CL_SUCCESS) throw pyopencl::error("clEnqueueMapBuffer", status_code); } ); event evt_handle(evt, false); std::auto_ptr map; try { result = py::handle<>(PyArray_NewFromDescr( &PyArray_Type, tp_descr, shape.size(), shape.empty() ? NULL : &shape.front(), strides.empty() ? NULL : &strides.front(), mapped, ary_flags, /*obj*/NULL)); if (size_in_bytes != (npy_uintp) PyArray_NBYTES(result.get())) throw pyopencl::error("enqueue_map_buffer", CL_INVALID_VALUE, "miscalculated numpy array size (not contiguous?)"); map = std::auto_ptr(new memory_map(cq, buf, mapped)); } catch (...) { PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueUnmapMemObject, ( cq.data(), buf.data(), mapped, 0, 0, 0)); throw; } py::handle<> map_py(handle_from_new_ptr(map.release())); PyArray_BASE(result.get()) = map_py.get(); Py_INCREF(map_py.get()); return py::make_tuple( result, handle_from_new_ptr(new event(evt_handle))); } inline py::object enqueue_map_image( command_queue &cq, memory_object_holder &img, cl_map_flags flags, py::object py_origin, py::object py_region, py::object py_shape, py::object dtype, py::object py_order, py::object py_strides, py::object py_wait_for, bool is_blocking ) { PYOPENCL_PARSE_WAIT_FOR; PYOPENCL_PARSE_NUMPY_ARRAY_SPEC; COPY_PY_COORD_TRIPLE(origin); COPY_PY_REGION_TRIPLE(region); cl_event evt; cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clEnqueueMapImage"); size_t row_pitch, slice_pitch; void *mapped; PYOPENCL_RETRY_IF_MEM_ERROR( { Py_BEGIN_ALLOW_THREADS mapped = clEnqueueMapImage( cq.data(), img.data(), PYOPENCL_CAST_BOOL(is_blocking), flags, origin, region, &row_pitch, &slice_pitch, PYOPENCL_WAITLIST_ARGS, &evt, &status_code); Py_END_ALLOW_THREADS if (status_code != CL_SUCCESS) throw pyopencl::error("clEnqueueMapImage", status_code); } ); event evt_handle(evt, false); std::auto_ptr map; try { map = std::auto_ptr(new memory_map(cq, img, mapped)); } catch (...) { PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueUnmapMemObject, ( cq.data(), img.data(), mapped, 0, 0, 0)); throw; } py::handle<> result = py::handle<>(PyArray_NewFromDescr( &PyArray_Type, tp_descr, shape.size(), shape.empty() ? NULL : &shape.front(), strides.empty() ? NULL : &strides.front(), mapped, ary_flags, /*obj*/NULL)); py::handle<> map_py(handle_from_new_ptr(map.release())); PyArray_BASE(result.get()) = map_py.get(); Py_INCREF(map_py.get()); return py::make_tuple( result, handle_from_new_ptr(new event(evt_handle)), row_pitch, slice_pitch); } // }}} // {{{ sampler class sampler : boost::noncopyable { private: cl_sampler m_sampler; public: sampler(context const &ctx, bool normalized_coordinates, cl_addressing_mode am, cl_filter_mode fm) { cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateSampler"); m_sampler = clCreateSampler( ctx.data(), normalized_coordinates, am, fm, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("Sampler", status_code); } sampler(cl_sampler samp, bool retain) : m_sampler(samp) { if (retain) PYOPENCL_CALL_GUARDED(clRetainSampler, (samp)); } ~sampler() { PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseSampler, (m_sampler)); } cl_sampler data() const { return m_sampler; } PYOPENCL_EQUALITY_TESTS(sampler); py::object get_info(cl_sampler_info param_name) const { switch (param_name) { case CL_SAMPLER_REFERENCE_COUNT: PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name, cl_uint); case CL_SAMPLER_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(Sampler, m_sampler, param_name, cl_context, context); case CL_SAMPLER_ADDRESSING_MODE: PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name, cl_addressing_mode); case CL_SAMPLER_FILTER_MODE: PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name, cl_filter_mode); case CL_SAMPLER_NORMALIZED_COORDS: PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name, cl_bool); default: throw error("Sampler.get_info", CL_INVALID_VALUE); } } }; // }}} // {{{ program class program : boost::noncopyable { public: enum program_kind_type { KND_UNKNOWN, KND_SOURCE, KND_BINARY }; private: cl_program m_program; program_kind_type m_program_kind; public: program(cl_program prog, bool retain, program_kind_type progkind=KND_UNKNOWN) : m_program(prog), m_program_kind(progkind) { if (retain) PYOPENCL_CALL_GUARDED(clRetainProgram, (prog)); } ~program() { PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseProgram, (m_program)); } cl_program data() const { return m_program; } program_kind_type kind() const { return m_program_kind; } PYOPENCL_EQUALITY_TESTS(program); py::object get_info(cl_program_info param_name) const { switch (param_name) { case CL_PROGRAM_REFERENCE_COUNT: PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name, cl_uint); case CL_PROGRAM_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(Program, m_program, param_name, cl_context, context); case CL_PROGRAM_NUM_DEVICES: PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name, cl_uint); case CL_PROGRAM_DEVICES: { std::vector result; PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result); py::list py_result; BOOST_FOREACH(cl_device_id did, result) py_result.append(handle_from_new_ptr( new pyopencl::device(did))); return py_result; } case CL_PROGRAM_SOURCE: PYOPENCL_GET_STR_INFO(Program, m_program, param_name); case CL_PROGRAM_BINARY_SIZES: { std::vector result; PYOPENCL_GET_VEC_INFO(Program, m_program, param_name, result); PYOPENCL_RETURN_VECTOR(size_t, result); } case CL_PROGRAM_BINARIES: // {{{ { std::vector sizes; PYOPENCL_GET_VEC_INFO(Program, m_program, CL_PROGRAM_BINARY_SIZES, sizes); size_t total_size = std::accumulate(sizes.begin(), sizes.end(), 0); boost::scoped_array result( new unsigned char[total_size]); std::vector result_ptrs; unsigned char *ptr = result.get(); for (unsigned i = 0; i < sizes.size(); ++i) { result_ptrs.push_back(ptr); ptr += sizes[i]; } PYOPENCL_CALL_GUARDED(clGetProgramInfo, (m_program, param_name, sizes.size()*sizeof(unsigned char *), result_ptrs.empty( ) ? NULL : &result_ptrs.front(), 0)); \ py::list py_result; ptr = result.get(); for (unsigned i = 0; i < sizes.size(); ++i) { py::handle<> binary_pyobj( #if PY_VERSION_HEX >= 0x03000000 PyBytes_FromStringAndSize( reinterpret_cast(ptr), sizes[i]) #else PyString_FromStringAndSize( reinterpret_cast(ptr), sizes[i]) #endif ); py_result.append(binary_pyobj); ptr += sizes[i]; } return py_result; } // }}} #if PYOPENCL_CL_VERSION >= 0x1020 case CL_PROGRAM_NUM_KERNELS: PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name, size_t); case CL_PROGRAM_KERNEL_NAMES: PYOPENCL_GET_STR_INFO(Program, m_program, param_name); #endif default: throw error("Program.get_info", CL_INVALID_VALUE); } } py::object get_build_info( device const &dev, cl_program_build_info param_name) const { switch (param_name) { #define PYOPENCL_FIRST_ARG m_program, dev.data() // hackety hack case CL_PROGRAM_BUILD_STATUS: PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild, PYOPENCL_FIRST_ARG, param_name, cl_build_status); case CL_PROGRAM_BUILD_OPTIONS: case CL_PROGRAM_BUILD_LOG: PYOPENCL_GET_STR_INFO(ProgramBuild, PYOPENCL_FIRST_ARG, param_name); #if PYOPENCL_CL_VERSION >= 0x1020 case CL_PROGRAM_BINARY_TYPE: PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild, PYOPENCL_FIRST_ARG, param_name, cl_program_binary_type); #endif #undef PYOPENCL_FIRST_ARG default: throw error("Program.get_build_info", CL_INVALID_VALUE); } } void build(std::string options, py::object py_devices) { PYOPENCL_PARSE_PY_DEVICES; PYOPENCL_CALL_GUARDED_THREADED(clBuildProgram, (m_program, num_devices, devices, options.c_str(), 0 ,0)); } #if PYOPENCL_CL_VERSION >= 0x1020 void compile(std::string options, py::object py_devices, py::object py_headers) { PYOPENCL_PARSE_PY_DEVICES; // {{{ pick apart py_headers // py_headers is a list of tuples *(name, program)* std::vector header_names; std::vector programs; PYTHON_FOREACH(name_hdr_tup, py_headers) { if (py::len(name_hdr_tup) != 2) throw error("Program.compile", CL_INVALID_VALUE, "epxected (name, header) tuple in headers list"); std::string name = py::extract(name_hdr_tup[0]); program &prg = py::extract(name_hdr_tup[1]); header_names.push_back(name); programs.push_back(prg.data()); } std::vector header_name_ptrs; BOOST_FOREACH(std::string const &name, header_names) header_name_ptrs.push_back(name.c_str()); // }}} PYOPENCL_CALL_GUARDED_THREADED(clCompileProgram, (m_program, num_devices, devices, options.c_str(), header_names.size(), programs.empty() ? NULL : &programs.front(), header_name_ptrs.empty() ? NULL : &header_name_ptrs.front(), 0, 0)); } #endif }; inline program *create_program_with_source( context &ctx, std::string const &src) { const char *string = src.c_str(); size_t length = src.size(); cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithSource"); cl_program result = clCreateProgramWithSource( ctx.data(), 1, &string, &length, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clCreateProgramWithSource", status_code); try { return new program(result, false, program::KND_SOURCE); } catch (...) { clReleaseProgram(result); throw; } } inline program *create_program_with_binary( context &ctx, py::object py_devices, py::object py_binaries) { std::vector devices; std::vector binaries; std::vector sizes; std::vector binary_statuses; int num_devices = len(py_devices); if (len(py_binaries) != num_devices) throw error("create_program_with_binary", CL_INVALID_VALUE, "device and binary counts don't match"); for (int i = 0; i < num_devices; ++i) { devices.push_back( py::extract(py_devices[i])().data()); const void *buf; PYOPENCL_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer( py::object(py_binaries[i]).ptr(), &buf, &len)) throw py::error_already_set(); binaries.push_back(reinterpret_cast(buf)); sizes.push_back(len); } binary_statuses.resize(num_devices); cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBinary"); cl_program result = clCreateProgramWithBinary( ctx.data(), num_devices, devices.empty( ) ? NULL : &devices.front(), sizes.empty( ) ? NULL : &sizes.front(), binaries.empty( ) ? NULL : &binaries.front(), binary_statuses.empty( ) ? NULL : &binary_statuses.front(), &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clCreateProgramWithBinary", status_code); /* for (int i = 0; i < num_devices; ++i) printf("%d:%d\n", i, binary_statuses[i]); */ try { return new program(result, false, program::KND_BINARY); } catch (...) { clReleaseProgram(result); throw; } } #if (PYOPENCL_CL_VERSION >= 0x1020) && \ ((PYOPENCL_CL_VERSION >= 0x1030) && defined(__APPLE__)) inline program *create_program_with_built_in_kernels( context &ctx, py::object py_devices, std::string const &kernel_names) { PYOPENCL_PARSE_PY_DEVICES; cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateProgramWithBuiltInKernels"); cl_program result = clCreateProgramWithBuiltInKernels( ctx.data(), num_devices, devices, kernel_names.c_str(), &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clCreateProgramWithBuiltInKernels", status_code); try { return new program(result, false); } catch (...) { clReleaseProgram(result); throw; } } #endif #if PYOPENCL_CL_VERSION >= 0x1020 inline program *link_program( context &ctx, py::object py_programs, std::string const &options, py::object py_devices ) { PYOPENCL_PARSE_PY_DEVICES; std::vector programs; PYTHON_FOREACH(py_prg, py_programs) { program &prg = py::extract(py_prg); programs.push_back(prg.data()); } cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clLinkProgram"); cl_program result = clLinkProgram( ctx.data(), num_devices, devices, options.c_str(), programs.size(), programs.empty() ? NULL : &programs.front(), 0, 0, &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clLinkPorgram", status_code); try { return new program(result, false); } catch (...) { clReleaseProgram(result); throw; } } #endif #if PYOPENCL_CL_VERSION >= 0x1020 inline void unload_platform_compiler(platform &plat) { PYOPENCL_CALL_GUARDED(clUnloadPlatformCompiler, (plat.data())); } #endif // }}} // {{{ kernel class local_memory { private: size_t m_size; public: local_memory(size_t size) : m_size(size) { } size_t size() const { return m_size; } }; class kernel : boost::noncopyable { private: cl_kernel m_kernel; public: kernel(cl_kernel knl, bool retain) : m_kernel(knl) { if (retain) PYOPENCL_CALL_GUARDED(clRetainKernel, (knl)); } kernel(program const &prg, std::string const &kernel_name) { cl_int status_code; PYOPENCL_PRINT_CALL_TRACE("clCreateKernel"); m_kernel = clCreateKernel(prg.data(), kernel_name.c_str(), &status_code); if (status_code != CL_SUCCESS) throw pyopencl::error("clCreateKernel", status_code); } ~kernel() { PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseKernel, (m_kernel)); } cl_kernel data() const { return m_kernel; } PYOPENCL_EQUALITY_TESTS(kernel); void set_arg_null(cl_uint arg_index) { cl_mem m = 0; PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(cl_mem), &m)); } void set_arg_mem(cl_uint arg_index, memory_object_holder &moh) { cl_mem m = moh.data(); PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(cl_mem), &m)); } void set_arg_local(cl_uint arg_index, local_memory const &loc) { PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, loc.size(), 0)); } void set_arg_sampler(cl_uint arg_index, sampler const &smp) { cl_sampler s = smp.data(); PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(cl_sampler), &s)); } void set_arg_buf(cl_uint arg_index, py::object py_buffer) { const void *buf; PYOPENCL_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer(py_buffer.ptr(), &buf, &len)) { PyErr_Clear(); throw error("Kernel.set_arg", CL_INVALID_VALUE, "invalid kernel argument"); } PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, len, buf)); } void set_arg(cl_uint arg_index, py::object arg) { if (arg.ptr() == Py_None) { set_arg_null(arg_index); return; } py::extract ex_mo(arg); if (ex_mo.check()) { set_arg_mem(arg_index, ex_mo()); return; } py::extract ex_loc(arg); if (ex_loc.check()) { set_arg_local(arg_index, ex_loc()); return; } py::extract ex_smp(arg); if (ex_smp.check()) { set_arg_sampler(arg_index, ex_smp()); return; } set_arg_buf(arg_index, arg); } py::object get_info(cl_kernel_info param_name) const { switch (param_name) { case CL_KERNEL_FUNCTION_NAME: PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name); case CL_KERNEL_NUM_ARGS: case CL_KERNEL_REFERENCE_COUNT: PYOPENCL_GET_INTEGRAL_INFO(Kernel, m_kernel, param_name, cl_uint); case CL_KERNEL_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name, cl_context, context); case CL_KERNEL_PROGRAM: PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name, cl_program, program); #if PYOPENCL_CL_VERSION >= 0x1020 case CL_KERNEL_ATTRIBUTES: PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name); #endif default: throw error("Kernel.get_info", CL_INVALID_VALUE); } } py::object get_work_group_info( cl_kernel_work_group_info param_name, device const &dev ) const { switch (param_name) { #define PYOPENCL_FIRST_ARG m_kernel, dev.data() // hackety hack case CL_KERNEL_WORK_GROUP_SIZE: PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup, PYOPENCL_FIRST_ARG, param_name, size_t); case CL_KERNEL_COMPILE_WORK_GROUP_SIZE: { std::vector result; PYOPENCL_GET_VEC_INFO(KernelWorkGroup, PYOPENCL_FIRST_ARG, param_name, result); PYOPENCL_RETURN_VECTOR(size_t, result); } case CL_KERNEL_LOCAL_MEM_SIZE: #if PYOPENCL_CL_VERSION >= 0x1010 case CL_KERNEL_PRIVATE_MEM_SIZE: #endif PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup, PYOPENCL_FIRST_ARG, param_name, cl_ulong); #if PYOPENCL_CL_VERSION >= 0x1010 case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup, PYOPENCL_FIRST_ARG, param_name, size_t); #endif default: throw error("Kernel.get_work_group_info", CL_INVALID_VALUE); #undef PYOPENCL_FIRST_ARG } } #if PYOPENCL_CL_VERSION >= 0x1020 py::object get_arg_info( cl_uint arg_index, cl_kernel_arg_info param_name ) const { switch (param_name) { #define PYOPENCL_FIRST_ARG m_kernel, arg_index // hackety hack case CL_KERNEL_ARG_ADDRESS_QUALIFIER: PYOPENCL_GET_INTEGRAL_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name, cl_kernel_arg_address_qualifier); case CL_KERNEL_ARG_ACCESS_QUALIFIER: PYOPENCL_GET_INTEGRAL_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name, cl_kernel_arg_access_qualifier); case CL_KERNEL_ARG_TYPE_NAME: case CL_KERNEL_ARG_NAME: PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name); #undef PYOPENCL_FIRST_ARG default: throw error("Kernel.get_arg_info", CL_INVALID_VALUE); } } #endif }; inline py::list create_kernels_in_program(program &pgm) { cl_uint num_kernels; PYOPENCL_CALL_GUARDED(clCreateKernelsInProgram, ( pgm.data(), 0, 0, &num_kernels)); std::vector kernels(num_kernels); PYOPENCL_CALL_GUARDED(clCreateKernelsInProgram, ( pgm.data(), num_kernels, kernels.empty( ) ? NULL : &kernels.front(), &num_kernels)); py::list result; BOOST_FOREACH(cl_kernel knl, kernels) result.append(handle_from_new_ptr(new kernel(knl, true))); return result; } inline event *enqueue_nd_range_kernel( command_queue &cq, kernel &knl, py::object py_global_work_size, py::object py_local_work_size, py::object py_global_work_offset, py::object py_wait_for, bool g_times_l) { PYOPENCL_PARSE_WAIT_FOR; cl_uint work_dim = len(py_global_work_size); std::vector global_work_size; COPY_PY_LIST(size_t, global_work_size); size_t *local_work_size_ptr = 0; std::vector local_work_size; if (py_local_work_size.ptr() != Py_None) { if (g_times_l) work_dim = std::max(work_dim, unsigned(len(py_local_work_size))); else if (work_dim != unsigned(len(py_local_work_size))) throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, "global/local work sizes have differing dimensions"); COPY_PY_LIST(size_t, local_work_size); while (local_work_size.size() < work_dim) local_work_size.push_back(1); while (global_work_size.size() < work_dim) global_work_size.push_back(1); local_work_size_ptr = local_work_size.empty( ) ? NULL : &local_work_size.front(); } if (g_times_l && local_work_size_ptr) { for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) global_work_size[work_axis] *= local_work_size[work_axis]; } size_t *global_work_offset_ptr = 0; std::vector global_work_offset; if (py_global_work_offset.ptr() != Py_None) { if (work_dim != unsigned(len(py_global_work_offset))) throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, "global work size and offset have differing dimensions"); COPY_PY_LIST(size_t, global_work_offset); if (g_times_l && local_work_size_ptr) { for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) global_work_offset[work_axis] *= local_work_size[work_axis]; } global_work_offset_ptr = global_work_offset.empty( ) ? NULL : &global_work_offset.front(); } PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( { cl_event evt; PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, ( cq.data(), knl.data(), work_dim, global_work_offset_ptr, global_work_size.empty( ) ? NULL : &global_work_size.front(), local_work_size_ptr, PYOPENCL_WAITLIST_ARGS, &evt )); PYOPENCL_RETURN_NEW_EVENT(evt); } ); } inline event *enqueue_task( command_queue &cq, kernel &knl, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( { cl_event evt; PYOPENCL_CALL_GUARDED(clEnqueueTask, ( cq.data(), knl.data(), PYOPENCL_WAITLIST_ARGS, &evt )); PYOPENCL_RETURN_NEW_EVENT(evt); } ); } // }}} // {{{ gl interop inline bool have_gl() { #ifdef HAVE_GL return true; #else return false; #endif } #ifdef HAVE_GL #ifdef __APPLE__ inline cl_context_properties get_apple_cgl_share_group() { CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); return (cl_context_properties) kCGLShareGroup; } #endif /* __APPLE__ */ class gl_buffer : public memory_object { public: gl_buffer(cl_mem mem, bool retain, py::object *hostbuf=0) : memory_object(mem, retain, hostbuf) { } }; class gl_renderbuffer : public memory_object { public: gl_renderbuffer(cl_mem mem, bool retain, py::object *hostbuf=0) : memory_object(mem, retain, hostbuf) { } }; class gl_texture : public image { public: gl_texture(cl_mem mem, bool retain, py::object *hostbuf=0) : image(mem, retain, hostbuf) { } py::object get_gl_texture_info(cl_gl_texture_info param_name) { switch (param_name) { case CL_GL_TEXTURE_TARGET: PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLenum); case CL_GL_MIPMAP_LEVEL: PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLint); default: throw error("MemoryObject.get_gl_texture_info", CL_INVALID_VALUE); } } }; #define PYOPENCL_WRAP_BUFFER_CREATOR(TYPE, NAME, CL_NAME, ARGS, CL_ARGS) \ inline \ TYPE *NAME ARGS \ { \ cl_int status_code; \ PYOPENCL_PRINT_CALL_TRACE(#CL_NAME); \ cl_mem mem = CL_NAME CL_ARGS; \ \ if (status_code != CL_SUCCESS) \ throw pyopencl::error(#CL_NAME, status_code); \ \ try \ { \ return new TYPE(mem, false); \ } \ catch (...) \ { \ PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem)); \ throw; \ } \ } PYOPENCL_WRAP_BUFFER_CREATOR(gl_buffer, create_from_gl_buffer, clCreateFromGLBuffer, (context &ctx, cl_mem_flags flags, GLuint bufobj), (ctx.data(), flags, bufobj, &status_code)); PYOPENCL_WRAP_BUFFER_CREATOR(gl_texture, create_from_gl_texture_2d, clCreateFromGLTexture2D, (context &ctx, cl_mem_flags flags, GLenum texture_target, GLint miplevel, GLuint texture), (ctx.data(), flags, texture_target, miplevel, texture, &status_code)); PYOPENCL_WRAP_BUFFER_CREATOR(gl_texture, create_from_gl_texture_3d, clCreateFromGLTexture3D, (context &ctx, cl_mem_flags flags, GLenum texture_target, GLint miplevel, GLuint texture), (ctx.data(), flags, texture_target, miplevel, texture, &status_code)); PYOPENCL_WRAP_BUFFER_CREATOR(gl_renderbuffer, create_from_gl_renderbuffer, clCreateFromGLRenderbuffer, (context &ctx, cl_mem_flags flags, GLuint renderbuffer), (ctx.data(), flags, renderbuffer, &status_code)); inline gl_texture *create_from_gl_texture( context &ctx, cl_mem_flags flags, GLenum texture_target, GLint miplevel, GLuint texture, unsigned dims) { if (dims == 2) return create_from_gl_texture_2d(ctx, flags, texture_target, miplevel, texture); else if (dims == 3) return create_from_gl_texture_3d(ctx, flags, texture_target, miplevel, texture); else throw pyopencl::error("Image", CL_INVALID_VALUE, "invalid dimension"); } inline py::tuple get_gl_object_info(memory_object_holder const &mem) { cl_gl_object_type otype; GLuint gl_name; PYOPENCL_CALL_GUARDED(clGetGLObjectInfo, (mem.data(), &otype, &gl_name)); return py::make_tuple(otype, gl_name); } #define WRAP_GL_ENQUEUE(what, What) \ inline \ event *enqueue_##what##_gl_objects( \ command_queue &cq, \ py::object py_mem_objects, \ py::object py_wait_for) \ { \ PYOPENCL_PARSE_WAIT_FOR; \ \ std::vector mem_objects; \ PYTHON_FOREACH(mo, py_mem_objects) \ mem_objects.push_back(py::extract(mo)().data()); \ \ cl_event evt; \ PYOPENCL_CALL_GUARDED(clEnqueue##What##GLObjects, ( \ cq.data(), \ mem_objects.size(), mem_objects.empty( ) ? NULL : &mem_objects.front(), \ PYOPENCL_WAITLIST_ARGS, &evt \ )); \ \ PYOPENCL_RETURN_NEW_EVENT(evt); \ } WRAP_GL_ENQUEUE(acquire, Acquire); WRAP_GL_ENQUEUE(release, Release); #endif #if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1) inline py::object get_gl_context_info_khr( py::object py_properties, cl_gl_context_info param_name, py::object py_platform ) { std::vector props = parse_context_properties(py_properties); typedef CL_API_ENTRY cl_int (CL_API_CALL *func_ptr_type)(const cl_context_properties * /* properties */, cl_gl_context_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; func_ptr_type func_ptr; #if PYOPENCL_CL_VERSION >= 0x1020 if (py_platform.ptr() != Py_None) { platform &plat = py::extract(py_platform); func_ptr = (func_ptr_type) clGetExtensionFunctionAddressForPlatform( plat.data(), "clGetGLContextInfoKHR"); } else { PYOPENCL_DEPRECATED("get_gl_context_info_khr with platform=None", "2013.1", ); func_ptr = (func_ptr_type) clGetExtensionFunctionAddress( "clGetGLContextInfoKHR"); } #else func_ptr = (func_ptr_type) clGetExtensionFunctionAddress( "clGetGLContextInfoKHR"); #endif if (!func_ptr) throw error("Context.get_info", CL_INVALID_PLATFORM, "clGetGLContextInfoKHR extension function not present"); cl_context_properties *props_ptr = props.empty( ) ? NULL : &props.front(); switch (param_name) { case CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR: { cl_device_id param_value; PYOPENCL_CALL_GUARDED(func_ptr, (props_ptr, param_name, sizeof(param_value), ¶m_value, 0)); return py::object(handle_from_new_ptr( \ new device(param_value, /*retain*/ true))); } case CL_DEVICES_FOR_GL_CONTEXT_KHR: { size_t size; PYOPENCL_CALL_GUARDED(func_ptr, (props_ptr, param_name, 0, 0, &size)); std::vector devices; devices.resize(size / sizeof(devices.front())); PYOPENCL_CALL_GUARDED(func_ptr, (props_ptr, param_name, size, devices.empty( ) ? NULL : &devices.front(), &size)); py::list result; BOOST_FOREACH(cl_device_id did, devices) result.append(handle_from_new_ptr( new device(did))); return result; } default: throw error("get_gl_context_info_khr", CL_INVALID_VALUE); } } #endif // }}} // {{{ deferred implementation bits inline py::object create_mem_object_wrapper(cl_mem mem) { cl_mem_object_type mem_obj_type; PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, \ (mem, CL_MEM_TYPE, sizeof(mem_obj_type), &mem_obj_type, 0)); switch (mem_obj_type) { case CL_MEM_OBJECT_BUFFER: return py::object(handle_from_new_ptr( new buffer(mem, /*retain*/ true))); case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE3D: #if PYOPENCL_CL_VERSION >= 0x1020 case CL_MEM_OBJECT_IMAGE2D_ARRAY: case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE1D_BUFFER: #endif return py::object(handle_from_new_ptr( new image(mem, /*retain*/ true))); default: return py::object(handle_from_new_ptr( new memory_object(mem, /*retain*/ true))); } } inline py::object memory_object_from_int(intptr_t cl_mem_as_int) { return create_mem_object_wrapper((cl_mem) cl_mem_as_int); } inline py::object memory_object_holder::get_info(cl_mem_info param_name) const { switch (param_name) { case CL_MEM_TYPE: PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, cl_mem_object_type); case CL_MEM_FLAGS: PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, cl_mem_flags); case CL_MEM_SIZE: PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, size_t); case CL_MEM_HOST_PTR: throw pyopencl::error("MemoryObject.get_info", CL_INVALID_VALUE, "Use MemoryObject.get_host_array to get host pointer."); case CL_MEM_MAP_COUNT: PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, cl_uint); case CL_MEM_REFERENCE_COUNT: PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, cl_uint); case CL_MEM_CONTEXT: PYOPENCL_GET_OPAQUE_INFO(MemObject, data(), param_name, cl_context, context); #if PYOPENCL_CL_VERSION >= 0x1010 case CL_MEM_ASSOCIATED_MEMOBJECT: { cl_mem param_value; PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, \ (data(), param_name, sizeof(param_value), ¶m_value, 0)); if (param_value == 0) { // no associated memory object? no problem. return py::object(); } return create_mem_object_wrapper(param_value); } case CL_MEM_OFFSET: PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name, size_t); #endif default: throw error("MemoryObjectHolder.get_info", CL_INVALID_VALUE); } } inline py::handle<> get_mem_obj_host_array( py::object mem_obj_py, py::object shape, py::object dtype, py::object order_py) { memory_object_holder const &mem_obj = py::extract(mem_obj_py); PyArray_Descr *tp_descr; if (PyArray_DescrConverter(dtype.ptr(), &tp_descr) != NPY_SUCCEED) throw py::error_already_set(); py::extract shape_as_int(shape); std::vector dims; if (shape_as_int.check()) dims.push_back(shape_as_int()); else std::copy( py::stl_input_iterator(shape), py::stl_input_iterator(), back_inserter(dims)); NPY_ORDER order = PyArray_CORDER; PyArray_OrderConverter(order_py.ptr(), &order); int ary_flags = 0; if (order == PyArray_FORTRANORDER) ary_flags |= NPY_FARRAY; else if (order == PyArray_CORDER) ary_flags |= NPY_CARRAY; else throw std::runtime_error("unrecognized order specifier"); void *host_ptr; size_t mem_obj_size; PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (mem_obj.data(), CL_MEM_HOST_PTR, sizeof(host_ptr), &host_ptr, 0)); PYOPENCL_CALL_GUARDED(clGetMemObjectInfo, (mem_obj.data(), CL_MEM_SIZE, sizeof(mem_obj_size), &mem_obj_size, 0)); py::handle<> result = py::handle<>(PyArray_NewFromDescr( &PyArray_Type, tp_descr, dims.size(), &dims.front(), /*strides*/ NULL, host_ptr, ary_flags, /*obj*/NULL)); if ((size_t) PyArray_NBYTES(result.get()) > mem_obj_size) throw pyopencl::error("MemoryObject.get_host_array", CL_INVALID_VALUE, "Resulting array is larger than memory object."); PyArray_BASE(result.get()) = mem_obj_py.ptr(); Py_INCREF(mem_obj_py.ptr()); return result; } // }}} } #endif // vim: foldmethod=marker pyopencl-2013.2/src/wrapper/_pvt_struct_v3.cpp0000644000175000000500000013433412245716340020115 0ustar tomussrc/* struct module -- pack values into and (out of) bytes objects */ /* New version supporting byte order, alignment and size options, character strings, and unsigned numbers */ #define PY_SSIZE_T_CLEAN #include "Python.h" #include "structmember.h" #include #include "numpy_init.hpp" namespace { extern PyTypeObject PyStructType; } /* The translation function for each format character is table driven */ typedef struct _formatdef { char format; Py_ssize_t size; Py_ssize_t alignment; PyObject* (*unpack)(const char *, const struct _formatdef *); int (*pack)(char *, PyObject *, const struct _formatdef *); } formatdef; typedef struct _formatcode { const struct _formatdef *fmtdef; Py_ssize_t offset; Py_ssize_t size; } formatcode; /* Struct object interface */ typedef struct { PyObject_HEAD Py_ssize_t s_size; Py_ssize_t s_len; formatcode *s_codes; PyObject *s_format; PyObject *weakreflist; /* List of weak references */ } PyStructObject; #define PyStruct_Check(op) PyObject_TypeCheck(op, &PyStructType) #define PyStruct_CheckExact(op) (Py_TYPE(op) == &PyStructType) /* Exception */ static PyObject *StructError; /* Define various structs to figure out the alignments of types */ typedef struct { char c; short x; } st_short; typedef struct { char c; int x; } st_int; typedef struct { char c; long x; } st_long; typedef struct { char c; float x; } st_float; typedef struct { char c; double x; } st_double; typedef struct { char c; void *x; } st_void_p; typedef struct { char c; size_t x; } st_size_t; #define SHORT_ALIGN (sizeof(st_short) - sizeof(short)) #define INT_ALIGN (sizeof(st_int) - sizeof(int)) #define LONG_ALIGN (sizeof(st_long) - sizeof(long)) #define FLOAT_ALIGN (sizeof(st_float) - sizeof(float)) #define DOUBLE_ALIGN (sizeof(st_double) - sizeof(double)) #define VOID_P_ALIGN (sizeof(st_void_p) - sizeof(void *)) #define SIZE_T_ALIGN (sizeof(st_size_t) - sizeof(size_t)) /* We can't support q and Q in native mode unless the compiler does; in std mode, they're 8 bytes on all platforms. */ #ifdef HAVE_LONG_LONG typedef struct { char c; PY_LONG_LONG x; } s_long_long; #define LONG_LONG_ALIGN (sizeof(s_long_long) - sizeof(PY_LONG_LONG)) #endif #if !defined(__cplusplus) && defined(HAVE_C99_BOOL) #define BOOL_TYPE _Bool typedef struct { char c; _Bool x; } s_bool; #define BOOL_ALIGN (sizeof(s_bool) - sizeof(BOOL_TYPE)) #else #define BOOL_TYPE char #define BOOL_ALIGN 0 #endif #define STRINGIFY(x) #x #ifdef __powerc #pragma options align=reset #endif /* Helper for integer format codes: converts an arbitrary Python object to a PyLongObject if possible, otherwise fails. Caller should decref. */ static PyObject * get_pylong(PyObject *v) { assert(v != NULL); if (!PyLong_Check(v)) { /* Not an integer; try to use __index__ to convert. */ if (PyIndex_Check(v)) { v = PyNumber_Index(v); if (v == NULL) return NULL; } else { PyErr_SetString(StructError, "required argument is not an integer"); return NULL; } } else Py_INCREF(v); assert(PyLong_Check(v)); return v; } /* Helper routine to get a C long and raise the appropriate error if it isn't one */ static int get_long(PyObject *v, long *p) { long x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsLong(v); Py_DECREF(v); if (x == (long)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } /* Same, but handling unsigned long */ static int get_ulong(PyObject *v, unsigned long *p) { unsigned long x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsUnsignedLong(v); Py_DECREF(v); if (x == (unsigned long)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } #ifdef HAVE_LONG_LONG /* Same, but handling native long long. */ static int get_longlong(PyObject *v, PY_LONG_LONG *p) { PY_LONG_LONG x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsLongLong(v); Py_DECREF(v); if (x == (PY_LONG_LONG)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } /* Same, but handling native unsigned long long. */ static int get_ulonglong(PyObject *v, unsigned PY_LONG_LONG *p) { unsigned PY_LONG_LONG x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsUnsignedLongLong(v); Py_DECREF(v); if (x == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } #endif /* Same, but handling Py_ssize_t */ static int get_ssize_t(PyObject *v, Py_ssize_t *p) { Py_ssize_t x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsSsize_t(v); Py_DECREF(v); if (x == (Py_ssize_t)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } /* Same, but handling size_t */ static int get_size_t(PyObject *v, size_t *p) { size_t x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsSize_t(v); Py_DECREF(v); if (x == (size_t)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } #define RANGE_ERROR(x, f, flag, mask) return _range_error(f, flag) /* Floating point helpers */ #if 0 static PyObject * unpack_float(const char *p, /* start of 4-byte string */ int le) /* true for little-endian, false for big-endian */ { double x; x = _PyFloat_Unpack4((unsigned char *)p, le); if (x == -1.0 && PyErr_Occurred()) return NULL; return PyFloat_FromDouble(x); } static PyObject * unpack_double(const char *p, /* start of 8-byte string */ int le) /* true for little-endian, false for big-endian */ { double x; x = _PyFloat_Unpack8((unsigned char *)p, le); if (x == -1.0 && PyErr_Occurred()) return NULL; return PyFloat_FromDouble(x); } #endif /* Helper to format the range error exceptions */ static int _range_error(const formatdef *f, int is_unsigned) { /* ulargest is the largest unsigned value with f->size bytes. * Note that the simpler: * ((size_t)1 << (f->size * 8)) - 1 * doesn't work when f->size == sizeof(size_t) because C doesn't * define what happens when a left shift count is >= the number of * bits in the integer being shifted; e.g., on some boxes it doesn't * shift at all when they're equal. */ const size_t ulargest = (size_t)-1 >> ((SIZEOF_SIZE_T - f->size)*8); assert(f->size >= 1 && f->size <= SIZEOF_SIZE_T); if (is_unsigned) PyErr_Format(StructError, "'%c' format requires 0 <= number <= %zu", f->format, ulargest); else { const Py_ssize_t largest = (Py_ssize_t)(ulargest >> 1); PyErr_Format(StructError, "'%c' format requires %zd <= number <= %zd", f->format, ~ largest, largest); } return -1; } /* A large number of small routines follow, with names of the form [bln][up]_TYPE [bln] distiguishes among big-endian, little-endian and native. [pu] distiguishes between pack (to struct) and unpack (from struct). TYPE is one of char, byte, ubyte, etc. */ // {{{ /* Native mode routines. ****************************************************/ /* NOTE: In all n[up]_ routines handling types larger than 1 byte, there is *no* guarantee that the p pointer is properly aligned for each type, therefore memcpy is called. An intermediate variable is used to compensate for big-endian architectures. Normally both the intermediate variable and the memcpy call will be skipped by C optimisation in little-endian architectures (gcc >= 2.91 does this). */ static PyObject * nu_char(const char *p, const formatdef *f) { return PyBytes_FromStringAndSize(p, 1); } static PyObject * nu_byte(const char *p, const formatdef *f) { return PyLong_FromLong((long) *(signed char *)p); } static PyObject * nu_ubyte(const char *p, const formatdef *f) { return PyLong_FromLong((long) *(unsigned char *)p); } static PyObject * nu_short(const char *p, const formatdef *f) { short x; memcpy((char *)&x, p, sizeof x); return PyLong_FromLong((long)x); } static PyObject * nu_ushort(const char *p, const formatdef *f) { unsigned short x; memcpy((char *)&x, p, sizeof x); return PyLong_FromLong((long)x); } static PyObject * nu_int(const char *p, const formatdef *f) { int x; memcpy((char *)&x, p, sizeof x); return PyLong_FromLong((long)x); } static PyObject * nu_uint(const char *p, const formatdef *f) { unsigned int x; memcpy((char *)&x, p, sizeof x); #if (SIZEOF_LONG > SIZEOF_INT) return PyLong_FromLong((long)x); #else if (x <= ((unsigned int)LONG_MAX)) return PyLong_FromLong((long)x); return PyLong_FromUnsignedLong((unsigned long)x); #endif } static PyObject * nu_long(const char *p, const formatdef *f) { long x; memcpy((char *)&x, p, sizeof x); return PyLong_FromLong(x); } static PyObject * nu_ulong(const char *p, const formatdef *f) { unsigned long x; memcpy((char *)&x, p, sizeof x); if (x <= LONG_MAX) return PyLong_FromLong((long)x); return PyLong_FromUnsignedLong(x); } static PyObject * nu_ssize_t(const char *p, const formatdef *f) { Py_ssize_t x; memcpy((char *)&x, p, sizeof x); return PyLong_FromSsize_t(x); } static PyObject * nu_size_t(const char *p, const formatdef *f) { size_t x; memcpy((char *)&x, p, sizeof x); return PyLong_FromSize_t(x); } /* Native mode doesn't support q or Q unless the platform C supports long long (or, on Windows, __int64). */ #ifdef HAVE_LONG_LONG static PyObject * nu_longlong(const char *p, const formatdef *f) { PY_LONG_LONG x; memcpy((char *)&x, p, sizeof x); if (x >= LONG_MIN && x <= LONG_MAX) return PyLong_FromLong(Py_SAFE_DOWNCAST(x, PY_LONG_LONG, long)); return PyLong_FromLongLong(x); } static PyObject * nu_ulonglong(const char *p, const formatdef *f) { unsigned PY_LONG_LONG x; memcpy((char *)&x, p, sizeof x); if (x <= LONG_MAX) return PyLong_FromLong(Py_SAFE_DOWNCAST(x, unsigned PY_LONG_LONG, long)); return PyLong_FromUnsignedLongLong(x); } #endif static PyObject * nu_bool(const char *p, const formatdef *f) { BOOL_TYPE x; memcpy((char *)&x, p, sizeof x); return PyBool_FromLong(x != 0); } static PyObject * nu_float(const char *p, const formatdef *f) { float x; memcpy((char *)&x, p, sizeof x); return PyFloat_FromDouble((double)x); } static PyObject * nu_double(const char *p, const formatdef *f) { double x; memcpy((char *)&x, p, sizeof x); return PyFloat_FromDouble(x); } static PyObject * nu_complex_float(const char *p, const formatdef *f) { float re, im; memcpy((char *)&re, p, sizeof re); memcpy((char *)&im, p+sizeof re, sizeof im); return PyComplex_FromDoubles((double)re, (double) im); } static PyObject * nu_complex_double(const char *p, const formatdef *f) { double re, im; memcpy((char *)&re, p, sizeof re); memcpy((char *)&im, p+sizeof re, sizeof im); return PyComplex_FromDoubles(re, im); } static PyObject * nu_void_p(const char *p, const formatdef *f) { void *x; memcpy((char *)&x, p, sizeof x); return PyLong_FromVoidPtr(x); } static int np_byte(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; if (x < -128 || x > 127){ PyErr_SetString(StructError, "byte format requires -128 <= number <= 127"); return -1; } *p = (char)x; return 0; } static int np_ubyte(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; if (x < 0 || x > 255){ PyErr_SetString(StructError, "ubyte format requires 0 <= number <= 255"); return -1; } *p = (char)x; return 0; } static int np_char(char *p, PyObject *v, const formatdef *f) { if (!PyBytes_Check(v) || PyBytes_Size(v) != 1) { PyErr_SetString(StructError, "char format requires a bytes object of length 1"); return -1; } *p = *PyBytes_AsString(v); return 0; } static int np_short(char *p, PyObject *v, const formatdef *f) { long x; short y; if (get_long(v, &x) < 0) return -1; if (x < SHRT_MIN || x > SHRT_MAX){ PyErr_SetString(StructError, "short format requires " STRINGIFY(SHRT_MIN) " <= number <= " STRINGIFY(SHRT_MAX)); return -1; } y = (short)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_ushort(char *p, PyObject *v, const formatdef *f) { long x; unsigned short y; if (get_long(v, &x) < 0) return -1; if (x < 0 || x > USHRT_MAX){ PyErr_SetString(StructError, "ushort format requires 0 <= number <= " STRINGIFY(USHRT_MAX)); return -1; } y = (unsigned short)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_int(char *p, PyObject *v, const formatdef *f) { long x; int y; if (get_long(v, &x) < 0) return -1; #if (SIZEOF_LONG > SIZEOF_INT) if ((x < ((long)INT_MIN)) || (x > ((long)INT_MAX))) RANGE_ERROR(x, f, 0, -1); #endif y = (int)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_uint(char *p, PyObject *v, const formatdef *f) { unsigned long x; unsigned int y; if (get_ulong(v, &x) < 0) return -1; y = (unsigned int)x; #if (SIZEOF_LONG > SIZEOF_INT) if (x > ((unsigned long)UINT_MAX)) RANGE_ERROR(y, f, 1, -1); #endif memcpy(p, (char *)&y, sizeof y); return 0; } static int np_long(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ulong(char *p, PyObject *v, const formatdef *f) { unsigned long x; if (get_ulong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ssize_t(char *p, PyObject *v, const formatdef *f) { Py_ssize_t x; if (get_ssize_t(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_size_t(char *p, PyObject *v, const formatdef *f) { size_t x; if (get_size_t(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } #ifdef HAVE_LONG_LONG static int np_longlong(char *p, PyObject *v, const formatdef *f) { PY_LONG_LONG x; if (get_longlong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ulonglong(char *p, PyObject *v, const formatdef *f) { unsigned PY_LONG_LONG x; if (get_ulonglong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } #endif static int np_bool(char *p, PyObject *v, const formatdef *f) { int y; BOOL_TYPE x; y = PyObject_IsTrue(v); if (y < 0) return -1; x = y; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_float(char *p, PyObject *v, const formatdef *f) { float x = (float)PyFloat_AsDouble(v); if (x == -1 && PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a float"); return -1; } memcpy(p, (char *)&x, sizeof x); return 0; } static int np_double(char *p, PyObject *v, const formatdef *f) { double x = PyFloat_AsDouble(v); if (x == -1 && PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a float"); return -1; } memcpy(p, (char *)&x, sizeof(double)); return 0; } static int np_complex_float(char *p, PyObject *v, const formatdef *f) { if (PyArray_IsZeroDim(v)) { PyObject *v_cast = PyArray_Cast( reinterpret_cast(v), NPY_CFLOAT); if (!v_cast) return -1; memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast)); Py_DECREF(v_cast); } else { float re = 0.0f; float im = 0.0f; Py_complex cplx = PyComplex_AsCComplex(v); if (PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a complex"); return -1; } re = (float)cplx.real; im = (float)cplx.imag; memcpy(p, (char *)&re, sizeof re); memcpy(p+sizeof re, (char *)&im, sizeof im); } return 0; } static int np_complex_double(char *p, PyObject *v, const formatdef *f) { if (PyArray_IsZeroDim(v)) { PyObject *v_cast = PyArray_Cast( reinterpret_cast(v), NPY_CDOUBLE); if (!v_cast) return -1; memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast)); Py_DECREF(v_cast); } else { double re = 0.0; double im = 0.0; Py_complex cplx = PyComplex_AsCComplex(v); if (PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a complex"); return -1; } re = cplx.real; im = cplx.imag; memcpy(p, (char *)&re, sizeof re); memcpy(p+sizeof re, (char *)&im, sizeof im); } return 0; } static int np_void_p(char *p, PyObject *v, const formatdef *f) { void *x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsVoidPtr(v); Py_DECREF(v); if (x == NULL && PyErr_Occurred()) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static formatdef native_table[] = { {'x', sizeof(char), 0, NULL}, {'b', sizeof(char), 0, nu_byte, np_byte}, {'B', sizeof(char), 0, nu_ubyte, np_ubyte}, {'c', sizeof(char), 0, nu_char, np_char}, {'s', sizeof(char), 0, NULL}, {'p', sizeof(char), 0, NULL}, {'h', sizeof(short), SHORT_ALIGN, nu_short, np_short}, {'H', sizeof(short), SHORT_ALIGN, nu_ushort, np_ushort}, {'i', sizeof(int), INT_ALIGN, nu_int, np_int}, {'I', sizeof(int), INT_ALIGN, nu_uint, np_uint}, {'l', sizeof(long), LONG_ALIGN, nu_long, np_long}, {'L', sizeof(long), LONG_ALIGN, nu_ulong, np_ulong}, {'n', sizeof(size_t), SIZE_T_ALIGN, nu_ssize_t, np_ssize_t}, {'N', sizeof(size_t), SIZE_T_ALIGN, nu_size_t, np_size_t}, #ifdef HAVE_LONG_LONG {'q', sizeof(PY_LONG_LONG), LONG_LONG_ALIGN, nu_longlong, np_longlong}, {'Q', sizeof(PY_LONG_LONG), LONG_LONG_ALIGN, nu_ulonglong,np_ulonglong}, #endif {'?', sizeof(BOOL_TYPE), BOOL_ALIGN, nu_bool, np_bool}, {'f', sizeof(float), FLOAT_ALIGN, nu_float, np_float}, {'d', sizeof(double), DOUBLE_ALIGN, nu_double, np_double}, {'F', 2*sizeof(float), FLOAT_ALIGN, nu_complex_float, np_complex_float}, {'D', 2*sizeof(double), DOUBLE_ALIGN, nu_complex_double, np_complex_double}, {'P', sizeof(void *), VOID_P_ALIGN, nu_void_p, np_void_p}, {0} }; // }}} static const formatdef * whichtable(char **pfmt) { const char *fmt = (*pfmt)++; /* May be backed out of later */ switch (*fmt) { default: --*pfmt; /* Back out of pointer increment */ /* Fall through */ case '@': return native_table; } } /* Get the table entry for a format code */ static const formatdef * getentry(int c, const formatdef *f) { for (; f->format != '\0'; f++) { if (f->format == c) { return f; } } PyErr_SetString(StructError, "bad char in struct format"); return NULL; } /* Align a size according to a format code. Return -1 on overflow. */ static Py_ssize_t align(Py_ssize_t size, char c, const formatdef *e) { Py_ssize_t extra; if (e->format == c) { if (e->alignment && size > 0) { extra = (e->alignment - 1) - (size - 1) % (e->alignment); if (extra > PY_SSIZE_T_MAX - size) return -1; size += extra; } } return size; } /* calculate the size of a format string */ static int prepare_s(PyStructObject *self) { const formatdef *f; const formatdef *e; formatcode *codes; const char *s; const char *fmt; char c; Py_ssize_t size, len, num, itemsize; fmt = PyBytes_AS_STRING(self->s_format); f = whichtable((char **)&fmt); s = fmt; size = 0; len = 0; while ((c = *s++) != '\0') { if (isspace(Py_CHARMASK(c))) continue; if ('0' <= c && c <= '9') { num = c - '0'; while ('0' <= (c = *s++) && c <= '9') { /* overflow-safe version of if (num*10 + (c - '0') > PY_SSIZE_T_MAX) { ... } */ if (num >= PY_SSIZE_T_MAX / 10 && ( num > PY_SSIZE_T_MAX / 10 || (c - '0') > PY_SSIZE_T_MAX % 10)) goto overflow; num = num*10 + (c - '0'); } if (c == '\0') { PyErr_SetString(StructError, "repeat count given without format specifier"); return -1; } } else num = 1; e = getentry(c, f); if (e == NULL) return -1; switch (c) { case 's': /* fall through */ case 'p': len++; break; case 'x': break; default: len += num; break; } itemsize = e->size; size = align(size, c, e); if (size == -1) goto overflow; /* if (size + num * itemsize > PY_SSIZE_T_MAX) { ... } */ if (num > (PY_SSIZE_T_MAX - size) / itemsize) goto overflow; size += num * itemsize; } /* check for overflow */ if ((len + 1) > (PY_SSIZE_T_MAX / sizeof(formatcode))) { PyErr_NoMemory(); return -1; } self->s_size = size; self->s_len = len; codes = (formatcode *) PyMem_MALLOC((len + 1) * sizeof(formatcode)); if (codes == NULL) { PyErr_NoMemory(); return -1; } /* Free any s_codes value left over from a previous initialization. */ if (self->s_codes != NULL) PyMem_FREE(self->s_codes); self->s_codes = codes; s = fmt; size = 0; while ((c = *s++) != '\0') { if (isspace(Py_CHARMASK(c))) continue; if ('0' <= c && c <= '9') { num = c - '0'; while ('0' <= (c = *s++) && c <= '9') num = num*10 + (c - '0'); if (c == '\0') break; } else num = 1; e = getentry(c, f); size = align(size, c, e); if (c == 's' || c == 'p') { codes->offset = size; codes->size = num; codes->fmtdef = e; codes++; size += num; } else if (c == 'x') { size += num; } else { while (--num >= 0) { codes->offset = size; codes->size = e->size; codes->fmtdef = e; codes++; size += e->size; } } } codes->fmtdef = NULL; codes->offset = size; codes->size = 0; return 0; overflow: PyErr_SetString(StructError, "total struct size too long"); return -1; } static PyObject * s_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { PyObject *self; assert(type != NULL && type->tp_alloc != NULL); self = type->tp_alloc(type, 0); if (self != NULL) { PyStructObject *s = (PyStructObject*)self; Py_INCREF(Py_None); s->s_format = Py_None; s->s_codes = NULL; s->s_size = -1; s->s_len = -1; } return self; } static int s_init(PyObject *self, PyObject *args, PyObject *kwds) { PyStructObject *soself = (PyStructObject *)self; PyObject *o_format = NULL; int ret = 0; static char *kwlist[] = {"format", 0}; assert(PyStruct_Check(self)); if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:Struct", kwlist, &o_format)) return -1; if (PyUnicode_Check(o_format)) { o_format = PyUnicode_AsASCIIString(o_format); if (o_format == NULL) return -1; } /* XXX support buffer interface, too */ else { Py_INCREF(o_format); } if (!PyBytes_Check(o_format)) { Py_DECREF(o_format); PyErr_Format(PyExc_TypeError, "Struct() argument 1 must be a bytes object, not %.200s", Py_TYPE(o_format)->tp_name); return -1; } Py_CLEAR(soself->s_format); soself->s_format = o_format; ret = prepare_s(soself); return ret; } static void s_dealloc(PyStructObject *s) { if (s->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *)s); if (s->s_codes != NULL) { PyMem_FREE(s->s_codes); } Py_XDECREF(s->s_format); Py_TYPE(s)->tp_free((PyObject *)s); } static PyObject * s_unpack_internal(PyStructObject *soself, char *startfrom) { formatcode *code; Py_ssize_t i = 0; PyObject *result = PyTuple_New(soself->s_len); if (result == NULL) return NULL; for (code = soself->s_codes; code->fmtdef != NULL; code++) { PyObject *v; const formatdef *e = code->fmtdef; const char *res = startfrom + code->offset; if (e->format == 's') { v = PyBytes_FromStringAndSize(res, code->size); } else if (e->format == 'p') { Py_ssize_t n = *(unsigned char*)res; if (n >= code->size) n = code->size - 1; v = PyBytes_FromStringAndSize(res + 1, n); } else { v = e->unpack(res, e); } if (v == NULL) goto fail; PyTuple_SET_ITEM(result, i++, v); } return result; fail: Py_DECREF(result); return NULL; } PyDoc_STRVAR(s_unpack__doc__, "S.unpack(buffer) -> (v1, v2, ...)\n\ \n\ Return a tuple containing values unpacked according to the format\n\ string S.format. Requires len(buffer) == S.size. See help(struct)\n\ for more on format strings."); static PyObject * s_unpack(PyObject *self, PyObject *input) { Py_buffer vbuf; PyObject *result; PyStructObject *soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyObject_GetBuffer(input, &vbuf, PyBUF_SIMPLE) < 0) return NULL; if (vbuf.len != soself->s_size) { PyErr_Format(StructError, "unpack requires a bytes object of length %zd", soself->s_size); PyBuffer_Release(&vbuf); return NULL; } result = s_unpack_internal(soself, (char *) vbuf.buf); PyBuffer_Release(&vbuf); return result; } PyDoc_STRVAR(s_unpack_from__doc__, "S.unpack_from(buffer, offset=0) -> (v1, v2, ...)\n\ \n\ Return a tuple containing values unpacked according to the format\n\ string S.format. Requires len(buffer[offset:]) >= S.size. See\n\ help(struct) for more on format strings."); static PyObject * s_unpack_from(PyObject *self, PyObject *args, PyObject *kwds) { static char *kwlist[] = {"buffer", "offset", 0}; PyObject *input; Py_ssize_t offset = 0; Py_buffer vbuf; PyObject *result; PyStructObject *soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|n:unpack_from", kwlist, &input, &offset)) return NULL; if (PyObject_GetBuffer(input, &vbuf, PyBUF_SIMPLE) < 0) return NULL; if (offset < 0) offset += vbuf.len; if (offset < 0 || vbuf.len - offset < soself->s_size) { PyErr_Format(StructError, "unpack_from requires a buffer of at least %zd bytes", soself->s_size); PyBuffer_Release(&vbuf); return NULL; } result = s_unpack_internal(soself, (char*)vbuf.buf + offset); PyBuffer_Release(&vbuf); return result; } /* * Guts of the pack function. * * Takes a struct object, a tuple of arguments, and offset in that tuple of * argument for where to start processing the arguments for packing, and a * character buffer for writing the packed string. The caller must insure * that the buffer may contain the required length for packing the arguments. * 0 is returned on success, 1 is returned if there is an error. * */ static int s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf) { formatcode *code; /* XXX(nnorwitz): why does i need to be a local? can we use the offset parameter or do we need the wider width? */ Py_ssize_t i; memset(buf, '\0', soself->s_size); i = offset; for (code = soself->s_codes; code->fmtdef != NULL; code++) { Py_ssize_t n; PyObject *v = PyTuple_GET_ITEM(args, i++); const formatdef *e = code->fmtdef; char *res = buf + code->offset; if (e->format == 's') { int isstring; void *p; if (PyBytes_Check(v)) { n = PyBytes_GET_SIZE(v); p = PyBytes_AS_STRING(v); if (n > code->size) n = code->size; if (n > 0) memcpy(res, p, n); } else if (PyByteArray_Check(v)) { n = PyByteArray_GET_SIZE(v); p = PyByteArray_AS_STRING(v); if (n > code->size) n = code->size; if (n > 0) memcpy(res, p, n); } else if (PyObject_CheckBuffer(v)) { Py_buffer view; int gb_result = PyObject_GetBuffer(v, &view, PyBUF_SIMPLE); if (gb_result == -1) return gb_result; n = view.len; if (n > code->size) n = code->size; if (n > 0) memcpy(res, view.buf, n); PyBuffer_Release(&view); } else { PyErr_SetString(StructError, "argument for 's' must be a bytes object"); return -1; } } else if (e->format == 'p') { int isstring; void *p; isstring = PyBytes_Check(v); if (!isstring && !PyByteArray_Check(v)) { PyErr_SetString(StructError, "argument for 'p' must be a bytes object"); return -1; } if (isstring) { n = PyBytes_GET_SIZE(v); p = PyBytes_AS_STRING(v); } else { n = PyByteArray_GET_SIZE(v); p = PyByteArray_AS_STRING(v); } if (n > (code->size - 1)) n = code->size - 1; if (n > 0) memcpy(res + 1, p, n); if (n > 255) n = 255; *res = Py_SAFE_DOWNCAST(n, Py_ssize_t, unsigned char); } else { if (e->pack(res, v, e) < 0) { if (PyLong_Check(v) && PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "long too large to convert to int"); return -1; } } } /* Success */ return 0; } PyDoc_STRVAR(s_pack__doc__, "S.pack(v1, v2, ...) -> bytes\n\ \n\ Return a bytes object containing values v1, v2, ... packed according\n\ to the format string S.format. See help(struct) for more on format\n\ strings."); static PyObject * s_pack(PyObject *self, PyObject *args) { PyStructObject *soself; PyObject *result; /* Validate arguments. */ soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyTuple_GET_SIZE(args) != soself->s_len) { PyErr_Format(StructError, "pack requires exactly %zd arguments", soself->s_len); return NULL; } /* Allocate a new string */ result = PyBytes_FromStringAndSize((char *)NULL, soself->s_size); if (result == NULL) return NULL; /* Call the guts */ if ( s_pack_internal(soself, args, 0, PyBytes_AS_STRING(result)) != 0 ) { Py_DECREF(result); return NULL; } return result; } PyDoc_STRVAR(s_pack_into__doc__, "S.pack_into(buffer, offset, v1, v2, ...)\n\ \n\ Pack the values v1, v2, ... according to the format string S.format\n\ and write the packed bytes into the writable buffer buf starting at\n\ offset. Note that the offset is a required argument. See\n\ help(struct) for more on format strings."); static PyObject * s_pack_into(PyObject *self, PyObject *args) { PyStructObject *soself; char *buffer; Py_ssize_t buffer_len, offset; /* Validate arguments. +1 is for the first arg as buffer. */ soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyTuple_GET_SIZE(args) != (soself->s_len + 2)) { PyErr_Format(StructError, "pack_into requires exactly %zd arguments", (soself->s_len + 2)); return NULL; } /* Extract a writable memory buffer from the first argument */ if ( PyObject_AsWriteBuffer(PyTuple_GET_ITEM(args, 0), (void**)&buffer, &buffer_len) == -1 ) { return NULL; } assert( buffer_len >= 0 ); /* Extract the offset from the first argument */ offset = PyNumber_AsSsize_t(PyTuple_GET_ITEM(args, 1), PyExc_IndexError); if (offset == -1 && PyErr_Occurred()) return NULL; /* Support negative offsets. */ if (offset < 0) offset += buffer_len; /* Check boundaries */ if (offset < 0 || (buffer_len - offset) < soself->s_size) { PyErr_Format(StructError, "pack_into requires a buffer of at least %zd bytes", soself->s_size); return NULL; } /* Call the guts */ if ( s_pack_internal(soself, args, 2, buffer + offset) != 0 ) { return NULL; } Py_RETURN_NONE; } static PyObject * s_get_format(PyStructObject *self, void *unused) { Py_INCREF(self->s_format); return self->s_format; } static PyObject * s_get_size(PyStructObject *self, void *unused) { return PyLong_FromSsize_t(self->s_size); } /* List of functions */ static struct PyMethodDef s_methods[] = { {"pack", s_pack, METH_VARARGS, s_pack__doc__}, {"pack_into", s_pack_into, METH_VARARGS, s_pack_into__doc__}, {"unpack", s_unpack, METH_O, s_unpack__doc__}, {"unpack_from", (PyCFunction)s_unpack_from, METH_VARARGS|METH_KEYWORDS, s_unpack_from__doc__}, {NULL, NULL} /* sentinel */ }; PyDoc_STRVAR(s__doc__, "Struct(fmt) --> compiled struct object\n" "\n" "Return a new Struct object which writes and reads binary data according to\n" "the format string fmt. See help(struct) for more on format strings."); #define OFF(x) offsetof(PyStructObject, x) static PyGetSetDef s_getsetlist[] = { {"format", (getter)s_get_format, (setter)NULL, "struct format string", NULL}, {"size", (getter)s_get_size, (setter)NULL, "struct size in bytes", NULL}, {NULL} /* sentinel */ }; namespace { PyTypeObject PyStructType = { PyVarObject_HEAD_INIT(NULL, 0) "Struct", sizeof(PyStructObject), 0, (destructor)s_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_reserved */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ PyObject_GenericSetAttr, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ s__doc__, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ offsetof(PyStructObject, weakreflist), /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ s_methods, /* tp_methods */ NULL, /* tp_members */ s_getsetlist, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ s_init, /* tp_init */ PyType_GenericAlloc,/* tp_alloc */ s_new, /* tp_new */ PyObject_Del, /* tp_free */ }; } /* ---- Standalone functions ---- */ #define MAXCACHE 100 static PyObject *cache = NULL; static PyObject * cache_struct(PyObject *fmt) { PyObject * s_object; if (cache == NULL) { cache = PyDict_New(); if (cache == NULL) return NULL; } s_object = PyDict_GetItem(cache, fmt); if (s_object != NULL) { Py_INCREF(s_object); return s_object; } s_object = PyObject_CallFunctionObjArgs((PyObject *)(&PyStructType), fmt, NULL); if (s_object != NULL) { if (PyDict_Size(cache) >= MAXCACHE) PyDict_Clear(cache); /* Attempt to cache the result */ if (PyDict_SetItem(cache, fmt, s_object) == -1) PyErr_Clear(); } return s_object; } PyDoc_STRVAR(clearcache_doc, "Clear the internal cache."); static PyObject * clearcache(PyObject *self) { Py_CLEAR(cache); Py_RETURN_NONE; } PyDoc_STRVAR(calcsize_doc, "calcsize(fmt) -> integer\n\ \n\ Return size in bytes of the struct described by the format string fmt."); static PyObject * calcsize(PyObject *self, PyObject *fmt) { Py_ssize_t n; PyObject *s_object = cache_struct(fmt); if (s_object == NULL) return NULL; n = ((PyStructObject *)s_object)->s_size; Py_DECREF(s_object); return PyLong_FromSsize_t(n); } PyDoc_STRVAR(pack_doc, "pack(fmt, v1, v2, ...) -> bytes\n\ \n\ Return a bytes object containing the values v1, v2, ... packed according\n\ to the format string fmt. See help(struct) for more on format strings."); static PyObject * pack(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_pack(s_object, newargs); Py_DECREF(newargs); Py_DECREF(s_object); return result; } PyDoc_STRVAR(pack_into_doc, "pack_into(fmt, buffer, offset, v1, v2, ...)\n\ \n\ Pack the values v1, v2, ... according to the format string fmt and write\n\ the packed bytes into the writable buffer buf starting at offset. Note\n\ that the offset is a required argument. See help(struct) for more\n\ on format strings."); static PyObject * pack_into(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_pack_into(s_object, newargs); Py_DECREF(newargs); Py_DECREF(s_object); return result; } PyDoc_STRVAR(unpack_doc, "unpack(fmt, buffer) -> (v1, v2, ...)\n\ \n\ Return a tuple containing values unpacked according to the format string\n\ fmt. Requires len(buffer) == calcsize(fmt). See help(struct) for more\n\ on format strings."); static PyObject * unpack(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *inputstr, *result; if (!PyArg_UnpackTuple(args, "unpack", 2, 2, &fmt, &inputstr)) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) return NULL; result = s_unpack(s_object, inputstr); Py_DECREF(s_object); return result; } PyDoc_STRVAR(unpack_from_doc, "unpack_from(fmt, buffer, offset=0) -> (v1, v2, ...)\n\ \n\ Return a tuple containing values unpacked according to the format string\n\ fmt. Requires len(buffer[offset:]) >= calcsize(fmt). See help(struct)\n\ for more on format strings."); static PyObject * unpack_from(PyObject *self, PyObject *args, PyObject *kwds) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_unpack_from(s_object, newargs, kwds); Py_DECREF(newargs); Py_DECREF(s_object); return result; } static struct PyMethodDef module_functions[] = { {"_clearcache", (PyCFunction)clearcache, METH_NOARGS, clearcache_doc}, {"calcsize", calcsize, METH_O, calcsize_doc}, {"pack", pack, METH_VARARGS, pack_doc}, {"pack_into", pack_into, METH_VARARGS, pack_into_doc}, {"unpack", unpack, METH_VARARGS, unpack_doc}, {"unpack_from", (PyCFunction)unpack_from, METH_VARARGS|METH_KEYWORDS, unpack_from_doc}, {NULL, NULL} /* sentinel */ }; /* Module initialization */ PyDoc_STRVAR(module_doc, "Functions to convert between Python values and C structs.\n\ Python bytes objects are used to hold the data representing the C struct\n\ and also as format strings (explained below) to describe the layout of data\n\ in the C struct.\n\ \n\ The optional first format char indicates byte order, size and alignment:\n\ @: native order, size & alignment (default)\n\ =: native order, std. size & alignment\n\ <: little-endian, std. size & alignment\n\ >: big-endian, std. size & alignment\n\ !: same as >\n\ \n\ The remaining chars indicate types of args and must match exactly;\n\ these can be preceded by a decimal repeat count:\n\ x: pad byte (no data); c:char; b:signed byte; B:unsigned byte;\n\ ?: _Bool (requires C99; if not available, char is used instead)\n\ h:short; H:unsigned short; i:int; I:unsigned int;\n\ l:long; L:unsigned long; f:float; d:double.\n\ Special cases (preceding decimal count indicates length):\n\ s:string (array of char); p: pascal string (with count byte).\n\ Special cases (only available in native format):\n\ n:ssize_t; N:size_t;\n\ P:an integer type that is wide enough to hold a pointer.\n\ Special case (not in native mode unless 'long long' in platform C):\n\ q:long long; Q:unsigned long long\n\ Whitespace between formats is ignored.\n\ \n\ The variable struct.error is an exception raised on errors.\n"); static struct PyModuleDef _structmodule = { PyModuleDef_HEAD_INIT, "_struct", module_doc, -1, module_functions, NULL, NULL, NULL, NULL }; extern "C" PyMODINIT_FUNC PyInit__pvt_struct(void) { PyObject *m; m = PyModule_Create(&_structmodule); if (m == NULL) return NULL; Py_TYPE(&PyStructType) = &PyType_Type; if (PyType_Ready(&PyStructType) < 0) return NULL; /* Add some symbolic constants to the module */ if (StructError == NULL) { StructError = PyErr_NewException("struct.error", NULL, NULL); if (StructError == NULL) return NULL; } Py_INCREF(StructError); PyModule_AddObject(m, "error", StructError); Py_INCREF((PyObject*)&PyStructType); PyModule_AddObject(m, "Struct", (PyObject*)&PyStructType); return m; } // vim: fdm=marker pyopencl-2013.2/src/wrapper/wrap_cl_part_1.cpp0000644000175000000500000002260512245716340020023 0ustar tomussrc#include "wrap_cl.hpp" using namespace pyopencl; void pyopencl_expose_part_1() { py::docstring_options doc_op; doc_op.disable_cpp_signatures(); py::def("get_cl_header_version", get_cl_header_version); // {{{ platform DEF_SIMPLE_FUNCTION(get_platforms); { typedef platform cls; py::class_("Platform", py::no_init) .DEF_SIMPLE_METHOD(get_info) .def("get_devices", &cls::get_devices, py::arg("device_type")=CL_DEVICE_TYPE_ALL) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_platform_id) ; } // }}} // {{{ device { typedef device cls; py::class_("Device", py::no_init) .DEF_SIMPLE_METHOD(get_info) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION) .DEF_SIMPLE_METHOD(create_sub_devices_ext) #endif #if PYOPENCL_CL_VERSION >= 0x1020 .DEF_SIMPLE_METHOD(create_sub_devices) #endif PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_device_id) ; } // }}} // {{{ context { typedef context cls; py::class_ >("Context", py::no_init) .def("__init__", make_constructor(create_context, py::default_call_policies(), (py::arg("devices")=py::object(), py::arg("properties")=py::object(), py::arg("dev_type")=py::object() ))) .DEF_SIMPLE_METHOD(get_info) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_context) ; } // }}} // {{{ command queue { typedef command_queue cls; py::class_("CommandQueue", py::init ((py::arg("context"), py::arg("device")=py::object(), py::arg("properties")=0))) .DEF_SIMPLE_METHOD(get_info) #if PYOPENCL_CL_VERSION < 0x1010 .DEF_SIMPLE_METHOD(set_property) #endif .DEF_SIMPLE_METHOD(flush) .DEF_SIMPLE_METHOD(finish) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_command_queue) ; } // }}} // {{{ events/synchronization { typedef event cls; py::class_("Event", py::no_init) .DEF_SIMPLE_METHOD(get_info) .DEF_SIMPLE_METHOD(get_profiling_info) .DEF_SIMPLE_METHOD(wait) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) PYOPENCL_EXPOSE_TO_FROM_INT_PTR(cl_event) // deprecated, remove in 2015.x. .def("from_cl_event_as_int", from_int_ptr, py::return_value_policy()) .staticmethod("from_cl_event_as_int") ; } { typedef nanny_event cls; py::class_ >("NannyEvent", py::no_init) .DEF_SIMPLE_METHOD(get_ward) ; } DEF_SIMPLE_FUNCTION(wait_for_events); #if PYOPENCL_CL_VERSION >= 0x1020 py::def("_enqueue_marker_with_wait_list", enqueue_marker_with_wait_list, (py::arg("queue"), py::arg("wait_for")=py::object()), py::return_value_policy()); #endif py::def("_enqueue_marker", enqueue_marker, (py::arg("queue")), py::return_value_policy()); py::def("_enqueue_wait_for_events", enqueue_wait_for_events, (py::arg("queue"), py::arg("wait_for")=py::object())); #if PYOPENCL_CL_VERSION >= 0x1020 py::def("_enqueue_barrier_with_wait_list", enqueue_barrier_with_wait_list, (py::arg("queue"), py::arg("wait_for")=py::object()), py::return_value_policy()); #endif py::def("_enqueue_barrier", enqueue_barrier, py::arg("queue")); #if PYOPENCL_CL_VERSION >= 0x1010 { typedef user_event cls; py::class_, boost::noncopyable>("UserEvent", py::no_init) .def("__init__", make_constructor( create_user_event, py::default_call_policies(), py::args("context"))) .DEF_SIMPLE_METHOD(set_status) ; } #endif // }}} // {{{ memory_object { typedef memory_object_holder cls; py::class_( "MemoryObjectHolder", py::no_init) .DEF_SIMPLE_METHOD(get_info) .def("get_host_array", get_mem_obj_host_array, (py::arg("shape"), py::arg("dtype"), py::arg("order")="C")) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cls::hash) .add_property("int_ptr", to_int_ptr, "Return an integer corresponding to the pointer value " "of the underlying :c:type:`cl_mem`. " "Use :meth:`from_int_ptr` to turn back into a Python object." "\n\n.. versionadded:: 2013.2\n") ; } { typedef memory_object cls; py::class_ >( "MemoryObject", py::no_init) .DEF_SIMPLE_METHOD(release) .add_property("hostbuf", &cls::hostbuf) .def("from_int_ptr", memory_object_from_int, "(static method) Return a new Python object referencing the C-level " \ ":c:type:`cl_mem` object at the location pointed to " \ "by *int_ptr_value*. The relevant :c:func:`clRetain*` function " \ "will be called." \ "\n\n.. versionadded:: 2013.2\n") \ .staticmethod("from_int_ptr") // deprecated, remove in 2015.x .def("from_cl_mem_as_int", memory_object_from_int) .staticmethod("from_cl_mem_as_int") ; } #if PYOPENCL_CL_VERSION >= 0x1020 py::def("enqueue_migrate_mem_objects", enqueue_migrate_mem_objects, (py::args("queue", "mem_objects"), py::arg("flags")=0, py::arg("wait_for")=py::object() ), py::return_value_policy()); #endif #ifdef cl_ext_migrate_memobject py::def("enqueue_migrate_mem_object_ext", enqueue_migrate_mem_object_ext, (py::args("queue", "mem_objects"), py::arg("flags")=0, py::arg("wait_for")=py::object() ), py::return_value_policy()); #endif // }}} // {{{ buffer { typedef buffer cls; py::class_, boost::noncopyable>( "Buffer", py::no_init) .def("__init__", make_constructor(create_buffer_py, py::default_call_policies(), (py::args("context", "flags"), py::arg("size")=0, py::arg("hostbuf")=py::object() ))) #if PYOPENCL_CL_VERSION >= 0x1010 .def("get_sub_region", &cls::get_sub_region, (py::args("origin", "size"), py::arg("flags")=0), py::return_value_policy()) .def("__getitem__", &cls::getitem, py::return_value_policy()) #endif ; } // }}} // {{{ transfers // {{{ byte-for-byte py::def("_enqueue_read_buffer", enqueue_read_buffer, (py::args("queue", "mem", "hostbuf"), py::arg("device_offset")=0, py::arg("wait_for")=py::object(), py::arg("is_blocking")=true ), py::return_value_policy()); py::def("_enqueue_write_buffer", enqueue_write_buffer, (py::args("queue", "mem", "hostbuf"), py::arg("device_offset")=0, py::arg("wait_for")=py::object(), py::arg("is_blocking")=true ), py::return_value_policy()); py::def("_enqueue_copy_buffer", enqueue_copy_buffer, (py::args("queue", "src", "dst"), py::arg("byte_count")=-1, py::arg("src_offset")=0, py::arg("dst_offset")=0, py::arg("wait_for")=py::object() ), py::return_value_policy()); // }}} // {{{ rectangular #if PYOPENCL_CL_VERSION >= 0x1010 py::def("_enqueue_read_buffer_rect", enqueue_read_buffer_rect, (py::args("queue", "mem", "hostbuf", "buffer_origin", "host_origin", "region"), py::arg("buffer_pitches")=py::object(), py::arg("host_pitches")=py::object(), py::arg("wait_for")=py::object(), py::arg("is_blocking")=true ), py::return_value_policy()); py::def("_enqueue_write_buffer_rect", enqueue_write_buffer_rect, (py::args("queue", "mem", "hostbuf", "buffer_origin", "host_origin", "region"), py::arg("buffer_pitches")=py::object(), py::arg("host_pitches")=py::object(), py::arg("wait_for")=py::object(), py::arg("is_blocking")=true ), py::return_value_policy()); py::def("_enqueue_copy_buffer_rect", enqueue_copy_buffer_rect, (py::args("queue", "src", "dst", "src_origin", "dst_origin", "region"), py::arg("src_pitches")=py::object(), py::arg("dst_pitches")=py::object(), py::arg("wait_for")=py::object() ), py::return_value_policy()); #endif // }}} // }}} #if PYOPENCL_CL_VERSION >= 0x1020 py::def("_enqueue_fill_buffer", enqueue_fill_buffer, (py::args("queue", "mem", "pattern", "offset", "size"), py::arg("wait_for")=py::object()), py::return_value_policy()); #endif } // vim: foldmethod=marker pyopencl-2013.2/src/wrapper/wrap_mempool.cpp0000644000175000000500000001572412245716340017633 0ustar tomussrc// Gregor Thalhammer (on Apr 13, 2011) said it's necessary to import Python.h // first to prevent OS X from overriding a bunch of macros. (e.g. isspace) #include #include #include "wrap_helpers.hpp" #include "wrap_cl.hpp" #include "mempool.hpp" #include "tools.hpp" #include namespace py = boost::python; namespace { class cl_allocator_base { protected: boost::shared_ptr m_context; cl_mem_flags m_flags; public: cl_allocator_base(boost::shared_ptr const &ctx, cl_mem_flags flags=CL_MEM_READ_WRITE) : m_context(ctx), m_flags(flags) { if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) throw pyopencl::error("Allocator", CL_INVALID_VALUE, "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags"); } cl_allocator_base(cl_allocator_base const &src) : m_context(src.m_context), m_flags(src.m_flags) { } virtual ~cl_allocator_base() { } typedef cl_mem pointer_type; typedef size_t size_type; virtual cl_allocator_base *copy() const = 0; virtual bool is_deferred() const = 0; virtual pointer_type allocate(size_type s) = 0; void free(pointer_type p) { PYOPENCL_CALL_GUARDED(clReleaseMemObject, (p)); } void try_release_blocks() { pyopencl::run_python_gc(); } }; class cl_deferred_allocator : public cl_allocator_base { private: typedef cl_allocator_base super; public: cl_deferred_allocator(boost::shared_ptr const &ctx, cl_mem_flags flags=CL_MEM_READ_WRITE) : super(ctx, flags) { } cl_allocator_base *copy() const { return new cl_deferred_allocator(*this); } bool is_deferred() const { return true; } pointer_type allocate(size_type s) { return pyopencl::create_buffer(m_context->data(), m_flags, s, 0); } }; const unsigned zero = 0; class cl_immediate_allocator : public cl_allocator_base { private: typedef cl_allocator_base super; pyopencl::command_queue m_queue; public: cl_immediate_allocator(pyopencl::command_queue &queue, cl_mem_flags flags=CL_MEM_READ_WRITE) : super(boost::shared_ptr(queue.get_context()), flags), m_queue(queue.data(), /*retain*/ true) { } cl_immediate_allocator(cl_immediate_allocator const &src) : super(src), m_queue(src.m_queue) { } cl_allocator_base *copy() const { return new cl_immediate_allocator(*this); } bool is_deferred() const { return false; } pointer_type allocate(size_type s) { pointer_type ptr = pyopencl::create_buffer( m_context->data(), m_flags, s, 0); // Make sure the buffer gets allocated right here and right now. // This looks (and is) expensive. But immediate allocators // have their main use in memory pools, whose basic assumption // is that allocation is too expensive anyway--but they rely // on exact 'out-of-memory' information. unsigned zero = 0; PYOPENCL_CALL_GUARDED(clEnqueueWriteBuffer, ( m_queue.data(), ptr, /* is blocking */ CL_FALSE, 0, std::min(s, sizeof(zero)), &zero, 0, NULL, NULL )); // No need to wait for completion here. clWaitForEvents (e.g.) // cannot return mem object allocation failures. This implies that // the buffer is faulted onto the device on enqueue. return ptr; } }; inline pyopencl::buffer *allocator_call(cl_allocator_base &alloc, size_t size) { cl_mem mem; int try_count = 0; while (try_count < 2) { try { mem = alloc.allocate(size); break; } catch (pyopencl::error &e) { if (!e.is_out_of_memory()) throw; if (++try_count == 2) throw; } alloc.try_release_blocks(); } try { return new pyopencl::buffer(mem, false); } catch (...) { PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem)); throw; } } class pooled_buffer : public pyopencl::pooled_allocation >, public pyopencl::memory_object_holder { private: typedef pyopencl::pooled_allocation > super; public: pooled_buffer( boost::shared_ptr p, super::size_type s) : super(p, s) { } const super::pointer_type data() const { return ptr(); } }; pooled_buffer *device_pool_allocate( boost::shared_ptr > pool, pyopencl::memory_pool::size_type sz) { return new pooled_buffer(pool, sz); } template void expose_memory_pool(Wrapper &wrapper) { typedef typename Wrapper::wrapped_type cls; wrapper .add_property("held_blocks", &cls::held_blocks) .add_property("active_blocks", &cls::active_blocks) .DEF_SIMPLE_METHOD(bin_number) .DEF_SIMPLE_METHOD(alloc_size) .DEF_SIMPLE_METHOD(free_held) .DEF_SIMPLE_METHOD(stop_holding) .staticmethod("bin_number") .staticmethod("alloc_size") ; } } void pyopencl_expose_mempool() { py::def("bitlog2", pyopencl::bitlog2); { typedef cl_allocator_base cls; py::class_ wrapper("_tools_AllocatorBase", py::no_init); wrapper .def("__call__", allocator_call, py::return_value_policy()) ; } { typedef cl_deferred_allocator cls; py::class_ > wrapper("_tools_DeferredAllocator", py::init< boost::shared_ptr const &, py::optional >()); } { typedef cl_immediate_allocator cls; py::class_ > wrapper("_tools_ImmediateAllocator", py::init >()); } { typedef pyopencl::memory_pool cls; py::class_< cls, boost::noncopyable, boost::shared_ptr > wrapper("MemoryPool", py::init() ); wrapper .def("allocate", device_pool_allocate, py::return_value_policy()) .def("__call__", device_pool_allocate, py::return_value_policy()) // undoc for now .DEF_SIMPLE_METHOD(set_trace) ; expose_memory_pool(wrapper); } { typedef pooled_buffer cls; py::class_ >( "PooledBuffer", py::no_init) .def("release", &cls::free) ; } } pyopencl-2013.2/src/wrapper/numpy_init.hpp0000644000175000000500000000075012245716340017323 0ustar tomussrc#ifndef _FAYHVVAAA_PYOPENCL_HEADER_SEEN_NUMPY_INIT_HPP #define _FAYHVVAAA_PYOPENCL_HEADER_SEEN_NUMPY_INIT_HPP #include #include namespace { static struct pyublas_array_importer { static bool do_import_array() { import_array1(false); return true; } pyublas_array_importer() { if (!do_import_array()) throw std::runtime_error("numpy failed to initialize"); } } _array_importer; } #endif pyopencl-2013.2/src/wrapper/wrap_constants.cpp0000644000175000000500000006264012245716340020176 0ustar tomussrc#include "wrap_cl.hpp" using namespace pyopencl; namespace { py::handle<> CLError, CLMemoryError, CLLogicError, CLRuntimeError; void translate_cl_error(const error &err) { if (err.code() == CL_MEM_OBJECT_ALLOCATION_FAILURE) PyErr_SetObject(CLMemoryError.get(), py::object(err).ptr()); else if (err.code() <= CL_INVALID_VALUE) PyErr_SetObject(CLLogicError.get(), py::object(err).ptr()); else if (err.code() > CL_INVALID_VALUE && err.code() < CL_SUCCESS) PyErr_SetObject(CLRuntimeError.get(), py::object(err).ptr()); else PyErr_SetObject(CLError.get(), py::object(err).ptr()); } // {{{ 'fake' constant scopes class status_code { }; class platform_info { }; class device_type { }; class device_info { }; class device_fp_config { }; class device_mem_cache_type { }; class device_local_mem_type { }; class device_exec_capabilities { }; class command_queue_properties { }; class context_info { }; class gl_context_info { }; class context_properties { }; class command_queue_info { }; class mem_flags { }; class channel_order { }; class channel_type { }; class mem_object_type { }; class mem_info { }; class image_info { }; class addressing_mode { }; class filter_mode { }; class sampler_info { }; class map_flags { }; class program_info { }; class program_build_info { }; class program_binary_type { }; class build_status { }; class kernel_info { }; class kernel_arg_info { }; class kernel_arg_address_qualifier { }; class kernel_arg_access_qualifier { }; class kernel_work_group_info { }; class event_info { }; class command_type { }; class command_execution_status { }; class profiling_info { }; class buffer_create_type { }; class mem_migration_flags { }; class device_partition_property { }; class device_affinity_domain { }; class device_partition_property_ext { }; class affinity_domain_ext { }; class gl_object_type { }; class gl_texture_info { }; class migrate_mem_object_flags_ext {}; // }}} } void pyopencl_expose_constants() { // {{{ exceptions #define DECLARE_EXC(NAME, BASE) \ CL##NAME = py::handle<>(PyErr_NewException("pyopencl." #NAME, BASE, NULL)); \ py::scope().attr(#NAME) = CL##NAME; { DECLARE_EXC(Error, NULL); DECLARE_EXC(MemoryError, CLError.get()); DECLARE_EXC(LogicError, CLError.get()); DECLARE_EXC(RuntimeError, CLError.get()); py::register_exception_translator(translate_cl_error); } // }}} // {{{ constants #define ADD_ATTR(PREFIX, NAME) \ cls.attr(#NAME) = CL_##PREFIX##NAME #define ADD_ATTR_SUFFIX(PREFIX, NAME, SUFFIX) \ cls.attr(#NAME) = CL_##PREFIX##NAME##SUFFIX { typedef error cls; py::class_ ("_error", py::no_init) .DEF_SIMPLE_METHOD(routine) .DEF_SIMPLE_METHOD(code) .DEF_SIMPLE_METHOD(what) ; } { py::class_ cls("status_code", py::no_init); ADD_ATTR(, SUCCESS); ADD_ATTR(, DEVICE_NOT_FOUND); ADD_ATTR(, DEVICE_NOT_AVAILABLE); #if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001) ADD_ATTR(, COMPILER_NOT_AVAILABLE); #endif ADD_ATTR(, MEM_OBJECT_ALLOCATION_FAILURE); ADD_ATTR(, OUT_OF_RESOURCES); ADD_ATTR(, OUT_OF_HOST_MEMORY); ADD_ATTR(, PROFILING_INFO_NOT_AVAILABLE); ADD_ATTR(, MEM_COPY_OVERLAP); ADD_ATTR(, IMAGE_FORMAT_MISMATCH); ADD_ATTR(, IMAGE_FORMAT_NOT_SUPPORTED); ADD_ATTR(, BUILD_PROGRAM_FAILURE); ADD_ATTR(, MAP_FAILURE); ADD_ATTR(, INVALID_VALUE); ADD_ATTR(, INVALID_DEVICE_TYPE); ADD_ATTR(, INVALID_PLATFORM); ADD_ATTR(, INVALID_DEVICE); ADD_ATTR(, INVALID_CONTEXT); ADD_ATTR(, INVALID_QUEUE_PROPERTIES); ADD_ATTR(, INVALID_COMMAND_QUEUE); ADD_ATTR(, INVALID_HOST_PTR); ADD_ATTR(, INVALID_MEM_OBJECT); ADD_ATTR(, INVALID_IMAGE_FORMAT_DESCRIPTOR); ADD_ATTR(, INVALID_IMAGE_SIZE); ADD_ATTR(, INVALID_SAMPLER); ADD_ATTR(, INVALID_BINARY); ADD_ATTR(, INVALID_BUILD_OPTIONS); ADD_ATTR(, INVALID_PROGRAM); ADD_ATTR(, INVALID_PROGRAM_EXECUTABLE); ADD_ATTR(, INVALID_KERNEL_NAME); ADD_ATTR(, INVALID_KERNEL_DEFINITION); ADD_ATTR(, INVALID_KERNEL); ADD_ATTR(, INVALID_ARG_INDEX); ADD_ATTR(, INVALID_ARG_VALUE); ADD_ATTR(, INVALID_ARG_SIZE); ADD_ATTR(, INVALID_KERNEL_ARGS); ADD_ATTR(, INVALID_WORK_DIMENSION); ADD_ATTR(, INVALID_WORK_GROUP_SIZE); ADD_ATTR(, INVALID_WORK_ITEM_SIZE); ADD_ATTR(, INVALID_GLOBAL_OFFSET); ADD_ATTR(, INVALID_EVENT_WAIT_LIST); ADD_ATTR(, INVALID_EVENT); ADD_ATTR(, INVALID_OPERATION); ADD_ATTR(, INVALID_GL_OBJECT); ADD_ATTR(, INVALID_BUFFER_SIZE); ADD_ATTR(, INVALID_MIP_LEVEL); #if defined(cl_khr_icd) && (cl_khr_icd >= 1) ADD_ATTR(, PLATFORM_NOT_FOUND_KHR); #endif #if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1) ADD_ATTR(, INVALID_GL_SHAREGROUP_REFERENCE_KHR); #endif #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(, MISALIGNED_SUB_BUFFER_OFFSET); ADD_ATTR(, EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); ADD_ATTR(, INVALID_GLOBAL_WORK_SIZE); #endif #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(, COMPILE_PROGRAM_FAILURE); ADD_ATTR(, LINKER_NOT_AVAILABLE); ADD_ATTR(, LINK_PROGRAM_FAILURE); ADD_ATTR(, DEVICE_PARTITION_FAILED); ADD_ATTR(, KERNEL_ARG_INFO_NOT_AVAILABLE); ADD_ATTR(, INVALID_IMAGE_DESCRIPTOR); ADD_ATTR(, INVALID_COMPILER_OPTIONS); ADD_ATTR(, INVALID_LINKER_OPTIONS); ADD_ATTR(, INVALID_DEVICE_PARTITION_COUNT); #endif #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION) ADD_ATTR(, DEVICE_PARTITION_FAILED_EXT); ADD_ATTR(, INVALID_PARTITION_COUNT_EXT); ADD_ATTR(, INVALID_PARTITION_NAME_EXT); #endif } { py::class_ cls("platform_info", py::no_init); ADD_ATTR(PLATFORM_, PROFILE); ADD_ATTR(PLATFORM_, VERSION); ADD_ATTR(PLATFORM_, NAME); ADD_ATTR(PLATFORM_, VENDOR); #if !(defined(CL_PLATFORM_NVIDIA) && CL_PLATFORM_NVIDIA == 0x3001) ADD_ATTR(PLATFORM_, EXTENSIONS); #endif } { py::class_ cls("device_type", py::no_init); ADD_ATTR(DEVICE_TYPE_, DEFAULT); ADD_ATTR(DEVICE_TYPE_, CPU); ADD_ATTR(DEVICE_TYPE_, GPU); ADD_ATTR(DEVICE_TYPE_, ACCELERATOR); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(DEVICE_TYPE_, CUSTOM); #endif ADD_ATTR(DEVICE_TYPE_, ALL); } { py::class_ cls("device_info", py::no_init); ADD_ATTR(DEVICE_, TYPE); ADD_ATTR(DEVICE_, VENDOR_ID); ADD_ATTR(DEVICE_, MAX_COMPUTE_UNITS); ADD_ATTR(DEVICE_, MAX_WORK_ITEM_DIMENSIONS); ADD_ATTR(DEVICE_, MAX_WORK_GROUP_SIZE); ADD_ATTR(DEVICE_, MAX_WORK_ITEM_SIZES); ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_CHAR); ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_SHORT); ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_INT); ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_LONG); ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_FLOAT); ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_DOUBLE); ADD_ATTR(DEVICE_, MAX_CLOCK_FREQUENCY); ADD_ATTR(DEVICE_, ADDRESS_BITS); ADD_ATTR(DEVICE_, MAX_READ_IMAGE_ARGS); ADD_ATTR(DEVICE_, MAX_WRITE_IMAGE_ARGS); ADD_ATTR(DEVICE_, MAX_MEM_ALLOC_SIZE); ADD_ATTR(DEVICE_, IMAGE2D_MAX_WIDTH); ADD_ATTR(DEVICE_, IMAGE2D_MAX_HEIGHT); ADD_ATTR(DEVICE_, IMAGE3D_MAX_WIDTH); ADD_ATTR(DEVICE_, IMAGE3D_MAX_HEIGHT); ADD_ATTR(DEVICE_, IMAGE3D_MAX_DEPTH); ADD_ATTR(DEVICE_, IMAGE_SUPPORT); ADD_ATTR(DEVICE_, MAX_PARAMETER_SIZE); ADD_ATTR(DEVICE_, MAX_SAMPLERS); ADD_ATTR(DEVICE_, MEM_BASE_ADDR_ALIGN); ADD_ATTR(DEVICE_, MIN_DATA_TYPE_ALIGN_SIZE); ADD_ATTR(DEVICE_, SINGLE_FP_CONFIG); #ifdef CL_DEVICE_DOUBLE_FP_CONFIG ADD_ATTR(DEVICE_, DOUBLE_FP_CONFIG); #endif #ifdef CL_DEVICE_HALF_FP_CONFIG ADD_ATTR(DEVICE_, HALF_FP_CONFIG); #endif ADD_ATTR(DEVICE_, GLOBAL_MEM_CACHE_TYPE); ADD_ATTR(DEVICE_, GLOBAL_MEM_CACHELINE_SIZE); ADD_ATTR(DEVICE_, GLOBAL_MEM_CACHE_SIZE); ADD_ATTR(DEVICE_, GLOBAL_MEM_SIZE); ADD_ATTR(DEVICE_, MAX_CONSTANT_BUFFER_SIZE); ADD_ATTR(DEVICE_, MAX_CONSTANT_ARGS); ADD_ATTR(DEVICE_, LOCAL_MEM_TYPE); ADD_ATTR(DEVICE_, LOCAL_MEM_SIZE); ADD_ATTR(DEVICE_, ERROR_CORRECTION_SUPPORT); ADD_ATTR(DEVICE_, PROFILING_TIMER_RESOLUTION); ADD_ATTR(DEVICE_, ENDIAN_LITTLE); ADD_ATTR(DEVICE_, AVAILABLE); ADD_ATTR(DEVICE_, COMPILER_AVAILABLE); ADD_ATTR(DEVICE_, EXECUTION_CAPABILITIES); ADD_ATTR(DEVICE_, QUEUE_PROPERTIES); ADD_ATTR(DEVICE_, NAME); ADD_ATTR(DEVICE_, VENDOR); ADD_ATTR(, DRIVER_VERSION); ADD_ATTR(DEVICE_, VERSION); ADD_ATTR(DEVICE_, PROFILE); ADD_ATTR(DEVICE_, VERSION); ADD_ATTR(DEVICE_, EXTENSIONS); ADD_ATTR(DEVICE_, PLATFORM); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(DEVICE_, PREFERRED_VECTOR_WIDTH_HALF); ADD_ATTR(DEVICE_, HOST_UNIFIED_MEMORY); ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_CHAR); ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_SHORT); ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_INT); ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_LONG); ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_FLOAT); ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_DOUBLE); ADD_ATTR(DEVICE_, NATIVE_VECTOR_WIDTH_HALF); ADD_ATTR(DEVICE_, OPENCL_C_VERSION); #endif // support for cl_nv_device_attribute_query #ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV ADD_ATTR(DEVICE_, COMPUTE_CAPABILITY_MAJOR_NV); ADD_ATTR(DEVICE_, COMPUTE_CAPABILITY_MINOR_NV); ADD_ATTR(DEVICE_, REGISTERS_PER_BLOCK_NV); ADD_ATTR(DEVICE_, WARP_SIZE_NV); ADD_ATTR(DEVICE_, GPU_OVERLAP_NV); ADD_ATTR(DEVICE_, KERNEL_EXEC_TIMEOUT_NV); ADD_ATTR(DEVICE_, INTEGRATED_MEMORY_NV); #endif // {{{ cl_amd_device_attribute_query #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD ADD_ATTR(DEVICE_, PROFILING_TIMER_OFFSET_AMD); #endif #ifdef CL_DEVICE_TOPOLOGY_AMD ADD_ATTR(DEVICE_, TOPOLOGY_AMD); #endif #ifdef CL_DEVICE_BOARD_NAME_AMD ADD_ATTR(DEVICE_, BOARD_NAME_AMD); #endif #ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD ADD_ATTR(DEVICE_, GLOBAL_FREE_MEMORY_AMD); #endif #ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD ADD_ATTR(DEVICE_, SIMD_PER_COMPUTE_UNIT_AMD); #endif #ifdef CL_DEVICE_SIMD_WIDTH_AMD ADD_ATTR(DEVICE_, SIMD_WIDTH_AMD); #endif #ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD ADD_ATTR(DEVICE_, SIMD_INSTRUCTION_WIDTH_AMD); #endif #ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD ADD_ATTR(DEVICE_, WAVEFRONT_WIDTH_AMD); #endif #ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD ADD_ATTR(DEVICE_, GLOBAL_MEM_CHANNELS_AMD); #endif #ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD ADD_ATTR(DEVICE_, GLOBAL_MEM_CHANNEL_BANKS_AMD); #endif #ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD ADD_ATTR(DEVICE_, GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD); #endif #ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD ADD_ATTR(DEVICE_, LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD); #endif #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD ADD_ATTR(DEVICE_, LOCAL_MEM_BANKS_AMD); #endif // }}} #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT ADD_ATTR(DEVICE_, MAX_ATOMIC_COUNTERS_EXT); #endif #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION) ADD_ATTR(DEVICE_, PARENT_DEVICE_EXT); ADD_ATTR(DEVICE_, PARTITION_TYPES_EXT); ADD_ATTR(DEVICE_, AFFINITY_DOMAINS_EXT); ADD_ATTR(DEVICE_, REFERENCE_COUNT_EXT); ADD_ATTR(DEVICE_, PARTITION_STYLE_EXT); #endif #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(DEVICE_, LINKER_AVAILABLE); ADD_ATTR(DEVICE_, BUILT_IN_KERNELS); ADD_ATTR(DEVICE_, IMAGE_MAX_BUFFER_SIZE); ADD_ATTR(DEVICE_, IMAGE_MAX_ARRAY_SIZE); ADD_ATTR(DEVICE_, PARENT_DEVICE); ADD_ATTR(DEVICE_, PARTITION_MAX_SUB_DEVICES); ADD_ATTR(DEVICE_, PARTITION_PROPERTIES); ADD_ATTR(DEVICE_, PARTITION_AFFINITY_DOMAIN); ADD_ATTR(DEVICE_, PARTITION_TYPE); ADD_ATTR(DEVICE_, REFERENCE_COUNT); ADD_ATTR(DEVICE_, PREFERRED_INTEROP_USER_SYNC); ADD_ATTR(DEVICE_, PRINTF_BUFFER_SIZE); #endif #ifdef cl_khr_image2d_from_buffer ADD_ATTR(DEVICE_, IMAGE_PITCH_ALIGNMENT); ADD_ATTR(DEVICE_, IMAGE_BASE_ADDRESS_ALIGNMENT); #endif } { py::class_ cls("device_fp_config", py::no_init); ADD_ATTR(FP_, DENORM); ADD_ATTR(FP_, INF_NAN); ADD_ATTR(FP_, ROUND_TO_NEAREST); ADD_ATTR(FP_, ROUND_TO_ZERO); ADD_ATTR(FP_, ROUND_TO_INF); ADD_ATTR(FP_, FMA); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(FP_, SOFT_FLOAT); #endif #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(FP_, CORRECTLY_ROUNDED_DIVIDE_SQRT); #endif } { py::class_ cls("device_mem_cache_type", py::no_init); ADD_ATTR( , NONE); ADD_ATTR( , READ_ONLY_CACHE); ADD_ATTR( , READ_WRITE_CACHE); } { py::class_ cls("device_local_mem_type", py::no_init); ADD_ATTR( , LOCAL); ADD_ATTR( , GLOBAL); } { py::class_ cls("device_exec_capabilities", py::no_init); ADD_ATTR(EXEC_, KERNEL); ADD_ATTR(EXEC_, NATIVE_KERNEL); #ifdef CL_EXEC_IMMEDIATE_EXECUTION_INTEL ADD_ATTR(EXEC_, IMMEDIATE_EXECUTION_INTEL); #endif } { py::class_ cls("command_queue_properties", py::no_init); ADD_ATTR(QUEUE_, OUT_OF_ORDER_EXEC_MODE_ENABLE); ADD_ATTR(QUEUE_, PROFILING_ENABLE); #ifdef CL_QUEUE_IMMEDIATE_EXECUTION_ENABLE_INTEL ADD_ATTR(QUEUE_, IMMEDIATE_EXECUTION_ENABLE_INTEL); #endif } { py::class_ cls("context_info", py::no_init); ADD_ATTR(CONTEXT_, REFERENCE_COUNT); ADD_ATTR(CONTEXT_, DEVICES); ADD_ATTR(CONTEXT_, PROPERTIES); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(CONTEXT_, NUM_DEVICES); #endif #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(CONTEXT_, INTEROP_USER_SYNC); #endif } { py::class_ cls("gl_context_info", py::no_init); #if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1) ADD_ATTR(, CURRENT_DEVICE_FOR_GL_CONTEXT_KHR); ADD_ATTR(, DEVICES_FOR_GL_CONTEXT_KHR); #endif } { py::class_ cls("context_properties", py::no_init); ADD_ATTR(CONTEXT_, PLATFORM); #if defined(cl_khr_gl_sharing) && (cl_khr_gl_sharing >= 1) ADD_ATTR( ,GL_CONTEXT_KHR); ADD_ATTR( ,EGL_DISPLAY_KHR); ADD_ATTR( ,GLX_DISPLAY_KHR); ADD_ATTR( ,WGL_HDC_KHR); ADD_ATTR( ,CGL_SHAREGROUP_KHR); #endif #if defined(__APPLE__) && defined(HAVE_GL) ADD_ATTR( ,CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE); #endif /* __APPLE__ */ // cl_amd_offline_devices #ifdef CL_CONTEXT_OFFLINE_DEVICES_AMD ADD_ATTR(CONTEXT_, OFFLINE_DEVICES_AMD); #endif } { py::class_ cls("command_queue_info", py::no_init); ADD_ATTR(QUEUE_, CONTEXT); ADD_ATTR(QUEUE_, DEVICE); ADD_ATTR(QUEUE_, REFERENCE_COUNT); ADD_ATTR(QUEUE_, PROPERTIES); } { py::class_ cls("mem_flags", py::no_init); ADD_ATTR(MEM_, READ_WRITE); ADD_ATTR(MEM_, WRITE_ONLY); ADD_ATTR(MEM_, READ_ONLY); ADD_ATTR(MEM_, USE_HOST_PTR); ADD_ATTR(MEM_, ALLOC_HOST_PTR); ADD_ATTR(MEM_, COPY_HOST_PTR); #ifdef cl_amd_device_memory_flags ADD_ATTR(MEM_, USE_PERSISTENT_MEM_AMD); #endif #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(MEM_, HOST_WRITE_ONLY); ADD_ATTR(MEM_, HOST_READ_ONLY); ADD_ATTR(MEM_, HOST_NO_ACCESS); #endif } { py::class_ cls("channel_order", py::no_init); ADD_ATTR( , R); ADD_ATTR( , A); ADD_ATTR( , RG); ADD_ATTR( , RA); ADD_ATTR( , RGB); ADD_ATTR( , RGBA); ADD_ATTR( , BGRA); ADD_ATTR( , INTENSITY); ADD_ATTR( , LUMINANCE); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR( , Rx); ADD_ATTR( , RGx); ADD_ATTR( , RGBx); #endif } { py::class_ cls("channel_type", py::no_init); ADD_ATTR( , SNORM_INT8); ADD_ATTR( , SNORM_INT16); ADD_ATTR( , UNORM_INT8); ADD_ATTR( , UNORM_INT16); ADD_ATTR( , UNORM_SHORT_565); ADD_ATTR( , UNORM_SHORT_555); ADD_ATTR( , UNORM_INT_101010); ADD_ATTR( , SIGNED_INT8); ADD_ATTR( , SIGNED_INT16); ADD_ATTR( , SIGNED_INT32); ADD_ATTR( , UNSIGNED_INT8); ADD_ATTR( , UNSIGNED_INT16); ADD_ATTR( , UNSIGNED_INT32); ADD_ATTR( , HALF_FLOAT); ADD_ATTR( , FLOAT); } { py::class_ cls("mem_object_type", py::no_init); ADD_ATTR(MEM_OBJECT_, BUFFER); ADD_ATTR(MEM_OBJECT_, IMAGE2D); ADD_ATTR(MEM_OBJECT_, IMAGE3D); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(MEM_OBJECT_, IMAGE2D_ARRAY); ADD_ATTR(MEM_OBJECT_, IMAGE1D); ADD_ATTR(MEM_OBJECT_, IMAGE1D_ARRAY); ADD_ATTR(MEM_OBJECT_, IMAGE1D_BUFFER); #endif } { py::class_ cls("mem_info", py::no_init); ADD_ATTR(MEM_, TYPE); ADD_ATTR(MEM_, FLAGS); ADD_ATTR(MEM_, SIZE); ADD_ATTR(MEM_, HOST_PTR); ADD_ATTR(MEM_, MAP_COUNT); ADD_ATTR(MEM_, REFERENCE_COUNT); ADD_ATTR(MEM_, CONTEXT); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(MEM_, ASSOCIATED_MEMOBJECT); ADD_ATTR(MEM_, OFFSET); #endif } { py::class_ cls("image_info", py::no_init); ADD_ATTR(IMAGE_, FORMAT); ADD_ATTR(IMAGE_, ELEMENT_SIZE); ADD_ATTR(IMAGE_, ROW_PITCH); ADD_ATTR(IMAGE_, SLICE_PITCH); ADD_ATTR(IMAGE_, WIDTH); ADD_ATTR(IMAGE_, HEIGHT); ADD_ATTR(IMAGE_, DEPTH); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(IMAGE_, ARRAY_SIZE); ADD_ATTR(IMAGE_, BUFFER); ADD_ATTR(IMAGE_, NUM_MIP_LEVELS); ADD_ATTR(IMAGE_, NUM_SAMPLES); #endif } { py::class_ cls("addressing_mode", py::no_init); ADD_ATTR(ADDRESS_, NONE); ADD_ATTR(ADDRESS_, CLAMP_TO_EDGE); ADD_ATTR(ADDRESS_, CLAMP); ADD_ATTR(ADDRESS_, REPEAT); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(ADDRESS_, MIRRORED_REPEAT); #endif } { py::class_ cls("filter_mode", py::no_init); ADD_ATTR(FILTER_, NEAREST); ADD_ATTR(FILTER_, LINEAR); } { py::class_ cls("sampler_info", py::no_init); ADD_ATTR(SAMPLER_, REFERENCE_COUNT); ADD_ATTR(SAMPLER_, CONTEXT); ADD_ATTR(SAMPLER_, NORMALIZED_COORDS); ADD_ATTR(SAMPLER_, ADDRESSING_MODE); ADD_ATTR(SAMPLER_, FILTER_MODE); } { py::class_ cls("map_flags", py::no_init); ADD_ATTR(MAP_, READ); ADD_ATTR(MAP_, WRITE); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(MAP_, WRITE_INVALIDATE_REGION); #endif } { py::class_ cls("program_info", py::no_init); ADD_ATTR(PROGRAM_, REFERENCE_COUNT); ADD_ATTR(PROGRAM_, CONTEXT); ADD_ATTR(PROGRAM_, NUM_DEVICES); ADD_ATTR(PROGRAM_, DEVICES); ADD_ATTR(PROGRAM_, SOURCE); ADD_ATTR(PROGRAM_, BINARY_SIZES); ADD_ATTR(PROGRAM_, BINARIES); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(PROGRAM_, NUM_KERNELS); ADD_ATTR(PROGRAM_, KERNEL_NAMES); #endif } { py::class_ cls("program_build_info", py::no_init); ADD_ATTR(PROGRAM_BUILD_, STATUS); ADD_ATTR(PROGRAM_BUILD_, OPTIONS); ADD_ATTR(PROGRAM_BUILD_, LOG); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(PROGRAM_, BINARY_TYPE); #endif } { py::class_ cls("program_binary_type", py::no_init); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(PROGRAM_BINARY_TYPE_, NONE); ADD_ATTR(PROGRAM_BINARY_TYPE_, COMPILED_OBJECT); ADD_ATTR(PROGRAM_BINARY_TYPE_, LIBRARY); ADD_ATTR(PROGRAM_BINARY_TYPE_, EXECUTABLE); #endif } { py::class_ cls("kernel_info", py::no_init); ADD_ATTR(KERNEL_, FUNCTION_NAME); ADD_ATTR(KERNEL_, NUM_ARGS); ADD_ATTR(KERNEL_, REFERENCE_COUNT); ADD_ATTR(KERNEL_, CONTEXT); ADD_ATTR(KERNEL_, PROGRAM); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(KERNEL_, ATTRIBUTES); #endif } { py::class_ cls("kernel_arg_info", py::no_init); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(KERNEL_ARG_, ADDRESS_QUALIFIER); ADD_ATTR(KERNEL_ARG_, ACCESS_QUALIFIER); ADD_ATTR(KERNEL_ARG_, TYPE_NAME); ADD_ATTR(KERNEL_ARG_, NAME); #endif } { py::class_ cls( "kernel_arg_address_qualifier", py::no_init); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(KERNEL_ARG_ADDRESS_, GLOBAL); ADD_ATTR(KERNEL_ARG_ADDRESS_, LOCAL); ADD_ATTR(KERNEL_ARG_ADDRESS_, CONSTANT); ADD_ATTR(KERNEL_ARG_ADDRESS_, PRIVATE); #endif } { py::class_ cls( "kernel_arg_access_qualifier", py::no_init); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(KERNEL_ARG_ACCESS_, READ_ONLY); ADD_ATTR(KERNEL_ARG_ACCESS_, WRITE_ONLY); ADD_ATTR(KERNEL_ARG_ACCESS_, READ_WRITE); ADD_ATTR(KERNEL_ARG_ACCESS_, NONE); #endif } { py::class_ cls("kernel_work_group_info", py::no_init); ADD_ATTR(KERNEL_, WORK_GROUP_SIZE); ADD_ATTR(KERNEL_, COMPILE_WORK_GROUP_SIZE); ADD_ATTR(KERNEL_, LOCAL_MEM_SIZE); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(KERNEL_, PREFERRED_WORK_GROUP_SIZE_MULTIPLE); ADD_ATTR(KERNEL_, PRIVATE_MEM_SIZE); #endif #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(KERNEL_, GLOBAL_WORK_SIZE); #endif } { py::class_ cls("event_info", py::no_init); ADD_ATTR(EVENT_, COMMAND_QUEUE); ADD_ATTR(EVENT_, COMMAND_TYPE); ADD_ATTR(EVENT_, REFERENCE_COUNT); ADD_ATTR(EVENT_, COMMAND_EXECUTION_STATUS); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(EVENT_, CONTEXT); #endif } { py::class_ cls("command_type", py::no_init); ADD_ATTR(COMMAND_, NDRANGE_KERNEL); ADD_ATTR(COMMAND_, TASK); ADD_ATTR(COMMAND_, NATIVE_KERNEL); ADD_ATTR(COMMAND_, READ_BUFFER); ADD_ATTR(COMMAND_, WRITE_BUFFER); ADD_ATTR(COMMAND_, COPY_BUFFER); ADD_ATTR(COMMAND_, READ_IMAGE); ADD_ATTR(COMMAND_, WRITE_IMAGE); ADD_ATTR(COMMAND_, COPY_IMAGE); ADD_ATTR(COMMAND_, COPY_IMAGE_TO_BUFFER); ADD_ATTR(COMMAND_, COPY_BUFFER_TO_IMAGE); ADD_ATTR(COMMAND_, MAP_BUFFER); ADD_ATTR(COMMAND_, MAP_IMAGE); ADD_ATTR(COMMAND_, UNMAP_MEM_OBJECT); ADD_ATTR(COMMAND_, MARKER); ADD_ATTR(COMMAND_, ACQUIRE_GL_OBJECTS); ADD_ATTR(COMMAND_, RELEASE_GL_OBJECTS); #if PYOPENCL_CL_VERSION >= 0x1010 ADD_ATTR(COMMAND_, READ_BUFFER_RECT); ADD_ATTR(COMMAND_, WRITE_BUFFER_RECT); ADD_ATTR(COMMAND_, COPY_BUFFER_RECT); ADD_ATTR(COMMAND_, USER); #endif #ifdef cl_ext_migrate_memobject ADD_ATTR(COMMAND_, MIGRATE_MEM_OBJECT_EXT); #endif #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(COMMAND_, BARRIER); ADD_ATTR(COMMAND_, MIGRATE_MEM_OBJECTS); ADD_ATTR(COMMAND_, FILL_BUFFER); ADD_ATTR(COMMAND_, FILL_IMAGE); #endif } { py::class_ cls("command_execution_status", py::no_init); ADD_ATTR(, COMPLETE); ADD_ATTR(, RUNNING); ADD_ATTR(, SUBMITTED); ADD_ATTR(, QUEUED); } { py::class_ cls("profiling_info", py::no_init); ADD_ATTR(PROFILING_COMMAND_, QUEUED); ADD_ATTR(PROFILING_COMMAND_, SUBMIT); ADD_ATTR(PROFILING_COMMAND_, START); ADD_ATTR(PROFILING_COMMAND_, END); } /* not needed--filled in automatically by implementation. #if PYOPENCL_CL_VERSION >= 0x1010 { py::class_ cls("buffer_create_type", py::no_init); ADD_ATTR(BUFFER_CREATE_TYPE_, REGION); } #endif */ { py::class_ cls( "mem_migration_flags", py::no_init); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(MIGRATE_MEM_OBJECT_, HOST); ADD_ATTR(MIGRATE_MEM_OBJECT_, CONTENT_UNDEFINED); #endif } { py::class_ cls( "device_partition_property_ext", py::no_init); #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION) ADD_ATTR_SUFFIX(DEVICE_PARTITION_, EQUALLY, _EXT); ADD_ATTR_SUFFIX(DEVICE_PARTITION_, BY_COUNTS, _EXT); ADD_ATTR_SUFFIX(DEVICE_PARTITION_, BY_NAMES, _EXT); ADD_ATTR_SUFFIX(DEVICE_PARTITION_, BY_AFFINITY_DOMAIN, _EXT); ADD_ATTR_SUFFIX(, PROPERTIES_LIST_END, _EXT); ADD_ATTR_SUFFIX(, PARTITION_BY_COUNTS_LIST_END, _EXT); ADD_ATTR_SUFFIX(, PARTITION_BY_NAMES_LIST_END, _EXT); #endif } { py::class_ cls("affinity_domain_ext", py::no_init); #if defined(cl_ext_device_fission) && defined(PYOPENCL_USE_DEVICE_FISSION) ADD_ATTR_SUFFIX(AFFINITY_DOMAIN_, L1_CACHE, _EXT); ADD_ATTR_SUFFIX(AFFINITY_DOMAIN_, L2_CACHE, _EXT); ADD_ATTR_SUFFIX(AFFINITY_DOMAIN_, L3_CACHE, _EXT); ADD_ATTR_SUFFIX(AFFINITY_DOMAIN_, L4_CACHE, _EXT); ADD_ATTR_SUFFIX(AFFINITY_DOMAIN_, NUMA, _EXT); ADD_ATTR_SUFFIX(AFFINITY_DOMAIN_, NEXT_FISSIONABLE, _EXT); #endif } { py::class_ cls( "device_partition_property", py::no_init); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(DEVICE_PARTITION_, EQUALLY); ADD_ATTR(DEVICE_PARTITION_, BY_COUNTS); ADD_ATTR(DEVICE_PARTITION_, BY_COUNTS_LIST_END); ADD_ATTR(DEVICE_PARTITION_, BY_AFFINITY_DOMAIN); #endif } { py::class_ cls("device_affinity_domain", py::no_init); #if PYOPENCL_CL_VERSION >= 0x1020 ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, NUMA); ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, L4_CACHE); ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, L3_CACHE); ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, L2_CACHE); ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, L1_CACHE); ADD_ATTR(DEVICE_AFFINITY_DOMAIN_, NEXT_PARTITIONABLE); #endif } #ifdef HAVE_GL { py::class_ cls("gl_object_type", py::no_init); ADD_ATTR(GL_OBJECT_, BUFFER); ADD_ATTR(GL_OBJECT_, TEXTURE2D); ADD_ATTR(GL_OBJECT_, TEXTURE3D); ADD_ATTR(GL_OBJECT_, RENDERBUFFER); } { py::class_ cls("gl_texture_info", py::no_init); ADD_ATTR(GL_, TEXTURE_TARGET); ADD_ATTR(GL_, MIPMAP_LEVEL); } #endif { py::class_ cls("migrate_mem_object_flags_ext", py::no_init); #ifdef cl_ext_migrate_memobject ADD_ATTR_SUFFIX(MIGRATE_MEM_OBJECT_, HOST, _EXT); #endif } // }}} } // vim: foldmethod=marker pyopencl-2013.2/src/wrapper/_pvt_struct_v2.cpp0000644000175000000500000011240712245716340020111 0ustar tomussrc/* struct module -- pack values into and (out of) strings */ /* New version supporting byte order, alignment and size options, character strings, and unsigned numbers */ /* Compared with vanilla Python's struct module, this adds support * for packing complex values and only supports native packing. * (the minimum that's needed for PyOpenCL.) */ #define PY_SSIZE_T_CLEAN #include "Python.h" #include "structseq.h" #include "structmember.h" #include #include "numpy_init.hpp" // static PyTypeObject PyStructType; /* compatibility macros */ #if (PY_VERSION_HEX < 0x02050000) #ifndef PY_SSIZE_T_MIN typedef long int Py_ssize_t; #endif #define PyInt_FromSsize_t(x) PyInt_FromLong(x) #define PyInt_AsSsize_t(x) PyInt_AsLong(x) #endif /* If PY_STRUCT_FLOAT_COERCE is defined, the struct module will allow float arguments for integer formats with a warning for backwards compatibility. */ #define PY_STRUCT_FLOAT_COERCE 1 #ifdef PY_STRUCT_FLOAT_COERCE #define FLOAT_COERCE "integer argument expected, got float" #endif /* Compatibility with Py2.5 and older */ #ifndef Py_TYPE # define Py_TYPE(o) ((o)->ob_type) #endif #ifndef PyVarObject_HEAD_INIT #define PyVarObject_HEAD_INIT(type, size) \ PyObject_HEAD_INIT(type) size, #endif #ifndef SIZEOF_SIZE_T #define SIZEOF_SIZE_T sizeof(size_t) #endif #ifndef PY_SSIZE_T_MAX #define PY_SSIZE_T_MAX LONG_MAX #endif /* The translation function for each format character is table driven */ typedef struct _formatdef { char format; Py_ssize_t size; Py_ssize_t alignment; PyObject* (*unpack)(const char *, const struct _formatdef *); int (*pack)(char *, PyObject *, const struct _formatdef *); } formatdef; typedef struct _formatcode { const struct _formatdef *fmtdef; Py_ssize_t offset; Py_ssize_t size; } formatcode; /* Struct object interface */ typedef struct { PyObject_HEAD Py_ssize_t s_size; Py_ssize_t s_len; formatcode *s_codes; PyObject *s_format; PyObject *weakreflist; /* List of weak references */ } PyStructObject; #define PyStruct_Check(op) PyObject_TypeCheck(op, &PyStructType) #define PyStruct_CheckExact(op) (Py_TYPE(op) == &PyStructType) /* Exception */ static PyObject *StructError; /* Define various structs to figure out the alignments of types */ typedef struct { char c; short x; } st_short; typedef struct { char c; int x; } st_int; typedef struct { char c; long x; } st_long; typedef struct { char c; float x; } st_float; typedef struct { char c; double x; } st_double; typedef struct { char c; void *x; } st_void_p; #define SHORT_ALIGN (sizeof(st_short) - sizeof(short)) #define INT_ALIGN (sizeof(st_int) - sizeof(int)) #define LONG_ALIGN (sizeof(st_long) - sizeof(long)) #define FLOAT_ALIGN (sizeof(st_float) - sizeof(float)) #define DOUBLE_ALIGN (sizeof(st_double) - sizeof(double)) #define VOID_P_ALIGN (sizeof(st_void_p) - sizeof(void *)) /* We can't support q and Q in native mode unless the compiler does; in std mode, they're 8 bytes on all platforms. */ #ifdef HAVE_LONG_LONG typedef struct { char c; PY_LONG_LONG x; } s_long_long; #define LONG_LONG_ALIGN (sizeof(s_long_long) - sizeof(PY_LONG_LONG)) #endif #define BOOL_TYPE bool typedef struct { char c; bool x; } s_bool; #define BOOL_ALIGN (sizeof(s_bool) - sizeof(BOOL_TYPE)) #define STRINGIFY(x) #x #ifdef __powerc #pragma options align=reset #endif static char *integer_codes = "bBhHiIlLqQ"; static void s_dealloc(PyStructObject *s); static int s_init(PyObject *self, PyObject *args, PyObject *kwds); static PyObject *s_new(PyTypeObject *type, PyObject *args, PyObject *kwds); static PyObject *s_pack(PyObject *self, PyObject *args); static PyObject *s_pack_into(PyObject *self, PyObject *args); static PyObject *s_unpack(PyObject *self, PyObject *inputstr); static PyObject *s_unpack_from(PyObject *self, PyObject *args, PyObject *kwds); static PyObject *s_get_format(PyStructObject *self, void *unused); static PyObject *s_get_size(PyStructObject *self, void *unused); PyDoc_STRVAR(s__doc__, "Compiled struct object"); /* List of functions */ PyDoc_STRVAR(s_pack__doc__, "S.pack(v1, v2, ...) -> string\n\ \n\ Return a string containing values v1, v2, ... packed according to this\n\ Struct's format. See struct.__doc__ for more on format strings."); PyDoc_STRVAR(s_pack_into__doc__, "S.pack_into(buffer, offset, v1, v2, ...)\n\ \n\ Pack the values v1, v2, ... according to this Struct's format, write \n\ the packed bytes into the writable buffer buf starting at offset. Note\n\ that the offset is not an optional argument. See struct.__doc__ for \n\ more on format strings."); PyDoc_STRVAR(s_unpack__doc__, "S.unpack(str) -> (v1, v2, ...)\n\ \n\ Return tuple containing values unpacked according to this Struct's format.\n\ Requires len(str) == self.size. See struct.__doc__ for more on format\n\ strings."); PyDoc_STRVAR(s_unpack_from__doc__, "S.unpack_from(buffer[, offset]) -> (v1, v2, ...)\n\ \n\ Return tuple containing values unpacked according to this Struct's format.\n\ Unlike unpack, unpack_from can unpack values from any object supporting\n\ the buffer API, not just str. Requires len(buffer[offset:]) >= self.size.\n\ See struct.__doc__ for more on format strings."); static struct PyMethodDef s_methods[] = { {"pack", s_pack, METH_VARARGS, s_pack__doc__}, {"pack_into", s_pack_into, METH_VARARGS, s_pack_into__doc__}, {"unpack", s_unpack, METH_O, s_unpack__doc__}, {"unpack_from", (PyCFunction)s_unpack_from, METH_VARARGS|METH_KEYWORDS, s_unpack_from__doc__}, {NULL, NULL} /* sentinel */ }; #define OFF(x) offsetof(PyStructObject, x) static PyGetSetDef s_getsetlist[] = { {"format", (getter)s_get_format, (setter)NULL, "struct format string", NULL}, {"size", (getter)s_get_size, (setter)NULL, "struct size in bytes", NULL}, {NULL} /* sentinel */ }; static PyTypeObject PyStructType = { PyVarObject_HEAD_INIT(NULL, 0) "Struct", sizeof(PyStructObject), 0, (destructor)s_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ PyObject_GenericSetAttr, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_WEAKREFS,/* tp_flags */ s__doc__, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ offsetof(PyStructObject, weakreflist), /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ s_methods, /* tp_methods */ NULL, /* tp_members */ s_getsetlist, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ s_init, /* tp_init */ PyType_GenericAlloc,/* tp_alloc */ s_new, /* tp_new */ PyObject_Del, /* tp_free */ }; /* Helper to get a PyLongObject by hook or by crook. Caller should decref. */ static PyObject * get_pylong(PyObject *v) { PyNumberMethods *m; assert(v != NULL); if (PyInt_Check(v)) return PyLong_FromLong(PyInt_AS_LONG(v)); if (PyLong_Check(v)) { Py_INCREF(v); return v; } m = Py_TYPE(v)->tp_as_number; if (m != NULL && m->nb_long != NULL) { v = m->nb_long(v); if (v == NULL) return NULL; if (PyLong_Check(v)) return v; Py_DECREF(v); } PyErr_SetString(StructError, "cannot convert argument to long"); return NULL; } /* Helper to convert a Python object to a C long. Sets an exception (struct.error for an inconvertible type, OverflowError for out-of-range values) and returns -1 on error. */ static int get_long(PyObject *v, long *p) { long x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsLong(v); Py_DECREF(v); if (x == (long)-1 && PyErr_Occurred()) return -1; *p = x; return 0; } /* Same, but handling unsigned long */ static int get_ulong(PyObject *v, unsigned long *p) { unsigned long x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsUnsignedLong(v); Py_DECREF(v); if (x == (unsigned long)-1 && PyErr_Occurred()) return -1; *p = x; return 0; } #ifdef HAVE_LONG_LONG /* Same, but handling native long long. */ static int get_longlong(PyObject *v, PY_LONG_LONG *p) { PY_LONG_LONG x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsLongLong(v); Py_DECREF(v); if (x == (PY_LONG_LONG)-1 && PyErr_Occurred()) return -1; *p = x; return 0; } /* Same, but handling native unsigned long long. */ static int get_ulonglong(PyObject *v, unsigned PY_LONG_LONG *p) { unsigned PY_LONG_LONG x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsUnsignedLongLong(v); Py_DECREF(v); if (x == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) return -1; *p = x; return 0; } #endif #if (SIZEOF_LONG > SIZEOF_INT) /* Helper to format the range error exceptions */ static int _range_error(const formatdef *f, int is_unsigned) { /* ulargest is the largest unsigned value with f->size bytes. * Note that the simpler: * ((size_t)1 << (f->size * 8)) - 1 * doesn't work when f->size == sizeof(size_t) because C doesn't * define what happens when a left shift count is >= the number of * bits in the integer being shifted; e.g., on some boxes it doesn't * shift at all when they're equal. */ const size_t ulargest = (size_t)-1 >> ((SIZEOF_SIZE_T - f->size)*8); assert(f->size >= 1 && f->size <= SIZEOF_SIZE_T); if (is_unsigned) PyErr_Format(StructError, "'%c' format requires 0 <= number <= %zu", f->format, ulargest); else { const Py_ssize_t largest = (Py_ssize_t)(ulargest >> 1); PyErr_Format(StructError, "'%c' format requires %zd <= number <= %zd", f->format, ~ largest, largest); } return -1; } #endif /* A large number of small routines follow, with names of the form [bln][up]_TYPE [bln] distiguishes among big-endian, little-endian and native. [pu] distiguishes between pack (to struct) and unpack (from struct). TYPE is one of char, byte, ubyte, etc. */ /* Native mode routines. ****************************************************/ /* NOTE: In all n[up]_ routines handling types larger than 1 byte, there is *no* guarantee that the p pointer is properly aligned for each type, therefore memcpy is called. An intermediate variable is used to compensate for big-endian architectures. Normally both the intermediate variable and the memcpy call will be skipped by C optimisation in little-endian architectures (gcc >= 2.91 does this). */ static PyObject * nu_char(const char *p, const formatdef *f) { return PyString_FromStringAndSize(p, 1); } static PyObject * nu_byte(const char *p, const formatdef *f) { return PyInt_FromLong((long) *(signed char *)p); } static PyObject * nu_ubyte(const char *p, const formatdef *f) { return PyInt_FromLong((long) *(unsigned char *)p); } static PyObject * nu_short(const char *p, const formatdef *f) { short x; memcpy((char *)&x, p, sizeof x); return PyInt_FromLong((long)x); } static PyObject * nu_ushort(const char *p, const formatdef *f) { unsigned short x; memcpy((char *)&x, p, sizeof x); return PyInt_FromLong((long)x); } static PyObject * nu_int(const char *p, const formatdef *f) { int x; memcpy((char *)&x, p, sizeof x); return PyInt_FromLong((long)x); } static PyObject * nu_uint(const char *p, const formatdef *f) { unsigned int x; memcpy((char *)&x, p, sizeof x); #if (SIZEOF_LONG > SIZEOF_INT) return PyInt_FromLong((long)x); #else if (x <= ((unsigned int)LONG_MAX)) return PyInt_FromLong((long)x); return PyLong_FromUnsignedLong((unsigned long)x); #endif } static PyObject * nu_long(const char *p, const formatdef *f) { long x; memcpy((char *)&x, p, sizeof x); return PyInt_FromLong(x); } static PyObject * nu_ulong(const char *p, const formatdef *f) { unsigned long x; memcpy((char *)&x, p, sizeof x); if (x <= LONG_MAX) return PyInt_FromLong((long)x); return PyLong_FromUnsignedLong(x); } /* Native mode doesn't support q or Q unless the platform C supports long long (or, on Windows, __int64). */ #ifdef HAVE_LONG_LONG static PyObject * nu_longlong(const char *p, const formatdef *f) { PY_LONG_LONG x; memcpy((char *)&x, p, sizeof x); if (x >= LONG_MIN && x <= LONG_MAX) return PyInt_FromLong(Py_SAFE_DOWNCAST(x, PY_LONG_LONG, long)); return PyLong_FromLongLong(x); } static PyObject * nu_ulonglong(const char *p, const formatdef *f) { unsigned PY_LONG_LONG x; memcpy((char *)&x, p, sizeof x); if (x <= LONG_MAX) return PyInt_FromLong(Py_SAFE_DOWNCAST(x, unsigned PY_LONG_LONG, long)); return PyLong_FromUnsignedLongLong(x); } #endif static PyObject * nu_bool(const char *p, const formatdef *f) { BOOL_TYPE x; memcpy((char *)&x, p, sizeof x); return PyBool_FromLong(x != 0); } static PyObject * nu_float(const char *p, const formatdef *f) { float x; memcpy((char *)&x, p, sizeof x); return PyFloat_FromDouble((double)x); } static PyObject * nu_double(const char *p, const formatdef *f) { double x; memcpy((char *)&x, p, sizeof x); return PyFloat_FromDouble(x); } static PyObject * nu_complex_float(const char *p, const formatdef *f) { float re, im; memcpy((char *)&re, p, sizeof re); memcpy((char *)&im, p+sizeof re, sizeof im); return PyComplex_FromDoubles((double)re, (double) im); } static PyObject * nu_complex_double(const char *p, const formatdef *f) { double re, im; memcpy((char *)&re, p, sizeof re); memcpy((char *)&im, p+sizeof re, sizeof im); return PyComplex_FromDoubles(re, im); } static PyObject * nu_void_p(const char *p, const formatdef *f) { void *x; memcpy((char *)&x, p, sizeof x); return PyLong_FromVoidPtr(x); } static int np_byte(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; if (x < -128 || x > 127){ PyErr_SetString(StructError, "byte format requires -128 <= number <= 127"); return -1; } *p = (char)x; return 0; } static int np_ubyte(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; if (x < 0 || x > 255){ PyErr_SetString(StructError, "ubyte format requires 0 <= number <= 255"); return -1; } *p = (char)x; return 0; } static int np_char(char *p, PyObject *v, const formatdef *f) { if (!PyString_Check(v) || PyString_Size(v) != 1) { PyErr_SetString(StructError, "char format require string of length 1"); return -1; } *p = *PyString_AsString(v); return 0; } static int np_short(char *p, PyObject *v, const formatdef *f) { long x; short y; if (get_long(v, &x) < 0) return -1; if (x < SHRT_MIN || x > SHRT_MAX){ PyErr_SetString(StructError, "short format requires " STRINGIFY(SHRT_MIN) " <= number <= " STRINGIFY(SHRT_MAX)); return -1; } y = (short)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_ushort(char *p, PyObject *v, const formatdef *f) { long x; unsigned short y; if (get_long(v, &x) < 0) return -1; if (x < 0 || x > USHRT_MAX){ PyErr_SetString(StructError, "ushort format requires 0 <= number <= " STRINGIFY(USHRT_MAX)); return -1; } y = (unsigned short)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_int(char *p, PyObject *v, const formatdef *f) { long x; int y; if (get_long(v, &x) < 0) return -1; #if (SIZEOF_LONG > SIZEOF_INT) if ((x < ((long)INT_MIN)) || (x > ((long)INT_MAX))) return _range_error(f, 0); #endif y = (int)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_uint(char *p, PyObject *v, const formatdef *f) { unsigned long x; unsigned int y; if (get_ulong(v, &x) < 0) return -1; y = (unsigned int)x; #if (SIZEOF_LONG > SIZEOF_INT) if (x > ((unsigned long)UINT_MAX)) return _range_error(f, 1); #endif memcpy(p, (char *)&y, sizeof y); return 0; } static int np_long(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ulong(char *p, PyObject *v, const formatdef *f) { unsigned long x; if (get_ulong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } #ifdef HAVE_LONG_LONG static int np_longlong(char *p, PyObject *v, const formatdef *f) { PY_LONG_LONG x; if (get_longlong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ulonglong(char *p, PyObject *v, const formatdef *f) { unsigned PY_LONG_LONG x; if (get_ulonglong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } #endif static int np_bool(char *p, PyObject *v, const formatdef *f) { BOOL_TYPE y; y = PyObject_IsTrue(v) != 0; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_float(char *p, PyObject *v, const formatdef *f) { float x = (float)PyFloat_AsDouble(v); if (x == -1 && PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a float"); return -1; } memcpy(p, (char *)&x, sizeof x); return 0; } static int np_double(char *p, PyObject *v, const formatdef *f) { double x = PyFloat_AsDouble(v); if (x == -1 && PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a float"); return -1; } memcpy(p, (char *)&x, sizeof(double)); return 0; } static int np_complex_float(char *p, PyObject *v, const formatdef *f) { if (PyArray_IsZeroDim(v)) { PyObject *v_cast = PyArray_Cast( reinterpret_cast(v), NPY_CFLOAT); if (!v_cast) return -1; memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast)); Py_DECREF(v_cast); } else { float re = 0.0f; float im = 0.0f; Py_complex cplx; #if (PY_VERSION_HEX < 0x02060000) if (PyComplex_Check(v)) cplx = PyComplex_AsCComplex(v); else if (PyObject_HasAttrString(v, "__complex__")) { PyObject *v2 = PyObject_CallMethod(v, "__complex__", ""); cplx = PyComplex_AsCComplex(v2); Py_DECREF(v2); } else cplx = PyComplex_AsCComplex(v); #else cplx = PyComplex_AsCComplex(v); #endif if (PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a complex"); return -1; } re = (float)cplx.real; im = (float)cplx.imag; memcpy(p, (char *)&re, sizeof re); memcpy(p+sizeof re, (char *)&im, sizeof im); } return 0; } static int np_complex_double(char *p, PyObject *v, const formatdef *f) { if (PyArray_IsZeroDim(v)) { PyObject *v_cast = PyArray_Cast( reinterpret_cast(v), NPY_CDOUBLE); if (!v_cast) return -1; memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast)); Py_DECREF(v_cast); } else { double re = 0.0; double im = 0.0; Py_complex cplx; #if (PY_VERSION_HEX < 0x02060000) if (PyComplex_Check(v)) cplx = PyComplex_AsCComplex(v); else if (PyObject_HasAttrString(v, "__complex__")) { PyObject *v2 = PyObject_CallMethod(v, "__complex__", ""); cplx = PyComplex_AsCComplex(v2); Py_DECREF(v2); } else cplx = PyComplex_AsCComplex(v); #else cplx = PyComplex_AsCComplex(v); #endif if (PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a complex"); return -1; } re = cplx.real; im = cplx.imag; memcpy(p, (char *)&re, sizeof re); memcpy(p+sizeof re, (char *)&im, sizeof im); } return 0; } static int np_void_p(char *p, PyObject *v, const formatdef *f) { void *x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsVoidPtr(v); Py_DECREF(v); if (x == NULL && PyErr_Occurred()) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static formatdef native_table[] = { {'x', sizeof(char), 0, NULL}, {'b', sizeof(char), 0, nu_byte, np_byte}, {'B', sizeof(char), 0, nu_ubyte, np_ubyte}, {'c', sizeof(char), 0, nu_char, np_char}, {'s', sizeof(char), 0, NULL}, {'p', sizeof(char), 0, NULL}, {'h', sizeof(short), SHORT_ALIGN, nu_short, np_short}, {'H', sizeof(short), SHORT_ALIGN, nu_ushort, np_ushort}, {'i', sizeof(int), INT_ALIGN, nu_int, np_int}, {'I', sizeof(int), INT_ALIGN, nu_uint, np_uint}, {'l', sizeof(long), LONG_ALIGN, nu_long, np_long}, {'L', sizeof(long), LONG_ALIGN, nu_ulong, np_ulong}, #ifdef HAVE_LONG_LONG {'q', sizeof(PY_LONG_LONG), LONG_LONG_ALIGN, nu_longlong, np_longlong}, {'Q', sizeof(PY_LONG_LONG), LONG_LONG_ALIGN, nu_ulonglong,np_ulonglong}, #endif {'?', sizeof(BOOL_TYPE), BOOL_ALIGN, nu_bool, np_bool}, {'f', sizeof(float), FLOAT_ALIGN, nu_float, np_float}, {'d', sizeof(double), DOUBLE_ALIGN, nu_double, np_double}, {'F', 2*sizeof(float), FLOAT_ALIGN, nu_complex_float, np_complex_float}, {'D', 2*sizeof(double), DOUBLE_ALIGN, nu_complex_double, np_complex_double}, {'P', sizeof(void *), VOID_P_ALIGN, nu_void_p, np_void_p}, {0} }; /* Get the table entry for a format code */ static const formatdef * getentry(int c, const formatdef *f) { for (; f->format != '\0'; f++) { if (f->format == c) { return f; } } PyErr_SetString(StructError, "bad char in struct format"); return NULL; } /* Align a size according to a format code */ static Py_ssize_t align(Py_ssize_t size, char c, const formatdef *e) { if (e->format == c) { if (e->alignment) { size = ((size + e->alignment - 1) / e->alignment) * e->alignment; } } return size; } /* calculate the size of a format string */ static int prepare_s(PyStructObject *self) { const formatdef *f; const formatdef *e; formatcode *codes; const char *s; const char *fmt; char c; Py_ssize_t size, len, num, itemsize, x; fmt = PyString_AS_STRING(self->s_format); f = native_table; s = fmt; size = 0; len = 0; while ((c = *s++) != '\0') { if (isspace(Py_CHARMASK(c))) continue; if ('0' <= c && c <= '9') { num = c - '0'; while ('0' <= (c = *s++) && c <= '9') { x = num*10 + (c - '0'); if (x/10 != num) { PyErr_SetString( StructError, "overflow in item count"); return -1; } num = x; } if (c == '\0') break; } else num = 1; e = getentry(c, f); if (e == NULL) return -1; switch (c) { case 's': /* fall through */ case 'p': len++; break; case 'x': break; default: len += num; break; } itemsize = e->size; size = align(size, c, e); x = num * itemsize; size += x; if (x/itemsize != num || size < 0) { PyErr_SetString(StructError, "total struct size too long"); return -1; } } /* check for overflow */ if ((len + 1) > (PY_SSIZE_T_MAX / sizeof(formatcode))) { PyErr_NoMemory(); return -1; } self->s_size = size; self->s_len = len; codes = (formatcode *) PyMem_MALLOC((len + 1) * sizeof(formatcode)); if (codes == NULL) { PyErr_NoMemory(); return -1; } self->s_codes = codes; s = fmt; size = 0; while ((c = *s++) != '\0') { if (isspace(Py_CHARMASK(c))) continue; if ('0' <= c && c <= '9') { num = c - '0'; while ('0' <= (c = *s++) && c <= '9') num = num*10 + (c - '0'); if (c == '\0') break; } else num = 1; e = getentry(c, f); size = align(size, c, e); if (c == 's' || c == 'p') { codes->offset = size; codes->size = num; codes->fmtdef = e; codes++; size += num; } else if (c == 'x') { size += num; } else { while (--num >= 0) { codes->offset = size; codes->size = e->size; codes->fmtdef = e; codes++; size += e->size; } } } codes->fmtdef = NULL; codes->offset = size; codes->size = 0; return 0; } static PyObject * s_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { PyObject *self; assert(type != NULL && type->tp_alloc != NULL); self = type->tp_alloc(type, 0); if (self != NULL) { PyStructObject *s = (PyStructObject*)self; Py_INCREF(Py_None); s->s_format = Py_None; s->s_codes = NULL; s->s_size = -1; s->s_len = -1; } return self; } static int s_init(PyObject *self, PyObject *args, PyObject *kwds) { PyStructObject *soself = (PyStructObject *)self; PyObject *o_format = NULL; int ret = 0; static char *kwlist[] = {"format", 0}; assert(PyStruct_Check(self)); if (!PyArg_ParseTupleAndKeywords(args, kwds, "S:Struct", kwlist, &o_format)) return -1; Py_INCREF(o_format); Py_CLEAR(soself->s_format); soself->s_format = o_format; ret = prepare_s(soself); return ret; } static void s_dealloc(PyStructObject *s) { if (s->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *)s); if (s->s_codes != NULL) { PyMem_FREE(s->s_codes); } Py_XDECREF(s->s_format); Py_TYPE(s)->tp_free((PyObject *)s); } static PyObject * s_unpack_internal(PyStructObject *soself, char *startfrom) { formatcode *code; Py_ssize_t i = 0; PyObject *result = PyTuple_New(soself->s_len); if (result == NULL) return NULL; for (code = soself->s_codes; code->fmtdef != NULL; code++) { PyObject *v; const formatdef *e = code->fmtdef; const char *res = startfrom + code->offset; if (e->format == 's') { v = PyString_FromStringAndSize(res, code->size); } else if (e->format == 'p') { Py_ssize_t n = *(unsigned char*)res; if (n >= code->size) n = code->size - 1; v = PyString_FromStringAndSize(res + 1, n); } else { v = e->unpack(res, e); } if (v == NULL) goto fail; PyTuple_SET_ITEM(result, i++, v); } return result; fail: Py_DECREF(result); return NULL; } static PyObject * s_unpack(PyObject *self, PyObject *inputstr) { char *start; Py_ssize_t len; PyObject *args=NULL, *result; PyStructObject *soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (inputstr == NULL) goto fail; if (PyString_Check(inputstr) && PyString_GET_SIZE(inputstr) == soself->s_size) { return s_unpack_internal(soself, PyString_AS_STRING(inputstr)); } args = PyTuple_Pack(1, inputstr); if (args == NULL) return NULL; if (!PyArg_ParseTuple(args, "s#:unpack", &start, &len)) goto fail; if (soself->s_size != len) goto fail; result = s_unpack_internal(soself, start); Py_DECREF(args); return result; fail: Py_XDECREF(args); PyErr_Format(StructError, "unpack requires a string argument of length %zd", soself->s_size); return NULL; } static PyObject * s_unpack_from(PyObject *self, PyObject *args, PyObject *kwds) { static char *kwlist[] = {"buffer", "offset", 0}; #if (PY_VERSION_HEX < 0x02050000) static char *fmt = "z#|i:unpack_from"; #else static char *fmt = "z#|n:unpack_from"; #endif Py_ssize_t buffer_len = 0, offset = 0; char *buffer = NULL; PyStructObject *soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (!PyArg_ParseTupleAndKeywords(args, kwds, fmt, kwlist, &buffer, &buffer_len, &offset)) return NULL; if (buffer == NULL) { PyErr_Format(StructError, "unpack_from requires a buffer argument"); return NULL; } if (offset < 0) offset += buffer_len; if (offset < 0 || (buffer_len - offset) < soself->s_size) { PyErr_Format(StructError, "unpack_from requires a buffer of at least %zd bytes", soself->s_size); return NULL; } return s_unpack_internal(soself, buffer + offset); } /* * Guts of the pack function. * * Takes a struct object, a tuple of arguments, and offset in that tuple of * argument for where to start processing the arguments for packing, and a * character buffer for writing the packed string. The caller must insure * that the buffer may contain the required length for packing the arguments. * 0 is returned on success, 1 is returned if there is an error. * */ static int s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf) { formatcode *code; /* XXX(nnorwitz): why does i need to be a local? can we use the offset parameter or do we need the wider width? */ Py_ssize_t i; memset(buf, '\0', soself->s_size); i = offset; for (code = soself->s_codes; code->fmtdef != NULL; code++) { Py_ssize_t n; PyObject *v = PyTuple_GET_ITEM(args, i++); const formatdef *e = code->fmtdef; char *res = buf + code->offset; if (e->format == 's') { if (!PyString_Check(v)) { if (!PyObject_CheckReadBuffer(v)) { PyErr_SetString(StructError, "argument for 's' must " "be a string or a buffer"); return -1; } else { const void *buf; Py_ssize_t len; if (PyObject_AsReadBuffer(v, &buf, &len)) return -1; if (len > code->size) len = code->size; if (len > 0) memcpy(res, buf, len); } } else { n = PyString_GET_SIZE(v); if (n > code->size) n = code->size; if (n > 0) memcpy(res, PyString_AS_STRING(v), n); } } else if (e->format == 'p') { if (!PyString_Check(v)) { PyErr_SetString(StructError, "argument for 'p' must " "be a string"); return -1; } n = PyString_GET_SIZE(v); if (n > (code->size - 1)) n = code->size - 1; if (n > 0) memcpy(res + 1, PyString_AS_STRING(v), n); if (n > 255) n = 255; *res = Py_SAFE_DOWNCAST(n, Py_ssize_t, unsigned char); } else if (e->pack(res, v, e) < 0) { if (strchr(integer_codes, e->format) != NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_Format(StructError, "integer out of range for " "'%c' format code", e->format); return -1; } } /* Success */ return 0; } static PyObject * s_pack(PyObject *self, PyObject *args) { PyStructObject *soself; PyObject *result; /* Validate arguments. */ soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyTuple_GET_SIZE(args) != soself->s_len) { PyErr_Format(StructError, "pack requires exactly %zd arguments", soself->s_len); return NULL; } /* Allocate a new string */ result = PyString_FromStringAndSize((char *)NULL, soself->s_size); if (result == NULL) return NULL; /* Call the guts */ if ( s_pack_internal(soself, args, 0, PyString_AS_STRING(result)) != 0 ) { Py_DECREF(result); return NULL; } return result; } static PyObject * s_pack_into(PyObject *self, PyObject *args) { PyStructObject *soself; char *buffer; Py_ssize_t buffer_len, offset; /* Validate arguments. +1 is for the first arg as buffer. */ soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyTuple_GET_SIZE(args) != (soself->s_len + 2)) { PyErr_Format(StructError, "pack_into requires exactly %zd arguments", (soself->s_len + 2)); return NULL; } /* Extract a writable memory buffer from the first argument */ if ( PyObject_AsWriteBuffer(PyTuple_GET_ITEM(args, 0), (void**)&buffer, &buffer_len) == -1 ) { return NULL; } assert( buffer_len >= 0 ); /* Extract the offset from the first argument */ offset = PyInt_AsSsize_t(PyTuple_GET_ITEM(args, 1)); if (offset == -1 && PyErr_Occurred()) return NULL; /* Support negative offsets. */ if (offset < 0) offset += buffer_len; /* Check boundaries */ if (offset < 0 || (buffer_len - offset) < soself->s_size) { PyErr_Format(StructError, "pack_into requires a buffer of at least %zd bytes", soself->s_size); return NULL; } /* Call the guts */ if ( s_pack_internal(soself, args, 2, buffer + offset) != 0 ) { return NULL; } Py_RETURN_NONE; } static PyObject * s_get_format(PyStructObject *self, void *unused) { Py_INCREF(self->s_format); return self->s_format; } static PyObject * s_get_size(PyStructObject *self, void *unused) { return PyInt_FromSsize_t(self->s_size); } /* ---- Standalone functions ---- */ #define MAXCACHE 100 static PyObject *cache = NULL; static PyObject * cache_struct(PyObject *fmt) { PyObject * s_object; if (cache == NULL) { cache = PyDict_New(); if (cache == NULL) return NULL; } s_object = PyDict_GetItem(cache, fmt); if (s_object != NULL) { Py_INCREF(s_object); return s_object; } s_object = PyObject_CallFunctionObjArgs((PyObject *)(&PyStructType), fmt, NULL); if (s_object != NULL) { if (PyDict_Size(cache) >= MAXCACHE) PyDict_Clear(cache); /* Attempt to cache the result */ if (PyDict_SetItem(cache, fmt, s_object) == -1) PyErr_Clear(); } return s_object; } PyDoc_STRVAR(clearcache_doc, "Clear the internal cache."); static PyObject * clearcache(PyObject *self) { Py_CLEAR(cache); Py_RETURN_NONE; } PyDoc_STRVAR(calcsize_doc, "Return size of C struct described by format string fmt."); static PyObject * calcsize(PyObject *self, PyObject *fmt) { Py_ssize_t n; PyObject *s_object = cache_struct(fmt); if (s_object == NULL) return NULL; n = ((PyStructObject *)s_object)->s_size; Py_DECREF(s_object); return PyInt_FromSsize_t(n); } PyDoc_STRVAR(pack_doc, "Return string containing values v1, v2, ... packed according to fmt."); static PyObject * pack(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_pack(s_object, newargs); Py_DECREF(newargs); Py_DECREF(s_object); return result; } PyDoc_STRVAR(pack_into_doc, "Pack the values v1, v2, ... according to fmt.\n\ Write the packed bytes into the writable buffer buf starting at offset."); static PyObject * pack_into(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_pack_into(s_object, newargs); Py_DECREF(newargs); Py_DECREF(s_object); return result; } PyDoc_STRVAR(unpack_doc, "Unpack the string containing packed C structure data, according to fmt.\n\ Requires len(string) == calcsize(fmt)."); static PyObject * unpack(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *inputstr, *result; if (!PyArg_UnpackTuple(args, "unpack", 2, 2, &fmt, &inputstr)) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) return NULL; result = s_unpack(s_object, inputstr); Py_DECREF(s_object); return result; } PyDoc_STRVAR(unpack_from_doc, "Unpack the buffer, containing packed C structure data, according to\n\ fmt, starting at offset. Requires len(buffer[offset:]) >= calcsize(fmt)."); static PyObject * unpack_from(PyObject *self, PyObject *args, PyObject *kwds) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_unpack_from(s_object, newargs, kwds); Py_DECREF(newargs); Py_DECREF(s_object); return result; } static struct PyMethodDef module_functions[] = { {"_clearcache", (PyCFunction)clearcache, METH_NOARGS, clearcache_doc}, {"calcsize", calcsize, METH_O, calcsize_doc}, {"pack", pack, METH_VARARGS, pack_doc}, {"pack_into", pack_into, METH_VARARGS, pack_into_doc}, {"unpack", unpack, METH_VARARGS, unpack_doc}, {"unpack_from", (PyCFunction)unpack_from, METH_VARARGS|METH_KEYWORDS, unpack_from_doc}, {NULL, NULL} /* sentinel */ }; /* Module initialization */ PyDoc_STRVAR(module_doc, "Functions to convert between Python values and C structs represented\n\ as Python strings. It uses format strings (explained below) as compact\n\ descriptions of the lay-out of the C structs and the intended conversion\n\ to/from Python values.\n\ \n\ The remaining chars indicate types of args and must match exactly;\n\ these can be preceded by a decimal repeat count:\n\ x: pad byte (no data); c:char; b:signed byte; B:unsigned byte;\n\ ?: _Bool (requires C99; if not available, char is used instead)\n\ h:short; H:unsigned short; i:int; I:unsigned int;\n\ l:long; L:unsigned long; f:float; d:double.\n\ Special cases (preceding decimal count indicates length):\n\ s:string (array of char); p: pascal string (with count byte).\n\ Special case (only available in native format):\n\ P:an integer type that is wide enough to hold a pointer.\n\ Special case (not in native mode unless 'long long' in platform C):\n\ q:long long; Q:unsigned long long\n\ Whitespace between formats is ignored.\n\ \n\ The variable struct.error is an exception raised on errors.\n"); PyMODINIT_FUNC init_pvt_struct(void) { PyObject *ver, *m; ver = PyString_FromString("0.2"); if (ver == NULL) return; m = Py_InitModule3("_pvt_struct", module_functions, module_doc); if (m == NULL) return; Py_TYPE(&PyStructType) = &PyType_Type; if (PyType_Ready(&PyStructType) < 0) return; /* This speed trick can't be used until overflow masking goes away, because native endian always raises exceptions instead of overflow masking. */ /* Add some symbolic constants to the module */ if (StructError == NULL) { StructError = PyErr_NewException("pyopencl._pvt_struct.error", NULL, NULL); if (StructError == NULL) return; } Py_INCREF(StructError); PyModule_AddObject(m, "error", StructError); Py_INCREF((PyObject*)&PyStructType); PyModule_AddObject(m, "Struct", (PyObject*)&PyStructType); PyModule_AddObject(m, "__version__", ver); PyModule_AddIntConstant(m, "_PY_STRUCT_RANGE_CHECKING", 1); #ifdef PY_STRUCT_FLOAT_COERCE PyModule_AddIntConstant(m, "_PY_STRUCT_FLOAT_COERCE", 1); #endif } // vim: noexpandtab:sw=8 pyopencl-2013.2/src/wrapper/bitlog.cpp0000644000175000000500000000164412245716340016406 0ustar tomussrc#include "bitlog.hpp" /* from http://graphics.stanford.edu/~seander/bithacks.html */ const char pyopencl::log_table_8[] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; pyopencl-2013.2/Makefile.in0000644000175000000500000000050312245716340014211 0ustar tomussrc.PHONY : all install clean tags dist userdoc devdoc all: tags ${PYTHON_EXE} setup.py build dist: ${PYTHON_EXE} setup.py sdist install: tags ${PYTHON_EXE} setup.py install clean: rm -Rf build rm -f tags tags: ctags -R src || true tests: echo "running tests" find ./test -type f -name "*.py" -exec python {} \; pyopencl-2013.2/pyopencl/0002755000175000000500000000000012245716342014003 5ustar tomussrcpyopencl-2013.2/pyopencl/clrandom.py0000644000175000000500000003270612245716342016162 0ustar tomussrc# encoding: utf8 from __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ # {{{ documentation __doc__ = u""" PyOpenCL now includes and uses the `RANLUXCL random number generator `_ by Ivar Ursin Nikolaisen. In addition to being usable through the convenience functions above, it is available in any piece of code compiled through PyOpenCL by:: #include See the `source `_ for some documentation if you're planning on using RANLUXCL directly. The RANLUX generator is described in the following two articles. If you use the generator for scientific purposes, please consider citing them: * Martin Lüscher, A portable high-quality random number generator for lattice field theory simulations, `Computer Physics Communications 79 (1994) 100-110 `_ * F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114 `_ """ # }}} import pyopencl as cl import pyopencl.array as cl_array from pyopencl.tools import first_arg_dependent_memoize from pytools import memoize_method import numpy as np class RanluxGenerator(object): """ .. versionadded:: 2011.2 .. attribute:: state A :class:`pyopencl.array.Array` containing the state of the generator. .. attribute:: nskip nskip is an integer which can (optionally) be defined in the kernel code as RANLUXCL_NSKIP. If this is done the generator will be faster for luxury setting 0 and 1, or when the p-value is manually set to a multiple of 24. """ def __init__(self, queue, num_work_items=None, luxury=None, seed=None, no_warmup=False, use_legacy_init=False, max_work_items=None): """ :param queue: :class:`pyopencl.CommandQueue`, only used for initialization :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest and 4 produces the best numbers. It can also be >=24, in which case it directly sets the p-value of RANLUXCL. :param num_work_items: is the number of generators to initialize, usually corresponding to the number of work-items in the NDRange RANLUXCL will be used with. May be `None`, in which case a default value is used. :param max_work_items: should reflect the maximum number of work-items that will be used on any parallel instance of RANLUXCL. So for instance if we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be generated by calling ranluxcl_intialization with numWorkitems = 5120 while GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must be at least 10240 for both GPU1 and GPU2, and it must be set to the same value for both. (may be `None`) .. versionchanged:: 2013.1 Added default value for `num_work_items`. """ if luxury is None: luxury = 4 if num_work_items is None: if queue.device.type & cl.device_type.CPU: num_work_items = 8 * queue.device.max_compute_units else: num_work_items = 64 * queue.device.max_compute_units if seed is None: from time import time seed = int(time()*1e6) % 2 << 30 self.context = queue.context self.luxury = luxury self.num_work_items = num_work_items from pyopencl.characterize import has_double_support self.support_double = has_double_support(queue.device) self.no_warmup = no_warmup self.use_legacy_init = use_legacy_init self.max_work_items = max_work_items src = """ %(defines)s #include kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab) { if (get_global_id(0) < %(num_work_items)d) ranluxcl_initialization(seeds, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), "num_work_items": num_work_items } prg = cl.Program(queue.context, src).build() # {{{ compute work group size wg_size = None import sys import platform if ("darwin" in sys.platform and "Apple" in queue.device.platform.vendor and platform.mac_ver()[0].startswith("10.7") and queue.device.type & cl.device_type.CPU): wg_size = (1,) self.wg_size = wg_size # }}} self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8) self.state.fill(17) prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed), self.state.data) def generate_settings_defines(self, include_double_pragma=True): lines = [] if include_double_pragma and self.support_double: lines.append("#pragma OPENCL EXTENSION cl_khr_fp64 : enable") lines.append("#define RANLUXCL_LUX %d" % self.luxury) if self.no_warmup: lines.append("#define RANLUXCL_NO_WARMUP") if self.support_double: lines.append("#define RANLUXCL_SUPPORT_DOUBLE") if self.use_legacy_init: lines.append("#define RANLUXCL_USE_LEGACY_INITIALIZATION") if self.max_work_items: lines.append( "#define RANLUXCL_MAXWORKITEMS %d" % self.max_work_items) return "\n".join(lines) @memoize_method def get_gen_kernel(self, dtype, distribution="uniform"): size_multiplier = 1 arg_dtype = dtype if dtype == np.float64: bits = 64 c_type = "double" rng_expr = "(shift + scale * gen)" elif dtype == np.float32: bits = 32 c_type = "float" rng_expr = "(shift + scale * gen)" elif dtype == cl_array.vec.float2: bits = 32 c_type = "float" rng_expr = "(shift + scale * gen)" size_multiplier = 2 arg_dtype = np.float32 elif dtype in [cl_array.vec.float3, cl_array.vec.float4]: bits = 32 c_type = "float" rng_expr = "(shift + scale * gen)" size_multiplier = 4 arg_dtype = np.float32 elif dtype == np.int32: assert distribution == "uniform" bits = 32 c_type = "int" rng_expr = ("(shift " "+ convert_int4((float) scale * gen) " "+ convert_int4((float) (scale / (1<<24)) * gen))") else: raise TypeError("unsupported RNG data type '%s'" % dtype) rl_flavor = "%d%s" % (bits, { "uniform": "", "normal": "norm" }[distribution]) src = """//CL// %(defines)s #include typedef %(output_t)s output_t; typedef %(output_t)s4 output_vec_t; #define NUM_WORKITEMS %(num_work_items)d #define RANLUX_FUNC ranluxcl%(rlflavor)s #define GET_RANDOM_NUM(gen) %(rng_expr)s kernel void generate( global ranluxcl_state_t *ranluxcltab, global output_t *output, unsigned long out_size, output_t scale, output_t shift) { ranluxcl_state_t ranluxclstate; ranluxcl_download_seed(&ranluxclstate, ranluxcltab); // output bulk unsigned long idx = get_global_id(0)*4; while (idx + 4 < out_size) { vstore4( GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate)), idx >> 2, output); idx += 4*NUM_WORKITEMS; } // output tail output_vec_t tail_ran = GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate)); if (idx < out_size) output[idx] = tail_ran.x; if (idx+1 < out_size) output[idx+1] = tail_ran.y; if (idx+2 < out_size) output[idx+2] = tail_ran.z; if (idx+3 < out_size) output[idx+3] = tail_ran.w; ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), "rlflavor": rl_flavor, "output_t": c_type, "num_work_items": self.num_work_items, "rng_expr": rng_expr } prg = cl.Program(self.context, src).build() knl = prg.generate knl.set_scalar_arg_dtypes([None, None, np.uint64, arg_dtype, arg_dtype]) return knl, size_multiplier def fill_uniform(self, ary, a=0, b=1, queue=None): """Fill *ary* with uniformly distributed random numbers in the interval *(a, b)*, endpoints excluded. """ if queue is None: queue = ary.queue knl, size_multiplier = self.get_gen_kernel(ary.dtype, "uniform") knl(queue, (self.num_work_items,), None, self.state.data, ary.data, ary.size*size_multiplier, b-a, a) def uniform(self, *args, **kwargs): """Make a new empty array, apply :meth:`fill_uniform` to it. """ a = kwargs.pop("a", 0) b = kwargs.pop("b", 1) result = cl_array.empty(*args, **kwargs) self.fill_uniform(result, queue=result.queue, a=a, b=b) return result def fill_normal(self, ary, mu=0, sigma=1, queue=None): """Fill *ary* with normally distributed numbers with mean *mu* and standard deviation *sigma*. """ if queue is None: queue = ary.queue knl, size_multiplier = self.get_gen_kernel(ary.dtype, "normal") knl(queue, (self.num_work_items,), self.wg_size, self.state.data, ary.data, ary.size*size_multiplier, sigma, mu) def normal(self, *args, **kwargs): """Make a new empty array, apply :meth:`fill_normal` to it. """ mu = kwargs.pop("mu", 0) sigma = kwargs.pop("sigma", 1) result = cl_array.empty(*args, **kwargs) self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma) return result @memoize_method def get_sync_kernel(self): src = """//CL// %(defines)s #include kernel void sync( global ranluxcl_state_t *ranluxcltab) { ranluxcl_state_t ranluxclstate; ranluxcl_download_seed(&ranluxclstate, ranluxcltab); ranluxcl_synchronize(&ranluxclstate); ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), } prg = cl.Program(self.context, src).build() return prg.sync def synchronize(self, queue): """The generator gets inefficient when different work items invoke the generator a differing number of times. This function ensures efficiency. """ self.get_sync_kernel()(queue, (self.num_work_items,), self.wg_size, self.state.data) @first_arg_dependent_memoize def _get_generator(queue, luxury=None): gen = RanluxGenerator(queue, luxury=luxury) queue.finish() return gen def fill_rand(result, queue=None, luxury=4, a=0, b=1): """Fill *result* with random values of `dtype` in the range [0,1). """ if queue is None: queue = result.queue gen = _get_generator(queue, luxury=luxury) gen.fill_uniform(result, a=a, b=b) def rand(queue, shape, dtype, luxury=None, a=0, b=1): """Return an array of `shape` filled with random values of `dtype` in the range [a,b). """ from pyopencl.array import Array gen = _get_generator(queue, luxury) result = Array(queue, shape, dtype) gen.fill_uniform(result, a=a, b=b) return result # vim: filetype=pyopencl:foldmethod=marker pyopencl-2013.2/pyopencl/capture_call.py0000644000175000000500000001206712245716342017017 0ustar tomussrcfrom __future__ import with_statement, division __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pyopencl as cl import numpy as np from pytools.py_codegen import PythonCodeGenerator, Indentation def capture_kernel_call(kernel, filename, queue, g_size, l_size, *args, **kwargs): try: source = kernel._source except AttributeError: raise RuntimeError("cannot capture call, kernel source not available") if source is None: raise RuntimeError("cannot capture call, kernel source not available") cg = PythonCodeGenerator() cg("# generated by pyopencl.capture_call") cg("") cg("import numpy as np") cg("import pyopencl as cl") cg("from base64 import b64decode") cg("from zlib import decompress") cg("mf = cl.mem_flags") cg("") cg('CODE = r"""//CL//') for l in source.split("\n"): cg(l) cg('"""') # {{{ invocation arg_data = [] cg("") cg("") cg("def main():") with Indentation(cg): cg("ctx = cl.create_some_context()") cg("queue = cl.CommandQueue(ctx)") cg("") kernel_args = [] for i, arg in enumerate(args): if isinstance(arg, cl.Buffer): buf = bytearray(arg.size) cl.enqueue_copy(queue, buf, arg) arg_data.append(("arg%d_data" % i, buf)) cg("arg%d = cl.Buffer(ctx, " "mf.READ_WRITE | cl.mem_flags.COPY_HOST_PTR," % i) cg(" hostbuf=decompress(b64decode(arg%d_data)))" % i) kernel_args.append("arg%d" % i) elif isinstance(arg, (int, float)): kernel_args.append(repr(arg)) elif isinstance(arg, np.integer): kernel_args.append("np.%s(%s)" % ( arg.dtype.type.__name__, repr(int(arg)))) elif isinstance(arg, np.floating): kernel_args.append("np.%s(%s)" % ( arg.dtype.type.__name__, repr(float(arg)))) elif isinstance(arg, np.complexfloating): kernel_args.append("np.%s(%s)" % ( arg.dtype.type.__name__, repr(complex(arg)))) else: try: arg_buf = buffer(arg) except: raise RuntimeError("cannot capture: " "unsupported arg nr %d (0-based)" % i) arg_data.append(("arg%d_data" % i, arg_buf)) kernel_args.append("decompress(b64decode(arg%d_data))" % i) cg("") g_times_l = kwargs.get("g_times_l", False) if g_times_l: dim = max(len(g_size), len(l_size)) l_size = l_size + (1,) * (dim-len(l_size)) g_size = g_size + (1,) * (dim-len(g_size)) g_size = tuple( gs*ls for gs, ls in zip(g_size, l_size)) global_offset = kwargs.get("global_offset", None) if global_offset is not None: kernel_args.append("global_offset=%s" % repr(global_offset)) cg("prg = cl.Program(ctx, CODE).build()") cg("knl = prg.%s" % kernel.function_name) if hasattr(kernel, "_arg_type_chars"): cg("knl._arg_type_chars = %s" % repr(kernel._arg_type_chars)) cg("knl(queue, %s, %s," % (repr(g_size), repr(l_size))) cg(" %s)" % ", ".join(kernel_args)) # }}} # {{{ data from zlib import compress from base64 import b64encode cg("") line_len = 70 for name, val in arg_data: cg("%s = (" % name) with Indentation(cg): val = str(b64encode(compress(buffer(val)))) i = 0 while i < len(val): cg(repr(val[i:i+line_len])) i += line_len cg(")") # }}} # {{{ file trailer cg("") cg("if __name__ == \"__main__\":") with Indentation(cg): cg("main()") cg("") cg("# vim: filetype=pyopencl") # }}} with open(filename, "w") as outf: outf.write(cg.get()) pyopencl-2013.2/pyopencl/array.py0000644000175000000500000017471212245716342015505 0ustar tomussrc"""CL device arrays.""" from __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np import pyopencl.elementwise as elementwise import pyopencl as cl from pytools import memoize_method from pyopencl.compyte.array import ( as_strided as _as_strided, f_contiguous_strides as _f_contiguous_strides, c_contiguous_strides as _c_contiguous_strides, ArrayFlags as _ArrayFlags, get_common_dtype as _get_common_dtype_base) from pyopencl.characterize import has_double_support def _get_common_dtype(obj1, obj2, queue): return _get_common_dtype_base(obj1, obj2, has_double_support(queue.device)) # {{{ vector types class vec: pass def _create_vector_types(): field_names = ["x", "y", "z", "w"] from pyopencl.tools import get_or_register_dtype vec.types = {} vec.type_to_scalar_and_count = {} counts = [2, 3, 4, 8, 16] for base_name, base_type in [ ('char', np.int8), ('uchar', np.uint8), ('short', np.int16), ('ushort', np.uint16), ('int', np.int32), ('uint', np.uint32), ('long', np.int64), ('ulong', np.uint64), ('float', np.float32), ('double', np.float64), ]: for count in counts: name = "%s%d" % (base_name, count) titles = field_names[:count] padded_count = count if count == 3: padded_count = 4 names = ["s%d" % i for i in range(count)] while len(names) < padded_count: names.append("padding%d" % (len(names)-count)) if len(titles) < len(names): titles.extend((len(names)-len(titles))*[None]) dtype = np.dtype(dict( names=names, formats=[base_type]*padded_count, titles=titles)) get_or_register_dtype(name, dtype) setattr(vec, name, dtype) my_field_names = ",".join(field_names[:count]) my_field_names_defaulted = ",".join( "%s=0" % fn for fn in field_names[:count]) setattr(vec, "make_"+name, staticmethod(eval( "lambda %s: array((%s), dtype=my_dtype)" % (my_field_names_defaulted, my_field_names), dict(array=np.array, my_dtype=dtype)))) vec.types[np.dtype(base_type), count] = dtype vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count _create_vector_types() # }}} # {{{ helper functionality def splay(queue, n, kernel_specific_max_wg_size=None): dev = queue.device max_work_items = _builtin_min(128, dev.max_work_group_size) if kernel_specific_max_wg_size is not None: from __builtin__ import min max_work_items = min(max_work_items, kernel_specific_max_wg_size) min_work_items = _builtin_min(32, max_work_items) max_groups = dev.max_compute_units * 4 * 8 # 4 to overfill the device # 8 is an Nvidia constant--that's how many # groups fit onto one compute device if n < min_work_items: group_count = 1 work_items_per_group = min_work_items elif n < (max_groups * min_work_items): group_count = (n + min_work_items - 1) // min_work_items work_items_per_group = min_work_items elif n < (max_groups * max_work_items): group_count = max_groups grp = (n + min_work_items - 1) // min_work_items work_items_per_group = ( (grp + max_groups - 1) // max_groups) * min_work_items else: group_count = max_groups work_items_per_group = max_work_items #print "n:%d gc:%d wipg:%d" % (n, group_count, work_items_per_group) return (group_count*work_items_per_group,), (work_items_per_group,) def elwise_kernel_runner(kernel_getter): """Take a kernel getter of the same signature as the kernel and return a function that invokes that kernel. Assumes that the zeroth entry in *args* is an :class:`Array`. """ def kernel_runner(*args, **kwargs): repr_ary = args[0] queue = kwargs.pop("queue", None) or repr_ary.queue wait_for = kwargs.pop("wait_for", None) # wait_for must be a copy, because we modify it in-place below if wait_for is None: wait_for = [] else: wait_for = list(wait_for) knl = kernel_getter(*args, **kwargs) gs, ls = repr_ary.get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) assert isinstance(repr_ary, Array) actual_args = [] for arg in args: if isinstance(arg, Array): if not arg.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") actual_args.append(arg.base_data) actual_args.append(arg.offset) wait_for.extend(arg.events) else: actual_args.append(arg) actual_args.append(repr_ary.size) return knl(queue, gs, ls, *actual_args, **dict(wait_for=wait_for)) try: from functools import update_wrapper except ImportError: return kernel_runner else: return update_wrapper(kernel_runner, kernel_getter) class DefaultAllocator(cl.tools.DeferredAllocator): def __init__(self, *args, **kwargs): from warnings import warn warn("pyopencl.array.DefaultAllocator is deprecated. " "It will be continue to exist throughout the 2013.x " "versions of PyOpenCL.", DeprecationWarning, 2) cl.tools.DeferredAllocator.__init__(self, *args, **kwargs) def _make_strides(itemsize, shape, order): if order in "fF": return _f_contiguous_strides(itemsize, shape) elif order in "cC": return _c_contiguous_strides(itemsize, shape) else: raise ValueError("invalid order: %s" % order) # }}} # {{{ array class class ArrayHasOffsetError(ValueError): """ .. versionadded:: 2013.1 """ def __init__(self, val="The operation you are attempting does not yet " "support arrays that start at an offset from the beginning " "of their buffer."): ValueError.__init__(self, val) class _copy_queue: pass class Array(object): """A :class:`numpy.ndarray` work-alike that stores its data and performs its computations on the compute device. *shape* and *dtype* work exactly as in :mod:`numpy`. Arithmetic methods in :class:`Array` support the broadcasting of scalars. (e.g. `array+5`) *cqa* must be a :class:`pyopencl.CommandQueue` or a :class:`pyopencl.Context`. If it is a queue, *cqa* specifies the queue in which the array carries out its computations by default. If a default queue (and thereby overloaded operators and many other niceties) are not desired, pass a :class:`Context`. *cqa* will at some point be renamed *cq*, so it should be considered 'positional-only'. Arguments starting from 'order' should be considered keyword-only. *allocator* may be `None` or a callable that, upon being called with an argument of the number of bytes to be allocated, returns an :class:`pyopencl.Buffer` object. (A :class:`pyopencl.tools.MemoryPool` instance is one useful example of an object to pass here.) .. versionchanged:: 2011.1 Renamed *context* to *cqa*, made it general-purpose. All arguments beyond *order* should be considered keyword-only. .. attribute :: data The :class:`pyopencl.MemoryObject` instance created for the memory that backs this :class:`Array`. .. versionchanged:: 2013.1 If a non-zero :attr:`offset` has been specified for this array, this will fail with :exc:`ArrayHasOffsetError`. .. attribute :: base_data The :class:`pyopencl.MemoryObject` instance created for the memory that backs this :class:`Array`. Unlike :attr:`data`, the base address of *base_data* is allowed to be different from the beginning of the array. The actual beginning is the base address of *base_data* plus :attr:`offset` in units of :attr:`dtype`. Unlike :attr:`data`, retrieving :attr:`base_data` always succeeds. .. versionadded:: 2013.1 .. attribute :: offset See :attr:`base_data`. .. versionadded:: 2013.1 .. attribute :: shape The tuple of lengths of each dimension in the array. .. attribute :: dtype The :class:`numpy.dtype` of the items in the GPU array. .. attribute :: size The number of meaningful entries in the array. Can also be computed by multiplying up the numbers in :attr:`shape`. .. attribute :: nbytes The size of the entire array in bytes. Computed as :attr:`size` times ``dtype.itemsize``. .. attribute :: strides Tuple of bytes to step in each dimension when traversing an array. .. attribute :: flags Return an object with attributes `c_contiguous`, `f_contiguous` and `forc`, which may be used to query contiguity properties in analogy to :attr:`numpy.ndarray.flags`. .. rubric:: Methods .. automethod :: with_queue .. automethod :: __len__ .. automethod :: reshape .. automethod :: ravel .. automethod :: view .. automethod :: set .. automethod :: get .. automethod :: copy .. automethod :: __str__ .. automethod :: __repr__ .. automethod :: mul_add .. automethod :: __add__ .. automethod :: __sub__ .. automethod :: __iadd__ .. automethod :: __isub__ .. automethod :: __neg__ .. automethod :: __mul__ .. automethod :: __div__ .. automethod :: __rdiv__ .. automethod :: __pow__ .. automethod :: __abs__ .. UNDOC reverse() .. automethod :: fill .. automethod :: astype .. autoattribute :: real .. autoattribute :: imag .. automethod :: conj .. automethod :: __getitem__ .. automethod :: __setitem__ .. automethod :: setitem .. automethod :: map_to_host .. rubric:: Comparisons, conditionals, any, all .. versionadded:: 2013.2 Boolean arrays are stored as :class:`numpy.int8` because ``bool`` has an unspecified size in the OpenCL spec. .. automethod :: __nonzero__ Only works for device scalars. (i.e. "arrays" with ``shape == ()``.) .. automethod :: any .. automethod :: all .. automethod :: __eq__ .. automethod :: __ne__ .. automethod :: __lt__ .. automethod :: __le__ .. automethod :: __gt__ .. automethod :: __ge__ """ __array_priority__ = 100 def __init__(self, cqa, shape, dtype, order="C", allocator=None, data=None, offset=0, queue=None, strides=None, events=None): # {{{ backward compatibility from warnings import warn if queue is not None: warn("Passing the queue to the array through anything but the " "first argument of the Array constructor is deprecated. " "This will be continue to be accepted throughout the " "2013.[0-6] versions of PyOpenCL.", DeprecationWarning, 2) if isinstance(cqa, cl.CommandQueue): if queue is not None: raise TypeError("can't specify queue in 'cqa' and " "'queue' arguments") queue = cqa elif isinstance(cqa, cl.Context): context = cqa if queue is not None: raise TypeError("may not pass a context and a queue " "(just pass the queue)") if allocator is not None: raise TypeError("may not pass a context and an allocator " "(just pass the queue)") else: # cqa is assumed to be an allocator warn("Passing an allocator for the 'cqa' parameter is deprecated. " "This usage will be continue to be accepted throughout " "the 2013.[0-6] versions of PyOpenCL.", DeprecationWarning, 2) if allocator is not None: raise TypeError("can't specify allocator in 'cqa' and " "'allocator' arguments") allocator = cqa # Queue-less arrays do have a purpose in life. # They don't do very much, but at least they don't run kernels # in random queues. # # See also :meth:`with_queue`. # }}} # invariant here: allocator, queue set # {{{ determine shape and strides dtype = np.dtype(dtype) try: s = 1 for dim in shape: s *= dim except TypeError: import sys if sys.version_info >= (3,): admissible_types = (int, np.integer) else: admissible_types = (int, long, np.integer) if not isinstance(shape, admissible_types): raise TypeError("shape must either be iterable or " "castable to an integer") s = shape shape = (shape,) if isinstance(s, np.integer): # bombs if s is a Python integer s = np.asscalar(s) if strides is None: strides = _make_strides(dtype.itemsize, shape, order) else: # FIXME: We should possibly perform some plausibility # checking on 'strides' here. strides = tuple(strides) # }}} if dtype == object: raise TypeError("object arrays on the compute device are not allowed") self.queue = queue self.shape = shape self.dtype = dtype self.strides = strides if events is None: self.events = [] else: self.events = events self.size = s alloc_nbytes = self.nbytes = self.dtype.itemsize * self.size self.allocator = allocator if data is None: if not alloc_nbytes: # Work around CL not allowing zero-sized buffers. alloc_nbytes = 1 if allocator is None: # FIXME remove me when queues become required if queue is not None: context = queue.context self.base_data = cl.Buffer( context, cl.mem_flags.READ_WRITE, alloc_nbytes) else: self.base_data = self.allocator(alloc_nbytes) else: self.base_data = data self.offset = offset @property def context(self): return self.base_data.context @property def data(self): if self.offset: raise ArrayHasOffsetError() else: return self.base_data @property @memoize_method def flags(self): return _ArrayFlags(self) def _new_with_changes(self, data, offset, shape=None, dtype=None, strides=None, queue=_copy_queue): """ :arg data: *None* means alocate a new array. """ if shape is None: shape = self.shape if dtype is None: dtype = self.dtype if strides is None: strides = self.strides if queue is _copy_queue: queue = self.queue if queue is not None: return Array(queue, shape, dtype, allocator=self.allocator, strides=strides, data=data, offset=offset, events=self.events) elif self.allocator is not None: return Array(self.allocator, shape, dtype, queue=queue, strides=strides, data=data, offset=offset, events=self.events) else: return Array(self.context, shape, dtype, strides=strides, data=data, offset=offset, events=self.events) def with_queue(self, queue): """Return a copy of *self* with the default queue set to *queue*. *None* is allowed as a value for *queue*. .. versionadded:: 2013.1 """ if queue is not None: assert queue.context == self.context return self._new_with_changes(self.base_data, self.offset, queue=queue) #@memoize_method FIXME: reenable def get_sizes(self, queue, kernel_specific_max_wg_size=None): if not self.flags.forc: raise NotImplementedError("cannot operate on non-contiguous array") return splay(queue, self.size, kernel_specific_max_wg_size=kernel_specific_max_wg_size) def set(self, ary, queue=None, async=False): """Transfer the contents the :class:`numpy.ndarray` object *ary* onto the device. *ary* must have the same dtype and size (not necessarily shape) as *self*. """ assert ary.size == self.size assert ary.dtype == self.dtype if not ary.flags.forc: raise RuntimeError("cannot set from non-contiguous array") ary = ary.copy() if ary.strides != self.strides: from warnings import warn warn("Setting array from one with different " "strides/storage order. This will cease to work " "in 2013.x.", stacklevel=2) if self.size: cl.enqueue_copy(queue or self.queue, self.base_data, ary, device_offset=self.offset, is_blocking=not async) def get(self, queue=None, ary=None, async=False): """Transfer the contents of *self* into *ary* or a newly allocated :mod:`numpy.ndarray`. If *ary* is given, it must have the right size (not necessarily shape) and dtype. """ if ary is None: ary = np.empty(self.shape, self.dtype) ary = _as_strided(ary, strides=self.strides) else: if ary.size != self.size: raise TypeError("'ary' has non-matching size") if ary.dtype != self.dtype: raise TypeError("'ary' has non-matching type") assert self.flags.forc, "Array in get() must be contiguous" if self.size: cl.enqueue_copy(queue or self.queue, ary, self.base_data, device_offset=self.offset, is_blocking=not async) return ary def copy(self, queue=None): """.. versionadded:: 2013.1""" queue = queue or self.queue result = self._new_like_me() cl.enqueue_copy(queue, result.base_data, self.base_data, src_offset=self.offset, byte_count=self.nbytes) return result def __str__(self): return str(self.get()) def __repr__(self): return repr(self.get()) def __hash__(self): raise TypeError("pyopencl arrays are not hashable.") # {{{ kernel invocation wrappers @staticmethod @elwise_kernel_runner def _axpbyz(out, afac, a, bfac, b, queue=None): """Compute ``out = selffac * self + otherfac*other``, where *other* is an array.""" assert out.shape == a.shape assert out.shape == b.shape return elementwise.get_axpbyz_kernel( out.context, a.dtype, b.dtype, out.dtype) @staticmethod @elwise_kernel_runner def _axpbz(out, a, x, b, queue=None): """Compute ``z = a * x + b``, where *b* is a scalar.""" a = np.array(a) b = np.array(b) assert out.shape == x.shape return elementwise.get_axpbz_kernel(out.context, a.dtype, x.dtype, b.dtype, out.dtype) @staticmethod @elwise_kernel_runner def _elwise_multiply(out, a, b, queue=None): assert out.shape == a.shape assert out.shape == b.shape return elementwise.get_multiply_kernel( a.context, a.dtype, b.dtype, out.dtype) @staticmethod @elwise_kernel_runner def _rdiv_scalar(out, ary, other, queue=None): other = np.array(other) assert out.shape == ary.shape return elementwise.get_rdivide_elwise_kernel( out.context, ary.dtype, other.dtype, out.dtype) @staticmethod @elwise_kernel_runner def _div(out, self, other, queue=None): """Divides an array by another array.""" assert self.shape == other.shape return elementwise.get_divide_kernel(self.context, self.dtype, other.dtype, out.dtype) @staticmethod @elwise_kernel_runner def _fill(result, scalar): return elementwise.get_fill_kernel(result.context, result.dtype) @staticmethod @elwise_kernel_runner def _abs(result, arg): if arg.dtype.kind == "c": from pyopencl.elementwise import complex_dtype_to_name fname = "%s_abs" % complex_dtype_to_name(arg.dtype) elif arg.dtype.kind == "f": fname = "fabs" elif arg.dtype.kind in ["u", "i"]: fname = "abs" else: raise TypeError("unsupported dtype in _abs()") return elementwise.get_unary_func_kernel( arg.context, fname, arg.dtype, out_dtype=result.dtype) @staticmethod @elwise_kernel_runner def _real(result, arg): from pyopencl.elementwise import complex_dtype_to_name fname = "%s_real" % complex_dtype_to_name(arg.dtype) return elementwise.get_unary_func_kernel( arg.context, fname, arg.dtype, out_dtype=result.dtype) @staticmethod @elwise_kernel_runner def _imag(result, arg): from pyopencl.elementwise import complex_dtype_to_name fname = "%s_imag" % complex_dtype_to_name(arg.dtype) return elementwise.get_unary_func_kernel( arg.context, fname, arg.dtype, out_dtype=result.dtype) @staticmethod @elwise_kernel_runner def _conj(result, arg): from pyopencl.elementwise import complex_dtype_to_name fname = "%s_conj" % complex_dtype_to_name(arg.dtype) return elementwise.get_unary_func_kernel( arg.context, fname, arg.dtype, out_dtype=result.dtype) @staticmethod @elwise_kernel_runner def _pow_scalar(result, ary, exponent): exponent = np.array(exponent) return elementwise.get_pow_kernel(result.context, ary.dtype, exponent.dtype, result.dtype, is_base_array=True, is_exp_array=False) @staticmethod @elwise_kernel_runner def _rpow_scalar(result, base, exponent): base = np.array(base) return elementwise.get_pow_kernel(result.context, base.dtype, exponent.dtype, result.dtype, is_base_array=False, is_exp_array=True) @staticmethod @elwise_kernel_runner def _pow_array(result, base, exponent): return elementwise.get_pow_kernel( result.context, base.dtype, exponent.dtype, result.dtype, is_base_array=True, is_exp_array=True) @staticmethod @elwise_kernel_runner def _reverse(result, ary): return elementwise.get_reverse_kernel(result.context, ary.dtype) @staticmethod @elwise_kernel_runner def _copy(dest, src): return elementwise.get_copy_kernel( dest.context, dest.dtype, src.dtype) def _new_like_me(self, dtype=None, queue=None): strides = None if dtype is None: dtype = self.dtype else: if dtype == self.dtype: strides = self.strides queue = queue or self.queue if queue is not None: return self.__class__(queue, self.shape, dtype, allocator=self.allocator, strides=strides) elif self.allocator is not None: return self.__class__(self.allocator, self.shape, dtype, strides=strides) else: return self.__class__(self.context, self.shape, dtype, strides=strides) # }}} # {{{ operators def mul_add(self, selffac, other, otherfac, queue=None): """Return `selffac * self + otherfac*other`. """ result = self._new_like_me( _get_common_dtype(self, other, queue or self.queue)) self._axpbyz(result, selffac, self, otherfac, other) return result def __add__(self, other): """Add an array with an array or an array with a scalar.""" if isinstance(other, Array): # add another vector result = self._new_like_me( _get_common_dtype(self, other, self.queue)) self._axpbyz(result, self.dtype.type(1), self, other.dtype.type(1), other) return result else: # add a scalar if other == 0: return self else: common_dtype = _get_common_dtype(self, other, self.queue) result = self._new_like_me(common_dtype) self._axpbz(result, self.dtype.type(1), self, common_dtype.type(other)) return result __radd__ = __add__ def __sub__(self, other): """Substract an array from an array or a scalar from an array.""" if isinstance(other, Array): result = self._new_like_me( _get_common_dtype(self, other, self.queue)) self._axpbyz(result, self.dtype.type(1), self, other.dtype.type(-1), other) return result else: # subtract a scalar if other == 0: return self else: result = self._new_like_me( _get_common_dtype(self, other, self.queue)) self._axpbz(result, self.dtype.type(1), self, -other) return result def __rsub__(self, other): """Substracts an array by a scalar or an array:: x = n - self """ common_dtype = _get_common_dtype(self, other, self.queue) # other must be a scalar result = self._new_like_me(common_dtype) self._axpbz(result, self.dtype.type(-1), self, common_dtype.type(other)) return result def __iadd__(self, other): if isinstance(other, Array): self._axpbyz(self, self.dtype.type(1), self, other.dtype.type(1), other) return self else: self._axpbz(self, self.dtype.type(1), self, other) return self def __isub__(self, other): if isinstance(other, Array): self._axpbyz(self, self.dtype.type(1), self, other.dtype.type(-1), other) return self else: self._axpbz(self, self.dtype.type(1), self, -other) return self def __neg__(self): result = self._new_like_me() self._axpbz(result, -1, self, 0) return result def __mul__(self, other): if isinstance(other, Array): result = self._new_like_me( _get_common_dtype(self, other, self.queue)) self._elwise_multiply(result, self, other) return result else: common_dtype = _get_common_dtype(self, other, self.queue) result = self._new_like_me(common_dtype) self._axpbz(result, common_dtype.type(other), self, self.dtype.type(0)) return result def __rmul__(self, scalar): common_dtype = _get_common_dtype(self, scalar, self.queue) result = self._new_like_me(common_dtype) self._axpbz(result, common_dtype.type(scalar), self, self.dtype.type(0)) return result def __imul__(self, other): if isinstance(other, Array): self._elwise_multiply(self, self, other) else: # scalar self._axpbz(self, other, self, self.dtype.type(0)) return self def __div__(self, other): """Divides an array by an array or a scalar, i.e. ``self / other``. """ if isinstance(other, Array): result = self._new_like_me( _get_common_dtype(self, other, self.queue)) self._div(result, self, other) else: if other == 1: return self else: # create a new array for the result common_dtype = _get_common_dtype(self, other, self.queue) result = self._new_like_me(common_dtype) self._axpbz(result, common_dtype.type(1/other), self, self.dtype.type(0)) return result __truediv__ = __div__ def __rdiv__(self, other): """Divides an array by a scalar or an array, i.e. ``other / self``. """ if isinstance(other, Array): result = self._new_like_me( _get_common_dtype(self, other, self.queue)) other._div(result, self) else: # create a new array for the result common_dtype = _get_common_dtype(self, other, self.queue) result = self._new_like_me(common_dtype) self._rdiv_scalar(result, self, common_dtype.type(other)) return result __rtruediv__ = __rdiv__ def fill(self, value, queue=None, wait_for=None): """Fill the array with *scalar*. :returns: *self*. """ self.events.append( self._fill(self, value, queue=queue, wait_for=wait_for)) return self def __len__(self): """Returns the size of the leading dimension of *self*.""" if len(self.shape): return self.shape[0] else: return TypeError("scalar has no len()") def __abs__(self): """Return a `Array` of the absolute values of the elements of *self*. """ result = self._new_like_me(self.dtype.type(0).real.dtype) self._abs(result, self) return result def __pow__(self, other): """Exponentiation by a scalar or elementwise by another :class:`Array`. """ if isinstance(other, Array): assert self.shape == other.shape result = self._new_like_me( _get_common_dtype(self, other, self.queue)) self._pow_array(result, self, other) else: result = self._new_like_me( _get_common_dtype(self, other, self.queue)) self._pow_scalar(result, self, other) return result def __rpow__(self, other): # other must be a scalar common_dtype = _get_common_dtype(self, other, self.queue) result = self._new_like_me(common_dtype) self._rpow_scalar(result, common_dtype.type(other), self) return result # }}} def reverse(self, queue=None): """Return this array in reversed order. The array is treated as one-dimensional. """ result = self._new_like_me() self._reverse(result, self) return result def astype(self, dtype, queue=None): """Return *self*, cast to *dtype*.""" if dtype == self.dtype: return self result = self._new_like_me(dtype=dtype) self._copy(result, self, queue=queue) return result # {{{ rich comparisons, any, all def __nonzero__(self): if self.shape == (): return bool(self.get()) else: raise ValueError("The truth value of an array with " "more than one element is ambiguous. Use a.any() or a.all()") def any(self, queue=None, wait_for=None): from pyopencl.reduction import get_any_kernel krnl = get_any_kernel(self.context, self.dtype) return krnl(self, queue=queue, wait_for=wait_for) def all(self, queue=None, wait_for=None): from pyopencl.reduction import get_all_kernel krnl = get_all_kernel(self.context, self.dtype) return krnl(self, queue=queue, wait_for=wait_for) @staticmethod @elwise_kernel_runner def _scalar_comparison(out, a, b, queue=None, op=None): return elementwise.get_array_scalar_comparison_kernel( out.context, op, a.dtype) @staticmethod @elwise_kernel_runner def _array_comparison(out, a, b, queue=None, op=None): if a.shape != b.shape: raise ValueError("shapes of comparison arguments do not match") return elementwise.get_array_comparison_kernel( out.context, op, a.dtype, b.dtype) def __eq__(self, other): if isinstance(other, Array): result = self._new_like_me(np.int8) self._array_comparison(result, self, other, op="==") return result else: result = self._new_like_me(np.int8) self._scalar_comparison(result, self, other, op="==") return result def __ne__(self, other): if isinstance(other, Array): result = self._new_like_me(np.int8) self._array_comparison(result, self, other, op="!=") return result else: result = self._new_like_me(np.int8) self._scalar_comparison(result, self, other, op="!=") return result def __le__(self, other): if isinstance(other, Array): result = self._new_like_me(np.int8) self._array_comparison(result, self, other, op="<=") return result else: result = self._new_like_me(np.int8) self._scalar_comparison(result, self, other, op="<=") return result def __ge__(self, other): if isinstance(other, Array): result = self._new_like_me(np.int8) self._array_comparison(result, self, other, op=">=") return result else: result = self._new_like_me(np.int8) self._scalar_comparison(result, self, other, op=">=") return result def __lt__(self, other): if isinstance(other, Array): result = self._new_like_me(np.int8) self._array_comparison(result, self, other, op="<") return result else: result = self._new_like_me(np.int8) self._scalar_comparison(result, self, other, op="<") return result def __gt__(self, other): if isinstance(other, Array): result = self._new_like_me(np.int8) self._array_comparison(result, self, other, op=">") return result else: result = self._new_like_me(np.int8) self._scalar_comparison(result, self, other, op=">") return result # }}} # {{{ complex-valued business def real(self): if self.dtype.kind == "c": result = self._new_like_me(self.dtype.type(0).real.dtype) self._real(result, self) return result else: return self real = property(real, doc=".. versionadded:: 2012.1") def imag(self): if self.dtype.kind == "c": result = self._new_like_me(self.dtype.type(0).real.dtype) self._imag(result, self) return result else: return zeros_like(self) imag = property(imag, doc=".. versionadded:: 2012.1") def conj(self): """.. versionadded:: 2012.1""" if self.dtype.kind == "c": result = self._new_like_me() self._conj(result, self) return result else: return self # }}} # {{{ views def reshape(self, *shape, **kwargs): """Returns an array containing the same data with a new shape.""" order = kwargs.pop("order", "C") if kwargs: raise TypeError("unexpected keyword arguments: %s" % kwargs.keys()) # TODO: add more error-checking, perhaps if isinstance(shape[0], tuple) or isinstance(shape[0], list): shape = tuple(shape[0]) size = reduce(lambda x, y: x * y, shape, 1) if size != self.size: raise ValueError("total size of new array must be unchanged") return self._new_with_changes( data=self.base_data, offset=self.offset, shape=shape, strides=_make_strides(self.dtype.itemsize, shape, order)) def ravel(self): """Returns flattened array containing the same data.""" return self.reshape(self.size) def view(self, dtype=None): """Returns view of array with the same data. If *dtype* is different from current dtype, the actual bytes of memory will be reinterpreted. """ if dtype is None: dtype = self.dtype old_itemsize = self.dtype.itemsize itemsize = np.dtype(dtype).itemsize from pytools import argmin2 min_stride_axis = argmin2( (axis, abs(stride)) for axis, stride in enumerate(self.strides)) if self.shape[min_stride_axis] * old_itemsize % itemsize != 0: raise ValueError("new type not compatible with array") new_shape = ( self.shape[:min_stride_axis] + (self.shape[min_stride_axis] * old_itemsize // itemsize,) + self.shape[min_stride_axis+1:]) new_strides = ( self.strides[:min_stride_axis] + (self.strides[min_stride_axis] * itemsize // old_itemsize,) + self.strides[min_stride_axis+1:]) return self._new_with_changes( self.base_data, self.offset, shape=new_shape, dtype=dtype, strides=new_strides) # }}} def finish(self): # undoc if self.events: cl.wait_for_events(self.events) del self.events[:] def map_to_host(self, queue=None, flags=None, is_blocking=True, wait_for=None): """If *is_blocking*, return a :class:`numpy.ndarray` corresponding to the same memory as *self*. If *is_blocking* is not true, return a tuple ``(ary, evt)``, where *ary* is the above-mentioned array. The host array is obtained using :func:`pyopencl.enqueue_map_buffer`. See there for further details. :arg flags: A combination of :class:`pyopencl.map_flags`. Defaults to read-write. .. versionadded :: 2013.2 """ if flags is None: flags = cl.map_flags.READ | cl.map_flags.WRITE ary, evt = cl.enqueue_map_buffer( queue or self.queue, self.base_data, flags, self.offset, self.shape, self.dtype, strides=self.strides, wait_for=wait_for, is_blocking=is_blocking) if is_blocking: return ary else: return ary, evt # {{{ getitem/setitem def __getitem__(self, index): """ .. versionadded:: 2013.1 """ if isinstance(index, Array): if index.dtype.kind != "i": raise TypeError( "fancy indexing is only allowed with integers") if len(index.shape) != 1: raise NotImplementedError( "multidimensional fancy indexing is not supported") if len(self.shape) != 1: raise NotImplementedError( "fancy indexing into a multi-d array is not supported") return take(self, index) if not isinstance(index, tuple): index = (index,) new_shape = [] new_offset = self.offset new_strides = [] seen_ellipsis = False index_axis = 0 array_axis = 0 while index_axis < len(index): index_entry = index[index_axis] if array_axis > len(self.shape): raise IndexError("too many axes in index") if isinstance(index_entry, slice): start, stop, idx_stride = index_entry.indices( self.shape[array_axis]) array_stride = self.strides[array_axis] new_shape.append((stop-start)//idx_stride) new_strides.append(idx_stride*array_stride) new_offset += array_stride*start index_axis += 1 array_axis += 1 elif isinstance(index_entry, (int, np.integer)): array_shape = self.shape[array_axis] if index_entry < 0: index_entry += array_shape if not (0 <= index_entry < array_shape): raise IndexError( "subindex in axis %d out of range" % index_axis) new_offset += self.strides[array_axis]*index_entry index_axis += 1 array_axis += 1 elif index_entry is Ellipsis: index_axis += 1 remaining_index_count = len(index) - index_axis new_array_axis = len(self.shape) - remaining_index_count if new_array_axis < array_axis: raise IndexError("invalid use of ellipsis in index") while array_axis < new_array_axis: new_shape.append(self.shape[array_axis]) new_strides.append(self.strides[array_axis]) array_axis += 1 if seen_ellipsis: raise IndexError( "more than one ellipsis not allowed in index") seen_ellipsis = True else: raise IndexError("invalid subindex in axis %d" % index_axis) while array_axis < len(self.shape): new_shape.append(self.shape[array_axis]) new_strides.append(self.strides[array_axis]) array_axis += 1 return self._new_with_changes( self.base_data, offset=new_offset, shape=tuple(new_shape), strides=tuple(new_strides)) def setitem(self, subscript, value, queue=None): """Like :meth:`__setitem__`, but with the ability to specify a *queue* for execution. .. versionadded:: 2013.1 """ if isinstance(subscript, Array): if subscript.dtype.kind != "i": raise TypeError( "fancy indexing is only allowed with integers") if len(subscript.shape) != 1: raise NotImplementedError( "multidimensional fancy indexing is not supported") if len(self.shape) != 1: raise NotImplementedError( "fancy indexing into a multi-d array is supported") multi_put([value], subscript, out=[self], queue=self.queue) return queue = queue or self.queue or value.queue subarray = self[subscript] if isinstance(value, np.ndarray): if subarray.shape == value.shape and subarray.strides == value.strides: self.events.append( cl.enqueue_copy(queue, subarray.base_data, value, device_offset=subarray.offset)) return else: value = to_device(queue, value, self.allocator) if isinstance(value, Array): if len(subarray.shape) != len(value.shape): raise NotImplementedError("broadcasting is not " "supported in __setitem__") if subarray.shape != value.shape: raise ValueError("cannot assign between arrays of " "differing shapes") if subarray.strides != value.strides: raise ValueError("cannot assign between arrays of " "differing strides") self._copy(subarray, value, queue=queue) else: # Let's assume it's a scalar subarray.fill(value, queue=queue) def __setitem__(self, subscript, value): """Set the slice of *self* identified *subscript* to *value*. *value* is allowed to be: * A :class:`Array` of the same :attr:`shape` and (for now) :attr:`strides`, but with potentially different :attr:`dtype`. * A :class:`numpy.ndarray` of the same :attr:`shape` and (for now) :attr:`strides`, but with potentially different :attr:`dtype`. * A scalar. Non-scalar broadcasting is not currently supported. .. versionadded:: 2013.1 """ self.setitem(subscript, value) # }}} # }}} def as_strided(ary, shape=None, strides=None): """Make an :class:`Array` from the given array with the given shape and strides. """ # undocumented for the moment shape = shape or ary.shape strides = strides or ary.strides return Array(ary.queue, shape, ary.dtype, allocator=ary.allocator, data=ary.data, strides=strides) # }}} # {{{ creation helpers def to_device(queue, ary, allocator=None, async=False): """Return a :class:`Array` that is an exact copy of the :class:`numpy.ndarray` instance *ary*. See :class:`Array` for the meaning of *allocator*. .. versionchanged:: 2011.1 *context* argument was deprecated. """ if ary.dtype == object: raise RuntimeError("to_device does not work on object arrays.") result = Array(queue, ary.shape, ary.dtype, allocator=allocator, strides=ary.strides) result.set(ary, async=async) return result empty = Array def zeros(queue, shape, dtype, order="C", allocator=None): """Same as :func:`empty`, but the :class:`Array` is zero-initialized before being returned. .. versionchanged:: 2011.1 *context* argument was deprecated. """ result = Array(queue, shape, dtype, order=order, allocator=allocator) zero = np.zeros((), dtype) result.fill(zero) return result def empty_like(ary): """Make a new, uninitialized :class:`Array` having the same properties as *other_ary*. """ return ary._new_with_changes(data=None, offset=0) def zeros_like(ary): """Make a new, zero-initialized :class:`Array` having the same properties as *other_ary*. """ result = empty_like(ary) zero = np.zeros((), ary.dtype) result.fill(zero) return result @elwise_kernel_runner def _arange_knl(result, start, step): return elementwise.get_arange_kernel( result.context, result.dtype) def arange(queue, *args, **kwargs): """Create a :class:`Array` filled with numbers spaced `step` apart, starting from `start` and ending at `stop`. For floating point arguments, the length of the result is `ceil((stop - start)/step)`. This rule may result in the last element of the result being greater than `stop`. *dtype*, if not specified, is taken as the largest common type of *start*, *stop* and *step*. .. versionchanged:: 2011.1 *context* argument was deprecated. .. versionchanged:: 2011.2 *allocator* keyword argument was added. """ # argument processing ----------------------------------------------------- # Yuck. Thanks, numpy developers. ;) from pytools import Record class Info(Record): pass explicit_dtype = False inf = Info() inf.start = None inf.stop = None inf.step = None inf.dtype = None inf.allocator = None inf.wait_for = [] if isinstance(args[-1], np.dtype): inf.dtype = args[-1] args = args[:-1] explicit_dtype = True argc = len(args) if argc == 0: raise ValueError("stop argument required") elif argc == 1: inf.stop = args[0] elif argc == 2: inf.start = args[0] inf.stop = args[1] elif argc == 3: inf.start = args[0] inf.stop = args[1] inf.step = args[2] else: raise ValueError("too many arguments") admissible_names = ["start", "stop", "step", "dtype", "allocator"] for k, v in kwargs.iteritems(): if k in admissible_names: if getattr(inf, k) is None: setattr(inf, k, v) if k == "dtype": explicit_dtype = True else: raise ValueError( "may not specify '%s' by position and keyword" % k) else: raise ValueError("unexpected keyword argument '%s'" % k) if inf.start is None: inf.start = 0 if inf.step is None: inf.step = 1 if inf.dtype is None: inf.dtype = np.array([inf.start, inf.stop, inf.step]).dtype # actual functionality ---------------------------------------------------- dtype = np.dtype(inf.dtype) start = dtype.type(inf.start) step = dtype.type(inf.step) stop = dtype.type(inf.stop) wait_for = inf.wait_for if not explicit_dtype: raise TypeError("arange requires a dtype argument") from math import ceil size = int(ceil((stop-start)/step)) result = Array(queue, (size,), dtype, allocator=inf.allocator) result.events.append( _arange_knl(result, start, step, queue=queue, wait_for=wait_for)) return result # }}} # {{{ take/put/concatenate/diff @elwise_kernel_runner def _take(result, ary, indices): return elementwise.get_take_kernel( result.context, result.dtype, indices.dtype) def take(a, indices, out=None, queue=None, wait_for=None): """Return the :class:`Array` ``[a[indices[0]], ..., a[indices[n]]]``. For the moment, *a* must be a type that can be bound to a texture. """ queue = queue or a.queue if out is None: out = Array(queue, indices.shape, a.dtype, allocator=a.allocator) assert len(indices.shape) == 1 out.events.append( _take(out, a, indices, queue=queue, wait_for=wait_for)) return out def multi_take(arrays, indices, out=None, queue=None): if not len(arrays): return [] assert len(indices.shape) == 1 from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].dtype context = indices.context queue = queue or indices.queue vec_count = len(arrays) if out is None: out = [Array(context, queue, indices.shape, a_dtype, allocator=a_allocator) for i in range(vec_count)] else: if len(out) != len(arrays): raise ValueError("out and arrays must have the same length") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): knl = elementwise.get_take_kernel( indices.context, a_dtype, indices.dtype, vec_count=chunk_size) knl.set_block_shape(*indices._block) return knl knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) gs, ls = indices.get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) knl(queue, gs, ls, indices.data, *([o.data for o in out[chunk_slice]] + [i.data for i in arrays[chunk_slice]] + [indices.size])) return out def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, out=None, queue=None, src_offsets=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator context = src_indices.context queue = queue or src_indices.queue vec_count = len(arrays) if out is None: out = [Array(queue, dest_shape, a_dtype, allocator=a_allocator) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if src_indices.dtype != dest_indices.dtype: raise TypeError( "src_indices and dest_indices must have the same dtype") if len(src_indices.shape) != 1: raise ValueError("src_indices must be 1D") if src_indices.shape != dest_indices.shape: raise ValueError( "src_indices and dest_indices must have the same shape") if src_offsets is None: src_offsets_list = [] else: src_offsets_list = src_offsets if len(src_offsets) != vec_count: raise ValueError( "src_indices and src_offsets must have the same length") max_chunk_size = 10 chunk_size = _builtin_min(vec_count, max_chunk_size) def make_func_for_chunk_size(chunk_size): return elementwise.get_take_put_kernel(context, a_dtype, src_indices.dtype, with_offsets=src_offsets is not None, vec_count=chunk_size) knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) gs, ls = src_indices.get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) from pytools import flatten knl(queue, gs, ls, *([o.data for o in out[chunk_slice]] + [dest_indices.base_data, dest_indices.offset, src_indices.base_data, src_indices.offset] + list(flatten( (i.base_data, i.offset) for i in arrays[chunk_slice])) + src_offsets_list[chunk_slice] + [src_indices.size])) return out def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator context = dest_indices.context queue = queue or dest_indices.queue vec_count = len(arrays) if out is None: out = [Array(queue, dest_shape, a_dtype, allocator=a_allocator, queue=queue) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if len(dest_indices.shape) != 1: raise ValueError("dest_indices must be 1D") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): knl = elementwise.get_put_kernel( context, a_dtype, dest_indices.dtype, vec_count=chunk_size) return knl knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) gs, ls = dest_indices.get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) from pytools import flatten knl(queue, gs, ls, *( list(flatten( (o.base_data, o.offset) for o in out[chunk_slice])) + [dest_indices.base_data, dest_indices.offset] + list(flatten( (i.base_data, i.offset) for i in arrays[chunk_slice])) + [dest_indices.size])) return out def concatenate(arrays, axis=0, queue=None, allocator=None): """ .. versionadded:: 2013.1 """ # {{{ find properties of result array shape = None for i_ary, ary in enumerate(arrays): queue = queue or ary.queue allocator = allocator or ary.allocator if shape is None: # first array shape = list(ary.shape) else: if len(ary.shape) != len(shape): raise ValueError("%d'th array has different number of axes " "(shold have %d, has %d)" % (i_ary, len(ary.shape), len(shape))) ary_shape_list = list(ary.shape) if (ary_shape_list[:axis] != shape[:axis] or ary_shape_list[axis+1:] != shape[axis+1:]): raise ValueError("%d'th array has residual not matching " "other arrays" % i_ary) shape[axis] += ary.shape[axis] # }}} shape = tuple(shape) dtype = np.find_common_type([ary.dtype for ary in arrays], []) result = empty(queue, shape, dtype, allocator=allocator) full_slice = (slice(None),) * len(shape) base_idx = 0 for ary in arrays: my_len = ary.shape[axis] result.setitem( full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:], ary) base_idx += my_len return result @elwise_kernel_runner def _diff(result, array): return elementwise.get_diff_kernel(array.context, array.dtype) def diff(array, queue=None, allocator=None): """ .. versionadded:: 2013.2 """ if len(array.shape) != 1: raise ValueError("multi-D arrays are not supported") n, = array.shape queue = queue or array.queue allocator = allocator or array.allocator result = empty(queue, (n-1,), array.dtype, allocator=allocator) _diff(result, array, queue=queue) return result # }}} # {{{ conditionals @elwise_kernel_runner def _if_positive(result, criterion, then_, else_): return elementwise.get_if_positive_kernel( result.context, criterion.dtype, then_.dtype) def if_positive(criterion, then_, else_, out=None, queue=None): """Return an array like *then_*, which, for the element at index *i*, contains *then_[i]* if *criterion[i]>0*, else *else_[i]*. """ if not (criterion.shape == then_.shape == else_.shape): raise ValueError("shapes do not match") if not (then_.dtype == else_.dtype): raise ValueError("dtypes do not match") if out is None: out = empty_like(then_) _if_positive(out, criterion, then_, else_, queue=queue) return out def maximum(a, b, out=None, queue=None): """Return the elementwise maximum of *a* and *b*.""" # silly, but functional return if_positive(a.mul_add(1, b, -1, queue=queue), a, b, queue=queue, out=out) def minimum(a, b, out=None, queue=None): """Return the elementwise minimum of *a* and *b*.""" # silly, but functional return if_positive(a.mul_add(1, b, -1, queue=queue), b, a, queue=queue, out=out) # }}} # {{{ reductions _builtin_sum = sum _builtin_min = min _builtin_max = max def sum(a, dtype=None, queue=None): """ .. versionadded:: 2011.1 """ from pyopencl.reduction import get_sum_kernel krnl = get_sum_kernel(a.context, dtype, a.dtype) return krnl(a, queue=queue) def dot(a, b, dtype=None, queue=None): """ .. versionadded:: 2011.1 """ from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype) return krnl(a, b, queue=queue) def vdot(a, b, dtype=None, queue=None): """Like :func:`numpy.vdot`. .. versionadded:: 2013.1 """ from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype, conjugate_first=True) return krnl(a, b, queue=queue) def subset_dot(subset, a, b, dtype=None, queue=None): """ .. versionadded:: 2011.1 """ from pyopencl.reduction import get_subset_dot_kernel krnl = get_subset_dot_kernel( a.context, dtype, subset.dtype, a.dtype, b.dtype) return krnl(subset, a, b, queue=queue) def _make_minmax_kernel(what): def f(a, queue=None): from pyopencl.reduction import get_minmax_kernel krnl = get_minmax_kernel(a.context, what, a.dtype) return krnl(a, queue=queue) return f min = _make_minmax_kernel("min") min.__doc__ = """ .. versionadded:: 2011.1 """ max = _make_minmax_kernel("max") max.__doc__ = """ .. versionadded:: 2011.1 """ def _make_subset_minmax_kernel(what): def f(subset, a, queue=None): from pyopencl.reduction import get_subset_minmax_kernel krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype) return krnl(subset, a, queue=queue) return f subset_min = _make_subset_minmax_kernel("min") subset_min.__doc__ = """.. versionadded:: 2011.1""" subset_max = _make_subset_minmax_kernel("max") subset_max.__doc__ = """.. versionadded:: 2011.1""" # }}} # {{{ scans def cumsum(a, output_dtype=None, queue=None, wait_for=None, return_event=False): # undocumented for now """ .. versionadded:: 2013.1 """ if output_dtype is None: output_dtype = a.dtype result = a._new_like_me(output_dtype) from pyopencl.scan import get_cumsum_kernel krnl = get_cumsum_kernel(a.context, a.dtype, output_dtype) evt = krnl(a, result, queue=queue, wait_for=wait_for) if return_event: return evt, result else: return result # }}} # vim: foldmethod=marker pyopencl-2013.2/pyopencl/__init__.py0000644000175000000500000011564312245716342016124 0ustar tomussrc# -*- coding: utf-8 -*- __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT # noqa try: import pyopencl._cl as _cl except ImportError: import os from os.path import dirname, join, realpath if realpath(join(os.getcwd(), "pyopencl")) == realpath(dirname(__file__)): from warnings import warn warn("It looks like you are importing PyOpenCL from " "its source directory. This likely won't work.") raise import numpy as np from pyopencl._cl import * # noqa import inspect as _inspect CONSTANT_CLASSES = [ getattr(_cl, name) for name in dir(_cl) if _inspect.isclass(getattr(_cl, name)) and name[0].islower()] class CompilerWarning(UserWarning): pass def compiler_output(text): import os from warnings import warn if int(os.environ.get("PYOPENCL_COMPILER_OUTPUT", "0")): warn(text, CompilerWarning) else: warn("Non-empty compiler output encountered. Set the " "environment variable PYOPENCL_COMPILER_OUTPUT=1 " "to see more.", CompilerWarning) # {{{ Program (including caching support) class Program(object): def __init__(self, arg1, arg2=None, arg3=None): if arg2 is None: # 1-argument form: program self._prg = arg1 elif arg3 is None: # 2-argument form: context, source context, source = arg1, arg2 import sys if isinstance(source, unicode) and sys.version_info < (3,): from warnings import warn warn("Received OpenCL source code in Unicode, " "should be ASCII string. Attempting conversion.", stacklevel=2) source = str(source) self._context = context self._source = source self._prg = None else: # 3-argument form: context, devices, binaries self._prg = _cl._Program(arg1, arg2, arg3) def _get_prg(self): if self._prg is not None: return self._prg else: # "no program" can only happen in from-source case. from warnings import warn warn("Pre-build attribute access defeats compiler caching.", stacklevel=3) self._prg = _cl._Program(self._context, self._source) del self._context return self._prg def get_info(self, arg): return self._get_prg().get_info(arg) def get_build_info(self, *args, **kwargs): return self._get_prg().get_build_info(*args, **kwargs) def all_kernels(self): return self._get_prg().all_kernels() def int_ptr(self): return self._get_prg().int_ptr int_ptr = property(int_ptr, doc=_cl._Program.int_ptr.__doc__) def from_int_ptr(int_ptr_value): return Program(_cl._Program.from_int_ptr(int_ptr_value)) from_int_ptr.__doc__ = _cl._Program.from_int_ptr.__doc__ from_int_ptr = staticmethod(from_int_ptr) def __getattr__(self, attr): try: knl = Kernel(self, attr) # Nvidia does not raise errors even for invalid names, # but this will give an error if the kernel is invalid. knl.num_args knl._source = getattr(self, "_source", None) return knl except LogicError: raise AttributeError("'%s' was not found as a program " "info attribute or as a kernel name" % attr) # {{{ build def build(self, options=[], devices=None, cache_dir=None): if isinstance(options, str): options = [options] options = options + ["-I", _find_pyopencl_include_path()] import os forced_options = os.environ.get("PYOPENCL_BUILD_OPTIONS") if forced_options: options = options + forced_options.split() if os.environ.get("PYOPENCL_NO_CACHE") and self._prg is None: self._prg = _cl._Program(self._context, self._source) if self._prg is not None: # uncached self._build_and_catch_errors( lambda: self._prg.build(" ".join(options), devices), options=options) else: # cached from pyopencl.cache import create_built_program_from_source_cached self._prg = self._build_and_catch_errors( lambda: create_built_program_from_source_cached( self._context, self._source, options, devices, cache_dir=cache_dir), options=options, source=self._source) del self._context return self def _build_and_catch_errors(self, build_func, options, source=None): try: return build_func() except _cl.RuntimeError, e: from pytools import Record class ErrorRecord(Record): pass what = e.what if options: what = what + "\n(options: %s)" % " ".join(options) if source is not None: from tempfile import NamedTemporaryFile srcfile = NamedTemporaryFile(mode="wt", delete=False, suffix=".cl") try: srcfile.write(source) finally: srcfile.close() what = what + "\n(source saved as %s)" % srcfile.name code = e.code routine = e.routine err = _cl.RuntimeError( ErrorRecord( what=lambda: what, code=lambda: code, routine=lambda: routine)) # Python 3.2 outputs the whole list of currently active exceptions # This serves to remove one (redundant) level from that nesting. raise err # }}} def compile(self, options=[], devices=None, headers=[]): options = " ".join(options) return self._prg().compile(options, devices, headers) def __eq__(self, other): return self._get_prg() == other._get_prg() def __ne__(self, other): return self._get_prg() == other._get_prg() def __hash__(self): return hash(self._get_prg()) def create_program_with_built_in_kernels(context, devices, kernel_names): if not isinstance(kernel_names, str): kernel_names = ":".join(kernel_names) return Program(_Program.create_with_built_in_kernels( context, devices, kernel_names)) def link_program(context, programs, options=[], devices=None): options = " ".join(options) return Program(_Program.link(context, programs, options, devices)) # }}} def _add_functionality(): cls_to_info_cls = { _cl.Platform: (_cl.Platform.get_info, _cl.platform_info), _cl.Device: (_cl.Device.get_info, _cl.device_info), _cl.Context: (_cl.Context.get_info, _cl.context_info), _cl.CommandQueue: (_cl.CommandQueue.get_info, _cl.command_queue_info), _cl.Event: (_cl.Event.get_info, _cl.event_info), _cl.MemoryObjectHolder: (MemoryObjectHolder.get_info, _cl.mem_info), Image: (_cl.Image.get_image_info, _cl.image_info), Program: (Program.get_info, _cl.program_info), Kernel: (Kernel.get_info, _cl.kernel_info), _cl.Sampler: (Sampler.get_info, _cl.sampler_info), } def to_string(cls, value, default_format=None): for name in dir(cls): if (not name.startswith("_") and getattr(cls, name) == value): return name if default_format is None: raise ValueError("a name for value %d was not found in %s" % (value, cls.__name__)) else: return default_format % value for cls in CONSTANT_CLASSES: cls.to_string = classmethod(to_string) # {{{ get_info attributes ------------------------------------------------- def make_getinfo(info_method, info_attr): def result(self): return info_method(self, info_attr) return property(result) for cls, (info_method, info_class) in cls_to_info_cls.iteritems(): for info_name, info_value in info_class.__dict__.iteritems(): if info_name == "to_string" or info_name.startswith("_"): continue setattr(cls, info_name.lower(), make_getinfo( info_method, getattr(info_class, info_name))) # }}} # {{{ Platform def platform_repr(self): return "" % (self.name, self.int_ptr) Platform.__repr__ = platform_repr # }}} # {{{ Device def device_repr(self): return "" % ( self.name.strip(), self.platform.name.strip(), self.int_ptr) Device.__repr__ = device_repr # }}} # {{{ Context def context_repr(self): return "" % (self.int_ptr, ", ".join(repr(dev) for dev in self.devices)) def context_get_cl_version(self): import re platform = self.devices[0].platform plat_version_string = platform.version match = re.match(r"^OpenCL ([0-9]+)\.([0-9]+) .*$", plat_version_string) if match is None: raise RuntimeError("platform %s returned non-conformant " "platform version string '%s'" % (platform, plat_version_string)) return int(match.group(1)), int(match.group(2)) Context.__repr__ = context_repr from pytools import memoize_method Context._get_cl_version = memoize_method(context_get_cl_version) # }}} # {{{ CommandQueue def command_queue_enter(self): return self def command_queue_exit(self, exc_type, exc_val, exc_tb): self.finish() def command_queue_get_cl_version(self): return self.context._get_cl_version() CommandQueue.__enter__ = command_queue_enter CommandQueue.__exit__ = command_queue_exit CommandQueue._get_cl_version = memoize_method(command_queue_get_cl_version) # }}} # {{{ _Program (the internal, non-caching version) def program_get_build_logs(self): build_logs = [] for dev in self.get_info(_cl.program_info.DEVICES): try: log = self.get_build_info(dev, program_build_info.LOG) except: log = "" build_logs.append((dev, log)) return build_logs def program_build(self, options=[], devices=None): if isinstance(options, list): options = " ".join(options) err = None try: self._build(options=options, devices=devices) except Exception, e: from pytools import Record class ErrorRecord(Record): pass what = e.what + "\n\n" + (75*"="+"\n").join( "Build on %s:\n\n%s" % (dev, log) for dev, log in self._get_build_logs()) code = e.code routine = e.routine err = _cl.RuntimeError( ErrorRecord( what=lambda: what, code=lambda: code, routine=lambda: routine)) if err is not None: # Python 3.2 outputs the whole list of currently active exceptions # This serves to remove one (redundant) level from that nesting. raise err message = (75*"="+"\n").join( "Build on %s succeeded, but said:\n\n%s" % (dev, log) for dev, log in self._get_build_logs() if log is not None and log.strip()) if message: if self.kind() == program_kind.SOURCE: build_type = "From-source build" elif self.kind() == program_kind.BINARY: build_type = "From-binary build" else: build_type = "Build" compiler_output("%s succeeded, but resulted in non-empty logs:\n%s" % (build_type, message)) return self _cl._Program._get_build_logs = program_get_build_logs _cl._Program.build = program_build # }}} # {{{ Event class ProfilingInfoGetter: def __init__(self, event): self.event = event def __getattr__(self, name): info_cls = _cl.profiling_info try: inf_attr = getattr(info_cls, name.upper()) except AttributeError: raise AttributeError("%s has no attribute '%s'" % (type(self), name)) else: return self.event.get_profiling_info(inf_attr) _cl.Event.profile = property(ProfilingInfoGetter) # }}} # {{{ Kernel kernel_old_init = Kernel.__init__ def kernel_init(self, prg, name): if not isinstance(prg, _cl._Program): prg = prg._get_prg() kernel_old_init(self, prg, name) self._source = getattr(prg, "_source", None) def kernel_call(self, queue, global_size, local_size, *args, **kwargs): global_offset = kwargs.pop("global_offset", None) g_times_l = kwargs.pop("g_times_l", False) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError( "Kernel.__call__ recived unexpected keyword arguments: %s" % ", ".join(kwargs.keys())) self.set_args(*args) return enqueue_nd_range_kernel(queue, self, global_size, local_size, global_offset, wait_for, g_times_l=g_times_l) def kernel_set_scalar_arg_dtypes(self, arg_dtypes): assert len(arg_dtypes) == self.num_args, ( "length of argument type array (%d) and " "CL-generated number of arguments (%d) do not agree" % (len(arg_dtypes), self.num_args)) arg_type_chars = [] for arg_dtype in arg_dtypes: if arg_dtype is None: arg_type_chars.append(None) else: arg_type_chars.append(np.dtype(arg_dtype).char) self._arg_type_chars = arg_type_chars def kernel_set_args(self, *args): assert len(args) == self.num_args, ( "length of argument list (%d) and " "CL-generated number of arguments (%d) do not agree" % (len(args), self.num_args)) i = None try: try: arg_type_chars = self.__dict__["_arg_type_chars"] except KeyError: for i, arg in enumerate(args): self.set_arg(i, arg) else: from pyopencl._pvt_struct import pack for i, (arg, arg_type_char) in enumerate( zip(args, arg_type_chars)): if arg_type_char and arg_type_char != "V": self.set_arg(i, pack(arg_type_char, arg)) else: self.set_arg(i, arg) except LogicError, e: if i is not None: advice = "" from pyopencl.array import Array if isinstance(args[i], Array): advice = " (perhaps you meant to pass 'array.data' " \ "instead of the array itself?)" raise LogicError( "when processing argument #%d (1-based): %s%s" % (i+1, str(e), advice)) else: raise def kernel_capture_call(self, filename, queue, global_size, local_size, *args, **kwargs): from pyopencl.capture_call import capture_kernel_call capture_kernel_call(self, filename, queue, global_size, local_size, *args, **kwargs) Kernel.__init__ = kernel_init Kernel.__call__ = kernel_call Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes Kernel.set_args = kernel_set_args Kernel.capture_call = kernel_capture_call # }}} # {{{ ImageFormat def image_format_repr(self): return "ImageFormat(%s, %s)" % ( channel_order.to_string(self.channel_order, ""), channel_type.to_string(self.channel_data_type, "")) def image_format_eq(self, other): return (self.channel_order == other.channel_order and self.channel_data_type == other.channel_data_type) def image_format_ne(self, other): return not image_format_eq(self, other) def image_format_hash(self): return hash((type(self), self.channel_order, self.channel_data_type)) ImageFormat.__repr__ = image_format_repr ImageFormat.__eq__ = image_format_eq ImageFormat.__ne__ = image_format_ne ImageFormat.__hash__ = image_format_hash # }}} # {{{ Image image_old_init = Image.__init__ def image_init(self, context, flags, format, shape=None, pitches=None, hostbuf=None, is_array=False, buffer=None): if shape is None and hostbuf is None: raise Error("'shape' must be passed if 'hostbuf' is not given") if shape is None and hostbuf is not None: shape = hostbuf.shape if hostbuf is not None and not \ (flags & (mem_flags.USE_HOST_PTR | mem_flags.COPY_HOST_PTR)): from warnings import warn warn("'hostbuf' was passed, but no memory flags to make use of it.") if hostbuf is None and pitches is not None: raise Error("'pitches' may only be given if 'hostbuf' is given") if context._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2): if buffer is not None and is_array: raise ValueError( "'buffer' and 'is_array' are mutually exclusive") if len(shape) == 3: if buffer is not None: raise TypeError( "'buffer' argument is not supported for 3D arrays") elif is_array: image_type = mem_object_type.IMAGE2D_ARRAY else: image_type = mem_object_type.IMAGE3D elif len(shape) == 2: if buffer is not None: raise TypeError( "'buffer' argument is not supported for 2D arrays") elif is_array: image_type = mem_object_type.IMAGE1D_ARRAY else: image_type = mem_object_type.IMAGE2D elif len(shape) == 1: if buffer is not None: image_type = mem_object_type.IMAGE1D_BUFFER elif is_array: raise TypeError("array of zero-dimensional images not supported") else: image_type = mem_object_type.IMAGE1D else: raise ValueError("images cannot have more than three dimensions") desc = ImageDescriptor() desc.image_type = image_type desc.shape = shape # also sets desc.array_size if pitches is None: desc.pitches = (0, 0) else: desc.pitches = pitches desc.num_mip_levels = 0 # per CL 1.2 spec desc.num_samples = 0 # per CL 1.2 spec desc.buffer = buffer image_old_init(self, context, flags, format, desc, hostbuf) else: # legacy init for CL 1.1 and older if is_array: raise TypeError("'is_array=True' is not supported for CL < 1.2") #if num_mip_levels is not None: #raise TypeError( # "'num_mip_levels' argument is not supported for CL < 1.2") #if num_samples is not None: #raise TypeError( # "'num_samples' argument is not supported for CL < 1.2") if buffer is not None: raise TypeError("'buffer' argument is not supported for CL < 1.2") image_old_init(self, context, flags, format, shape, pitches, hostbuf) class _ImageInfoGetter: def __init__(self, event): from warnings import warn warn("Image.image.attr is deprecated. " "Use Image.attr directly, instead.") self.event = event def __getattr__(self, name): try: inf_attr = getattr(_cl.image_info, name.upper()) except AttributeError: raise AttributeError("%s has no attribute '%s'" % (type(self), name)) else: return self.event.get_image_info(inf_attr) def image_shape(self): if self.type == mem_object_type.IMAGE2D: return (self.width, self.height) elif self.type == mem_object_type.IMAGE3D: return (self.width, self.height, self.depth) else: raise LogicError("only images have shapes") Image.__init__ = image_init Image.image = property(_ImageInfoGetter) Image.shape = property(image_shape) # }}} # {{{ Error def error_str(self): val = self.args[0] try: val.routine except AttributeError: return str(val) else: result = "%s failed: %s" % (val.routine(), status_code.to_string(val.code(), "") .lower().replace("_", " ")) if val.what(): result += " - " + val.what() return result def error_code(self): return self.args[0].code() def error_routine(self): return self.args[0].routine() def error_what(self): return self.args[0].what() Error.__str__ = error_str Error.code = property(error_code) Error.routine = property(error_routine) Error.what = property(error_what) # }}} if _cl.have_gl(): def gl_object_get_gl_object(self): return self.get_gl_object_info()[1] GLBuffer.gl_object = property(gl_object_get_gl_object) GLTexture.gl_object = property(gl_object_get_gl_object) _add_functionality() # {{{ find pyopencl shipped source code def _find_pyopencl_include_path(): from pkg_resources import Requirement, resource_filename return resource_filename(Requirement.parse("pyopencl"), "pyopencl/cl") # }}} # {{{ convenience def create_some_context(interactive=True, answers=None): import os if answers is None and "PYOPENCL_CTX" in os.environ: ctx_spec = os.environ["PYOPENCL_CTX"] answers = ctx_spec.split(":") if answers is not None: pre_provided_answers = answers answers = answers[:] else: pre_provided_answers = None user_inputs = [] try: import sys if not sys.stdin.isatty(): interactive = False except: interactive = False def cc_print(s): if interactive: print s def get_input(prompt): if answers: return str(answers.pop(0)) elif not interactive: return '' else: user_input = raw_input(prompt) user_inputs.append(user_input) return user_input # {{{ pick a platform platforms = get_platforms() if not platforms: raise Error("no platforms found") elif len(platforms) == 1: platform, = platforms else: if not answers: cc_print("Choose platform:") for i, pf in enumerate(platforms): cc_print("[%d] %s" % (i, pf)) answer = get_input("Choice [0]:") if not answer: platform = platforms[0] else: platform = None try: int_choice = int(answer) except ValueError: pass else: if 0 <= int_choice < len(platforms): platform = platforms[int_choice] if platform is None: answer = answer.lower() for i, pf in enumerate(platforms): if answer in pf.name.lower(): platform = pf if platform is None: raise RuntimeError("input did not match any platform") # }}} # {{{ pick a device devices = platform.get_devices() def parse_device(choice): try: int_choice = int(choice) except ValueError: pass else: if 0 <= int_choice < len(devices): return devices[int_choice] choice = choice.lower() for i, dev in enumerate(devices): if choice in dev.name.lower(): return dev raise RuntimeError("input did not match any device") if not devices: raise Error("no devices found") elif len(devices) == 1: pass else: if not answers: cc_print("Choose device(s):") for i, dev in enumerate(devices): cc_print("[%d] %s" % (i, dev)) answer = get_input("Choice, comma-separated [0]:") if not answer: devices = [devices[0]] else: devices = [parse_device(i) for i in answer.split(",")] # }}} if user_inputs: if pre_provided_answers is not None: user_inputs = pre_provided_answers + user_inputs cc_print("Set the environment variable PYOPENCL_CTX='%s' to " "avoid being asked again." % ":".join(user_inputs)) if answers: raise RuntimeError("not all provided choices were used by " "create_some_context. (left over: '%s')" % ":".join(answers)) return Context(devices) _csc = create_some_context def _mark_copy_deprecated(func): def new_func(*args, **kwargs): from warnings import warn warn("'%s' has been deprecated in version 2011.1. Please use " "enqueue_copy() instead." % func.__name__[1:], DeprecationWarning, stacklevel=2) return func(*args, **kwargs) try: from functools import update_wrapper except ImportError: pass else: try: update_wrapper(new_func, func) except AttributeError: pass return new_func enqueue_read_image = _mark_copy_deprecated(_cl._enqueue_read_image) enqueue_write_image = _mark_copy_deprecated(_cl._enqueue_write_image) enqueue_copy_image = _mark_copy_deprecated(_cl._enqueue_copy_image) enqueue_copy_image_to_buffer = _mark_copy_deprecated( _cl._enqueue_copy_image_to_buffer) enqueue_copy_buffer_to_image = _mark_copy_deprecated( _cl._enqueue_copy_buffer_to_image) enqueue_read_buffer = _mark_copy_deprecated(_cl._enqueue_read_buffer) enqueue_write_buffer = _mark_copy_deprecated(_cl._enqueue_write_buffer) enqueue_copy_buffer = _mark_copy_deprecated(_cl._enqueue_copy_buffer) if _cl.get_cl_header_version() >= (1, 1): enqueue_read_buffer_rect = _mark_copy_deprecated(_cl._enqueue_read_buffer_rect) enqueue_write_buffer_rect = _mark_copy_deprecated(_cl._enqueue_write_buffer_rect) enqueue_copy_buffer_rect = _mark_copy_deprecated(_cl._enqueue_copy_buffer_rect) def enqueue_copy(queue, dest, src, **kwargs): """Copy from :class:`Image`, :class:`Buffer` or the host to :class:`Image`, :class:`Buffer` or the host. (Note: host-to-host copies are unsupported.) The following keyword arguments are available: :arg wait_for: (optional, default empty) :arg is_blocking: Wait for completion. Defaults to *True*. (Available on any copy involving host memory) :return: A :class:`NannyEvent` if the transfer involved a host-side buffer, otherwise an :class:`Event`. .. ------------------------------------------------------------------------ .. rubric :: Transfer :class:`Buffer` ↔ host .. ------------------------------------------------------------------------ :arg device_offset: offset in bytes (optional) .. note:: The size of the transfer is controlled by the size of the of the host-side buffer. If the host-side buffer is a :class:`numpy.ndarray`, you can control the transfer size by transfering into a smaller 'view' of the target array, like this:: cl.enqueue_copy(queue, large_dest_numpy_array[:15], src_buffer) .. ------------------------------------------------------------------------ .. rubric :: Transfer :class:`Buffer` ↔ :class:`Buffer` .. ------------------------------------------------------------------------ :arg byte_count: (optional) If not specified, defaults to the size of the source in versions 2012.x and earlier, and to the minimum of the size of the source and target from 2013.1 on. :arg src_offset: (optional) :arg dest_offset: (optional) .. ------------------------------------------------------------------------ .. rubric :: Rectangular :class:`Buffer` ↔ host transfers (CL 1.1 and newer) .. ------------------------------------------------------------------------ :arg buffer_origin: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) :arg host_origin: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) :arg region: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) :arg buffer_pitches: :class:`tuple` of :class:`int` of length two or shorter. (optional, "tightly-packed" if unspecified) :arg host_pitches: :class:`tuple` of :class:`int` of length two or shorter. (optional, "tightly-packed" if unspecified) .. ------------------------------------------------------------------------ .. rubric :: Transfer :class:`Image` ↔ host .. ------------------------------------------------------------------------ :arg origin: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) :arg region: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) :arg pitches: :class:`tuple` of :class:`int` of length two or shorter. (optional) .. ------------------------------------------------------------------------ .. rubric :: Transfer :class:`Buffer` ↔ :class:`Image` .. ------------------------------------------------------------------------ :arg offset: offset in buffer (mandatory) :arg origin: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) :arg region: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) .. ------------------------------------------------------------------------ .. rubric :: Transfer :class:`Image` ↔ :class:`Image` .. ------------------------------------------------------------------------ :arg src_origin: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) :arg dest_origin: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) :arg region: :class:`tuple` of :class:`int` of length three or shorter. (mandatory) |std-enqueue-blurb| .. versionadded:: 2011.1 """ if isinstance(dest, MemoryObjectHolder): if dest.type == mem_object_type.BUFFER: if isinstance(src, MemoryObjectHolder): if src.type == mem_object_type.BUFFER: if "src_origin" in kwargs: return _cl._enqueue_copy_buffer_rect( queue, src, dest, **kwargs) else: kwargs["dst_offset"] = kwargs.pop("dest_offset", 0) return _cl._enqueue_copy_buffer(queue, src, dest, **kwargs) elif src.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]: return _cl._enqueue_copy_image_to_buffer( queue, src, dest, **kwargs) else: raise ValueError("invalid src mem object type") else: # assume from-host if "buffer_origin" in kwargs: return _cl._enqueue_write_buffer_rect(queue, dest, src, **kwargs) else: return _cl._enqueue_write_buffer(queue, dest, src, **kwargs) elif dest.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]: if isinstance(src, MemoryObjectHolder): if src.type == mem_object_type.BUFFER: return _cl._enqueue_copy_buffer_to_image( queue, src, dest, **kwargs) elif src.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]: return _cl._enqueue_copy_image(queue, src, dest, **kwargs) else: raise ValueError("invalid src mem object type") else: # assume from-host origin = kwargs.pop("origin") region = kwargs.pop("region") pitches = kwargs.pop("pitches", (0, 0)) if len(pitches) == 1: kwargs["row_pitch"], = pitches else: kwargs["row_pitch"], kwargs["slice_pitch"] = pitches return _cl._enqueue_write_image( queue, dest, origin, region, src, **kwargs) else: raise ValueError("invalid dest mem object type") else: # assume to-host if isinstance(src, MemoryObjectHolder): if src.type == mem_object_type.BUFFER: if "buffer_origin" in kwargs: return _cl._enqueue_read_buffer_rect(queue, src, dest, **kwargs) else: return _cl._enqueue_read_buffer(queue, src, dest, **kwargs) elif src.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]: origin = kwargs.pop("origin") region = kwargs.pop("region") pitches = kwargs.pop("pitches", (0, 0)) if len(pitches) == 1: kwargs["row_pitch"], = pitches else: kwargs["row_pitch"], kwargs["slice_pitch"] = pitches return _cl._enqueue_read_image( queue, src, origin, region, dest, **kwargs) else: raise ValueError("invalid src mem object type") else: # assume from-host raise TypeError("enqueue_copy cannot perform host-to-host transfers") # }}} # {{{ image creation DTYPE_TO_CHANNEL_TYPE = { np.dtype(np.float32): channel_type.FLOAT, np.dtype(np.int16): channel_type.SIGNED_INT16, np.dtype(np.int32): channel_type.SIGNED_INT32, np.dtype(np.int8): channel_type.SIGNED_INT8, np.dtype(np.uint16): channel_type.UNSIGNED_INT16, np.dtype(np.uint32): channel_type.UNSIGNED_INT32, np.dtype(np.uint8): channel_type.UNSIGNED_INT8, } try: np.float16 except: pass else: DTYPE_TO_CHANNEL_TYPE[np.dtype(np.float16)] = channel_type.HALF_FLOAT, DTYPE_TO_CHANNEL_TYPE_NORM = { np.dtype(np.int16): channel_type.SNORM_INT16, np.dtype(np.int8): channel_type.SNORM_INT8, np.dtype(np.uint16): channel_type.UNORM_INT16, np.dtype(np.uint8): channel_type.UNORM_INT8, } def image_from_array(ctx, ary, num_channels=None, mode="r", norm_int=False): if not ary.flags.c_contiguous: raise ValueError("array must be C-contiguous") dtype = ary.dtype if num_channels is None: from pyopencl.array import vec try: dtype, num_channels = vec.type_to_scalar_and_count[dtype] except KeyError: # It must be a scalar type then. num_channels = 1 shape = ary.shape strides = ary.strides elif num_channels == 1: shape = ary.shape strides = ary.strides else: if ary.shape[-1] != num_channels: raise RuntimeError("last dimension must be equal to number of channels") shape = ary.shape[:-1] strides = ary.strides[:-1] if mode == "r": mode_flags = mem_flags.READ_ONLY elif mode == "w": mode_flags = mem_flags.WRITE_ONLY else: raise ValueError("invalid value '%s' for 'mode'" % mode) img_format = { 1: channel_order.R, 2: channel_order.RG, 3: channel_order.RGB, 4: channel_order.RGBA, }[num_channels] assert ary.strides[-1] == ary.dtype.itemsize if norm_int: channel_type = DTYPE_TO_CHANNEL_TYPE_NORM[dtype] else: channel_type = DTYPE_TO_CHANNEL_TYPE[dtype] return Image(ctx, mode_flags | mem_flags.COPY_HOST_PTR, ImageFormat(img_format, channel_type), shape=shape[::-1], pitches=strides[::-1][1:], hostbuf=ary) # }}} # {{{ enqueue_* compatibility shims def enqueue_marker(queue, wait_for=None): if queue._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2): return _cl._enqueue_marker_with_wait_list(queue, wait_for) else: if wait_for: _cl._enqueue_wait_for_events(queue, wait_for) return _cl._enqueue_marker(queue) def enqueue_barrier(queue, wait_for=None): if queue._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2): return _cl._enqueue_barrier_with_wait_list(queue, wait_for) else: _cl._enqueue_barrier(queue) if wait_for: _cl._enqueue_wait_for_events(queue, wait_for) return _cl._enqueue_marker(queue) def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None): if not (queue._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2)): from warnings import warn warn("The context for this queue does not declare OpenCL 1.2 support, so " "the next thing you might see is a crash") return _cl.enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None) # }}} # vim: foldmethod=marker pyopencl-2013.2/pyopencl/scan.py0000644000175000000500000015562712245716342015317 0ustar tomussrc"""Scan primitive.""" from __future__ import division __copyright__ = """ Copyright 2011-2012 Andreas Kloeckner Copyright 2008-2011 NVIDIA Corporation """ __license__ = """ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Derived from thrust/detail/backend/cuda/detail/fast_scan.inl within the Thrust project, https://code.google.com/p/thrust/ """ # Direct link to thrust source: # https://code.google.com/p/thrust/source/browse/thrust/detail/backend/cuda/detail/fast_scan.inl # noqa import numpy as np import pyopencl as cl import pyopencl.array # noqa from pyopencl.tools import (dtype_to_ctype, bitlog2, KernelTemplateBase, _process_code_for_macro, get_arg_list_scalar_arg_dtypes, context_dependent_memoize) import pyopencl._mymako as mako from pyopencl._cluda import CLUDA_PREAMBLE # {{{ preamble SHARED_PREAMBLE = CLUDA_PREAMBLE + """//CL// #define WG_SIZE ${wg_size} #define SCAN_EXPR(a, b, across_seg_boundary) ${scan_expr} #define INPUT_EXPR(i) (${input_expr}) %if is_segmented: #define IS_SEG_START(i, a) (${is_segment_start_expr}) %endif ${preamble} typedef ${dtype_to_ctype(scan_dtype)} scan_type; typedef ${dtype_to_ctype(index_dtype)} index_type; // NO_SEG_BOUNDARY is the largest representable integer in index_type. // This assumption is used in code below. #define NO_SEG_BOUNDARY ${str(np.iinfo(index_dtype).max)} """ # }}} # {{{ main scan code # Algorithm: Each work group is responsible for one contiguous # 'interval'. There are just enough intervals to fill all compute # units. Intervals are split into 'units'. A unit is what gets # worked on in parallel by one work group. # # in index space: # interval > unit > local-parallel > k-group # # (Note that there is also a transpose in here: The data is read # with local ids along linear index order.) # # Each unit has two axes--the local-id axis and the k axis. # # unit 0: # | | | | | | | | | | ----> lid # | | | | | | | | | | # | | | | | | | | | | # | | | | | | | | | | # | | | | | | | | | | # # | # v k (fastest-moving in linear index) # # unit 1: # | | | | | | | | | | ----> lid # | | | | | | | | | | # | | | | | | | | | | # | | | | | | | | | | # | | | | | | | | | | # # | # v k (fastest-moving in linear index) # # ... # # At a device-global level, this is a three-phase algorithm, in # which first each interval does its local scan, then a scan # across intervals exchanges data globally, and the final update # adds the exchanged sums to each interval. # # Exclusive scan is realized by allowing look-behind (access to the # preceding item) in the final update, by means of a local shift. # # NOTE: All segment_start_in_X indices are relative to the start # of the array. SCAN_INTERVALS_SOURCE = SHARED_PREAMBLE + r"""//CL// #define K ${k_group_size} // #define DEBUG #ifdef DEBUG #define pycl_printf(ARGS) printf ARGS #else #define pycl_printf(ARGS) /* */ #endif KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${name_prefix}_scan_intervals( ${argument_signature}, GLOBAL_MEM scan_type *restrict partial_scan_buffer, const index_type N, const index_type interval_size %if is_first_level: , GLOBAL_MEM scan_type *restrict interval_results %endif %if is_segmented and is_first_level: // NO_SEG_BOUNDARY if no segment boundary in interval. , GLOBAL_MEM index_type *restrict g_first_segment_start_in_interval %endif %if store_segment_start_flags: , GLOBAL_MEM char *restrict g_segment_start_flags %endif ) { // index K in first dimension used for carry storage %if use_bank_conflict_avoidance: // Avoid bank conflicts by adding a single 32-bit value to the size of // the scan type. struct __attribute__ ((__packed__)) wrapped_scan_type { scan_type value; int dummy; }; LOCAL_MEM struct wrapped_scan_type ldata[K + 1][WG_SIZE + 1]; %else: struct wrapped_scan_type { scan_type value; }; // padded in WG_SIZE to avoid bank conflicts LOCAL_MEM struct wrapped_scan_type ldata[K + 1][WG_SIZE]; %endif %if is_segmented: LOCAL_MEM char l_segment_start_flags[K][WG_SIZE]; LOCAL_MEM index_type l_first_segment_start_in_subtree[WG_SIZE]; // only relevant/populated for local id 0 index_type first_segment_start_in_interval = NO_SEG_BOUNDARY; index_type first_segment_start_in_k_group, first_segment_start_in_subtree; %endif // {{{ declare local data for input_fetch_exprs if any of them are stenciled <% fetch_expr_offsets = {} for name, arg_name, ife_offset in input_fetch_exprs: fetch_expr_offsets.setdefault(arg_name, set()).add(ife_offset) local_fetch_expr_args = set( arg_name for arg_name, ife_offsets in fetch_expr_offsets.items() if -1 in ife_offsets or len(ife_offsets) > 1) %> %for arg_name in local_fetch_expr_args: LOCAL_MEM ${arg_ctypes[arg_name]} l_${arg_name}[WG_SIZE*K]; %endfor // }}} const index_type interval_begin = interval_size * GID_0; const index_type interval_end = min(interval_begin + interval_size, N); const index_type unit_size = K * WG_SIZE; index_type unit_base = interval_begin; %for is_tail in [False, True]: %if not is_tail: for(; unit_base + unit_size <= interval_end; unit_base += unit_size) %else: if (unit_base < interval_end) %endif { // {{{ carry out input_fetch_exprs // (if there are ones that need to be fetched into local) %if local_fetch_expr_args: for(index_type k = 0; k < K; k++) { const index_type offset = k*WG_SIZE + LID_0; const index_type read_i = unit_base + offset; %for arg_name in local_fetch_expr_args: %if is_tail: if (read_i < interval_end) %endif { l_${arg_name}[offset] = ${arg_name}[read_i]; } %endfor } local_barrier(); %endif pycl_printf(("after input_fetch_exprs\n")); // }}} // {{{ read a unit's worth of data from global for(index_type k = 0; k < K; k++) { const index_type offset = k*WG_SIZE + LID_0; const index_type read_i = unit_base + offset; %if is_tail: if (read_i < interval_end) %endif { %for name, arg_name, ife_offset in input_fetch_exprs: ${arg_ctypes[arg_name]} ${name}; %if arg_name in local_fetch_expr_args: if (offset + ${ife_offset} >= 0) ${name} = l_${arg_name}[offset + ${ife_offset}]; else if (read_i + ${ife_offset} >= 0) ${name} = ${arg_name}[read_i + ${ife_offset}]; /* else if out of bounds, name is left undefined */ %else: // ${arg_name} gets fetched directly from global ${name} = ${arg_name}[read_i]; %endif %endfor scan_type scan_value = INPUT_EXPR(read_i); const index_type o_mod_k = offset % K; const index_type o_div_k = offset / K; ldata[o_mod_k][o_div_k].value = scan_value; %if is_segmented: bool is_seg_start = IS_SEG_START(read_i, scan_value); l_segment_start_flags[o_mod_k][o_div_k] = is_seg_start; %endif %if store_segment_start_flags: g_segment_start_flags[read_i] = is_seg_start; %endif } } pycl_printf(("after read from global\n")); // }}} // {{{ carry in from previous unit, if applicable %if is_segmented: local_barrier(); first_segment_start_in_k_group = NO_SEG_BOUNDARY; if (l_segment_start_flags[0][LID_0]) first_segment_start_in_k_group = unit_base + K*LID_0; %endif if (LID_0 == 0 && unit_base != interval_begin) { ldata[0][0].value = SCAN_EXPR( ldata[K][WG_SIZE - 1].value, ldata[0][0].value, %if is_segmented: (l_segment_start_flags[0][0]) %else: false %endif ); } pycl_printf(("after carry-in\n")); // }}} local_barrier(); // {{{ scan along k (sequentially in each work item) scan_type sum = ldata[0][LID_0].value; %if is_tail: const index_type offset_end = interval_end - unit_base; %endif for(index_type k = 1; k < K; k++) { %if is_tail: if (K * LID_0 + k < offset_end) %endif { scan_type tmp = ldata[k][LID_0].value; index_type seq_i = unit_base + K*LID_0 + k; %if is_segmented: if (l_segment_start_flags[k][LID_0]) { first_segment_start_in_k_group = min( first_segment_start_in_k_group, seq_i); } %endif sum = SCAN_EXPR(sum, tmp, %if is_segmented: (l_segment_start_flags[k][LID_0]) %else: false %endif ); ldata[k][LID_0].value = sum; } } pycl_printf(("after scan along k\n")); // }}} // store carry in out-of-bounds (padding) array entry (index K) in // the K direction ldata[K][LID_0].value = sum; %if is_segmented: l_first_segment_start_in_subtree[LID_0] = first_segment_start_in_k_group; %endif local_barrier(); // {{{ tree-based local parallel scan // This tree-based scan works as follows: // - Each work item adds the previous item to its current state // - barrier // - Each work item adds in the item from two positions to the left // - barrier // - Each work item adds in the item from four positions to the left // ... // At the end, each item has summed all prior items. // across k groups, along local id // (uses out-of-bounds k=K array entry for storage) scan_type val = ldata[K][LID_0].value; <% scan_offset = 1 %> % while scan_offset <= wg_size: // {{{ reads from local allowed, writes to local not allowed if (LID_0 >= ${scan_offset}) { scan_type tmp = ldata[K][LID_0 - ${scan_offset}].value; % if is_tail: if (K*LID_0 < offset_end) % endif { val = SCAN_EXPR(tmp, val, %if is_segmented: (l_first_segment_start_in_subtree[LID_0] != NO_SEG_BOUNDARY) %else: false %endif ); } %if is_segmented: // Prepare for l_first_segment_start_in_subtree, below. // Note that this update must take place *even* if we're // out of bounds. first_segment_start_in_subtree = min( l_first_segment_start_in_subtree[LID_0], l_first_segment_start_in_subtree [LID_0 - ${scan_offset}]); %endif } %if is_segmented: else { first_segment_start_in_subtree = l_first_segment_start_in_subtree[LID_0]; } %endif // }}} local_barrier(); // {{{ writes to local allowed, reads from local not allowed ldata[K][LID_0].value = val; %if is_segmented: l_first_segment_start_in_subtree[LID_0] = first_segment_start_in_subtree; %endif // }}} local_barrier(); %if 0: if (LID_0 == 0) { printf("${scan_offset}: "); for (int i = 0; i < WG_SIZE; ++i) { if (l_first_segment_start_in_subtree[i] == NO_SEG_BOUNDARY) printf("- "); else printf("%d ", l_first_segment_start_in_subtree[i]); } printf("\n"); } %endif <% scan_offset *= 2 %> % endwhile pycl_printf(("after tree scan\n")); // }}} // {{{ update local values if (LID_0 > 0) { sum = ldata[K][LID_0 - 1].value; for(index_type k = 0; k < K; k++) { %if is_tail: if (K * LID_0 + k < offset_end) %endif { scan_type tmp = ldata[k][LID_0].value; ldata[k][LID_0].value = SCAN_EXPR(sum, tmp, %if is_segmented: (unit_base + K * LID_0 + k >= first_segment_start_in_k_group) %else: false %endif ); } } } %if is_segmented: if (LID_0 == 0) { // update interval-wide first-seg variable from current unit first_segment_start_in_interval = min( first_segment_start_in_interval, l_first_segment_start_in_subtree[WG_SIZE-1]); } %endif pycl_printf(("after local update\n")); // }}} local_barrier(); // {{{ write data %if is_gpu: { // work hard with index math to achieve contiguous 32-bit stores __global int *dest = (__global int *) (partial_scan_buffer + unit_base); <% assert scan_dtype.itemsize % 4 == 0 ints_per_wg = wg_size ints_to_store = scan_dtype.itemsize*wg_size*k_group_size // 4 %> const index_type scan_types_per_int = ${scan_dtype.itemsize//4}; %for store_base in range(0, ints_to_store, ints_per_wg): <% # Observe that ints_to_store is divisible by the work group # size already, so we won't go out of bounds that way. assert store_base + ints_per_wg <= ints_to_store %> %if is_tail: if (${store_base} + LID_0 < scan_types_per_int*(interval_end - unit_base)) %endif { index_type linear_index = ${store_base} + LID_0; index_type linear_scan_data_idx = linear_index / scan_types_per_int; index_type remainder = linear_index - linear_scan_data_idx * scan_types_per_int; __local int *src = (__local int *) &( ldata [linear_scan_data_idx % K] [linear_scan_data_idx / K].value); dest[linear_index] = src[remainder]; } %endfor } %else: for (index_type k = 0; k < K; k++) { const index_type offset = k*WG_SIZE + LID_0; %if is_tail: if (unit_base + offset < interval_end) %endif { pycl_printf(("write: %d\n", unit_base + offset)); partial_scan_buffer[unit_base + offset] = ldata[offset % K][offset / K].value; } } %endif pycl_printf(("after write\n")); // }}} local_barrier(); } % endfor // write interval sum %if is_first_level: if (LID_0 == 0) { interval_results[GID_0] = partial_scan_buffer[interval_end - 1]; %if is_segmented: g_first_segment_start_in_interval[GID_0] = first_segment_start_in_interval; %endif } %endif } """ # }}} # {{{ update UPDATE_SOURCE = SHARED_PREAMBLE + r"""//CL// KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${name_prefix}_final_update( ${argument_signature}, const index_type N, const index_type interval_size, GLOBAL_MEM scan_type *restrict interval_results, GLOBAL_MEM scan_type *restrict partial_scan_buffer %if is_segmented: , GLOBAL_MEM index_type *restrict g_first_segment_start_in_interval %endif %if is_segmented and use_lookbehind_update: , GLOBAL_MEM char *restrict g_segment_start_flags %endif ) { %if use_lookbehind_update: LOCAL_MEM scan_type ldata[WG_SIZE]; %endif %if is_segmented and use_lookbehind_update: LOCAL_MEM char l_segment_start_flags[WG_SIZE]; %endif const index_type interval_begin = interval_size * GID_0; const index_type interval_end = min(interval_begin + interval_size, N); // carry from last interval scan_type carry = ${neutral}; if (GID_0 != 0) carry = interval_results[GID_0 - 1]; %if is_segmented: const index_type first_seg_start_in_interval = g_first_segment_start_in_interval[GID_0]; %endif %if not is_segmented and 'last_item' in output_statement: scan_type last_item = interval_results[GDIM_0-1]; %endif %if not use_lookbehind_update: // {{{ no look-behind ('prev_item' not in output_statement -> simpler) index_type update_i = interval_begin+LID_0; %if is_segmented: index_type seg_end = min(first_seg_start_in_interval, interval_end); %endif for(; update_i < interval_end; update_i += WG_SIZE) { scan_type partial_val = partial_scan_buffer[update_i]; scan_type item = SCAN_EXPR(carry, partial_val, %if is_segmented: (update_i >= seg_end) %else: false %endif ); index_type i = update_i; { ${output_statement}; } } // }}} %else: // {{{ allow look-behind ('prev_item' in output_statement -> complicated) // We are not allowed to branch across barriers at a granularity smaller // than the whole workgroup. Therefore, the for loop is group-global, // and there are lots of local ifs. index_type group_base = interval_begin; scan_type prev_item = carry; // (A) for(; group_base < interval_end; group_base += WG_SIZE) { index_type update_i = group_base+LID_0; // load a work group's worth of data if (update_i < interval_end) { scan_type tmp = partial_scan_buffer[update_i]; tmp = SCAN_EXPR(carry, tmp, %if is_segmented: (update_i >= first_seg_start_in_interval) %else: false %endif ); ldata[LID_0] = tmp; %if is_segmented: l_segment_start_flags[LID_0] = g_segment_start_flags[update_i]; %endif } local_barrier(); // find prev_item if (LID_0 != 0) prev_item = ldata[LID_0 - 1]; /* else prev_item = carry (see (A)) OR last tail (see (B)); */ if (update_i < interval_end) { %if is_segmented: if (l_segment_start_flags[LID_0]) prev_item = ${neutral}; %endif scan_type item = ldata[LID_0]; index_type i = update_i; { ${output_statement}; } } if (LID_0 == 0) prev_item = ldata[WG_SIZE - 1]; // (B) local_barrier(); } // }}} %endif } """ # }}} # {{{ driver # {{{ helpers def _round_down_to_power_of_2(val): result = 2**bitlog2(val) if result > val: result >>= 1 assert result <= val return result _PREFIX_WORDS = set(""" ldata partial_scan_buffer global scan_offset segment_start_in_k_group carry g_first_segment_start_in_interval IS_SEG_START tmp Z val l_first_segment_start_in_subtree unit_size index_type interval_begin interval_size offset_end K SCAN_EXPR do_update WG_SIZE first_segment_start_in_k_group scan_type segment_start_in_subtree offset interval_results interval_end first_segment_start_in_subtree unit_base first_segment_start_in_interval k INPUT_EXPR prev_group_sum prev pv value partial_val pgs is_seg_start update_i scan_item_at_i seq_i read_i l_ o_mod_k o_div_k l_segment_start_flags scan_value sum first_seg_start_in_interval g_segment_start_flags group_base seg_end my_val DEBUG ARGS ints_to_store ints_per_wg scan_types_per_int linear_index linear_scan_data_idx dest src store_base wrapped_scan_type dummy LID_2 LID_1 LID_0 LDIM_0 LDIM_1 LDIM_2 GDIM_0 GDIM_1 GDIM_2 GID_0 GID_1 GID_2 """.split()) _IGNORED_WORDS = set(""" 4 8 32 typedef for endfor if void while endwhile endfor endif else const printf None return bool n char true false ifdef pycl_printf str range assert np iinfo max itemsize __packed__ struct restrict set iteritems len setdefault GLOBAL_MEM LOCAL_MEM_ARG WITHIN_KERNEL LOCAL_MEM KERNEL REQD_WG_SIZE local_barrier CLK_LOCAL_MEM_FENCE OPENCL EXTENSION pragma __attribute__ __global __kernel __local get_local_size get_local_id cl_khr_fp64 reqd_work_group_size get_num_groups barrier get_group_id _final_update _scan_intervals _debug_scan positions all padded integer its previous write based writes 0 has local worth scan_expr to read cannot not X items False bank four beginning follows applicable item min each indices works side scanning right summed relative used id out index avoid current state boundary True across be This reads groups along Otherwise undetermined store of times prior s update first regardless Each number because array unit from segment conflicts two parallel 2 empty define direction CL padding work tree bounds values and adds scan is allowed thus it an as enable at in occur sequentially end no storage data 1 largest may representable uses entry Y meaningful computations interval At the left dimension know d A load B group perform shift tail see last OR this add fetched into are directly need gets them stenciled that undefined there up any ones or name only relevant populated even wide we Prepare int seg Note re below place take variable must intra Therefore find code assumption branch workgroup complicated granularity phase remainder than simpler We smaller look ifs lots self behind allow barriers whole loop after already Observe achieve contiguous stores hard go with by math size won t way divisible bit so Avoid declare adding single type is_tail is_first_level input_expr argument_signature preamble double_support neutral output_statement k_group_size name_prefix is_segmented index_dtype scan_dtype wg_size is_segment_start_expr fetch_expr_offsets arg_ctypes ife_offsets input_fetch_exprs def ife_offset arg_name local_fetch_expr_args update_body update_loop_lookbehind update_loop_plain update_loop use_lookbehind_update store_segment_start_flags update_loop first_seg scan_dtype dtype_to_ctype is_gpu use_bank_conflict_avoidance a b prev_item i last_item prev_value N NO_SEG_BOUNDARY across_seg_boundary """.split()) def _make_template(s): leftovers = set() def replace_id(match): # avoid name clashes with user code by adding 'psc_' prefix to # identifiers. word = match.group(1) if word in _IGNORED_WORDS: return word elif word in _PREFIX_WORDS: return "psc_"+word else: leftovers.add(word) return word import re s = re.sub(r"\b([a-zA-Z0-9_]+)\b", replace_id, s) if leftovers: from warnings import warn warn("leftover words in identifier prefixing: " + " ".join(leftovers)) return mako.template.Template(s, strict_undefined=True) from pytools import Record class _ScanKernelInfo(Record): pass # }}} class ScanPerformanceWarning(UserWarning): pass class _GenericScanKernelBase(object): # {{{ constructor, argument processing def __init__(self, ctx, dtype, arguments, input_expr, scan_expr, neutral, output_statement, is_segment_start_expr=None, input_fetch_exprs=[], index_dtype=np.int32, name_prefix="scan", options=[], preamble="", devices=None): """ :arg ctx: a :class:`pyopencl.Context` within which the code for this scan kernel will be generated. :arg dtype: the :class:`numpy.dtype` with which the scan will be performed. May be a structured type if that type was registered through :func:`pyopencl.tools.get_or_register_dtype`. :arg arguments: A string of comma-separated C argument declarations. If *arguments* is specified, then *input_expr* must also be specified. All types used here must be known to PyOpenCL. (see :func:`pyopencl.tools.get_or_register_dtype`). :arg scan_expr: The associative, binary operation carrying out the scan, represented as a C string. Its two arguments are available as `a` and `b` when it is evaluated. `b` is guaranteed to be the 'element being updated', and `a` is the increment. Thus, if some data is supposed to just propagate along without being modified by the scan, it should live in `b`. This expression may call functions given in the *preamble*. Another value available to this expression is `across_seg_boundary`, a C `bool` indicating whether this scan update is crossing a segment boundary, as defined by `is_segment_start_expr`. The scan routine does not implement segmentation semantics on its own. It relies on `scan_expr` to do this. This value is available (but always `false`) even for a non-segmented scan. .. note:: In early pre-releases of the segmented scan, segmentation semantics were implemented *without* relying on `scan_expr`. :arg input_expr: A C expression, encoded as a string, resulting in the values to which the scan is applied. This may be used to apply a mapping to values stored in *arguments* before being scanned. The result of this expression must match *dtype*. The index intended to be mapped is available as `i` in this expression. This expression may also use the variables defined by *input_fetch_expr*. This expression may also call functions given in the *preamble*. :arg output_statement: a C statement that writes the output of the scan. It has access to the scan result as `item`, the preceding scan result item as `prev_item`, and the current index as `i`. `prev_item` in a segmented scan will be the neutral element at a segment boundary, not the immediately preceding item. Using *prev_item* in output statement has a small run-time cost. `prev_item` enables the construction of an exclusive scan. For non-segmented scans, *output_statement* may also reference `last_item`, which evaluates to the scan result of the last array entry. :arg is_segment_start_expr: A C expression, encoded as a string, resulting in a C `bool` value that determines whether a new scan segments starts at index *i*. If given, makes the scan a segmented scan. Has access to the current index `i`, the result of *input_expr* as a, and in addition may use *arguments* and *input_fetch_expr* variables just like *input_expr*. If it returns true, then previous sums will not spill over into the item with index *i* or subsequent items. :arg input_fetch_exprs: a list of tuples *(NAME, ARG_NAME, OFFSET)*. An entry here has the effect of doing the equivalent of the following before input_expr:: ARG_NAME_TYPE NAME = ARG_NAME[i+OFFSET]; `OFFSET` is allowed to be 0 or -1, and `ARG_NAME_TYPE` is the type of `ARG_NAME`. :arg preamble: |preamble| The first array in the argument list determines the size of the index space over which the scan is carried out, and thus the values over which the index *i* occurring in a number of code fragments in arguments above will vary. All code fragments further have access to N, the number of elements being processed in the scan. """ self.context = ctx dtype = self.dtype = np.dtype(dtype) if neutral is None: from warnings import warn warn("not specifying 'neutral' is deprecated and will lead to " "wrong results if your scan is not in-place or your " "'output_statement' does something otherwise non-trivial", stacklevel=2) if dtype.itemsize % 4 != 0: raise TypeError("scan value type must have size divisible by 4 bytes") self.index_dtype = np.dtype(index_dtype) if np.iinfo(self.index_dtype).min >= 0: raise TypeError("index_dtype must be signed") if devices is None: devices = ctx.devices self.devices = devices self.options = options from pyopencl.tools import parse_arg_list self.parsed_args = parse_arg_list(arguments) from pyopencl.tools import VectorArg self.first_array_idx = [ i for i, arg in enumerate(self.parsed_args) if isinstance(arg, VectorArg)][0] self.input_expr = input_expr self.is_segment_start_expr = is_segment_start_expr self.is_segmented = is_segment_start_expr is not None if self.is_segmented: is_segment_start_expr = _process_code_for_macro(is_segment_start_expr) self.output_statement = output_statement for name, arg_name, ife_offset in input_fetch_exprs: if ife_offset not in [0, -1]: raise RuntimeError("input_fetch_expr offsets must either be 0 or -1") self.input_fetch_exprs = input_fetch_exprs arg_dtypes = {} arg_ctypes = {} for arg in self.parsed_args: arg_dtypes[arg.name] = arg.dtype arg_ctypes[arg.name] = dtype_to_ctype(arg.dtype) self.options = options self.name_prefix = name_prefix # {{{ set up shared code dict from pytools import all from pyopencl.characterize import has_double_support self.code_variables = dict( np=np, dtype_to_ctype=dtype_to_ctype, preamble=preamble, name_prefix=name_prefix, index_dtype=self.index_dtype, scan_dtype=dtype, is_segmented=self.is_segmented, arg_dtypes=arg_dtypes, arg_ctypes=arg_ctypes, scan_expr=_process_code_for_macro(scan_expr), neutral=_process_code_for_macro(neutral), is_gpu=bool(self.devices[0].type & cl.device_type.GPU), double_support=all( has_double_support(dev) for dev in devices), ) # }}} self.finish_setup() # }}} class GenericScanKernel(_GenericScanKernelBase): """Generates and executes code that performs prefix sums ("scans") on arbitrary types, with many possible tweaks. Usage example:: from pyopencl.scan import GenericScanKernel knl = GenericScanKernel( context, np.int32, arguments="__global int *ary", input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i+1] = item;") a = cl.array.arange(queue, 10000, dtype=np.int32) scan_kernel(a, queue=queue) """ def finish_setup(self): use_lookbehind_update = "prev_item" in self.output_statement self.store_segment_start_flags = self.is_segmented and use_lookbehind_update # {{{ find usable workgroup/k-group size, build first-level scan trip_count = 0 avail_local_mem = min( dev.local_mem_size for dev in self.devices) is_cpu = self.devices[0].type & cl.device_type.CPU is_gpu = self.devices[0].type & cl.device_type.GPU if is_cpu: # (about the widest vector a CPU can support, also taking # into account that CPUs don't hide latency by large work groups max_scan_wg_size = 16 wg_size_multiples = 4 else: max_scan_wg_size = min(dev.max_work_group_size for dev in self.devices) wg_size_multiples = 64 use_bank_conflict_avoidance = ( self.dtype.itemsize > 4 and self.dtype.itemsize % 8 == 0 and is_gpu) # k_group_size should be a power of two because of in-kernel # division by that number. solutions = [] for k_exp in range(0, 9): for wg_size in range(wg_size_multiples, max_scan_wg_size+1, wg_size_multiples): k_group_size = 2**k_exp lmem_use = self.get_local_mem_use(wg_size, k_group_size, use_bank_conflict_avoidance) if lmem_use + 256 <= avail_local_mem: solutions.append((wg_size*k_group_size, k_group_size, wg_size)) if is_gpu: from pytools import any for wg_size_floor in [256, 192, 128]: have_sol_above_floor = any(wg_size >= wg_size_floor for _, _, wg_size in solutions) if have_sol_above_floor: # delete all solutions not meeting the wg size floor solutions = [(total, k_group_size, wg_size) for total, k_group_size, wg_size in solutions if wg_size >= wg_size_floor] break _, k_group_size, max_scan_wg_size = max(solutions) while True: candidate_scan_info = self.build_scan_kernel( max_scan_wg_size, self.parsed_args, _process_code_for_macro(self.input_expr), self.is_segment_start_expr, input_fetch_exprs=self.input_fetch_exprs, is_first_level=True, store_segment_start_flags=self.store_segment_start_flags, k_group_size=k_group_size, use_bank_conflict_avoidance=use_bank_conflict_avoidance) # Will this device actually let us execute this kernel # at the desired work group size? Building it is the # only way to find out. kernel_max_wg_size = min( candidate_scan_info.kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, dev) for dev in self.devices) if candidate_scan_info.wg_size <= kernel_max_wg_size: break else: max_scan_wg_size = min(kernel_max_wg_size, max_scan_wg_size) trip_count += 1 assert trip_count <= 20 self.first_level_scan_info = candidate_scan_info assert (_round_down_to_power_of_2(candidate_scan_info.wg_size) == candidate_scan_info.wg_size) # }}} # {{{ build second-level scan from pyopencl.tools import VectorArg second_level_arguments = self.parsed_args + [ VectorArg(self.dtype, "interval_sums")] second_level_build_kwargs = {} if self.is_segmented: second_level_arguments.append( VectorArg(self.index_dtype, "g_first_segment_start_in_interval_input")) # is_segment_start_expr answers the question "should previous sums # spill over into this item". And since # g_first_segment_start_in_interval_input answers the question if a # segment boundary was found in an interval of data, then if not, # it's ok to spill over. second_level_build_kwargs["is_segment_start_expr"] = \ "g_first_segment_start_in_interval_input[i] != NO_SEG_BOUNDARY" else: second_level_build_kwargs["is_segment_start_expr"] = None self.second_level_scan_info = self.build_scan_kernel( max_scan_wg_size, arguments=second_level_arguments, input_expr="interval_sums[i]", input_fetch_exprs=[], is_first_level=False, store_segment_start_flags=False, k_group_size=k_group_size, use_bank_conflict_avoidance=use_bank_conflict_avoidance, **second_level_build_kwargs) # }}} # {{{ build final update kernel self.update_wg_size = min(max_scan_wg_size, 256) final_update_tpl = _make_template(UPDATE_SOURCE) final_update_src = str(final_update_tpl.render( wg_size=self.update_wg_size, output_statement=self.output_statement, argument_signature=", ".join( arg.declarator() for arg in self.parsed_args), is_segment_start_expr=self.is_segment_start_expr, input_expr=_process_code_for_macro(self.input_expr), use_lookbehind_update=use_lookbehind_update, **self.code_variables)) final_update_prg = cl.Program( self.context, final_update_src).build(self.options) self.final_update_knl = getattr( final_update_prg, self.name_prefix+"_final_update") update_scalar_arg_dtypes = ( get_arg_list_scalar_arg_dtypes(self.parsed_args) + [self.index_dtype, self.index_dtype, None, None]) if self.is_segmented: # g_first_segment_start_in_interval update_scalar_arg_dtypes.append(None) if self.store_segment_start_flags: update_scalar_arg_dtypes.append(None) # g_segment_start_flags self.final_update_knl.set_scalar_arg_dtypes(update_scalar_arg_dtypes) # }}} # {{{ scan kernel build/properties def get_local_mem_use(self, k_group_size, wg_size, use_bank_conflict_avoidance): arg_dtypes = {} for arg in self.parsed_args: arg_dtypes[arg.name] = arg.dtype fetch_expr_offsets = {} for name, arg_name, ife_offset in self.input_fetch_exprs: fetch_expr_offsets.setdefault(arg_name, set()).add(ife_offset) itemsize = self.dtype.itemsize if use_bank_conflict_avoidance: itemsize += 4 return ( # ldata itemsize*(k_group_size+1)*(wg_size+1) # l_segment_start_flags + k_group_size*wg_size # l_first_segment_start_in_subtree + self.index_dtype.itemsize*wg_size + k_group_size*wg_size*sum( arg_dtypes[arg_name].itemsize for arg_name, ife_offsets in fetch_expr_offsets.items() if -1 in ife_offsets or len(ife_offsets) > 1)) def build_scan_kernel(self, max_wg_size, arguments, input_expr, is_segment_start_expr, input_fetch_exprs, is_first_level, store_segment_start_flags, k_group_size, use_bank_conflict_avoidance): scalar_arg_dtypes = get_arg_list_scalar_arg_dtypes(arguments) # Empirically found on Nv hardware: no need to be bigger than this size wg_size = _round_down_to_power_of_2( min(max_wg_size, 256)) scan_tpl = _make_template(SCAN_INTERVALS_SOURCE) scan_src = str(scan_tpl.render( wg_size=wg_size, input_expr=input_expr, k_group_size=k_group_size, argument_signature=", ".join(arg.declarator() for arg in arguments), is_segment_start_expr=is_segment_start_expr, input_fetch_exprs=input_fetch_exprs, is_first_level=is_first_level, store_segment_start_flags=store_segment_start_flags, use_bank_conflict_avoidance=use_bank_conflict_avoidance, **self.code_variables)) prg = cl.Program(self.context, scan_src).build(self.options) knl = getattr( prg, self.code_variables["name_prefix"]+"_scan_intervals") scalar_arg_dtypes.extend( (None, self.index_dtype, self. index_dtype)) if is_first_level: scalar_arg_dtypes.append(None) # interval_results if self.is_segmented and is_first_level: scalar_arg_dtypes.append(None) # g_first_segment_start_in_interval if store_segment_start_flags: scalar_arg_dtypes.append(None) # g_segment_start_flags knl.set_scalar_arg_dtypes(scalar_arg_dtypes) return _ScanKernelInfo( kernel=knl, wg_size=wg_size, knl=knl, k_group_size=k_group_size) # }}} def __call__(self, *args, **kwargs): # {{{ argument processing allocator = kwargs.get("allocator") queue = kwargs.get("queue") n = kwargs.get("size") wait_for = kwargs.get("wait_for") if len(args) != len(self.parsed_args): raise TypeError("expected %d arguments, got %d" % (len(self.parsed_args), len(args))) first_array = args[self.first_array_idx] allocator = allocator or first_array.allocator queue = queue or first_array.queue if n is None: n, = first_array.shape if n == 0: # We're done here. (But pretend to return an event.) return cl.enqueue_marker(queue, wait_for=wait_for) data_args = [] from pyopencl.tools import VectorArg for arg_descr, arg_val in zip(self.parsed_args, args): if isinstance(arg_descr, VectorArg): data_args.append(arg_val.data) else: data_args.append(arg_val) # }}} l1_info = self.first_level_scan_info l2_info = self.second_level_scan_info # see CL source above for terminology unit_size = l1_info.wg_size * l1_info.k_group_size max_intervals = 3*max(dev.max_compute_units for dev in self.devices) from pytools import uniform_interval_splitting interval_size, num_intervals = uniform_interval_splitting( n, unit_size, max_intervals) # {{{ allocate some buffers interval_results = cl.array.empty(queue, num_intervals, dtype=self.dtype, allocator=allocator) partial_scan_buffer = cl.array.empty( queue, n, dtype=self.dtype, allocator=allocator) if self.store_segment_start_flags: segment_start_flags = cl.array.empty( queue, n, dtype=np.bool, allocator=allocator) # }}} # {{{ first level scan of interval (one interval per block) scan1_args = data_args + [ partial_scan_buffer.data, n, interval_size, interval_results.data, ] if self.is_segmented: first_segment_start_in_interval = cl.array.empty(queue, num_intervals, dtype=self.index_dtype, allocator=allocator) scan1_args.append(first_segment_start_in_interval.data) if self.store_segment_start_flags: scan1_args.append(segment_start_flags.data) l1_evt = l1_info.kernel( queue, (num_intervals,), (l1_info.wg_size,), *scan1_args, **dict(g_times_l=True, wait_for=wait_for)) # }}} # {{{ second level scan of per-interval results # can scan at most one interval assert interval_size >= num_intervals scan2_args = data_args + [ interval_results.data, # interval_sums ] if self.is_segmented: scan2_args.append(first_segment_start_in_interval.data) scan2_args = scan2_args + [ interval_results.data, # partial_scan_buffer num_intervals, interval_size] l2_evt = l2_info.kernel( queue, (1,), (l1_info.wg_size,), *scan2_args, **dict(g_times_l=True, wait_for=[l1_evt])) # }}} # {{{ update intervals with result of interval scan upd_args = data_args + [ n, interval_size, interval_results.data, partial_scan_buffer.data] if self.is_segmented: upd_args.append(first_segment_start_in_interval.data) if self.store_segment_start_flags: upd_args.append(segment_start_flags.data) return self.final_update_knl( queue, (num_intervals,), (self.update_wg_size,), *upd_args, **dict(g_times_l=True, wait_for=[l2_evt])) # }}} # }}} # {{{ debug kernel DEBUG_SCAN_TEMPLATE = SHARED_PREAMBLE + r"""//CL// KERNEL REQD_WG_SIZE(1, 1, 1) void ${name_prefix}_debug_scan( ${argument_signature}, const index_type N) { scan_type item = ${neutral}; scan_type prev_item; for (index_type i = 0; i < N; ++i) { %for name, arg_name, ife_offset in input_fetch_exprs: ${arg_ctypes[arg_name]} ${name}; %if ife_offset < 0: if (i+${ife_offset} >= 0) ${name} = ${arg_name}[i+offset]; %else: ${name} = ${arg_name}[i]; %endif %endfor scan_type my_val = INPUT_EXPR(i); prev_item = item; %if is_segmented: bool is_seg_start = IS_SEG_START(i, my_val); %endif item = SCAN_EXPR(prev_item, my_val, %if is_segmented: is_seg_start %else: false %endif ); { ${output_statement}; } } } """ class GenericDebugScanKernel(_GenericScanKernelBase): def finish_setup(self): scan_tpl = _make_template(DEBUG_SCAN_TEMPLATE) scan_src = str(scan_tpl.render( output_statement=self.output_statement, argument_signature=", ".join( arg.declarator() for arg in self.parsed_args), is_segment_start_expr=self.is_segment_start_expr, input_expr=_process_code_for_macro(self.input_expr), input_fetch_exprs=self.input_fetch_exprs, wg_size=1, **self.code_variables)) scan_prg = cl.Program(self.context, scan_src).build(self.options) self.kernel = getattr( scan_prg, self.name_prefix+"_debug_scan") scalar_arg_dtypes = ( get_arg_list_scalar_arg_dtypes(self.parsed_args) + [self.index_dtype]) self.kernel.set_scalar_arg_dtypes(scalar_arg_dtypes) def __call__(self, *args, **kwargs): # {{{ argument processing allocator = kwargs.get("allocator") queue = kwargs.get("queue") n = kwargs.get("size") wait_for = kwargs.get("wait_for") if len(args) != len(self.parsed_args): raise TypeError("expected %d arguments, got %d" % (len(self.parsed_args), len(args))) first_array = args[self.first_array_idx] allocator = allocator or first_array.allocator queue = queue or first_array.queue if n is None: n, = first_array.shape data_args = [] from pyopencl.tools import VectorArg for arg_descr, arg_val in zip(self.parsed_args, args): if isinstance(arg_descr, VectorArg): data_args.append(arg_val.data) else: data_args.append(arg_val) # }}} return self.kernel(queue, (1,), (1,), *(data_args + [n]), **dict(wait_for=wait_for)) # }}} # {{{ compatibility interface class _LegacyScanKernelBase(GenericScanKernel): def __init__(self, ctx, dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None): scan_ctype = dtype_to_ctype(dtype) GenericScanKernel.__init__(self, ctx, dtype, arguments="__global %s *input_ary, __global %s *output_ary" % ( scan_ctype, scan_ctype), input_expr="input_ary[i]", scan_expr=scan_expr, neutral=neutral, output_statement=self.ary_output_statement, options=options, preamble=preamble, devices=devices) def __call__(self, input_ary, output_ary=None, allocator=None, queue=None): allocator = allocator or input_ary.allocator queue = queue or input_ary.queue or output_ary.queue if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl.array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary GenericScanKernel.__call__(self, input_ary, output_ary, allocator=allocator, queue=queue) return output_ary class InclusiveScanKernel(_LegacyScanKernelBase): ary_output_statement = "output_ary[i] = item;" class ExclusiveScanKernel(_LegacyScanKernelBase): ary_output_statement = "output_ary[i] = prev_item;" # }}} # {{{ template class ScanTemplate(KernelTemplateBase): def __init__(self, arguments, input_expr, scan_expr, neutral, output_statement, is_segment_start_expr=None, input_fetch_exprs=[], name_prefix="scan", preamble="", template_processor=None): KernelTemplateBase.__init__(self, template_processor=template_processor) self.arguments = arguments self.input_expr = input_expr self.scan_expr = scan_expr self.neutral = neutral self.output_statement = output_statement self.is_segment_start_expr = is_segment_start_expr self.input_fetch_exprs = input_fetch_exprs self.name_prefix = name_prefix self.preamble = preamble def build_inner(self, context, type_aliases=(), var_values=(), more_preamble="", more_arguments=(), declare_types=(), options=(), devices=None, scan_cls=GenericScanKernel): renderer = self.get_renderer(type_aliases, var_values, context, options) arg_list = renderer.render_argument_list(self.arguments, more_arguments) type_decl_preamble = renderer.get_type_decl_preamble( context.devices[0], declare_types, arg_list) return scan_cls(context, renderer.type_aliases["scan_t"], renderer.render_argument_list(self.arguments, more_arguments), renderer(self.input_expr), renderer(self.scan_expr), renderer(self.neutral), renderer(self.output_statement), is_segment_start_expr=renderer(self.is_segment_start_expr), input_fetch_exprs=self.input_fetch_exprs, index_dtype=renderer.type_aliases.get("index_t", np.int32), name_prefix=renderer(self.name_prefix), options=list(options), preamble=( type_decl_preamble + "\n" + renderer(self.preamble + "\n" + more_preamble)), devices=devices) # }}} # {{{ 'canned' scan kernels @context_dependent_memoize def get_cumsum_kernel(context, input_dtype, output_dtype): from pyopencl.tools import VectorArg return GenericScanKernel( context, output_dtype, arguments=[ VectorArg(input_dtype, "input"), VectorArg(output_dtype, "output"), ], input_expr="input[i]", scan_expr="a+b", neutral="0", output_statement=""" output[i] = item; """) # }}} # vim: filetype=pyopencl:fdm=marker pyopencl-2013.2/pyopencl/compyte/0002755000175000000500000000000012245716343015464 5ustar tomussrcpyopencl-2013.2/pyopencl/compyte/array.py0000644000175000000500000001244612245716342017160 0ustar tomussrcfrom __future__ import division __copyright__ = "Copyright (C) 2011 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from pytools import memoize_method import numpy as np def f_contiguous_strides(itemsize, shape): if shape: strides = [itemsize] for s in shape[:-1]: strides.append(strides[-1]*s) return tuple(strides) else: return () def c_contiguous_strides(itemsize, shape): if shape: strides = [itemsize] for s in shape[:0:-1]: strides.append(strides[-1]*s) return tuple(strides[::-1]) else: return () class ArrayFlags: def __init__(self, ary): self.f_contiguous = ( ary.strides == f_contiguous_strides( ary.dtype.itemsize, ary.shape)) self.c_contiguous = ( ary.strides == c_contiguous_strides( ary.dtype.itemsize, ary.shape)) self.forc = self.f_contiguous or self.c_contiguous def get_common_dtype(obj1, obj2, allow_double): # Yes, numpy behaves differently depending on whether # we're dealing with arrays or scalars. zero1 = np.zeros(1, dtype=obj1.dtype) try: zero2 = np.zeros(1, dtype=obj2.dtype) except AttributeError: zero2 = obj2 result = (zero1 + zero2).dtype if not allow_double: if result == np.float64: result = np.dtype(np.float32) elif result == np.complex128: result = np.dtype(np.complex64) return result def bound(a): high = a.bytes low = a.bytes for stri, shp in zip(a.strides, a.shape): if stri<0: low += (stri)*(shp-1) else: high += (stri)*(shp-1) return low, high def may_share_memory(a, b): # When this is called with a an ndarray and b # a sparse matrix, numpy.may_share_memory fails. if a is b: return True if a.__class__ is b.__class__: a_l, a_h = bound(a) b_l, b_h = bound(b) if b_l >= a_h or a_l >= b_h: return False return True else: return False # {{{ as_strided implementation # stolen from numpy to be compatible with older versions of numpy class _DummyArray(object): """ Dummy object that just exists to hang __array_interface__ dictionaries and possibly keep alive a reference to a base array. """ def __init__(self, interface, base=None): self.__array_interface__ = interface self.base = base def as_strided(x, shape=None, strides=None): """ Make an ndarray from the given array with the given shape and strides. """ # work around Numpy bug 1873 (reported by Irwin Zaid) # Since this is stolen from numpy, this implementation has the same bug. # http://projects.scipy.org/numpy/ticket/1873 if not x.dtype.isbuiltin: if (shape is None or x.shape == shape) and \ (strides is None or x.strides == strides): return x if shape is None: shape = x.shape strides = tuple(strides) from pytools import product if strides is not None and shape is not None \ and product(shape) == product(x.shape) \ and x.flags.forc: # Workaround: If we're being asked to do what amounts to a # contiguous reshape, at least do that. if strides == f_contiguous_strides(x.dtype.itemsize, shape): # **dict is a workaround for Python 2.5 syntax. result = x.reshape(-1).reshape(*shape, **dict(order="F")) assert result.strides == strides return result elif strides == c_contiguous_strides(x.dtype.itemsize, shape): # **dict is a workaround for Python 2.5 syntax. result = x.reshape(-1).reshape(*shape, **dict(order="C")) assert result.strides == strides return result raise NotImplementedError( "as_strided won't work on non-builtin arrays for now. " "See http://projects.scipy.org/numpy/ticket/1873") interface = dict(x.__array_interface__) if shape is not None: interface['shape'] = tuple(shape) if strides is not None: interface['strides'] = tuple(strides) return np.asarray(_DummyArray(interface, base=x)) # }}} pyopencl-2013.2/pyopencl/compyte/__init__.py0000644000175000000500000000000012245716342017560 0ustar tomussrcpyopencl-2013.2/pyopencl/compyte/ndarray/0002755000175000000500000000000012245716342017123 5ustar tomussrcpyopencl-2013.2/pyopencl/compyte/ndarray/pygpu_ndarray.h0000644000175000000500000000320112245716342022152 0ustar tomussrc#ifndef _PYGPU_NDARRAY_H #define _PYGPU_NDARRAY_H #ifndef OFFSET #define OFFSET 0 #endif //#include //#include #include #include #include "pygpu_ndarray_object.h" #include "gpu_ndarray.h" #include "pygpu_language.h" /* * Return a PyGpuNdArray whose 'nd' dimensions are all 0. * if nd==-1, it is not initialized. */ PyObject * PyGpuNdArray_New(int nd=-1); /** * Return 1 for a PyGpuNdArrayObject otw 0 */ int PyGpuNdArray_Check(const PyObject * ob); /** * Return 1 for a PyGpuNdArrayObject otw 0 */ int PyGpuNdArray_CheckExact(const PyObject * ob); /** * Transfer the contents of numpy array `obj` to `self`. * * self is reallocated to have the correct dimensions if necessary. */ int PyGpuNdArray_CopyFromArray(PyGpuNdArrayObject * self, PyArrayObject*obj); static int PyGpuNdArray_add_offset(PyGpuNdArrayObject * self, int offset); static int PyGpuNdArray_set_data(PyGpuNdArrayObject * self, char * data, PyObject * base, int offset=0); static PyObject * PyGpuNdArray_Subscript(PyObject * py_self, PyObject * key); static PyObject * PyGpuNdArray_Copy(PyGpuNdArrayObject * self, NPY_ORDER order=NPY_CORDER); static PyObject * PyGpuNdArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran); static PyObject * PyGpuNdArray_Empty(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran); #endif /* Local Variables: mode:c++ c-basic-offset:4 c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)) indent-tabs-mode:nil fill-column:79 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : pyopencl-2013.2/pyopencl/compyte/ndarray/__init__.py0000644000175000000500000000000012245716342021220 0ustar tomussrcpyopencl-2013.2/pyopencl/compyte/ndarray/pygpu_language_cuda.cu0000644000175000000500000007121312245716342023461 0ustar tomussrc#include #include #include #ifdef __DEVICE_EMULATION__ #define NUM_VECTOR_OP_BLOCKS 4096 #define NUM_VECTOR_OP_THREADS_PER_BLOCK 1 //This prevents printf from getting tangled up #else #define NUM_VECTOR_OP_BLOCKS 4096 //Max number of blocks to launch. Should be read from device properties. (#10) #define NUM_VECTOR_OP_THREADS_PER_BLOCK 256 //Should be read from device properties. (#10) #endif #if 0 // Do not wait after every kernel & transfer. #define CNDA_THREAD_SYNC #else // This is useful for using normal profiling tools #define CNDA_THREAD_SYNC cudaThreadSynchronize(); #endif #ifndef SHARED_SIZE #define SHARED_SIZE (16*1024) #endif char * cublasGetErrorString(cublasStatus err) { if (err == CUBLAS_STATUS_NOT_INITIALIZED) { return "CUBLAS_STATUS_NOT_INITIALIZED"; } else if (err == CUBLAS_STATUS_ALLOC_FAILED){ return "CUBLAS_STATUS_ALLOC_FAILED"; } else if (err == CUBLAS_STATUS_INVALID_VALUE){ return "CUBLAS_STATUS_INVALID_VALUE"; } else if (err == CUBLAS_STATUS_MAPPING_ERROR){ return "CUBLAS_STATUS_MAPPING_ERROR"; } else if (err == CUBLAS_STATUS_EXECUTION_FAILED){ return "CUBLAS_STATUS_EXECUTION_FAILED"; } else if (err == CUBLAS_STATUS_INTERNAL_ERROR){ return "CUBLAS_STATUS_INTERNAL_ERROR"; } else { return "UNKNOW ERROR"; } } ///////////////////////// // Alloc and Free ///////////////////////// void * device_malloc(size_t size) { void * rval=NULL; cudaError_t err = cudaMalloc(&rval, size); if (cudaSuccess != err){ #if COMPUTE_GPU_MEM_USED fprintf(stderr, "Error allocating %li bytes of device memory (%s). %d already allocated\n", (long)size, cudaGetErrorString(err),_allocated_size); #else fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err)); #endif PyErr_Format(PyExc_MemoryError, "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err)); return NULL; } _outstanding_mallocs[0] += (rval != NULL); #if COMPUTE_GPU_MEM_USED for(int i=0;i __device__ T unary_copy(T a) { return a; } decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_float, unary_copy, float) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_double, unary_copy, double) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint8, unary_copy, uint8_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int8, unary_copy, int8_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint16, unary_copy, uint16_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int16, unary_copy, int16_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint32, unary_copy, uint32_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int32, unary_copy, int32_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint64, unary_copy, uint64_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int64, unary_copy, int64_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex64, unary_copy, npy_complex64) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex128, unary_copy, npy_complex128) //template __device__ T unary_exp(T a) { return exp(a); } //decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp) template static __global__ void k_copy_1d(const int N, const T * x, const ssize_t sx, T * y, const ssize_t sy) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x) { y[i*sy] = x[i*sx]; } } //copy from other into self //don't allocated memory int PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast) { DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray start nd=%d\n", PyGpuNdArray_NDIM(self)); assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other)); assert(PyGpuNdArray_ISWRITEABLE(self)); //standard elemwise size checks if (PyGpuNdArray_NDIM(self) == -1) { PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuNdArrayObject"); return -1; } if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other)); return -1; } //standard elemwise dim checks (also compute total size) unsigned int size = 1; unsigned int size_source = 1; for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) { if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i]) && (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) { PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld", i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]); return -1; } size *= (unsigned int) PyGpuNdArray_DIMS(self)[i]; size_source *= (unsigned int) PyGpuNdArray_DIMS(other)[i]; } if (0 == size) { return 0; //nothing to copy, we're done. } //cublas don't support negative stride bool pos_stride = true; for (int i = 0; i < PyGpuNdArray_NDIM(other); ++i) if (PyGpuNdArray_STRIDE(other,i)<0) pos_stride = false; void * other_data = PyGpuNdArray_DATA(other) + PyGpuNdArray_OFFSET(other); void * self_data = PyGpuNdArray_DATA(self) + PyGpuNdArray_OFFSET(self); //Try to transfer with cublas(we suppose it is faster) if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) && size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT32 && pos_stride ) { cublasScopy(size, (float*) other_data, 1, (float*) self_data, 1); CNDA_THREAD_SYNC; if (CUBLAS_STATUS_SUCCESS != cublasGetError()) { PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); return -1; } DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: cublasScopy end\n"); return 0; } if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) && size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT64 && pos_stride) { cublasDcopy(size, (double*) other_data, 1, (double*) self_data, 1); CNDA_THREAD_SYNC; if (CUBLAS_STATUS_SUCCESS != cublasGetError()) { PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); return -1; } DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray cublasDcopy end\n"); return 0; } //TODO: rewrite these copy operations to be more efficient // See, for example the transpose example in the cuda_sdk. switch (PyGpuNdArray_NDIM(self)) { case 0: // scalar { // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS assert(0); }; break; case 1: // vector { assert(PyGpuNdArray_ISALIGNED(self)); assert(PyGpuNdArray_ISALIGNED(other)); DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying non-contiguous vector\n"); unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int n_threads = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); if (PyGpuNdArray_TYPE(self) == NPY_FLOAT32) { const int elsize = sizeof(float); k_copy_1d<<>>(size, (float*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (float*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_FLOAT64) { const int elsize = sizeof(double); k_copy_1d<<>>(size, (double*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (double*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_INT8) { const int elsize = sizeof(int8_t); k_copy_1d<<>>(size, (int8_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (int8_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_INT16) { const int elsize = sizeof(int16_t); k_copy_1d<<>>(size, (int16_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (int16_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_INT32) { const int elsize = sizeof(int32_t); k_copy_1d<<>>(size, (int32_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (int32_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_INT64) { const int elsize = sizeof(int64_t); k_copy_1d<<>>(size, (int64_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (int64_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_UINT8) { const int elsize = sizeof(uint8_t); k_copy_1d<<>>(size, (uint8_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (uint8_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_UINT16) { const int elsize = sizeof(uint16_t); k_copy_1d<<>>(size, (uint16_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (uint16_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_UINT32) { const int elsize = sizeof(uint32_t); k_copy_1d<<>>(size, (uint32_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (uint32_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_UINT64) { const int elsize = sizeof(uint64_t); k_copy_1d<<>>(size, (uint64_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (uint64_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) { const int elsize = sizeof(npy_complex64); k_copy_1d<<>>(size, (npy_complex64*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (npy_complex64*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) { const int elsize = sizeof(npy_complex128); k_copy_1d<<>>(size, (npy_complex128*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (npy_complex128*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n"); return -1; } CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_copy_1d", cudaGetErrorString(err), n_blocks, n_threads); return -1; } }; break; default: { assert (cudaSuccess == cudaGetLastError()); assert(PyGpuNdArray_ISALIGNED(self)); assert(PyGpuNdArray_ISALIGNED(other)); DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying with default version unbroadcast=%d\n", unbroadcast); // Identigy the dim of the output memory. PyGpuNdArrayObject * cuda_dims = other; if(unbroadcast) cuda_dims = self; // Move the dim and strides information on the gpu memory int ndim = PyGpuNdArray_NDIM(other); void * strides_dev = device_malloc(sizeof(ssize_t)*ndim*3); ssize_t * strides_dev_p = (ssize_t *) strides_dev; cudaError_t err = cudaMemcpy(strides_dev, PyGpuNdArray_DIMS(cuda_dims), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice); if (err != cudaSuccess){ PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory1: %s", cudaGetErrorString(err)); return -1; } err = cudaMemcpy((void*)(strides_dev_p+ndim), PyGpuNdArray_STRIDES(other), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice); if (err != cudaSuccess){ PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory2: %s", cudaGetErrorString(err)); return -1; } err = cudaMemcpy((void*)(strides_dev_p+(ndim*2)), PyGpuNdArray_STRIDES(self), ndim*sizeof(ssize_t), cudaMemcpyHostToDevice); if (err != cudaSuccess){ PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory3: %s", cudaGetErrorString(err)); return -1; } void * strides_host = malloc(sizeof(ssize_t)*ndim*3); err = cudaMemcpy(strides_host, strides_dev, ndim*3*sizeof(ssize_t),cudaMemcpyDeviceToHost); if (err != cudaSuccess){ PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory4: %s", cudaGetErrorString(err)); return -1; } #ifdef DEBUG for(int i=0;i<3*ndim;i++) DPRINTF(" %ld", ((ssize_t *)strides_host)[i]); DPRINTF("\n"); #endif CNDA_THREAD_SYNC; if(cudaSuccess != cudaGetLastError()){ PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: error before copy\n"); return -1; } // call worker routine unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int threads_per_block = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT32) { k_elemwise_unary_rowmajor_copy_float<<>>( size, (unsigned int)ndim, strides_dev_p, (const float*)other_data, strides_dev_p+ndim, (float*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT64) { k_elemwise_unary_rowmajor_copy_double<<>>( size, (unsigned int)ndim, strides_dev_p, (const double*)other_data, strides_dev_p+ndim, (double*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_INT8) { k_elemwise_unary_rowmajor_copy_int8<<>>( size, (unsigned int)ndim, strides_dev_p, (const int8_t*)other_data, strides_dev_p+ndim, (int8_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_INT16) { k_elemwise_unary_rowmajor_copy_int16<<>>( size, (unsigned int)ndim, strides_dev_p, (const int16_t*)other_data, strides_dev_p+ndim, (int16_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_INT32) { k_elemwise_unary_rowmajor_copy_int32<<>>( size, (unsigned int)ndim, strides_dev_p, (const int32_t*)other_data, strides_dev_p+ndim, (int32_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_INT64) { k_elemwise_unary_rowmajor_copy_int64<<>>( size, (unsigned int)ndim, strides_dev_p, (const int64_t*)other_data, strides_dev_p+ndim, (int64_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT8) { k_elemwise_unary_rowmajor_copy_uint8<<>>( size, (unsigned int)ndim, strides_dev_p, (const uint8_t*)other_data, strides_dev_p+ndim, (uint8_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT16) { k_elemwise_unary_rowmajor_copy_uint16<<>>( size, (unsigned int)ndim, strides_dev_p, (const uint16_t*)other_data, strides_dev_p+ndim, (uint16_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT32) { k_elemwise_unary_rowmajor_copy_uint32<<>>( size, (unsigned int)ndim, strides_dev_p, (const uint32_t*)other_data, strides_dev_p+ndim, (uint32_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT64) { k_elemwise_unary_rowmajor_copy_uint64<<>>( size, (unsigned int)ndim, strides_dev_p, (const uint64_t*)other_data, strides_dev_p+ndim, (uint64_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) { k_elemwise_unary_rowmajor_copy_complex64<<>>( size, (unsigned int)ndim, strides_dev_p, (const npy_complex64*)other_data, strides_dev_p+ndim, (npy_complex64*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) { k_elemwise_unary_rowmajor_copy_complex128<<>>( size, (unsigned int)ndim, strides_dev_p, (const npy_complex128*)other_data, strides_dev_p+ndim, (npy_complex128*) self_data, strides_dev_p+(ndim*2)); } else { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n"); return -1; } CNDA_THREAD_SYNC; err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_elemwise_unary_rowmajor_copy", cudaGetErrorString(err), n_blocks, threads_per_block); return -1; } device_free(strides_dev); free(strides_host); } }; // Set flags if (false && PyGpuNdArray_NDIM(self) == 0) { //Numpy 1.4.1 is not consistent here //When we create a new numpy ndarray of 0 dim, it is not f contiguous //But when we take a subtensor that is of 0 dim, it is f contiguous! //We make as them for now... PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; } else { if (PyGpuNdArray_is_c_contiguous(self)) { PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; } if (PyGpuNdArray_is_f_contiguous(self)) { PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; } } DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray end\n"); return 0; } int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, PyGpuTransfert direction){ DPRINTF("PyGpuMemcpy: start\n"); cudaMemcpyKind dir; const char * ssrc; const char * ddst; if (direction == PyGpuDeviceToHost){ dir = cudaMemcpyDeviceToHost; ssrc = (char*)src+dev_offset; ddst = (char*)dst; } else if (direction == PyGpuHostToDevice) { dir = cudaMemcpyHostToDevice; ssrc = (char*)src; ddst = (char*)dst + dev_offset; } else { PyErr_Format(PyExc_ValueError, "PyGpuMemcpy: Received wrong direction %d!\n", direction); return -1; } cudaError_t err = cudaMemcpy((void*)ddst, (void*)ssrc, bytes, dir); CNDA_THREAD_SYNC; if (cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "PyGpuMemcpy: cudaMemcpy: error copying data to host (%s)", cudaGetErrorString(err)); return -1; } DPRINTF("PyGpuMemcpy: end\n"); return 0; } int PyGpuMemset(void * dst, int data, size_t bytes){ DPRINTF("PyGpuMemset: start\n"); cudaError_t err = cudaMemset(dst, data, bytes); CNDA_THREAD_SYNC; if (cudaSuccess != err) { PyErr_Format(PyExc_MemoryError, "PyGpuMemset: Error memsetting %ld bytes of device memory(%s). %p", bytes, cudaGetErrorString(err), PyGpuNdArray_DATA(dst)); DPRINTF("PyGpuMemset: end error\n"); return -1; } DPRINTF("PyGpuMemset: end\n"); return 0; } /* Local Variables: mode:c++ c-basic-offset:4 c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)) indent-tabs-mode:nil fill-column:79 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : pyopencl-2013.2/pyopencl/compyte/ndarray/test_gpu_ndarray.py0000644000175000000500000004255312245716342023056 0ustar tomussrcimport copy import numpy import pygpu_ndarray as gpu_ndarray enable_double = True enable_double = False dtypes_all = ["float32", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "complex64", ] dtypes_no_complex = ["float32", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", ] if enable_double: dtypes_all += ["float64", "complex128"] dtypes_no_complex += ["float64"] def check_flags(x, y): assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] assert x.flags["OWNDATA"] == y.flags["OWNDATA"] assert x.flags["ALIGNED"] == y.flags["ALIGNED"] assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] def check_meta(x, y): assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides check_flags(x, y) def check_all(x, y): check_meta(x, y) assert numpy.allclose(numpy.asarray(x), numpy.asarray(y)) def gen_gpu_nd_array(shape_orig, dtype='float32', offseted_outer=False, offseted_inner=False, sliced=1, order='c'): if sliced is True: sliced = 2 elif sliced is False: sliced = 1 shape = numpy.asarray(shape_orig).copy() if sliced != 1 and len(shape) > 0: shape[0] *= numpy.absolute(sliced) if offseted_outer and len(shape) > 0: shape[0] += 1 if offseted_inner and len(shape) > 0: shape[-1] += 1 a = numpy.random.rand(*shape) * 10 if dtype.startswith("u"): a = numpy.absolute(a) a = numpy.asarray(a, dtype=dtype) assert order in ['c', 'f'] if order == 'f' and len(shape) > 0: a = numpy.asfortranarray(a) b = gpu_ndarray.GpuNdArrayObject(a) if order == 'f' and len(shape) > 0 and b.size > 1: assert b.flags['F_CONTIGUOUS'] if offseted_outer and len(shape) > 0: b = b[1:] a = a[1:] assert b.offset != 0 if offseted_inner and len(shape) > 0: # The b[..., 1:] act as the test for this subtensor case. b = b[..., 1:] a = a[..., 1:] assert b.offset != 0 if sliced != 1 and len(shape) > 0: a = a[::sliced] b = b[::sliced] if False and shape_orig == (): assert a.shape == (1,) assert b.shape == (1,) else: assert a.shape == shape_orig, (a.shape, shape_orig) assert b.shape == shape_orig, (b.shape, shape_orig) assert numpy.allclose(a, numpy.asarray(b)) return a, b def product(*args, **kwds): # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111 pools = map(tuple, args) * kwds.get('repeat', 1) result = [[]] for pool in pools: result = [x + [y] for x in result for y in pool] for prod in result: yield tuple(prod) def test_transfer(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: a, b = gen_gpu_nd_array(shp, dtype, offseted) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape assert a.strides == b.strides == c.strides assert a.dtype == b.dtype == c.dtype == dtype assert c.flags.c_contiguous def test_transfer_not_contiguous(): """ Test transfer when the input on the CPU is not contiguous TODO: test when the input on the gpu is not contiguous """ for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: a = numpy.random.rand(*shp) * 10 a = a[::-1] b = gpu_ndarray.GpuNdArrayObject(a) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape # We copy a to a c contiguous array before the transfer assert (-a.strides[0],) + a.strides[1:] == b.strides == c.strides assert a.dtype == b.dtype == c.dtype assert c.flags.c_contiguous def test_transfer_fortran(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: a = numpy.random.rand(*shp) * 10 a_ = numpy.asfortranarray(a) if len(shp) > 1: assert a_.strides != a.strides a = a_ b = gpu_ndarray.GpuNdArrayObject(a) c = numpy.asarray(b) assert a.shape == b.shape == c.shape assert a.dtype == b.dtype == c.dtype assert a.flags.f_contiguous assert c.flags.f_contiguous assert a.strides == b.strides == c.strides assert numpy.allclose(c, a) def test_ascontiguousarray(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted_o in [True, False]: for offseted_i in [True, True]: for sliced in [1, 2, -1, -2]: for order in ['f', 'c']: #print shp, dtype, offseted_o, offseted_i, #print sliced, order cpu, gpu = gen_gpu_nd_array(shp, dtype, offseted_o, offseted_i, sliced, order) a = numpy.ascontiguousarray(cpu) b = gpu_ndarray.ascontiguousarray(gpu) # numpy upcast with a view to 1d scalar. if (sliced != 1 or shp == () or (offseted_i and len(shp) > 1)): assert b is not gpu if sliced == 1 and not offseted_i: assert ((a.data is cpu.data) == (b.bytes is gpu.bytes)) else: assert b is gpu assert a.shape == b.shape assert a.dtype == b.dtype assert a.flags.c_contiguous assert b.flags['C_CONTIGUOUS'] assert a.strides == b.strides assert numpy.allclose(cpu, a) assert numpy.allclose(cpu, b) def test_asfortranarray(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted_outer in [True, False]: for offseted_inner in [True, False]: for sliced in [1, 2, -1, -2]: for order in ['f', 'c']: #print shp, dtype, offseted_outer, offseted_inner, sliced, order cpu, gpu = gen_gpu_nd_array(shp, dtype, offseted_outer, offseted_inner, sliced, order) a = numpy.asfortranarray(cpu) b = gpu_ndarray.asfortranarray(gpu) # numpy upcast with a view to 1d scalar. if (sliced != 1 or shp == () or (offseted_outer and len(shp) > 1) or (order != 'f' and len(shp) > 1)): assert b is not gpu if (sliced == 1 and not offseted_outer and order != 'c'): assert ((a.data is cpu.data) == (b.bytes is gpu.bytes)) else: assert b is gpu pass assert a.shape == b.shape assert a.dtype == b.dtype assert a.flags.f_contiguous if shp != (): assert b.flags['F_CONTIGUOUS'] assert a.strides == b.strides assert numpy.allclose(cpu, a) assert numpy.allclose(cpu, b) def test_zeros(): for shp in [(), (0,), (5,), (0, 0), (1, 0), (0, 1), (6, 7), (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (4, 8, 9), (1, 8, 9)]: for order in ["C", "F"]: for dtype in dtypes_all: x = numpy.zeros(shp, dtype, order) y = gpu_ndarray.zeros(shp, dtype, order) check_all(x, y) x = gpu_ndarray.zeros(()) # no dtype and order param y = numpy.zeros(()) check_meta(x, y) try: gpu_ndarray.zeros() assert False except TypeError: pass def test_empty(): for shp in [(), (0,), (5,), (0, 0), (1, 0), (0, 1), (6, 7), (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (4, 8, 9), (1, 8, 9)]: for order in ["C", "F"]: for dtype in dtypes_all: x = numpy.empty(shp, dtype, order) y = gpu_ndarray.empty(shp, dtype, order) check_meta(x, y) x = gpu_ndarray.empty(()) # no dtype and order param y = numpy.empty(()) check_meta(x, y) try: gpu_ndarray.empty() assert False except TypeError: pass def test_mapping_getitem_ellipsis(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: a, a_gpu = gen_gpu_nd_array(shp, dtype, offseted) b = a_gpu[...] assert b.bytes == a_gpu.bytes assert b.strides == a.strides assert b.shape == a.shape b_cpu = numpy.asarray(b) assert numpy.allclose(a, b_cpu) def test_copy_view(): from ..array import may_share_memory def check_memory_region(a, a_op, b, b_op): assert numpy.may_share_memory(a, a_op) == may_share_memory(b, b_op) if a_op.base is None: assert b_op.base is None else: assert a_op.base is a if b.base: # We avoid having a series of object connected by base. # This is to don't bloc the garbage collection. assert b_op.base is b.base else: assert b_op.base is b for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [False, True]: # order1 is the order of the original data for order1 in ['c', 'f']: # order2 is the order wanted after copy for order2 in ['c', 'f']: print shp, dtype, offseted, order1, order2 #TODO test copy unbroadcast! a, b = gen_gpu_nd_array(shp, dtype, offseted, order=order1) assert numpy.allclose(a, numpy.asarray(b)) check_flags(a, b) c = b.copy(order2) assert numpy.allclose(a, numpy.asarray(c)) check_flags(c, a.copy(order2)) check_memory_region(a, a.copy(order2), b, c) d = copy.copy(b) assert numpy.allclose(a, numpy.asarray(d)) check_flags(d, copy.copy(a)) check_memory_region(a, copy.copy(a), b, d) e = b.view() assert numpy.allclose(a, numpy.asarray(e)) check_flags(e, a.view()) check_memory_region(a, a.view(), b, e) f = copy.deepcopy(b) assert numpy.allclose(a, numpy.asarray(f)) check_flags(f, copy.deepcopy(a)) check_memory_region(a, copy.deepcopy(a), b, f) g = copy.copy(b.view()) assert numpy.allclose(a, numpy.asarray(g)) check_memory_region(a, copy.copy(a.view()), b, g) check_flags(g, copy.copy(a.view())) def test_len(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: a, a_gpu = gen_gpu_nd_array(shp, dtype, offseted) assert len(a_gpu) == shp[0] def test_mapping_getitem_w_int(): def _cmp(x, y): assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] if x.flags["WRITEABLE"] != y.flags["WRITEABLE"]: assert x.ndim == 0 assert not x.flags["OWNDATA"] assert y.flags["OWNDATA"] else: assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] assert x.flags["OWNDATA"] == y.flags["OWNDATA"] assert x.flags["ALIGNED"] == y.flags["ALIGNED"] assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] x = numpy.asarray(x) assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides if not numpy.all(x == y): print x print y assert numpy.all(x == y), (x, y) def _cmpNs(x, y): """ Don't compare the stride after the transfer There is a copy that have been made on the gpu before the transfer """ assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] assert x.flags["ALIGNED"] == y.flags["ALIGNED"] assert x.flags["OWNDATA"] == y.flags["OWNDATA"] assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] x_ = numpy.asarray(x) assert x_.shape == y.shape assert x_.dtype == y.dtype if not numpy.all(x_ == y): print x_ print y assert numpy.all(x_ == y), (x_, y) pass def _cmpf(x, *y): try: x.__getitem__(y) except IndexError: pass else: raise Exception("Did not generate out or bound error") def _cmpfV(x, *y): try: if len(y) == 1: x.__getitem__(*y) else: x.__getitem__(y) except ValueError: pass else: raise Exception("Did not generate value error") for dtype in dtypes_all: for offseted in [True, False]: # test vector dim = (2,) a, _a = gen_gpu_nd_array(dim, dtype, offseted) import sys init_ref_count = sys.getrefcount(_a) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[-1], a[-1]) _cmp(_a[1], a[1]) _cmp(_a[0], a[0]) _cmp(_a[::1], a[::1]) _cmpNs(_a[::-1], a[::-1]) _cmp(_a[...], a[...]) _cmpf(_a, 2) # test scalar dim = () a, _a = gen_gpu_nd_array(dim, dtype, offseted) _cmp(_a[...], a[...]) _cmpf(_a, 0) _cmpfV(_a, slice(1)) # test 4d-tensor dim = (5, 4, 3, 2) a, _a = gen_gpu_nd_array(dim, dtype, offseted) _cmpf(_a, slice(-1), slice(-1), 10, -10) _cmpf(_a, slice(-1), slice(-1), -10, slice(-1)) _cmpf(_a, 0, slice(0, -1, -20), -10) _cmpf(_a, 10) _cmpf(_a, (10, 0, 0, 0)) _cmpf(_a, -10) #test with integer _cmp(_a[1], a[1]) _cmp(_a[-1], a[-1]) _cmp(_a[numpy.int64(1)], a[numpy.int64(1)]) _cmp(_a[numpy.int64(-1)], a[numpy.int64(-1)]) #test with slice _cmp(_a[1:], a[1:]) _cmp(_a[1:2], a[1:2]) _cmp(_a[-1:1], a[-1:1]) #test with tuple (mix slice, integer, numpy.int64) _cmpNs(_a[0, 0, ::numpy.int64(-1), ::-1], a[0, 0, ::-1, ::-1]) _cmpNs(_a[:, :, ::numpy.int64(-1), ::-1], a[:, :, ::-1, ::-1]) _cmpNs(_a[:, :, numpy.int64(1), -1], a[:, :, 1, -1]) _cmpNs(_a[:, :, ::-1, ::-1], a[:, :, ::-1, ::-1]) _cmpNs(_a[:, :, ::-10, ::-10], a[:, :, ::-10, ::-10]) _cmpNs(_a[:, :, 1, -1], a[:, :, 1, -1]) _cmpNs(_a[:, :, -1, :], a[:, :, -1, :]) _cmpNs(_a[:, ::-2, -1, :], a[:, ::-2, -1, :]) _cmpNs(_a[:, ::-20, -1, :], a[:, ::-20, -1, :]) _cmpNs(_a[:, ::-2, -1], a[:, ::-2, -1]) _cmpNs(_a[0, ::-2, -1], a[0, ::-2, -1]) _cmp(_a[-1, -1, -1, -2], a[-1, -1, -1, -2]) #test ellipse _cmp(_a[...], a[...]) pyopencl-2013.2/pyopencl/compyte/ndarray/gpu_ndarray.h0000644000175000000500000000175512245716342021615 0ustar tomussrc#ifndef _GPU_NDARRAY_H #define _GPU_NDARRAY_H typedef struct GpuNdArray{ char* data; //pointer to data element [0,..,0]. int offset; int nd; //the number of dimensions of the tensor /** * base: * either NULL or a pointer to a fellow CudaNdarray into which this one is viewing. * This pointer is never followed, except during Py_DECREF when we do not need it any longer. */ void * base; ssize_t * dimensions; //dim0, dim1, ... dim nd ssize_t * strides; //stride0, stride1, ... stride nd int flags; // Flags, see numpy flags //DTYPE dtype; // fine for numeric types //DtypeMeta * dtype_meta; // reserved for future use. //PyArray_Descr *descr; /* Pointer to type structure */ } GpuNdArray; #endif /* Local Variables: mode:c++ c-basic-offset:4 c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)) indent-tabs-mode:nil fill-column:79 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : pyopencl-2013.2/pyopencl/compyte/ndarray/pygpu_ndarray.cpp0000644000175000000500000014450512245716342022522 0ustar tomussrc#include #include #include #include #include "pygpu_ndarray.h" #include "pygpu_language.h" ///////////////////////// // Static helper methods ///////////////////////// static void PyGpuNdArray_null_init(PyGpuNdArrayObject *self) { DPRINTF("PyGpuNdArrayObject_null_init\n"); PyGpuNdArray_DATA(self) = NULL; PyGpuNdArray_OFFSET(self) = 0; PyGpuNdArray_NDIM(self) = -1; self->base = NULL; PyGpuNdArray_DIMS(self) = NULL; PyGpuNdArray_STRIDES(self) = NULL; PyGpuNdArray_FLAGS(self) = NPY_DEFAULT; self->descr = NULL; self->data_allocated = 0; } ///////////////////////////// // Satisfying reqs to be Type ///////////////////////////// //DON'T use directly(if their is other PyGpuNdArrayObject that point to it, it will cause problem)! use Py_DECREF() instead static void PyGpuNdArrayObject_dealloc(PyGpuNdArrayObject* self) { DPRINTF("PyGpuNdArrayObject_dealloc\n"); DPRINTF("PyGpuNdArrayObject dealloc %p %d %p\n", self, self->data_allocated, PyGpuNdArray_DATA(self)); if(self->ob_refcnt>1) printf("WARNING:PyGpuNdArrayObject_dealloc called when their is still active reference to it.\n"); if (self->data_allocated){ assert(PyGpuNdArray_DATA(self)); if (PyGpuNdArray_DATA(self)){ if (device_free(PyGpuNdArray_DATA(self))){ fprintf(stderr, "!!!! error freeing device memory %p (self=%p)\n", PyGpuNdArray_DATA(self), self); } PyGpuNdArray_DATA(self) = NULL; } } PyGpuNdArray_OFFSET(self) = 0; PyGpuNdArray_NDIM(self) = -1; Py_XDECREF(self->base); self->base = NULL; if (PyGpuNdArray_DIMS(self)){ free(PyGpuNdArray_DIMS(self)); PyGpuNdArray_DIMS(self) = NULL; } if (PyGpuNdArray_STRIDES(self)){ free(PyGpuNdArray_STRIDES(self)); PyGpuNdArray_STRIDES(self) = NULL; } PyGpuNdArray_FLAGS(self) = NPY_DEFAULT; //Py_XDECREF(self->descr);//TODO: How to handle the refcont on this object? self->descr = NULL; self->data_allocated = 0; self->ob_type->tp_free((PyObject*)self); --_outstanding_mallocs[1]; DPRINTF("device_malloc_counts: (device) %i (obj) %i\n", _outstanding_mallocs[0], _outstanding_mallocs[1]); DPRINTF("PyGpuNdArrayObject_dealloc end\n"); } static PyObject * PyGpuNdArray_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { DPRINTF("PyGpuNdArray_new\n"); PyGpuNdArrayObject *self; self = (PyGpuNdArrayObject *)type->tp_alloc(type, 0); if (self != NULL){ PyGpuNdArray_null_init(self); ++_outstanding_mallocs[1]; } DPRINTF("PyGpuNdArray_new end %p\n", self); return (PyObject *)self; } static int PyGpuNdArray_init(PyGpuNdArrayObject *self, PyObject *args, PyObject *kwds) { DPRINTF("PyGpuNdArray_init\n"); PyObject *arr=NULL; if (! PyArg_ParseTuple(args, "O", &arr)) return -1; if (! PyArray_Check(arr)){ PyErr_SetString(PyExc_TypeError, "PyGpuNdArrayObject_init: PyArray or PyGpuNdArrayObject arg required"); return -1; } // TODO: We must create a new copy of the PyArray_Descr(or this only increment the refcount?) or still the reference? PyArray_Descr * type = PyArray_DescrFromType(PyArray_TYPE(arr)); self->descr = type; Py_XINCREF(self->descr);//TODO: How to handle the refcont on this object? int rval = PyGpuNdArray_CopyFromArray(self, (PyArrayObject*)arr); DPRINTF("PyGpuNdArray_init: end %p type=%p\n", self, self->descr); return rval; } int PyGpuNdArray_CopyFromArray(PyGpuNdArrayObject * self, PyArrayObject*obj) { DPRINTF("PyGpuNdArray_CopyFromArray: start descr=%p\n", self->descr); //modif done to the new array won't be updated! assert(!PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); //Aligned are not tested, so don't allow it for now assert(PyGpuNdArray_CHKFLAGS(self, NPY_ALIGNED)); int typenum = PyArray_TYPE(obj); PyObject * py_src = NULL; if (PyArray_ISONESEGMENT(obj)) { Py_INCREF(obj); py_src = (PyObject *) obj; }else{ py_src = PyArray_ContiguousFromAny((PyObject*)obj, typenum, PyArray_NDIM(obj), PyArray_NDIM(obj)); } DPRINTF("PyGpuNdArray_CopyFromArray: contiguous!\n"); if (!py_src) { return -1; } int err; if(PyArray_ISFORTRAN(obj) && ! PyArray_ISCONTIGUOUS(obj)){ DPRINTF("PyGpuNdArray_CopyFromArray: fortran!\n"); err = PyGpuNdArray_alloc_contiguous(self, obj->nd, obj->dimensions, NPY_FORTRANORDER); }else{ err = PyGpuNdArray_alloc_contiguous(self, obj->nd, obj->dimensions); } if (err) { return err; } //check that the flag are the same if (PyArray_ISCONTIGUOUS(py_src) != PyGpuNdArray_ISCONTIGUOUS(self) && PyArray_ISFORTRAN(obj) && 0) { PyErr_Format(PyExc_RuntimeError, "ISCONTIGUOUS %d %d\n", PyArray_ISCONTIGUOUS(py_src), PyGpuNdArray_ISCONTIGUOUS(self)); return -1; } assert(PyArray_ISCONTIGUOUS(py_src) == PyGpuNdArray_ISCONTIGUOUS(self) || PyArray_ISFORTRAN(obj)); assert(PyArray_ISFORTRAN(py_src) == PyGpuNdArray_ISFORTRAN(self)); assert(PyArray_ISALIGNED(py_src) == PyGpuNdArray_ISALIGNED(self)); // New memory, so we should own it. assert(PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); // New memory, so it should be writable assert(PyGpuNdArray_ISWRITEABLE(self)); err = PyGpuMemcpy(PyGpuNdArray_DATA(self), PyArray_DATA(py_src), PyGpuNdArray_OFFSET(self), PyArray_SIZE(py_src) * PyArray_ITEMSIZE(py_src), PyGpuHostToDevice); if (err) { Py_DECREF(py_src); return -1; } Py_DECREF(py_src); DPRINTF("PyGpuNdArray_CopyFromArray: end\n"); return 0; } static PyObject * PyGpuNdArray_copy(PyObject * self, PyObject *args, PyObject *kargs) { DPRINTF("PyGpuNdArray_copy start\n"); static const char *kwlist[] = {"order", NULL}; NPY_ORDER order = PyArray_CORDER; if(!PyGpuNdArray_Check(self)){ PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_copy: expected a PyGpuNdArrayObject."); return NULL; } DPRINTF("PyGpuNdArray_copy before parse inputs\n"); if (!PyArg_ParseTupleAndKeywords(args, kargs, "|O&", (char**)kwlist, PyArray_OrderConverter, &order)) { DPRINTF("PyGpuNdArray_copy start1.2\n"); return NULL; } DPRINTF("PyGpuNdArray_copy after parse inputs\n"); DPRINTF("PyGpuNdArray_copy before copy\n"); PyObject *ret = PyGpuNdArray_Copy((PyGpuNdArrayObject*)self, order); DPRINTF("PyGpuNdArray_copy end\n"); return ret; } static PyObject * PyGpuNdArray_Copy(PyGpuNdArrayObject * self, NPY_ORDER order) { DPRINTF("PyGpuNdArray_Copy start\n"); PyObject * rval = PyGpuNdArray_New(); //TODO find how to refcount descr. PyGpuNdArray_DESCR(rval) = PyGpuNdArray_DESCR(self); if ((!rval) || (-1 == PyGpuNdArray_NDIM(self))) { return rval; } if (PyGpuNdArray_alloc_contiguous((PyGpuNdArrayObject*)rval, PyGpuNdArray_NDIM(self), PyGpuNdArray_DIMS(self), order)) { Py_DECREF(rval); return NULL; } if (PyGpuNdArray_CopyFromPyGpuNdArray((PyGpuNdArrayObject*)rval, self)) { Py_DECREF(rval); return NULL; } if (order == NPY_F_CONTIGUOUS) PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; #ifdef DEBUG PyGpuNdArray_fprint(stderr, self); PyGpuNdArray_fprint(stderr, (PyGpuNdArrayObject *)rval); #endif DPRINTF("PyGpuNdArray_Copy end\n"); return rval; } PyObject * PyGpuNdArray_DeepCopy(PyGpuNdArrayObject * self, PyObject * memo) { assert(PyDict_Check(memo)); PyObject * selfkey = PyInt_FromLong((long)self); assert(selfkey); if (PyDict_Contains(memo, selfkey)) { PyObject * rval = PyDict_GetItem(memo, selfkey); Py_DECREF(selfkey); Py_XINCREF(rval); return rval; } else { DPRINTF("PyGpuNdArray_DeepCopy: startd deepcopy\n"); PyObject * rval = PyGpuNdArray_Copy(self); if (NULL == rval) { Py_DECREF(selfkey); return NULL; } DPRINTF("DeepCopy created %p\n", rval); DPRINTF("DeepCopy created %p %p\n", PyGpuNdArray_DESCR(rval), PyGpuNdArray_DATA(rval)); if (PyDict_SetItem(memo, selfkey, rval)) { Py_DECREF(rval); Py_DECREF(selfkey); return NULL; } Py_DECREF(selfkey); DPRINTF("PyGpuNdArray_DeepCopy: startd end\n"); return rval; } } PyObject * PyGpuNdArray_View(PyGpuNdArrayObject * self) { PyGpuNdArrayObject * rval = (PyGpuNdArrayObject*)PyGpuNdArray_New(PyGpuNdArray_NDIM(self)); if (!rval || PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), (PyObject *)self, PyGpuNdArray_OFFSET(self))) { Py_XDECREF(rval); DPRINTF("PyGpuNdArray_View: no rval or PyGpuNdArray_set_data " "failed: self=%p, rval=%p rval_base=%p\n", self, rval, rval->base); return NULL; } else { for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { PyGpuNdArray_DIM(rval, i) = PyGpuNdArray_DIMS(self)[i]; PyGpuNdArray_STRIDE(rval, i) = PyGpuNdArray_STRIDES(self)[i]; } } DPRINTF("PyGpuNdArray_View: self=%p, self->base=%p" " rval=%p rval->base=%p\n", self, self->base, rval, rval->base); //TODO: find how to refcount on the descr! //Py_INCREF(PyGpuNdArray_DESCR(self)); PyGpuNdArray_DESCR(rval) = PyGpuNdArray_DESCR(self); PyGpuNdArray_FLAGS(rval) = PyGpuNdArray_FLAGS(self); PyGpuNdArray_FLAGS(rval) &= ~NPY_OWNDATA; return (PyObject*)rval; } //updated for offset PyObject * PyGpuNdArray_CreateArrayObj(PyGpuNdArrayObject * self) { DPRINTF("PyGpuNdArray_CreateArrayObj\n"); if(PyGpuNdArray_NDIM(self)>=0 && PyGpuNdArray_SIZE(self)==0){ npy_intp * npydims = (npy_intp*)malloc(PyGpuNdArray_NDIM(self) * sizeof(npy_intp)); assert (npydims); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) npydims[i] = (npy_intp)(PyGpuNdArray_DIMS(self)[i]); // Numpy will do a decref on the description. Py_INCREF(PyGpuNdArray_DESCR(self)); // We can't use PyArray_{Empty,EMPTY} as they segfault when size == 0 PyObject * rval = PyArray_NewFromDescr(&PyArray_Type, PyGpuNdArray_DESCR(self), PyGpuNdArray_NDIM(self), npydims, NULL, NULL, 0, NULL); free(npydims); if (!rval){ return NULL; } assert (PyArray_ITEMSIZE(rval) == PyGpuNdArray_ITEMSIZE(self)); return rval; } if ((PyGpuNdArray_NDIM(self) < 0) || (PyGpuNdArray_DATA(self) == 0)) { PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized PyGpuNdArray"); return NULL; } PyGpuNdArrayObject * contiguous_self = NULL; bool pos_stride = true; for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) if (PyGpuNdArray_STRIDE(self,i)<0) pos_stride = false; if (PyGpuNdArray_ISONESEGMENT(self) && pos_stride) { contiguous_self = self; Py_INCREF(contiguous_self); DPRINTF("PyGpuNdArray_CreateArrayObj: gpu array already contiguous %p\n", contiguous_self); //}else if(PyGpuNdArray_ISONESEGMENT(self)){ //TODO implement special object handling to speed up transfer // DPRINTF("CreateArrayObj one segment, with special handling %p\n", contiguous_self); //PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_CreateArrayObj: Need PyGpuNdArray_Copy or some other nd array mandling to transfer contiguous bloc with negative stride."); //return NULL; } else { contiguous_self = (PyGpuNdArrayObject*)PyGpuNdArray_Copy(self); DPRINTF("CreateArrayObj created contiguous %p\n", contiguous_self); } if (!contiguous_self) { return NULL; } npy_intp * npydims = (npy_intp*)malloc(PyGpuNdArray_NDIM(self) * sizeof(npy_intp)); assert (npydims); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) npydims[i] = (npy_intp)(PyGpuNdArray_DIMS(self)[i]); Py_INCREF(PyGpuNdArray_DESCR(self)); PyObject * rval = PyArray_Empty(PyGpuNdArray_NDIM(self), npydims, PyGpuNdArray_DESCR(self), PyGpuNdArray_ISFORTRAN(self)); free(npydims); if (!rval) { Py_DECREF(contiguous_self); return NULL; } int err = PyGpuMemcpy(PyArray_DATA(rval), PyGpuNdArray_DATA(contiguous_self), PyGpuNdArray_OFFSET(contiguous_self), PyArray_SIZE(rval) * PyArray_ITEMSIZE(rval), PyGpuDeviceToHost); if (err) { Py_DECREF(contiguous_self); Py_DECREF(rval); rval = NULL; } Py_DECREF(contiguous_self); return rval; } static PyObject * PyGpuNdArray_Empty(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran) { DPRINTF("PyGpuNdArray_Empty: start!\n"); PyGpuNdArrayObject* rval = (PyGpuNdArrayObject*)PyGpuNdArray_New(); PyGpuNdArray_DESCR(rval) = dtype; if (!rval) { DPRINTF("PyGpuNdArray_Empty: fail!\n"); return NULL; } NPY_ORDER order = NPY_CORDER; if (fortran!=0) order = NPY_FORTRANORDER; if (PyGpuNdArray_alloc_contiguous(rval, nd, dims, order)) { Py_DECREF(rval); return NULL; } DPRINTF("PyGpuNdArray_Empty: end!\n"); return (PyObject*) rval; } //DONE: dtype, offset not needed, flags static PyObject * PyGpuNdArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran) { DPRINTF("PyGpuNdArray_Zeros: start!\n"); PyObject * rval = PyGpuNdArray_Empty(nd, dims, dtype, fortran); if (!rval) { return rval; } int total_elements = 1; for(int i=0;ielsize; // Fill with zeros int err = PyGpuMemset(PyGpuNdArray_DATA(rval), 0, total_size); if (err) { Py_DECREF(rval); return NULL; } DPRINTF("PyGpuNdArray_Zeros: end!\n"); return (PyObject*) rval; } // declared as a static method (hence "dummy" is not used) // numpy.zeros(shape, dtype=float, order='C') static PyObject * PyGpuNdArray_zeros(PyObject* dummy, PyObject* args, PyObject *kargs) { static const char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */ PyArray_Descr *typecode = NULL; PyObject * shape = NULL; NPY_ORDER order = PyArray_CORDER; bool fortran = false; PyObject *ret = NULL; if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&O&", (char**)kwlist, &shape, PyArray_DescrConverter, &typecode, PyArray_OrderConverter, &order)) { Py_XDECREF(typecode); Py_XDECREF(shape); return ret; } if (order == PyArray_FORTRANORDER) { fortran = true; } else { fortran = false; } if(!PySequence_Check(shape)) { PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence"); return NULL; } if (!typecode) typecode = PyArray_DescrFromType(NPY_FLOAT64); int shplen = PySequence_Length(shape); if (shplen == 0) { return PyGpuNdArray_Zeros(0, NULL, typecode, fortran); } npy_intp* newdims = (npy_intp *)malloc(sizeof(npy_intp) * shplen); if (!newdims) { PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_Zeros: Failed to allocate temporary space"); return NULL; } // start from the end to compute strides for (int i = shplen-1; i >= 0; --i) { PyObject* shp_el_obj = PySequence_GetItem(shape, i); if(shp_el_obj == NULL) { // shouldn't happen since we checked length before... PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_Zeros: Index out of bound in sequence"); free(newdims); return NULL; } int shp_el = PyInt_AsLong(shp_el_obj); Py_DECREF(shp_el_obj); newdims[i] = shp_el; } PyObject* rval = PyGpuNdArray_Zeros(shplen, newdims, typecode, fortran); free(newdims); return (PyObject*)rval; } // declared as a static method (hence "dummy" is not used) // numpy.empty(shape, dtype=float, order='C') static PyObject * PyGpuNdArray_empty(PyObject* dummy, PyObject* args, PyObject *kargs) { static const char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */ PyArray_Descr *typecode = NULL; PyObject * shape = NULL; NPY_ORDER order = PyArray_CORDER; bool fortran = false; PyObject *ret = NULL; if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&O&", (char **)kwlist, &shape, PyArray_DescrConverter, &typecode, PyArray_OrderConverter, &order)) { Py_XDECREF(typecode); Py_XDECREF(shape); return ret; } if (order == PyArray_FORTRANORDER) { fortran = true; } else { fortran = false; } if(!PySequence_Check(shape)) { PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence"); return NULL; } if (!typecode) typecode = PyArray_DescrFromType(NPY_FLOAT64); int shplen = PySequence_Length(shape); if (shplen == 0) { return PyGpuNdArray_Empty(0, NULL, typecode, fortran); } npy_intp* newdims = (npy_intp *)malloc(sizeof(npy_intp) * shplen); if (!newdims) { PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_empty: Failed to allocate temporary space"); return NULL; } // start from the end to compute strides for (int i = shplen-1; i >= 0; --i) { PyObject* shp_el_obj = PySequence_GetItem(shape, i); if(shp_el_obj == NULL) { // shouldn't happen since we checked length before... PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_empty: Index out of bound in sequence"); free(newdims); return NULL; } int shp_el = PyInt_AsLong(shp_el_obj); Py_DECREF(shp_el_obj); newdims[i] = shp_el; } PyObject* rval = PyGpuNdArray_Empty(shplen, newdims, typecode, fortran); free(newdims); return (PyObject*)rval; } static PyMethodDef PyGpuNdArray_methods[] = { {"__array__", (PyCFunction)PyGpuNdArray_CreateArrayObj, METH_NOARGS, "Copy from the device to a numpy ndarray"}, {"copy", (PyCFunction)PyGpuNdArray_copy, METH_VARARGS|METH_KEYWORDS, "Create a deep copy of this object."}, {"view", (PyCFunction)PyGpuNdArray_View, METH_NOARGS, "Create a view of this object."}, {"__copy__", (PyCFunction)PyGpuNdArray_Copy, METH_NOARGS, "Create a copy of this object as numpy does. Why numpy do a copy of the data when the object is a view?"}, {"__deepcopy__", (PyCFunction)PyGpuNdArray_DeepCopy, METH_O, "Create a copy of this object"}, /* {"reduce_sum", (PyCFunction)PyGpuNdArray_ReduceSum, METH_O, "Reduce over the given dimensions by summation"}, {"exp", (PyCFunction)PyGpuNdArray_Exp, METH_NOARGS, "Return the exponential of all elements"}, {"reshape", (PyCFunction)PyGpuNdArray_Reshape, METH_O, "Return a reshaped view (or copy) of this ndarray\n\ The required argument is a tuple of integers specifying the shape of the new ndarray."}, {"_set_stride", (PyCFunction)PyGpuNdArray_SetStride, METH_VARARGS, "For integer arguments (i, s), set the 'i'th stride to 's'"}, {"_set_shape_i", (PyCFunction)PyGpuNdArray_SetShapeI, METH_VARARGS, "For integer arguments (i, s), set the 'i'th shape to 's'"}, */ {NULL, NULL, NULL, NULL} /* Sentinel */ }; //PyArray_CopyInto(PyArrayObject* dest, PyArrayObject* src)¶ //PyObject* PyArray_NewCopy(PyArrayObject* old, NPY_ORDER order)¶ static PyObject * PyGpuNdArray_get_shape(PyGpuNdArrayObject *self, void *closure) { DPRINTF("PyGpuNdArray_get_shape\n"); if (PyGpuNdArray_NDIM(self) < 0) { PyErr_SetString(PyExc_ValueError, "PyGpuNdArray not initialized"); return NULL; } PyObject * rval = PyTuple_New(PyGpuNdArray_NDIM(self)); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(PyGpuNdArray_DIMS(self)[i]))) { Py_XDECREF(rval); return NULL; } } return rval; } static int PyGpuNdArray_set_shape(PyGpuNdArrayObject *self, PyObject *value, void *closure) { PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape"); return -1; } static PyObject * PyGpuNdArray_get_strides(PyGpuNdArrayObject *self, void *closure) { if ( PyGpuNdArray_NDIM(self) < 0){ PyErr_SetString(PyExc_ValueError, "PyGpuNdArrayObject not initialized"); return NULL; } PyObject * rval = PyTuple_New( PyGpuNdArray_NDIM(self)); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i){ if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(PyGpuNdArray_STRIDES(self)[i]))){ Py_XDECREF(rval); return NULL; } } return rval; } static PyObject * PyGpuNdArray_get_data(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) PyGpuNdArray_DATA(self)); } static PyObject * PyGpuNdArray_get_flags(PyGpuNdArrayObject *self, void *closure) { PyObject * dict = PyDict_New(); PyObject * str= PyString_FromString("C_CONTIGUOUS"); PyObject * i = PyBool_FromLong(PyGpuNdArray_ISCONTIGUOUS(self)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("F_CONTIGUOUS"); i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("WRITEABLE"); i = PyBool_FromLong(PyGpuNdArray_ISWRITEABLE(self)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("ALIGNED"); i = PyBool_FromLong(PyGpuNdArray_ISALIGNED(self)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("UPDATEIFCOPY"); i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("OWNDATA"); i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); return dict; } static PyObject * PyGpuNdArray_get_ndim(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) PyGpuNdArray_NDIM(self)); } static PyObject * PyGpuNdArray_get_offset(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) PyGpuNdArray_OFFSET(self)); } static PyObject * PyGpuNdArray_get_data_allocated(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) self->data_allocated); } static PyObject * PyGpuNdArray_get_size(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) PyGpuNdArray_SIZE(self)); } static PyObject * PyGpuNdArray_get_base(PyGpuNdArrayObject *self, void *closure) { if (!PyGpuNdArray_BASE(self)){ Py_INCREF(Py_None); return Py_None; } PyObject * ret = PyGpuNdArray_BASE(self); Py_INCREF(ret); return ret; } static PyObject * PyGpuNdArray_get_dtype(PyArrayObject *self) { Py_INCREF(PyGpuNdArray_DESCR(self)); PyObject * ret = (PyObject *)PyGpuNdArray_DESCR(self); return ret; } static PyObject * PyGpuNdArray_get_itemsize(PyArrayObject *self) { return (PyObject *)PyInt_FromLong(PyGpuNdArray_ITEMSIZE(self)); } static PyGetSetDef PyGpuNdArray_getset[] = { {(char*)"base", (getter)PyGpuNdArray_get_base, NULL, (char*)"Return the object stored in the base attribute", NULL}, {(char*)"bytes", (getter)PyGpuNdArray_get_data, NULL, (char*)"device data pointer", NULL}, {(char*)"shape", (getter)PyGpuNdArray_get_shape, (setter)PyGpuNdArray_set_shape, (char*)"shape of this ndarray (tuple)", NULL}, {(char*)"strides", (getter)PyGpuNdArray_get_strides, NULL,//(setter)PyGpuNdArray_set_strides, (char*)"data pointer strides (in elements)", NULL}, {(char*)"ndim", (getter)PyGpuNdArray_get_ndim, NULL, (char*)"The number of dimensions in this object", NULL}, {(char*)"offset", (getter)PyGpuNdArray_get_offset, NULL, (char*)"Return the offset value", NULL}, {(char*)"size", (getter)PyGpuNdArray_get_size, NULL, (char*)"The number of elements in this object.", NULL}, {(char*)"data_allocated", (getter)PyGpuNdArray_get_data_allocated, NULL, (char*)"The size of the allocated memory on the device.", NULL}, {(char*)"itemsize", (getter)PyGpuNdArray_get_itemsize, NULL, (char*)"The size of the base element.", NULL}, {(char*)"dtype", (getter)PyGpuNdArray_get_dtype, NULL, (char*)"The dtype of the element", NULL}, {(char*)"flags", (getter)PyGpuNdArray_get_flags, NULL, (char*)"Return the flags as a dictionary", NULL}, {NULL, NULL, NULL, NULL} /* Sentinel */ }; // Will by called by __len__ in Python static Py_ssize_t PyGpuNdArray_len(PyObject * py_self) { PyGpuNdArrayObject * self = (PyGpuNdArrayObject*) py_self; if (PyGpuNdArray_NDIM(self) <= 0) { return (Py_ssize_t) 0; } else { return (Py_ssize_t) PyGpuNdArray_DIMS(self)[0]; } } static int PyGpuNdArray_add_offset(PyGpuNdArrayObject * self, int offset) { DPRINTF("PyGpuNdArray_add_offset: %p %d\n", self, offset); #if OFFSET PyGpuNdArray_OFFSET(self) += offset; #else PyGpuNdArray_DATA(self) += offset; #endif return 0; } static int PyGpuNdArray_set_data(PyGpuNdArrayObject * self, char * data, PyObject * base, int offset) { DPRINTF("PyGpuNdArray_set_data: %p %p %p %d\n", self, data, base, offset); if (self->data_allocated) { assert(PyGpuNdArray_DATA(self)); if (device_free(PyGpuNdArray_DATA(self))) { PyGpuNdArray_DATA(self) = NULL; self->data_allocated = 0; DPRINTF("PyGpuNdArray_set_data: device_free failed!\n"); PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_set_data: device_free failed"); return -1; } } // Get the original base object (base.base.base...) // TODO: check that base is indeed a CudaNdarray? PyObject * orig_base = base; // base is not always a PyGpuNdArrayObject. It can be a GpuArray from pycuda, ... while (orig_base && PyGpuNdArray_Check(orig_base) && ((PyGpuNdArrayObject*) orig_base)->base) { // base_base is itself a view orig_base = ((PyGpuNdArrayObject*) orig_base)->base; } //N.B. XDECREF and XINCREF are no-ops for NULL pointers if (PyGpuNdArray_BASE(self) != orig_base) { Py_XDECREF(PyGpuNdArray_BASE(self)); PyGpuNdArray_BASE(self) = orig_base; Py_XINCREF(PyGpuNdArray_BASE(self)); } self->data_allocated = 0; #if OFFSET PyGpuNdArray_DATA(self) = data; PyGpuNdArray_OFFSET(self) = offset; #else PyGpuNdArray_DATA(self) = data + offset; #endif return 0; } // Will by called by __getitem__ in Python static PyObject * PyGpuNdArray_Subscript(PyObject * py_self, PyObject * key) { DPRINTF("Subscript start\n"); PyGpuNdArrayObject * self = (PyGpuNdArrayObject*) py_self; PyObject * py_rval = NULL; PyGpuNdArrayObject * rval = NULL; PyObject * intobj = NULL; //PyObject_Print(key, stderr, 0); if (key == Py_Ellipsis) { DPRINTF("Subscript with ellipse \n"); Py_INCREF(py_self); DPRINTF("Subscript with ellipse end\n"); return py_self; } if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER { #ifdef DEBUG PyGpuNdArray_fprint(stderr, self); #endif DPRINTF("Subscript with int \n"); int d_idx = PyInt_AsLong(intobj); Py_DECREF(intobj); intobj=NULL; DPRINTF("Subscript with int 1\n"); if (PyGpuNdArray_NDIM(self) == 0) { PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed"); return NULL; }else if (PyGpuNdArray_NDIM(self)< 0){ PyErr_SetString(PyExc_IndexError, "nd arrays must have a number of dim > 0!"); return NULL; } int d_dim = PyGpuNdArray_DIMS(self)[0]; int offset = 0; DPRINTF("Subscript with int 2\n"); if ((d_idx >= 0) && (d_idx < d_dim)) { //normal indexing offset += d_idx * PyGpuNdArray_STRIDES(self)[0]; } else if ((d_idx < 0) && (d_idx >= -d_dim)) { //end-based indexing // d_idx is negative offset += (d_dim + d_idx) * PyGpuNdArray_STRIDES(self)[0]; } else { PyErr_SetString(PyExc_IndexError, "index out of bounds"); return NULL; } DPRINTF("Subscript with int 3\n"); //Add the original offset offset += PyGpuNdArray_OFFSET(self); //allocate our subtensor view py_rval = PyGpuNdArray_New(PyGpuNdArray_NDIM(self) - 1); rval = (PyGpuNdArrayObject*) py_rval; if (!rval) return NULL; //TODO: find how to refcount on the descr! PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); DPRINTF("Subscript with int 4\n"); //initialize the view's data pointer to our own. assert (0 == rval->data_allocated); if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), (PyObject *) self, offset)){ Py_DECREF(rval); return NULL; } DPRINTF("Subscript with int 5\n"); for (int d = 1; d < PyGpuNdArray_NDIM(self); ++d) { PyGpuNdArray_STRIDE(rval, d-1) = PyGpuNdArray_STRIDES(self)[d]; PyGpuNdArray_DIM(rval, d-1) = PyGpuNdArray_DIMS(self)[d]; } } else { PyErr_Clear(); } if (PySlice_Check(key)) //INDEXING BY SLICE { DPRINTF("Subscript with slice \n"); if (PyGpuNdArray_NDIM(self) == 0) { PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array"); return NULL; } int d_dim = PyGpuNdArray_DIMS(self)[0]; Py_ssize_t start, stop, step, slen; if (PySlice_GetIndicesEx((PySliceObject*)key, d_dim, &start, &stop, &step, &slen)) { return NULL; } DPRINTF("start %zd\nstop %zd\n step %zd\n slen %zd\n", start, stop, step, slen); //allocate our subtensor view py_rval = PyGpuNdArray_New(PyGpuNdArray_NDIM(self)); rval = (PyGpuNdArrayObject*) py_rval; if (!rval) return NULL; //TODO: find how to refcount on the descr! PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); assert (0 == rval->data_allocated); if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), py_self, start * PyGpuNdArray_STRIDE(self, 0) + PyGpuNdArray_OFFSET(self))) { Py_DECREF(rval); return NULL; } //initialize dimension 0 of rval PyGpuNdArray_STRIDE(rval, 0) = step * PyGpuNdArray_STRIDES(self)[0]; PyGpuNdArray_DIM(rval, 0) = slen; DPRINTF("rval stride %zd\n", PyGpuNdArray_STRIDES(rval)[0]); // initialize dimensions > 0 of rval for (int d = 1; d < PyGpuNdArray_NDIM(self); ++d) { PyGpuNdArray_STRIDE(rval, d) = PyGpuNdArray_STRIDES(self)[d]; PyGpuNdArray_DIM(rval, d) = PyGpuNdArray_DIMS(self)[d]; } } if (PyTuple_Check(key)) //INDEXING BY TUPLE { DPRINTF("Subscript with tuple \n"); //elements of the tuple can be either integers or slices //the dimensionality of the view we will return is diminished for each slice in the tuple int tuple_start_index = 0; if (PyTuple_Size(key) > PyGpuNdArray_NDIM(self)) { if (PyTuple_GetItem(key, 0) == Py_Ellipsis && PyTuple_Size(key) == PyGpuNdArray_NDIM(self) + 1) { tuple_start_index = 1; DPRINTF("Subscript with tuple staring with an extra ellipse" " at the start.\n"); } else{ PyErr_SetString(PyExc_IndexError, "index error, specified more dimensions then" " the number of existing dimensions"); return NULL; } } //calculate the number of dimensions in the return value int rval_nd = PyGpuNdArray_NDIM(self); for (int tuple_d = tuple_start_index; tuple_d < PyTuple_Size(key); ++tuple_d) { //On some paltform PyInt_Check() return true, other it return false. //So we use PyArray_IsAnyScalar that should covert everything. rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, tuple_d)); } //allocate our subtensor view py_rval = PyGpuNdArray_New(rval_nd); rval = (PyGpuNdArrayObject*) py_rval; if (!rval) return NULL; assert (0 == rval->data_allocated); //TODO: find how to refcount on the descr! PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); //initialize the view's data pointer to our own. if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), py_self, PyGpuNdArray_OFFSET(self))) { Py_DECREF(rval); return NULL; } // rval_d will refer to the current dimension in the rval. // It will not be incremented for integer keys, but will be incremented for slice // keys int rval_d = 0; for (int self_d = 0, tuple_d = tuple_start_index; self_d < PyGpuNdArray_NDIM(self); ++self_d, ++tuple_d) { // keys can be shorter than PyGpuNdArray_NDIM(self). // when that happens, it means that the remaining dimensions are "full slices" if (tuple_d >= PyTuple_Size(key)) { PyGpuNdArray_STRIDE(rval, rval_d) = PyGpuNdArray_STRIDES(self)[tuple_d]; PyGpuNdArray_DIM(rval, rval_d) = PyGpuNdArray_DIMS(self)[tuple_d]; ++rval_d; DPRINTF("Subscript extra dims to append %zd %zd\n", PyGpuNdArray_STRIDE(rval, rval_d), PyGpuNdArray_DIM(rval, rval_d)); } else { PyObject * key_d = PyTuple_GetItem(key, tuple_d); if (PySlice_Check(key_d)) { Py_ssize_t start, stop, step, slen; if (PySlice_GetIndicesEx((PySliceObject*)key_d, PyGpuNdArray_DIMS(self)[self_d], &start, &stop, &step, &slen)) { Py_DECREF(rval); return NULL; } PyGpuNdArray_add_offset(rval, start * PyGpuNdArray_STRIDES(self)[self_d]); PyGpuNdArray_STRIDE(rval, rval_d) = step * PyGpuNdArray_STRIDES(self)[self_d]; PyGpuNdArray_DIM(rval, rval_d) = slen; DPRINTF("rval_d %d self_d %d\n start %zd\nstop %zd\n step %zd\n slen %zd\n", rval_d, self_d, start, stop, step, slen); ++rval_d; } else if ((intobj=PyNumber_Int(key_d))) { assert(PyArray_IsAnyScalar(key_d)); int d_idx = PyInt_AsLong(intobj); Py_DECREF(intobj); intobj = NULL; int d_dim = PyGpuNdArray_DIMS(self)[self_d]; if ((d_idx >= 0) && (d_idx < d_dim)) { //normal indexing PyGpuNdArray_add_offset(rval, d_idx * PyGpuNdArray_STRIDES(self)[self_d]); } else if ((d_idx < 0) && (d_idx >= -d_dim)) { //end-based indexing PyGpuNdArray_add_offset(rval, (d_dim + d_idx) * PyGpuNdArray_STRIDES(self)[self_d]); } else { PyErr_SetString(PyExc_IndexError, "index out of bounds"); Py_DECREF(rval); return NULL; } } else if (key_d == Py_Ellipsis) { if (self_d != 0){ PyErr_Format(PyExc_IndexError, "Ellipsis supported only at the start of" " the tuple"); Py_DECREF(rval); return NULL; } DPRINTF("Substript with tuple with the first element an ellipse\n"); for( ; self_d < (rval_nd - PyTuple_Size(key) + 1); self_d++) { PyGpuNdArray_STRIDE(rval, rval_d) = PyGpuNdArray_STRIDES(self)[self_d]; PyGpuNdArray_DIM(rval, rval_d) = PyGpuNdArray_DIMS(self)[self_d]; DPRINTF("Ellipse append dimensions self_%d with %zd %zd\n", self_d, PyGpuNdArray_STRIDE(rval, rval_d), PyGpuNdArray_DIM(rval, rval_d)); ++rval_d; } tuple_start_index = 1; self_d--; } else { PyErr_Clear(); // clear the error set by PyNumber_Int PyErr_Format(PyExc_IndexError, "index must be either int or slice. Got %s", PyString_AsString(PyObject_Str(key_d))); Py_DECREF(rval); return NULL; } } } } if (py_rval) { #ifdef DEBUG PyGpuNdArray_fprint(stderr, self); PyGpuNdArray_fprint(stderr, rval); #endif } else { PyErr_SetString(PyExc_NotImplementedError, "Unknown key type"); return NULL; } // Set flags if (PyGpuNdArray_ISWRITEABLE(self)) { PyGpuNdArray_FLAGS(rval) |= NPY_WRITEABLE; } else { PyGpuNdArray_FLAGS(rval) &= ~NPY_WRITEABLE; } PyGpuNdArray_FLAGS(rval) &= ~NPY_OWNDATA; if (PyGpuNdArray_ISALIGNED(self)) { PyGpuNdArray_FLAGS(rval) |= NPY_ALIGNED; } else { PyGpuNdArray_FLAGS(rval) &= ~NPY_ALIGNED; } PyGpuNdArray_FLAGS(rval) &= ~NPY_UPDATEIFCOPY; if (false && PyGpuNdArray_NDIM(rval) == 0) { //Numpy is not consistent here //When we create a new numpy ndarray of 0 dim, it is not f contiguous //But when we take a subtensor that is of 0 dim, it is f contiguous! //We make as them for now... PyGpuNdArray_FLAGS(rval) &= ~NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(rval) |= NPY_C_CONTIGUOUS; } else { if (PyGpuNdArray_is_c_contiguous(rval)) { PyGpuNdArray_FLAGS(rval) |= NPY_C_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(rval) &= ~NPY_C_CONTIGUOUS; } if (PyGpuNdArray_is_f_contiguous(rval)) { PyGpuNdArray_FLAGS(rval) |= NPY_F_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(rval) &= ~NPY_F_CONTIGUOUS; } } DPRINTF("Subscript end\n"); return py_rval; } PyMappingMethods PyGpuNdArrayMappingMethods = { PyGpuNdArray_len, //lenfunc mp_length; __len__ PyGpuNdArray_Subscript, //binaryfunc mp_subscript; __getitem__ 0 //PyGpuNdArray_setitem //objobjargproc mp_ass_subscript; __setitem__ }; static PyTypeObject PyGpuNdArrayType = { PyObject_HEAD_INIT(NULL) 0, /*ob_size*/ "GpuNdArray", /*tp_name*/ sizeof(PyGpuNdArrayObject), /*tp_basicsize*/ 0, /*tp_itemsize*/ (destructor)PyGpuNdArrayObject_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ 0, /*tp_compare*/ 0, /*tp_repr*/ 0, //&PyGpuNdArrayObjectNumberMethods, /*tp_as_number*/ 0, /*tp_as_sequence*/ &PyGpuNdArrayMappingMethods,/*tp_as_mapping*/ 0, /*tp_hash */ 0, /*tp_call*/ 0, /*tp_str*/ 0, /*tp_getattro*/ 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/ "PyGpuNdArrayObject objects", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ PyGpuNdArray_methods, /* tp_methods */ 0, //PyGpuNdArray_members, /* tp_members */ //TODO PyGpuNdArray_getset, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)PyGpuNdArray_init,/* tp_init */ 0, /* tp_alloc */ PyGpuNdArray_new, /* tp_new */ }; ////////////////////////////////////// // // C API FOR PyGpuNdArrayObject // ////////////////////////////////////// PyObject * PyGpuNdArray_New(int nd) { DPRINTF("PyGpuNdArray_New start\n"); PyGpuNdArrayObject *self = (PyGpuNdArrayObject *)PyGpuNdArrayType.tp_alloc(&PyGpuNdArrayType, 0); if (self == NULL) { PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_New failed to allocate self"); return NULL; } PyGpuNdArray_null_init(self); if (nd == 0) { PyGpuNdArray_NDIM(self) = 0; } else if (nd > 0) { if (PyGpuNdArray_set_nd(self, nd)) { Py_DECREF(self); return NULL; } } ++_outstanding_mallocs[1]; DPRINTF("PyGpuNdArray_New end\n"); return (PyObject *)self; } int PyGpuNdArray_Check(const PyObject * ob) { DPRINTF("PyGpuNdArray_Check\n"); //TODO: doesn't work with inheritance return PyGpuNdArray_CheckExact(ob); } int PyGpuNdArray_CheckExact(const PyObject * ob) { DPRINTF("PyGpuNdArray_CheckExact\n"); return ((ob->ob_type == &PyGpuNdArrayType) ? 1 : 0); } static PyObject * PyGpuNdArray_as_c_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs) { DPRINTF("PyGpuNdArray_as_c_contiguous:start\n"); static const char *kwlist[] = {"a", "dtype", NULL}; PyArray_Descr *typecode = NULL; PyObject *self_ = NULL; if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&", (char **)kwlist, &self_, PyArray_DescrConverter, &typecode)) { Py_XDECREF(typecode); Py_XDECREF(self_); return NULL; } assert(typecode == NULL); if (!PyGpuNdArray_Check(self_)){ PyErr_SetString(PyExc_TypeError, "PyGpuNdArray_as_c_contiguous:" " PyGpuNdArrayObject required"); return NULL; } PyGpuNdArrayObject *self = (PyGpuNdArrayObject*)self_; if (PyGpuNdArray_is_c_contiguous(self)){ Py_INCREF(self); if (PyGpuNdArray_NDIM(self) == 0){ //numpy.ascontiguous() always return object with 1d. DPRINTF("PyGpuNdArray_as_c_contiguous: upcast to 1d tensor end\n"); PyObject * rval = PyGpuNdArray_View(self); if (!rval) return NULL; PyGpuNdArray_set_nd((PyGpuNdArrayObject*)rval, 1); PyGpuNdArray_DIM(rval, 0) = 1; PyGpuNdArray_STRIDE(rval, 0) = PyGpuNdArray_ITEMSIZE(rval); return rval; } DPRINTF("PyGpuNdArray_as_c_contiguous: no copy end\n"); return (PyObject*)self; } PyObject * ret = PyGpuNdArray_Copy(self); DPRINTF("PyGpuNdArray_as_c_contiguous: copy end\n"); return ret; } static PyObject * PyGpuNdArray_as_f_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs) { DPRINTF("PyGpuNdArray_as_f_contiguous:start\n"); static const char *kwlist[] = {"a", "dtype", NULL}; PyArray_Descr *typecode = NULL; PyObject *self_ = NULL; if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&", (char **)kwlist, &self_, PyArray_DescrConverter, &typecode)) { Py_XDECREF(typecode); Py_XDECREF(self_); return NULL; } assert(typecode == NULL); if (!PyGpuNdArray_Check(self_)){ PyErr_SetString(PyExc_TypeError, "PyGpuNdArray_as_f_contiguous:" " PyGpuNdArrayObject required"); return NULL; } PyGpuNdArrayObject *self = (PyGpuNdArrayObject*)self_; if (PyGpuNdArray_is_f_contiguous(self)){ Py_INCREF(self); if (PyGpuNdArray_NDIM(self) == 0){ //numpy.ascontiguous() always return object with 1d. PyObject * rval = PyGpuNdArray_View(self); if (!rval) return NULL; PyGpuNdArray_set_nd((PyGpuNdArrayObject*)rval, 1); PyGpuNdArray_DIM(rval, 0) = 1; PyGpuNdArray_STRIDE(rval, 0) = PyGpuNdArray_ITEMSIZE(rval); DPRINTF("PyGpuNdArray_as_f_contiguous: upcast to 1d tensor end\n"); return rval; } DPRINTF("PyGpuNdArray_as_f_contiguous: no copy end\n"); return (PyObject*)self; } PyObject * ret = PyGpuNdArray_Copy(self, NPY_FORTRANORDER); DPRINTF("PyGpuNdArray_as_f_contiguous: copy end\n"); return ret; } #ifdef WITH_OPENCL #ifdef __APPLE__ #include #else #include #endif extern void setup_context(cl_context c); PyObject * PyGpuNdArray_set_opencl_context(PyObject *mod, PyObject *ctx) { Py_ssize_t v; v = PyInt_AsSsize_t(ctx); if (v == -1 && PyErr_Occurred()) return NULL; setup_context((cl_context)v); Py_INCREF(Py_None); return Py_None; } #endif static PyMethodDef module_methods[] = { //{"dimshuffle", PyGpuNdArray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a PyGpuNdArray."}, {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"}, {"zeros", (PyCFunction)PyGpuNdArray_zeros, METH_VARARGS|METH_KEYWORDS, "Create a new PyGpuNdArray with specified shape, filled with zeros."}, {"empty", (PyCFunction)PyGpuNdArray_empty, METH_VARARGS|METH_KEYWORDS, "Create a new PyGpuNdArray with specified shape, filled with zeros."}, {"ascontiguousarray", (PyCFunction)PyGpuNdArray_as_c_contiguous, METH_VARARGS|METH_KEYWORDS, "If the array is not c contiguous, copy it to a new c contiguous region."}, {"asfortranarray", (PyCFunction)PyGpuNdArray_as_f_contiguous, METH_VARARGS|METH_KEYWORDS, "If the array is not f contiguous, copy it to a new c contiguous region."}, #ifdef WITH_OPENCL {"set_opencl_context", PyGpuNdArray_set_opencl_context, METH_O, "Set the OpenCL context to use for allocations and work."}, #endif {NULL, NULL, NULL, NULL} /* Sentinel */ }; #ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ #define PyMODINIT_FUNC void #endif PyMODINIT_FUNC initpygpu_ndarray(void) { import_array(); PyObject* m; if (PyType_Ready(&PyGpuNdArrayType) < 0) return; m = Py_InitModule3("pygpu_ndarray", module_methods, "Example module that creates an extension type."); if (m == NULL) return; Py_INCREF(&PyGpuNdArrayType); PyModule_AddObject(m, "GpuNdArrayObject", (PyObject *)&PyGpuNdArrayType); #if COMPUTE_GPU_MEM_USED for(int i=0;i> sio, """ if (%(x)s->nd != %(nd_in)s) { PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", %(x)s->nd); %(fail)s; } """ % locals() # # alloc an output if we need one # # check the basics of out output print >> sio, """ if ( !%(z)s || (%(z)s->nd != %(nd_out)s) """ % locals() #ensure that the output has the right non-reduced dimensions j = 0 for i in xrange(nd_in): if not self.reduce_mask[i]: print >> sio, (" || (CudaNdarray_HOST_DIMS(%(z)s)[%(j)s] !=" "CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]) " % locals()) j += 1 print >> sio, """ ) { """ % locals() print >> sio, "int new_dims[%(nd_out)s]; " % locals() j = 0 for i in xrange(nd_in): if not self.reduce_mask[i]: print >> sio, ('new_dims[%(j)s] = CudaNdarray_HOST_DIMS' '(%(x)s)[%(i)s];' % locals()) j += 1 print >> sio, """ Py_XDECREF(%(z)s); %(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims); if (NULL == %(z)s) { PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); %(fail)s; } } """ % locals() # \begin bracket the reduction in a check that there is # actually work to do print >> sio, """ if (CudaNdarray_SIZE(%(z)s)) { """ % locals() # # Now perform the reduction # if all(i == 1 for i in self.reduce_mask): #check if the tensor is ccontiguous, if true, use the #c_c0de_reduce_ccontig code. #TODO: check if we are ccontiguous when we un-dimshuffle #TODO: if only some dims are ccontiguous, call version # with less dims. print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){' % locals() self.c_code_reduce_ccontig(sio, node, name, x, z, fail) print >> sio, "}else{" getattr(self, 'c_code_reduce_%s' % (''.join( str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) print >> sio, "}" else: getattr(self, 'c_code_reduce_%s' % (''.join( str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) # \end bracket the reduction ... print >> sio, """ } """ % locals() return sio.getvalue() def _makecall(self, node, name, x, z, fail, pattern=None): """Return a string for making a kernel call. The return value looks something like: .. code-block:: c if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); int n_shared = sizeof(%(dtype)s) * n_threads.x; kernel_reduce_sum_10_%(name)s<<>>( CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_DEV_DATA(%(z)s), CudaNdarray_HOST_STRIDES(%(z)s)[0] ); CNDA_THREAD_SYNC; if (cudaSuccess != cudaGetLastError()) { PyErr_Format(PyExc_RuntimeError, "Cuda error: ... ); %(fail)s; } """ sio = StringIO.StringIO() if pattern is None: pattern = ''.join(str(c) for c in self.reduce_mask) ndim = len(self.reduce_mask) nd_out = ndim - sum(self.reduce_mask) print >> sio, """ if (verbose) printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n"); int n_shared = sizeof(%(dtype)s) * n_threads.x * n_threads.y * n_threads.z; if (verbose>1) printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d," " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d," " nb_block=%%d, n_shared=%%d\\n", n_threads.x,n_threads.y,n_threads.z, n_threads.x*n_threads.y*n_threads.z, n_blocks.x,n_blocks.y, n_blocks.x*n_blocks.y, n_shared); kernel_reduce_sum_%(pattern)s_%(name)s<<>>( """ % locals() for i in xrange(ndim): print >> sio, """ CudaNdarray_HOST_DIMS(%(x)s)[%(i)s], """ % locals() print >> sio, """ CudaNdarray_DEV_DATA(%(x)s) """ % locals() for i in xrange(ndim): print >> sio, """ ,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s] """ % locals() print >> sio, """ ,CudaNdarray_DEV_DATA(%(z)s) """ % locals() for i in xrange(nd_out): print >> sio, """ ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s] """ % locals() print >> sio, """ ); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_%(pattern)s_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } """ % locals() return sio.getvalue() def _k_decl(self, nodename, pattern=None, ndim=None, reduce_mask=None): """Return a string to declare a kernel function .. code-block:: c __global__ void kernel_reduce_sum_110_%(nodename)s( const int d0, const int d1, const int d2, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, %(dtype)s * Z, const int sZ0) """ dtype = self.dtype if reduce_mask is None: reduce_mask = self.reduce_mask if ndim is None: ndim = len(reduce_mask) if pattern is None: pattern = ''.join(str(i) for i in reduce_mask) sio = StringIO.StringIO() print >> sio, """ __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s( """ % locals() for i in xrange(ndim): print >> sio, """const int d%(i)s,""" % locals() print >> sio, """const %(dtype)s *A,""" % locals() for i in xrange(ndim): print >> sio, """const int sA%(i)s,""" % locals() print >> sio, """%(dtype)s * Z""" % locals() for i in xrange(ndim - sum(reduce_mask)): print >> sio, """, const int sZ%(i)s""" % locals() print >> sio, ")" return sio.getvalue() def _k_init(self, *args): dtype = self.dtype return """ const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32){ //TODO: set error code Z[0] = 666; return; } """ % locals() def _k_reduce_buf(self, z_pos): return """ __syncthreads(); // some kernel do multiple reduction. buf[threadNum] = mysum; __syncthreads(); // rest of function is handled by one warp if (threadNum < warpSize) { //round up all the partial sums into the first `warpSize` elements for (int i = threadNum + warpSize; i < threadCount; i += warpSize) { mysum += buf[i]; } buf[threadNum] = mysum; if (threadNum < 16) { //reduce so that threadNum 0 has the sum of everything if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16]; if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8]; if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4]; if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2]; if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1]; if (threadNum == 0) { %(z_pos)s = buf[0]; } } } """ % locals() return """ __syncthreads(); // some kernel do multiple reduction. buf[threadNum] = mysum; __syncthreads(); // rest of function is handled by one warp if (threadNum < warpSize) { //round up all the partial sums into the first `warpSize` elements for (int i = threadNum + warpSize; i < threadCount; i += warpSize) { mysum += buf[i]; } buf[threadNum] = mysum; /*Comment this optimization as it don't work on Fermi GPU. TODO: find why it don't work or put the GPU compute capability into the version // no sync because only one warp is running if(threadCount >32) { buf[threadNum] += buf[threadNum+16]; buf[threadNum] += buf[threadNum+8]; buf[threadNum] += buf[threadNum+4]; buf[threadNum] += buf[threadNum+2]; buf[threadNum] += buf[threadNum+1]; if (threadNum == 0) { %(z_pos)s = buf[0]; } } else */ if (threadNum < 16) { //reduce so that threadNum 0 has the sum of everything if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16]; if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8]; if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4]; if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2]; if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1]; if (threadNum == 0) { %(z_pos)s = buf[0]; } } } """ % locals() # Threads must be organized as: threadNum%nb_reduce correspond to # the same sum # nb_reduce<=warpSize def _k_reduce_buf_multiple(self, z_pos, nb_reduce): return """ __syncthreads(); // some kernel do multiple reduction. buf[threadNum] = mysum; __syncthreads(); // rest of function is handled by one warp if (threadNum < %(nb_reduce)s) { //round up all the partial sums into the first `nb_reduce` elements for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s) { mysum += buf[i]; } %(z_pos)s = mysum; } """ % locals() def c_code_reduce_ccontig(self, sio, node, name, x, z, fail): print >> sio, """ { if(CudaNdarray_SIZE(%(x)s)==0){ cudaMemset(CudaNdarray_DEV_DATA(%(z)s),0,sizeof(%(dtype)s)); }else{ int verbose = 0; dim3 n_threads( std::min(CudaNdarray_SIZE(%(x)s), NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks(1); if (verbose) printf("running kernel_reduce_sum_ccontig_%(name)s" " n_threads.x=%%d, size=%%d, ndim=%%d\\n", n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd); int n_shared = sizeof(%(dtype)s) * n_threads.x; kernel_reduce_sum_ccontig_%(name)s<<>>( CudaNdarray_SIZE(%(x)s), CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(z)s)); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i;" " block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_ccontig_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } } } """ % locals() def c_code_reduce_1(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks(1); %(makecall)s } """ % locals() def c_code_reduce_11(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.y * n_threads.x <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y; n_threads.y -= 1; if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0]) n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[0]; dim3 n_blocks(1); %(makecall)s } """ % locals() def c_code_reduce_01X(self, sio, node, name, x, z, fail, N): """ :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011, N=3 ->0111 Work for N=1,2,3 """ assert N in [1, 2, 3] makecall = self._makecall(node, name, x, z, fail) N_pattern = ''.join(['1'] * N) param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals() for i in xrange(N + 1)]) strides_dim = ",".join( ["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]" % locals() for i in xrange(N + 1)]) threads_y = """ //get as many y threads as we can fit while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-1]) n_threads.y += 1; else break; } """ % locals() threads_z = """ //get as many z threads as we can fit while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.z < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-2]) n_threads.z += 1; else break; } """ % locals() if len(self.reduce_mask) == 2: threads_y = '' threads_z = '' if len(self.reduce_mask) == 3: threads_z = '' print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[%(N)s], NUM_VECTOR_OP_THREADS_PER_BLOCK)); %(threads_y)s %(threads_z)s dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS)); %(makecall)s } """ % locals() def c_code_reduce_01(self, sio, node, name, x, z, fail): self.c_code_reduce_01X(sio, node, name, x, z, fail, 1) def c_code_reduce_011(self, sio, node, name, x, z, fail): self.c_code_reduce_01X(sio, node, name, x, z, fail, 2) def c_code_reduce_0111(self, sio, node, name, x, z, fail): self.c_code_reduce_01X(sio, node, name, x, z, fail, 3) def c_code_reduce_10(self, sio, node, name, x, z, fail): print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks(1, std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], NUM_VECTOR_OP_BLOCKS)); if (verbose) { fprintf(stderr, "running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n", n_blocks.x, n_blocks.y); } assert(CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]); int n_shared = sizeof(%(dtype)s) * n_threads.x; kernel_reduce_sum_010_%(name)s<<>>( 1, CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_DEV_DATA(%(x)s), 1, CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_DEV_DATA(%(z)s), 1, CudaNdarray_HOST_STRIDES(%(z)s)[0] ); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_010_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } } """ % locals() def c_code_reduce_010(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) makecall_inner = self._makecall(node, name, x, z, fail, pattern="010_inner") pattern = ''.join(str(i) for i in self.reduce_mask) print >> sio, """ { // if the alternative is less buggy, consider not using this branch if (1) { // If there are a lot of summations to do, then we can use // simple parallelization - use each thread to do one sum. // we might as well launch blocks of 32 threads because that's // the warp size. we could schedule more threads if we were // maxing out the gridsize below, but the gridsize is way more // than the physical hardware and I think 32 threads // on a huge grid is enough to fully use the hardware. dim3 n_threads(32,1,1); // We kindof reshape the input implicitly to something 4D: // the shape A,B,C -> A, B, D, E // where C <= D*E < C+32 // where E==32 int A = CudaNdarray_HOST_DIMS(%(x)s)[0]; int B = CudaNdarray_HOST_DIMS(%(x)s)[1]; int C = CudaNdarray_HOST_DIMS(%(x)s)[2]; int D = C/32; if (32*D < C) D+= 1; assert ((C <= 32*D) && (32*D < C+32)); // The gridsize would ideally be (A, D). But we do the // following logic to make sure we don't ask for a grid that // is too big. dim3 n_blocks(A,D); if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS; if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x; int n_shared = 0; kernel_reduce_sum_010_AD_%(name)s<<>>( A,B,C,D, CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[2], CudaNdarray_DEV_DATA(%(z)s), CudaNdarray_HOST_STRIDES(%(z)s)[0], CudaNdarray_HOST_STRIDES(%(z)s)[1] ); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_010_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } } else { int verbose = 2; dim3 n_threads(std::min(32,CudaNdarray_HOST_DIMS(%(x)s)[2])); while((n_threads.x*(n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) && (n_threads.y1) printf("n_block.x.1=%%d, n_block.x.2=%%d," " n_block.y.1=%%d, n_block.y.2=%%d,\\n", CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS, ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2], (int)n_threads.x), (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x)); assert(n_threads.x<=32); %(makecall_inner)s }else{ n_threads.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], (int)NUM_VECTOR_OP_THREADS_PER_BLOCK); n_blocks.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], (int)NUM_VECTOR_OP_BLOCKS); n_blocks.y = std::min( CudaNdarray_HOST_DIMS(%(x)s)[2], (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x) ); %(makecall)s } CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_%(pattern)s_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } } } """ % locals() def c_code_reduce_0101(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; n_threads.y += 1; } n_threads.y -= 1; dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]); %(makecall)s } """ % locals() def c_code_reduce_100(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) # use threadIdx.x for i0 # use blockIdx.x for i1 # use blockIdx.y for i2 print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2]) { n_blocks.y += 1; } %(makecall)s } """ % locals() def c_code_reduce_110(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.x*n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0]) break; n_threads.y += 1; } n_threads.y -= 1; dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[2]); %(makecall)s } """ % locals() def c_code_reduce_001(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS)); while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS) { if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; n_blocks.y += 1; } n_blocks.y -= 1; %(makecall)s } """ % locals() def c_code_reduce_111(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)); //get as many y threads as we can fit while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; n_threads.y += 1; } n_threads.y -= 1; //get as many z threads as we can fit while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) break; n_threads.z += 1; } n_threads.z -= 1; dim3 n_blocks(1,1,1); %(makecall)s } """ % locals() def c_code_reduce_0011(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS)); while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS && n_blocks.y < CudaNdarray_HOST_DIMS(%(x)s)[1]) { n_blocks.y += 1; } dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK && n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[2] && n_threads.x * n_threads.y * sizeof(%(dtype)s) <= (15 * 1024 - 200)) { n_threads.y += 1; } %(makecall)s } """ % locals() def c_code_reduce_1111(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)); //get as many y threads as we can fit while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; n_threads.y += 1; } n_threads.y -= 1; //get as many z threads as we can fit while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) break; n_threads.z += 1; } n_threads.z -= 1; dim3 n_blocks(1,1,1); %(makecall)s } """ % locals() def c_code_reduce_1011(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y; if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2]) n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2]; while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z; if (n_threads.z > 64) n_threads.z = 64; if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) n_threads.z = CudaNdarray_HOST_DIMS(%(x)s)[0]; dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); %(makecall)s } """ % locals() def c_code_cache_version(self): return (21,) def c_support_code_apply(self, nodename, contig=False): sio = StringIO.StringIO() nd_in = len(self.reduce_mask) dtype = self.dtype if contig: # all(i == 1 for i in self.reduce_mask): #this kernel is ok for up to a few thousand elements, but # it only runs on ONE multiprocessor reducebuf = self._k_reduce_buf('Z[0]') print >> sio, """ __global__ void kernel_reduce_sum_ccontig_%(nodename)s( const int d0, const %(dtype)s *A, %(dtype)s * Z) { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) { mysum += A[i0]; } %(reducebuf)s } """ % locals() if self.reduce_mask == (1,): #this kernel is ok for up to a few thousand elements, but # it only runs on ONE multiprocessor reducebuf = self._k_reduce_buf('Z[0]') decl = self._k_decl(nodename) print >> sio, """ %(decl)s { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) { %(dtype)s Ai = A[i0 * sA0]; mysum += Ai; } %(reducebuf)s } """ % locals() if self.reduce_mask == (1, 1): #this kernel is ok for up to a few thousand elements, but # it only runs on ONE multiprocessor reducebuf = self._k_reduce_buf('Z[0]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, decl print >> sio, " { " print >> sio, init print >> sio, """ for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) { for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) { %(dtype)s Ai = A[i0 * sA0 + i1 * sA1]; mysum += Ai; } } """ % locals() print >> sio, reducebuf print >> sio, " } " #01, 011, 0111 if (0 == self.reduce_mask[0] and all(self.reduce_mask[1:]) and nd_in in[2, 3, 4]): # this kernel uses one block for each row. # threads per block for each element per row. N_pattern = ''.join(['1'] * (nd_in - 1)) if nd_in == 2: for_i1 = "for(int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)" for_i2 = "int i2=0, sA2=0;" for_i3 = "int i3=0, sA3=0;" if nd_in == 3: for_i1 = "for(int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)" for_i2 = "for(int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)" for_i3 = "int i3=0, sA3=0;" if nd_in == 4: for_i1 = "for(int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)" for_i2 = "for(int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)" for_i3 = "for(int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)" reducebuf = self._k_reduce_buf('Z[i0 * sZ0]') param_dim = ",".join(["const int d%(i)s" % locals() for i in xrange(nd_in)]) param_strides = ",".join(["const int sA%(i)s" % locals() for i in xrange(nd_in)]) decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s{ %(init)s for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ mysum = 0; %(for_i1)s{ %(for_i2)s{ %(for_i3)s{ %(dtype)s Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]; mysum += Ai; } } } %(reducebuf)s } } """ % locals() if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): # this kernel uses one block for each column, # threads per block for each element per column. #TODO: This kernel is pretty inefficient in terms of # reading, because if A is c_contiguous (typical # case) then each warp is accessing non-contigous # memory (a segment of a column). reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]') print >> sio, """ __global__ void kernel_reduce_sum_010_%(nodename)s( const int d0, const int d1, const int d2, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, %(dtype)s * Z, const int sZ0, const int sZ1) { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; extern __shared__ %(dtype)s buf[]; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) { for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) { %(dtype)s mysum = 0.0f; for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (0, 1, 0): print >> sio, """ __global__ void kernel_reduce_sum_010_AD_%(nodename)s( const int A, const int B, const int C, const int D, //const int E, // THIS is 32 const %(dtype)s *X, const int sX0, const int sX1, const int sX2, %(dtype)s * Z, const int sZ0, const int sZ1) { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; %(dtype)s mysum = 0.0f; if (warpSize != 32) { return; //TODO: set error code } for (int a = blockIdx.x; a < A; a += gridDim.x) { for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y) { int c = i2_D * 32 + threadIdx.x; if (c < C) { mysum = 0; for (int b = 0; b < B; ++b) { mysum += X[a * sX0 + b * sX1 + c * sX2]; } Z[a * sZ0 + c * sZ1] = mysum; } } } } """ % locals() if self.reduce_mask == (0, 1, 0): # # This kernel is optimized when the inner most dimensions # have the smallest stride. # this kernel uses one block for multiple column(up to 32TODO), # threads per block for each element per column. #thread.x = dim 2 contiguous #thread.y = dim 1 #block.x = dim 0 #block.y = dim 1 rest init = self._k_init(nodename) decl = self._k_decl(nodename, pattern="010_inner") reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', 'blockDim.x') reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', 'blockDim.x') print >> sio, """ %(decl)s { if(warpSize> sio, """ __global__ void kernel_reduce_sum_110_%(nodename)s( const int d0, const int d1, const int d2, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, %(dtype)s * Z, const int sZ0) { const int threadCount = blockDim.x * blockDim.y; const int threadNum = threadIdx.y * blockDim.x + threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32) { //TODO: set error code Z[blockIdx.x * sZ0] = 666; return; } for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) { for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) { %(dtype)s Ai = A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]; mysum += Ai; } } %(reducebuf)s } """ % locals() if self.reduce_mask == (1, 0, 0): reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) { for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x) { mysum = 0; for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (1, 1, 1): reducebuf = self._k_reduce_buf('Z[0]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) { for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) { for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; } } } """ % locals() print >> sio, reducebuf, "}" if self.reduce_mask == (0, 0, 1): # this kernel uses one block for each row, # threads per block for each element per row. reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]') print >> sio, """ __global__ void kernel_reduce_sum_001_%(nodename)s( const int d0, const int d1, const int d2, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, %(dtype)s * Z, const int sZ0, const int sZ1) { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; extern __shared__ %(dtype)s buf[]; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) { for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) { %(dtype)s mysum = 0.0f; for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (0, 0, 1, 1): # this kernel uses one block for each row, # threads per block for each element per row. reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) { for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) { %(dtype)s mysum = 0.0f; for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) { for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]; } } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (0, 1, 0, 1): # this kernel uses one block for each row, # threads per block for each element per row. reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) { for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) { %(dtype)s mysum = 0.0f; for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) { for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]; } } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (1, 1, 1, 1): reducebuf = self._k_reduce_buf('Z[0]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s mysum = 0; for (int i0 = 0; i0 < d0; i0++) for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z) { for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) { for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]; } } } %(reducebuf)s } """ % locals() if self.reduce_mask == (1, 0, 1, 1): reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]') print >> sio, """ __global__ void kernel_reduce_sum_1011_%(nodename)s( const int d0, const int d1, const int d2, const int d3, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, const int sA3, %(dtype)s * Z, const int sZ0) { const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) { for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) { for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) { %(dtype)sy Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]; mysum += Ai; } } } %(reducebuf)s } """ % locals() return sio.getvalue() pyopencl-2013.2/pyopencl/compyte/ndarray/test_gpu_elemwise.py0000644000175000000500000004374512245716342023234 0ustar tomussrc# TODO: test other dtype import numpy import theano import pygpu_ndarray as gpu_ndarray from gen_elemwise import MyGpuNdArray, elemwise_collapses from test_gpu_ndarray import (dtypes_all, enable_double, gen_gpu_nd_array, product) def rand(shape, dtype): r = numpy.random.randn(*shape) * 10 if dtype.startswith("u"): r = numpy.absolute(r) return r.astype(dtype) # numpy.allclose seam to have problem with int8... def all_close(x, y): return (numpy.allclose(x, y) or numpy.absolute(x - y).max() == 0) def test_elemwise_collapse(): """ Test collapsing under many broadcast and strided pattern """ for dtype1 in ["int16", "float32", "int8"]: for dtype2 in ["int16", "float32", "int8"]: for shape1_, shape2_, expected in [ # 1d to test this special case ((40,), (40,), 0), ((40,), (1,), 1), # No broadcastable dimensions ((4, 5, 6, 9), (4, 5, 6, 9), 0), # All inputs have one(and the same) broadcastable dimension ((1, 4, 5, 9), (1, 4, 5, 9), 0), ((4, 1, 5, 9), (4, 1, 5, 9), 0), ((4, 5, 1, 9), (4, 5, 1, 9), 0), ((4, 5, 9, 1), (4, 5, 9, 1), 0), # One inputs have one broadcastable dimension ((1, 5, 6, 9), (4, 5, 6, 9), 2), ((4, 1, 6, 9), (4, 5, 6, 9), 3), ((4, 5, 1, 9), (4, 5, 6, 9), 3), ((4, 5, 6, 1), (4, 5, 6, 9), 2), # One inputs have two broadcastable dimension ((1, 1, 6, 9), (4, 5, 6, 9), 2), ((1, 5, 1, 9), (4, 5, 6, 9), 4), ((1, 5, 6, 1), (4, 5, 6, 9), 3), ((4, 1, 1, 9), (4, 5, 6, 9), 3), ((4, 1, 6, 1), (4, 5, 6, 9), 4), ((4, 5, 1, 1), (4, 5, 6, 9), 2), # One inputs have tree broadcastable dimension ((1, 1, 1, 9), (4, 5, 6, 9), 2), ((1, 1, 6, 1), (4, 5, 6, 9), 3), ((1, 5, 1, 1), (4, 5, 6, 9), 3), ((4, 1, 1, 1), (4, 5, 6, 9), 2), # One scalar ((1, 1, 1, 1), (4, 5, 6, 9), 1), # One scalar, the other 1 broadcast dims ((1, 1, 1, 1), (4, 5, 6, 1), 1), ]: scalar_cpu = rand((1,) * len(shape1_), dtype=dtype1) scalar_gpu = gpu_ndarray.GpuNdArrayObject(scalar_cpu) scalar_gpu1 = MyGpuNdArray(scalar_gpu) for shape1, shape2 in [(shape1_, shape2_), (shape2_, shape1_)]: a_cpu = rand(shape1, dtype=dtype1) a = gpu_ndarray.GpuNdArrayObject(a_cpu) a1 = MyGpuNdArray(a) b_cpu = rand(shape2, dtype=dtype2) b = gpu_ndarray.GpuNdArrayObject(b_cpu) b1 = MyGpuNdArray(b) assert len(shape1) == len(shape2) o_shape = [] for i in range(len(shape1)): o_shape.append(max(shape1[i], shape2[i])) o = gpu_ndarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype) # 1.1 Check direct collapse nd_collaps, info = elemwise_collapses([a, b], [o]) assert nd_collaps == expected, (shape1, shape2, nd_collaps, expected, info) # 1.2 Check computation are still valid f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1], len(shape1)) out = f([a1, b1]) out2 = f([a1, b1], out=out) assert out is out2 assert numpy.allclose(numpy.asarray(f([a1, b1])), a_cpu + b_cpu) assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(a1, b1)), a_cpu + b_cpu) assert numpy.allclose(numpy.asarray( MyGpuNdArray.add(a1, b1)), a_cpu + b_cpu) assert MyGpuNdArray.add(a1, b1, out=out2) is out2 # 1.3 Check work without collaping f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1], len(shape1), collapse=False) out = f([a1, b1]) out2 = f([a1, b1], out=out) assert out is out2 assert numpy.allclose(numpy.asarray(f([a1, b1])), a_cpu + b_cpu) assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds( a1, b1)), a_cpu + b_cpu) assert numpy.allclose(numpy.asarray(MyGpuNdArray.add( a1, b1)), a_cpu + b_cpu) assert MyGpuNdArray.add(a1, b1, out=out2) is out2 # 2.1 What if we add a scalar? nd_collaps, info = elemwise_collapses( [a, b, scalar_gpu], [o]) if expected == 0: expected2 = 1 else: expected2 = expected assert nd_collaps == expected2, (shape1, shape2, nd_collaps, expected, info) # 2.2 Check computation assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds( a1, b1, scalar_gpu1)), a_cpu + b_cpu + scalar_cpu) # 3.1 What if one of the dimensions is strided? broadcast = any([True for i in a.shape + b.shape if i == 1]) if expected == 0: expected2 = 2 else: expected2 = expected if len(shape1_) != 4: continue if a.shape[0] != 1: shape = list(shape1) shape[0] *= 2 c_cpu = rand(shape, dtype='float32') c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::2] c1 = MyGpuNdArray(c) err = ("strided", c.shape, shape2, nd_collaps, expected, info) nd_collaps, info = elemwise_collapses([c, b], [o]) if broadcast: assert nd_collaps >= expected, err else: assert nd_collaps == expected2, err assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(c1, b1)), numpy.asarray(c) + b_cpu) if a.shape[1] != 1: shape = list(shape1) shape[1] *= 2 c_cpu = rand(shape, dtype='float32') c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::2] c1 = MyGpuNdArray(c) err = ("strided", c.shape, shape2, nd_collaps, expected, info) nd_collaps, info = elemwise_collapses([c, b], [o]) if broadcast: assert nd_collaps >= expected, err else: assert nd_collaps == expected2, err pass assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(c1, b1)), numpy.asarray(c) + b_cpu) if a.shape[2] != 1: shape = list(shape1) shape[2] *= 2 c_cpu = rand(shape, dtype='float32') c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, ::2] c1 = MyGpuNdArray(c) err = ("strided", c.shape, shape2, nd_collaps, expected, info) nd_collaps, info = elemwise_collapses([c, b], [o]) if broadcast: assert nd_collaps >= expected, err else: assert nd_collaps == expected2, err pass assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(c1, b1)), numpy.asarray(c) + b_cpu) if a.shape[3] != 1: shape = list(shape1) shape[3] *= 2 c_cpu = rand(shape, dtype='float32') c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, ::, ::2] c1 = MyGpuNdArray(c) err = ("strided", c.shape, shape2, nd_collaps, expected, info) nd_collaps, info = elemwise_collapses([c, b], [o]) if broadcast: assert nd_collaps >= expected, err else: assert nd_collaps == 1, err pass assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(c1, b1)), numpy.asarray(c) + b_cpu) def test_elemwise_mixed_dtype(): to_cpu = numpy.asarray for dtype1 in ["int16", "float32", "int8"]: for dtype2 in ["int16", "float32", "int8"]: dtypeo = str((numpy.zeros(1, dtype=dtype1) + numpy.zeros(1, dtype=dtype2)).dtype) #print "dtypes", dtype1, dtype2, "o dtype", dtypeo #print " Test inside a wrapping python object 2 inputs" for shape in [(500,), (50, 5), (5, 6, 7)]: input_vals = [rand(shape, dtype) for dtype in [dtype1, dtype2]] del dtype gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) for i in input_vals] assert all([numpy.allclose(to_cpu(ig), i) for ig, i in zip(gpu_vals, input_vals)]) gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] out = gpu_vals[0] + gpu_vals[1] assert numpy.allclose(to_cpu(out), input_vals[0] + input_vals[1]) out = gpu_vals[0] - gpu_vals[1] assert numpy.allclose(to_cpu(out), input_vals[0] - input_vals[1]) out = gpu_vals[0] * gpu_vals[1] assert all_close(to_cpu(out), input_vals[0] * input_vals[1]) if dtypeo.startswith("float"): # TODO: execute for all dtype out = gpu_vals[0] / gpu_vals[1] assert numpy.allclose(to_cpu(out), input_vals[0] / input_vals[1]) nb_in = 4 #print " Test inside a wrapping python object %d inputs"%nb_in for shape in [(500,), (50, 5), (5, 6, 7)]: input_vals = [rand(shape, dtype) for dtype in [dtype1, dtype2, dtype1, dtype2]] gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) for i in input_vals] assert all([numpy.allclose(to_cpu(ig), i) for ig, i in zip(gpu_vals, input_vals)]) gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] out = MyGpuNdArray.adds(*gpu_vals) assert numpy.allclose(to_cpu(out), reduce(numpy.add, input_vals)) out = MyGpuNdArray.multiplys(*gpu_vals) assert all_close(to_cpu(out), reduce(numpy.multiply, input_vals)) #print " Test broadcasting" for shapes in [((1, 5), (4, 5)), ((33, 10), (33, 1)), ((33, 1, 5), (33, 10, 1)), ((33, 1, 5), (33, 10, 1), ((1, 10, 5))), ]: input_vals = [rand(shape, dtype) for shape, dtype in zip(shapes, [dtype1, dtype2])] gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) for i in input_vals] assert all([numpy.allclose(to_cpu(ig), i) for ig, i in zip(gpu_vals, input_vals)]) gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] out = MyGpuNdArray.adds(*gpu_vals) assert numpy.allclose(to_cpu(out), reduce(numpy.add, input_vals)) out = MyGpuNdArray.multiplys(*gpu_vals) assert all_close(to_cpu(out), reduce(numpy.multiply, input_vals)) def test_sum(): to_cpu = numpy.asarray dtypes = list(dtypes_all) # I remove *int8 as currently the output have the same dtype # And this cause overflow dtypes.remove("int8") dtypes.remove("uint8") # I need to find how pycuda handle complexe in c. # I probably just need to add an header. dtypes.remove("complex64") if enable_double: dtypes.remove("complex128") for shape in [ # need something bigger then 32, 1024 or 4096. # Those are corner case. # 1d, take only a few seconds on a GTX470 (0,), (5,), (31,), (32,), (33,), (1023,), (1024,), (1025,), (4095,), (4096,), (4097,), (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,), # 2d, take 2 minutes on a GTX 470 (0, 0), (1, 0), (0, 1,), (5, 4), (31, 31), (31, 32), (31, 33), (32, 31), (32, 32), (32, 33), (33, 31), (33, 32), (33, 33), (1024, 32), (1025, 32), (1024, 33), (1025, 33), (4096, 32), (32, 4096), (4096, 33), (33, 4096), (4097, 32), (32, 4097), (4097, 33), (33, 4097), # 3d, take 2 minutes on a GTX 470 (0, 0, 0), (0, 1, 0), (0, 0, 1), (5, 4, 3), (5, 4, 3), (5, 4, 3), (4096, 2, 33), (2, 4096, 33), (33, 2, 4096), (4097, 2, 33), (2, 4097, 33), (33, 2, 4097), (4096, 33, 2), (33, 4096, 2), (2, 33, 4096), (4097, 33, 2), (33, 4097, 2), (2, 33, 4097), # 4d, take 1 minutes on a GTX 470 (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1), (5, 4, 3, 2), (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32), (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32), (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33), (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33), (4100, 4, 3, 2), (4, 4100, 3, 2), (4, 3, 4100, 2), (4, 3, 2, 4100), # 5d, work only if c contiguous (5, 4, 3, 10, 11), ]: for dtype, off_o, off_i, sliced, order in product( *([dtypes] + [[False, True]] + [[False, True]] + [[-1, 2, -2, 1]] + [['f', 'c']])): cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o, off_i, sliced, order) if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or gpu_val.flags["F_CONTIGUOUS"]): continue gpu_val = MyGpuNdArray(gpu_val) cpu_sum = cpu_val.sum() # print dtype, shape, off_o, off_i, sliced, order # print (cpu_val.strides, # cpu_val.flags["C_CONTIGUOUS"], # cpu_val.flags["F_CONTIGUOUS"]) # print (gpu_val.strides, # gpu_val.flags["C_CONTIGUOUS"], # gpu_val.flags["F_CONTIGUOUS"]) gpu_sum = to_cpu(gpu_val.sum()) def get_rtol(orig, after_reduction): if after_reduction.size == 0: return 0 if orig.size // after_reduction.size > 500000: rtols = {"float32": 4.3e-5} elif orig.size // after_reduction.size > 100000: rtols = {"float32": 3e-5} elif orig.size // after_reduction.size > 50000: rtols = {"float32": 2e-5} else: rtols = {"float32": 1e-5} if dtype in rtols: rtol = rtols[dtype] else: rtol = 1e-8 return rtol rtol = get_rtol(gpu_val, gpu_sum) cpu_sum = cpu_sum.astype(dtype) if not (dtype.endswith("int16") and numpy.prod(shape) > 20000): assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or cpu_sum == gpu_sum), ( dtype, shape, cpu_sum, gpu_sum, (cpu_sum - gpu_sum) / cpu_sum) # Test pattern 10 and 01 # Test pattern 100, 010 and 001 if len(shape) in [2, 3]: for axis in range(len(shape)): gpu_sum = to_cpu(gpu_val.sum(axis=[axis])) cpu_sum = cpu_val.sum(axis=axis) rtol = get_rtol(gpu_val, gpu_sum) if cpu_sum.size > 0: argmax = numpy.absolute(cpu_sum - gpu_sum).argmax() cpu_max = cpu_sum.flatten()[argmax] gpu_max = gpu_sum.flatten()[argmax] assert numpy.allclose(cpu_sum, gpu_sum), ( "axis=%d" % axis, dtype, shape, cpu_sum.shape, cpu_sum, gpu_sum, cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max) pyopencl-2013.2/pyopencl/compyte/ndarray/pygpu_language.h0000644000175000000500000001561112245716342022305 0ustar tomussrc/** * This file contain the header for ALL code that depend on cuda or opencl. */ #ifndef _PYGPU_LANGUAGE_H #define _PYGPU_LANGUAGE_H #include //#include #include "pygpu_ndarray_object.h" ///////////////////////// // Alloc and Free ///////////////////////// //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device. #define COMPUTE_GPU_MEM_USED 0 #define VERBOSE_ALLOC_FREE 0 //If true, we fill with NAN allocated device memory. #define ALLOC_MEMSET 0 static int _outstanding_mallocs[] = {0,0}; #ifdef DEBUG #define DPRINTF(args...) fprintf(stderr, args) #else #define DPRINTF(...) #endif #if COMPUTE_GPU_MEM_USED int _allocated_size = 0; const int TABLE_SIZE = 10000; struct table_struct{ void* ptr; int size; }; table_struct _alloc_size_table[TABLE_SIZE]; #endif /** * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage. * * device_malloc will set the Python error message before returning None. * device_free will return nonzero on failure (after setting the python error message) */ void * device_malloc(size_t size); int device_free(void * ptr); static PyObject * outstanding_mallocs(PyObject* self, PyObject * args) { return PyInt_FromLong(_outstanding_mallocs[0]); } int PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast = false); /** * PyGpuNdArray_alloc_contiguous * * Allocate storage space for a tensor of rank 'nd' and given dimensions. * * Note: PyGpuNdArray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions */ template int PyGpuNdArray_alloc_contiguous(PyGpuNdArrayObject *self, const int nd, const inttype * dim, NPY_ORDER order=NPY_CORDER) { DPRINTF("PyGpuNdArray_alloc_contiguous: start nd=%i descr=%p\n", nd, self); if (!PyGpuNdArray_DESCR(self)){ PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_alloc_contiguous: The array don't have a type! We can't allocate it!\n"); return -1; } // allocate an empty ndarray with c_contiguous access // return 0 on success int size = 1; //set up the strides for contiguous tensor assert (nd >= 0); if (PyGpuNdArray_set_nd(self, nd)) { return -1; } //TODO: check if by any chance our current dims are correct, // and strides already contiguous // in that case we can return right here. DPRINTF("PyGpuNdArray_alloc_contiguous: before itemsize descr=%p elsize=%i\n", self->descr, self->descr->elsize); int elsize = PyGpuNdArray_ITEMSIZE((PyObject*)self); DPRINTF("PyGpuNdArray_alloc_contiguous: set_nd %d! elsize=%i\n", nd, elsize); if(order != NPY_FORTRANORDER){ DPRINTF("PyGpuNdArray_alloc_contiguous: NPY_CORDER\n"); for (int i = nd-1; i >= 0; --i){ if (size == 0) PyGpuNdArray_STRIDE(self, i) = elsize; else PyGpuNdArray_STRIDE(self,i) = size * elsize; PyGpuNdArray_DIM(self,i) = dim[i]; size = size * dim[i]; } }else if (nd>0){ DPRINTF("PyGpuNdArray_alloc_contiguous: NPY_FORTRANORDER\n"); size = dim[0]; PyGpuNdArray_STRIDE(self, 0) = elsize; PyGpuNdArray_DIM(self, nd-1) = dim[nd-1]; for (int i = 1; i < nd; ++i){ if (size == 0) PyGpuNdArray_STRIDE(self, i) = elsize; else PyGpuNdArray_STRIDE(self, i) = PyGpuNdArray_STRIDE(self, i-1) * dim[i-1]; PyGpuNdArray_DIM(self, nd-i-1) = dim[nd-i-1]; size = size * dim[i]; } } if (self->data_allocated != size) { // If self is a view, do not try to free its memory if (self->data_allocated && device_free(PyGpuNdArray_DATA(self))) { // Does this ever happen?? Do we need to set data_allocated or devdata to 0? PyGpuNdArray_DATA(self) = NULL; self->data_allocated = 0; return -1; } assert(size>0); DPRINTF("PyGpuNdArray_alloc_contiguous: will allocate for size=%d elements\n", size); PyGpuNdArray_DATA(self) = (char*)device_malloc(size * PyGpuNdArray_ITEMSIZE((PyObject *)self)); if (!PyGpuNdArray_DATA(self)) { PyGpuNdArray_set_nd(self,-1); self->data_allocated = 0; PyGpuNdArray_DATA(self) = 0; return -1; } // The structure of self will be reused with newly allocated memory. // If self was a view, we should remove the reference to its base. // (If base was already NULL, the following has no effect.) Py_XDECREF(self->base); self->base = NULL; self->data_allocated = size; self->gpu_ndarray.flags = NPY_DEFAULT; PyGpuNdArray_FLAGS(self) |= NPY_WRITEABLE; PyGpuNdArray_FLAGS(self) |= NPY_OWNDATA; if (nd == 0) { PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; if (order != NPY_FORTRANORDER) { PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; } }else if(nd == 1){//set c and f contiguous PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; }else if(order != NPY_FORTRANORDER){//set c contiguous PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; }else{//set f contiguous PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; } PyGpuNdArray_FLAGS(self) &= ~NPY_UPDATEIFCOPY; }else if(size == 0){ PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_OWNDATA; if (nd == 0) { PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; if (order != NPY_FORTRANORDER) { PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; } }else if(nd == 1){//set c and f contiguous PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; }else if(order != NPY_FORTRANORDER){//set c contiguous PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; }else{//set f contiguous PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; } PyGpuNdArray_FLAGS(self) &= ~NPY_UPDATEIFCOPY; return 0; }else{ // How to check for the flags? Need to check if already contiguous. PyErr_Format(PyExc_RuntimeError, "PyGpuNdArray_alloc_contiguous: self->data_allocated=%d, size=%d, cmp=%d", self->data_allocated, size, self->data_allocated != size ); return -1; } if (order != NPY_FORTRANORDER) { assert(PyGpuNdArray_is_c_contiguous(self)); } else { assert(PyGpuNdArray_is_f_contiguous(self)); } DPRINTF("PyGpuNdArray_alloc_contiguous: end\n"); return 0; } enum PyGpuTransfert { PyGpuHostToDevice, PyGpuDeviceToHost }; int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, PyGpuTransfert direction); int PyGpuMemset(void * dst, int data, size_t bytes); #endif pyopencl-2013.2/pyopencl/compyte/ndarray/pygpu_ndarray_object.h0000644000175000000500000002154312245716342023511 0ustar tomussrc/** * struct PyGPUArrayObject * * This is a Python type. * */ #ifndef _PYGPU_NDARRAY_OBJECT_H #define _PYGPU_NDARRAY_OBJECT_H #include #include #include "gpu_ndarray.h" typedef struct PyGpuNdArrayObject{ PyObject_HEAD GpuNdArray gpu_ndarray; //no pointer, just inlined. PyObject * base; PyArray_Descr * descr; // for numpy-like desc int data_allocated; //the number of bytes allocated for devdata } PyGpuNdArrayObject; #define PyGpuNdArray_NDIM(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.nd) #define PyGpuNdArray_DATA(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.data) #define PyGpuNdArray_BYTES(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.data) #define PyGpuNdArray_OFFSET(obj) (((PyGpuNdArrayObject *)(obj))->gpu_ndarray.offset) #define PyGpuNdArray_DIMS(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.dimensions) #define PyGpuNdArray_STRIDES(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.strides) #define PyGpuNdArray_DIM(obj,n) (PyGpuNdArray_DIMS(obj)[n]) #define PyGpuNdArray_STRIDE(obj,n) (PyGpuNdArray_STRIDES(obj)[n]) #define PyGpuNdArray_BASE(obj) (((PyGpuNdArrayObject *)obj)->base) #define PyGpuNdArray_DESCR(obj) (((PyGpuNdArrayObject *)obj)->descr) #define PyGpuNdArray_FLAGS(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.flags) #define PyGpuNdArray_ITEMSIZE(obj) (((PyGpuNdArrayObject *)obj)->descr->elsize) #define PyGpuNdArray_TYPE(obj) (((PyGpuNdArrayObject *)(obj))->descr->type_num) #define PyGpuNdArray_SIZE(obj) PyArray_MultiplyList(PyGpuNdArray_DIMS(obj),PyGpuNdArray_NDIM(obj)) //npy_intp PyGpuNdArray_Size(PyObject* obj); //npy_intp PyGpuNdArray_NBYTES(PyObject* arr); /* Flags accessor */ #define PyGpuNdArray_CHKFLAGS(m, FLAGS) \ ((((PyGpuNdArrayObject *)(m))->gpu_ndarray.flags & (FLAGS)) == (FLAGS)) #define PyGpuNdArray_ISCONTIGUOUS(m) PyGpuNdArray_CHKFLAGS(m, NPY_CONTIGUOUS) #define PyGpuNdArray_ISFORTRAN(m) (PyGpuNdArray_CHKFLAGS(m, NPY_F_CONTIGUOUS) && \ PyGpuNdArray_NDIM(m) > 1) #define PyGpuNdArray_FORTRAN_IF(m) (PyGpuNdArray_CHKFLAGS(m, NPY_F_CONTIGUOUS)? \ NPY_F_CONTIGUOUS : 0) #define PyGpuNdArray_ISONESEGMENT(m) (PyGpuNdArray_NDIM(m) == 0 || \ PyGpuNdArray_ISCONTIGUOUS(m) || \ PyGpuNdArray_ISFORTRAN(m)) #define PyGpuNdArray_ISWRITEABLE(m) PyGpuNdArray_CHKFLAGS(m, NPY_WRITEABLE) #define PyGpuNdArray_ISALIGNED(m) PyGpuNdArray_CHKFLAGS(m, NPY_ALIGNED) #define PyGpuNdArray_ISNBO(arg) ((arg) != NPY_OPPBYTE) // THE NEXT ONE SEEM BAD... #define PyGpuNdArray_IsNativeByteOrder PyArray_ISNBO #define PyGpuNdArray_ISNOTSWAPPED(m) PyArray_ISNBO(PyArray_DESCR(m)->byteorder) #define PyGpuNdArray_FLAGSWAP(m, flags) (PyGpuNdArray_CHKFLAGS(m, flags) && PyGpuNdArray_ISNOTSWAPPED(m)) #define PyGpuNdArray_ISCARRAY(m) PyGpuNdArray_FLAGSWAP(m, NPY_CARRAY) #define PyGpuNdArray_ISCARRAY_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_CARRAY_RO) #define PyGpuNdArray_ISFARRAY(m) PyGpuNdArray_FLAGSWAP(m, NPY_FARRAY) #define PyGpuNdArray_ISFARRAY_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_FARRAY_RO) #define PyGpuNdArray_ISBEHAVED(m) PyGpuNdArray_FLAGSWAP(m, NPY_BEHAVED) #define PyGpuNdArray_ISBEHAVED_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_ALIGNED) static void PyGpuNdArray_fprint(FILE * fd, const PyGpuNdArrayObject *self) { fprintf(fd, "PyGpuNdArrayObject <%p, %p> nd=%i data_allocated=%d\n", self, PyGpuNdArray_DATA(self), PyGpuNdArray_NDIM(self), self->data_allocated); fprintf(fd, "\tITEMSIZE: %d\n", PyGpuNdArray_ITEMSIZE(self)); fprintf(fd, "\tTYPENUM: %d\n", PyGpuNdArray_TYPE(self)); fprintf(fd, "\tRefcount: %ld\n", (long int)self->ob_refcnt); fprintf(fd, "\tBASE: %p\n", PyGpuNdArray_BASE(self)); fprintf(fd, "\tHOST_DIMS: "); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { fprintf(fd, "%ld\t", PyGpuNdArray_DIMS(self)[i]); } fprintf(fd, "\n\tHOST_STRIDES: "); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { fprintf(fd, "%ld\t", PyGpuNdArray_STRIDES(self)[i]); } fprintf(fd, "\n\tFLAGS: "); fprintf(fd, "\n\t\tC_CONTIGUOUS: %d", PyGpuNdArray_ISCONTIGUOUS(self)); fprintf(fd, "\n\t\tPyGpuNdArray_ISFORTRAN: %d PyGpuNdArray_FORTRAN_IF:%d F_CONTIGUOUS: %d", PyGpuNdArray_ISFORTRAN(self), PyGpuNdArray_FORTRAN_IF(self), PyGpuNdArray_CHKFLAGS(self, NPY_FORTRAN)); fprintf(fd, "\n\t\tOWNDATA: %d", PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); fprintf(fd, "\n\t\tWRITEABLE: %d", PyGpuNdArray_ISWRITEABLE(self)); fprintf(fd, "\n\t\tALIGNED: %d", PyGpuNdArray_ISALIGNED(self)); fprintf(fd, "\n\t\tUPDATEIFCOPY: %d", PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); fprintf(fd, "\n"); } static void PyArray_fprint(FILE * fd, const PyArrayObject *self) { fprintf(fd, "PyArrayObject <%p, %p> nd=%i\n", self, PyArray_DATA(self), PyArray_NDIM(self)); fprintf(fd, "\tITEMSIZE: %d\n", PyArray_ITEMSIZE(self)); fprintf(fd, "\tTYPENUM: %d\n", PyArray_TYPE(self)); fprintf(fd, "\tHOST_DIMS: "); for (int i = 0; i < PyArray_NDIM(self); ++i) { fprintf(fd, "%ld\t", PyArray_DIMS(self)[i]); } fprintf(fd, "\n\tHOST_STRIDES: "); for (int i = 0; i < PyArray_NDIM(self); ++i) { fprintf(fd, "%ld\t", PyArray_STRIDES(self)[i]); } fprintf(fd, "\n\tFLAGS: "); fprintf(fd, "\n\t\tC_CONTIGUOUS: %d", PyArray_ISCONTIGUOUS(self)); fprintf(fd, "\n\t\tPyArray_ISFORTRAN: %d PyArray_FORTRAN_IF:%d F_CONTIGUOUS: %d", PyArray_ISFORTRAN(self), PyArray_FORTRAN_IF(self), PyArray_CHKFLAGS(self, NPY_FORTRAN)); fprintf(fd, "\n\t\tOWNDATA: %d", PyArray_CHKFLAGS(self, NPY_OWNDATA)); fprintf(fd, "\n\t\tWRITEABLE: %d", PyArray_ISWRITEABLE(self)); fprintf(fd, "\n\t\tALIGNED: %d", PyArray_ISALIGNED(self)); fprintf(fd, "\n\t\tUPDATEIFCOPY: %d", PyArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); fprintf(fd, "\n"); } template static T ceil_intdiv(T a, T b) { return (a/b) + ((a % b) ? 1: 0); } //Compute if the resulting array is c contiguous static bool PyGpuNdArray_is_c_contiguous(const PyGpuNdArrayObject * self) { bool c_contiguous = true; int size = PyGpuNdArray_ITEMSIZE(self); for (int i = PyGpuNdArray_NDIM(self)-1; (i >= 0) && c_contiguous; --i) { if (PyGpuNdArray_STRIDE(self, i) != size) { c_contiguous = false; } size = size * PyGpuNdArray_DIM(self, i); } return c_contiguous; } //Compute if the resulting array is f contiguous static bool PyGpuNdArray_is_f_contiguous(const PyGpuNdArrayObject * self) { bool f_contiguous = true; int size = PyGpuNdArray_ITEMSIZE(self); for (int i = 0; i < PyGpuNdArray_NDIM(self) && f_contiguous; ++i) { if (PyGpuNdArray_STRIDE(self, i) != size) { f_contiguous = false; } size = size * PyGpuNdArray_DIM(self, i); } return f_contiguous; } static PyObject * PyGpuNdArray_as_c_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs); static PyObject * PyGpuNdArray_as_f_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs); /** * [Re]allocate a PyGpuNdArrayObject with access to 'nd' dimensions. * * Note: This does not allocate storage for data. */ static int PyGpuNdArray_set_nd(PyGpuNdArrayObject * self, const int nd) { if (nd != PyGpuNdArray_NDIM(self)) { if(0) fprintf(stderr, "PyGpuNdArray_set_nd: modif nd=%i to nd=%i\n", PyGpuNdArray_NDIM(self), nd); if (PyGpuNdArray_DIMS(self)){ free(PyGpuNdArray_DIMS(self)); PyGpuNdArray_DIMS(self) = NULL; PyGpuNdArray_NDIM(self) = -1; } if (PyGpuNdArray_STRIDES(self)){ free(PyGpuNdArray_STRIDES(self)); PyGpuNdArray_STRIDES(self) = NULL; PyGpuNdArray_NDIM(self) = -1; } if (nd == -1) return 0; PyGpuNdArray_DIMS(self) = (npy_intp*)malloc(nd*sizeof(npy_intp)); if (NULL == PyGpuNdArray_DIMS(self)) { PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_set_nd: Failed to allocate dimensions"); return -1; } PyGpuNdArray_STRIDES(self) = (npy_intp*)malloc(nd*sizeof(npy_intp)); if (NULL == PyGpuNdArray_STRIDES(self)) { PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_set_nd: Failed to allocate str"); return -1; } //initialize all dimensions and strides to 0 for (int i = 0; i < nd; ++i) { PyGpuNdArray_DIM(self, i) = 0; PyGpuNdArray_STRIDES(self)[i] = 0; } PyGpuNdArray_NDIM(self) = nd; if(0) fprintf(stderr, "PyGpuNdArray_set_nd: end\n"); } return 0; } #endif /* Local Variables: mode:c++ c-basic-offset:4 c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)) indent-tabs-mode:nil fill-column:79 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : pyopencl-2013.2/pyopencl/compyte/ndarray/setup_opencl.py0000644000175000000500000000734012245716342022177 0ustar tomussrcimport os from distutils.core import setup, Extension from distutils.command.build_ext import build_ext from distutils.dep_util import newer import numpy as np class build_ext_nvcc(build_ext): user_options = build_ext.user_options user_options.extend([ ('cuda-root=', None, "The cuda root directory")]) def initialize_options(self): build_ext.initialize_options(self) self.cuda_root = None def finalize_options(self): build_ext.finalize_options(self) if self.cuda_root is None: self.cuda_root = os.getenv('CUDA_ROOT', None) if self.cuda_root is not None: self._nvcc_bin = os.path.join(self.cuda_root, 'bin', 'nvcc') else: self._nvcc_bin = 'nvcc' def cuda_process(self, source, include_args): target = source + '.cpp' if newer(source, target): self.spawn([self._nvcc_bin, '--cuda', source, '-o', target] + \ include_args) return target def cuda_extension(self, ext): includes = self.distribution.include_dirs + ext.include_dirs include_args = ['-I' + i for i in includes] new_sources = [] anycuda = False for src in ext.sources: if src.endswith('.cu'): new_sources.append(self.cuda_process(src, include_args)) anycuda = True else: new_sources.append(src) if anycuda: ext.sources = new_sources if self.cuda_root is not None: lib = os.path.join(self.cuda_root, 'lib') lib64 = os.path.join(self.cuda_root, 'lib64') if os.path.isdir(lib): ext.library_dirs.append(lib) ext.extra_link_args.append('-Xlinker') ext.extra_link_args.append('-rpath') ext.extra_link_args.append('-Xlinker') ext.extra_link_args.append(lib) if os.path.isdir(lib64): ext.library_dirs.append(lib64) # ext.extra_link_args.append('-rpath') # ext.extra_link_args.append(lib64) if 'cudart' not in ext.libraries: ext.libraries.append('cudart') if self.cuda_root: include = os.path.join(self.cuda_root, 'include') if os.path.isdir(include): ext.extra_compile_args.append('-I' + include) if os.path.isfile('/usr/lib/nvidia-current/libOpenCL.so'): ext.extra_link_args.append('-L/usr/lib/nvidia-current') ext.extra_link_args.append('-Xlinker') ext.extra_link_args.append('-rpath') ext.extra_link_args.append('-Xlinker') ext.extra_link_args.append('/usr/lib/nvidia-current') def build_extensions(self): self.check_extensions_list(self.extensions) for ext in self.extensions: self.cuda_extension(ext) # uncomment this + inherit from the cython version of build_ext # work with cuda and cython sources #ext.sources = self.cython_sources(ext.sources, ext) self.build_extension(ext) import sys if sys.platform == 'darwin': libcl_args = {'extra_link_args': ['-framework', 'OpenCL']} else: libcl_args = {'libraries': ['OpenCL']} setup(name='compyte', cmdclass={'build_ext': build_ext_nvcc}, include_dirs=[np.get_include(), '.'], ext_modules=[Extension('pygpu_ndarray', define_macros=[('OFFSET', '1'), ('WITH_OPENCL', '')], sources=['pygpu_language_opencl.cpp', 'pygpu_ndarray.cpp'], **libcl_args) ] ) pyopencl-2013.2/pyopencl/compyte/ndarray/Makefile0000644000175000000500000000232412245716342020562 0ustar tomussrcall: pygpu_ndarray.so PYTHONVERSION ?= $(shell python -c "import sys; print '%d.%d'%(sys.version_info[0], sys.version_info[1]")) CUDA_ROOT ?= /opt/lisa/os/cuda THEANO_ROOT ?= /u/bastienf/repos/Theano CFLAGS=-g -DDEBUG -DOFFSET # By default enable the OFFSET usage. Otherwise some test fail. CFLAGS=-g -DOFFSET #BINDIR=--compiler-bindir ${HOME}/.theano.nvcc-bindir #NPY_PATH!=python -c "import numpy;print numpy.__path__" #NPY_INCLUDE=-I${NPY_PATH}/core/include CUDA_INCLUDE=-I${CUDA_ROOT}/include PYTHON_INCLUDE=-I$(shell python -c "import distutils.sysconfig;print distutils.sysconfig.get_python_inc()") INCLUDES=${CUDA_INCLUDE} ${PYTHON_INCLUDE} CUDA_FLAGS=-Xlinker -rpath,${CUDA_ROOT}/lib64 -Xlinker -rpath,${CUDA_ROOT}/lib pygpu_language_cuda.o: pygpu_language_cuda.cu pygpu_language.h nvcc -c ${CFLAGS} -m64 -Xcompiler -fPIC,-m64 ${CUDA_FLAGS} ${INCLUDES} ${BINDIR} -o $@ $< pygpu_ndarray.so: pygpu_ndarray.cpp pygpu_ndarray.h pygpu_language_cuda.o pygpu_ndarray_object.h nvcc -shared ${CFLAGS} -m64 -Xcompiler -fPIC,-m64 ${CUDA_FLAGS} ${INCLUDES} ${BINDIR} -o $@ pygpu_language_cuda.o $< -lpython${PYTHONVERSION} -lcublas -lcudart clean: rm -f pygpu_ndarray.so core.* *.o *~ rm -rf build cleantmp: rm -f core.* *.o *~pyopencl-2013.2/pyopencl/compyte/ndarray/gen_elemwise.py0000644000175000000500000022514612245716342022150 0ustar tomussrc""" This file implement 1 version of the elemwise op on the gpu. The elemwise fct are also used with scalar operation! So it can happen that ndim is 0 as with all scalar type. """ import numpy import StringIO import pygpu_ndarray as gpu_ndarray _CL_MODE = hasattr(gpu_ndarray, "set_opencl_context") if _CL_MODE: # THIS IS NOT FINISHED import pyopencl as cl import pyopencl.array as cl_array from pyopencl.tools import dtype_to_ctype # import pyopencl._mymako as mako from pyopencl._cluda import CLUDA_PREAMBLE # TODO: use mako to get rid of the %if CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455] CLUDA_PREAMBLE += """ #define LDIM_0 get_local_size(0) #define LDIM_1 get_local_size(1) #define LDIM_2 get_local_size(2) #define GDIM_0 get_num_groups(0) #define GDIM_1 get_num_groups(1) #define GDIM_2 get_num_groups(2) """ # TODO, reuse the same context as the use used to create the memory. ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) else: import pycuda.autoinit import pycuda.driver as driver from pycuda.compiler import SourceModule from pycuda.tools import dtype_to_ctype # import pycuda._mymako as mako from pycuda._cluda import CLUDA_PREAMBLE CLUDA_PREAMBLE += """ #define LDIM_0 blockDim.x #define LDIM_1 blockDim.y #define LDIM_2 blockDim.z #define GDIM_0 gridDim.x #define GDIM_1 gridDim.y #define GDIM_2 gridDim.z """ from theano import Apply from theano import scalar from theano.tensor import TensorType import theano import logging _logger_name = 'compyte.gen_elemwise' _logger = logging.getLogger(_logger_name) _logger.setLevel(logging.INFO) _logger.addHandler(logging.StreamHandler()) # TO REMOVE def warning(*msg): _logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg)) def info(*msg): _logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg)) def debug(*msg): _logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg)) if _CL_MODE: gpu_ndarray.set_opencl_context(ctx.obj_ptr) cast_int = numpy.intc cast_uint = numpy.uintc def _logical_scalar(x): return numpy.all(x.type.broadcastable) def get_str_list_logical_scalar(inputs, value_str='ii_i%i_value', data_str='ii_i%i_data[0]'): l = [] for ipos, i in enumerate(inputs): if _logical_scalar(i): l += [value_str % ipos] else: l += [data_str % ipos] return l class WrapOpenCLFunction(object): def __init__(self, fct): self.fct = fct def _param_wrap(self, p): if isinstance(p, MyGpuNdArray): p = p.gpu_nd_array if isinstance(p, gpu_ndarray.GpuNdArrayObject): p = cl.MemoryObject.from_cl_mem_as_int(p.bytes) return p def set_block_shape(self, *shape): self.local_size = shape def param_set(self, *param): self.param = [self._param_wrap(p) for p in param] def launch_grid(self, *global_shape): global_size = global_shape + (1,) d = {"g_times_l": True} return self.fct(queue, global_size, self.local_size, *self.param, **d) def compile_gpu_code(code, fct_name): if _CL_MODE: # Compile the gpu function with pyopencl prg = cl.Program(ctx, code).build() fct2 = getattr(prg, fct_name) fct = WrapOpenCLFunction(fct2) else: # Compile the gpu function with pycuda mod = SourceModule(code) fct = mod.get_function(fct_name) return fct class ElemwiseAlgo(object): verbose = 0 # 1, 2 or 3 for more verbose output. cache_version = () cache_version = ('debug', 14, verbose) def __init__(self, scalar_op, inplace_pattern={}): """ :param scalar_op: the scalar operation to execute on each element. """ self.scalar_op = scalar_op self.inplace_pattern = inplace_pattern def task_code(self, inputs, outputs, sio, nodename, iname=None, oname=None): if iname == None: iname = get_str_list_logical_scalar(inputs) if oname == None: oname = ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(outputs)] print >> sio, self.scalar_op.c_code( Apply(self.scalar_op, [scalar.Scalar(dtype=input.type.dtype)() for input in inputs], [scalar.Scalar(dtype=output.type.dtype)() for output in outputs]), nodename + '_scalar_', iname, oname, sub=dict(fail='return;')) # TODO: set a failure code somehow!!! def c_src_kernel(self, inputs, outputs, nodename, nd, static="static"): sio = StringIO.StringIO() #print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, static, ( "KERNEL void kernel_%s_%s(unsigned int numEls" % (nodename, nd)) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(inputs): s = ", ".join(["GLOBAL_MEM const %s * i%i_data" % ( dtype_to_ctype(i.dtype), ipos)] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(outputs): s = ", ".join(["GLOBAL_MEM %s * o%i_data" % ( dtype_to_ctype(i.dtype), ipos)] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) # for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" print >> sio, " const int idx = GID_0 * LDIM_0 + LID_0;" print >> sio, " const int numThreads = LDIM_0 * GDIM_0;" # For each input that is a scalar which has been broadcasted # to a tensor, load it into a local variable for ipos, i in enumerate(inputs): if _logical_scalar(i): print >> sio, " const %s ii_i%i_value = i%i_data[0];" % ( dtype_to_ctype(i.dtype), ipos, ipos) #loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # calculate the data pointers for all arguments print >> sio, " int ii = i;" for ipos, i in enumerate(inputs): if not _logical_scalar(i): print >> sio, (" GLOBAL_MEM const " "%s * ii_i%i_data = i%i_data;" % ( dtype_to_ctype(i.dtype), ipos, ipos)) for ipos, i in enumerate(outputs): print >> sio, " GLOBAL_MEM %s * ii_o%i_data = o%i_data;" % ( dtype_to_ctype(i.dtype), ipos, ipos) for d in xrange(nd - 1, -1, -1): if d > 0: print >> sio, " int pos%i = ii %% dim%i;" % (d, d) print >> sio, " ii = ii / dim%i;" % d else: print >> sio, " int pos%i = ii;" % d for ipos, i in enumerate(inputs): if not _logical_scalar(i): print >> sio, (" ii_i" "%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)) for ipos, i in enumerate(outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % ( ipos, d, ipos, d) # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? self.task_code(inputs, outputs, sio, nodename) print >> sio, " }" #indent = " "*(4*d+7) #for ipos, i in enumerate(inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" #print sio.getvalue() return sio.getvalue() def c_src_kernel_Ccontiguous(self, inputs, outputs, nodename, static="static"): nd = outputs[0].type.ndim sio = StringIO.StringIO() #print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, static, ("KERNEL void kernel_%s_Ccontiguous" " (unsigned int numEls" % (nodename)) #declare inputs for ipos, i in enumerate(inputs): print >> sio, "\t,", "GLOBAL_MEM const %s * i%i_data" % ( dtype_to_ctype(i.dtype), ipos) #declare outputs for ipos, i in enumerate(outputs): print >> sio, "\t,", "GLOBAL_MEM %s * o%i_data" % ( dtype_to_ctype(i.dtype), ipos) print >> sio, "\t)\n{" print >> sio, " const int idx = GID_0 * LDIM_0 + LID_0;" print >> sio, " const int numThreads = LDIM_0 * GDIM_0;" # For each input that is a scalar which has been broadcasted # to a tensor, load it into a local variable for ipos, i in enumerate(inputs): if _logical_scalar(i): print >> sio, " const %s ii_i%i_value = i%i_data[0];" % ( dtype_to_ctype(i.dtype), ipos, ipos) #loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? self.task_code(inputs, outputs, sio, nodename, iname=get_str_list_logical_scalar( inputs, data_str='i%i_data[i]'), oname=['o%i_data[i]' % ipos for ipos, i in enumerate(outputs)]) print >> sio, " }" print >> sio, "}" #print sio.getvalue() return sio.getvalue() def c_src_callkernel(self, inputs, outputs, nodename): # # This function serves three main goals: # # The first is stride unpacking: # it accepts input and output arguments as # float * , int* # pairs, and it constructs a kernel function call where inputs # and arguments are named like # float *, int, int, int ... # # The second is to recognize when any dimensions can be collapsed as # being contiguous. That mean that we can merge that dimensions with # another one for all inputs/outputs and have the same retusuls # (confusing... read code) # # The thrid is to make a special case for scalar element. We allow # the collapsing of them. In the ccontiguous and not contiguous case, # we use registers to lower the number of memory access. # TODO: make a special case for broadcasting, to store the # data in shared memory. nd = outputs[0].type.ndim nb_inputs = len(inputs) nb_outputs = len(outputs) d = dict() # input_params and output_params go into the function # declaration/definition input_params = ", ".join("const %s * i%i_data, const int * i%i_str" % ( dtype_to_ctype(inputs[i].dtype), ipos, ipos) for ipos in xrange(len(inputs))) output_params = ", ".join("%s * o%i_data, const int * o%i_str" % ( dtype_to_ctype(outputs[i].dtype), ipos, ipos) for ipos in xrange(len(outputs))) #input_args and output_args go into the recursive call. input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos) for ipos in xrange(len(inputs))) output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos) for ipos in xrange(len(outputs))) prod_dims = '*'.join(["dims[%i]" % di for di in xrange(nd)] + ['1']) sio = StringIO.StringIO() print >> sio, """ static void can_collapse_%(nodename)s(int nd, const int * dims, const int * strides, int collapse[]) { //can we collapse dims[i] and dims[i-1] for(int i=nd-1;i>0;i--){ if(strides[i]*dims[i]==strides[i-1]){ //the dims nd-1 are not strided again dimension nd collapse[i]=1; }else collapse[i]=0; } } """ % locals() print >> sio, """ static int callkernel_%(nodename)s(unsigned int numEls, const int d, const int * dims, %(input_params)s, %(output_params)s) { numEls = %(prod_dims)s; """ % locals() if self.verbose: print >> sio, """ std::cerr << "calling kernel_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n"; """ % locals() print >> sio, 'std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di for di in xrange(nd)) + ["'\\n';"]) if self.verbose > 1: for ipos in xrange(len(inputs)): print >> sio, """ std::cerr << " %(ipos)s data strides" << """ % locals() + " << ' ' << ".join(["i%s_data" % ipos] + list("i%s_str[%i]" % (ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''' for ipos in xrange(len(outputs)): print >> sio, """ std::cerr << " %(ipos)s data strides" << """ % locals() + " << ' ' << ".join(["o%s_data" % ipos] + list("o%s_str[%i]" % (ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''' # collapse dimension that are broadcast in all inputs. # need to be done before contiguous collapse as it will break it. # do the dimensions and the strides print >> sio, """ int local_dims[%(nd)s]; int local_str[%(nb_inputs)s][%(nd)s]; int local_ostr[%(nb_inputs)s][%(nd)s]; int nd_collapse = %(nd)s; for(int i=0;i<%(nd)s;i++){//init new dim local_dims[i]=dims[i]; } """ % locals() for ipos in xrange(len(inputs)): print >> sio, """ for(int i=0;i<%(nd)s;i++){//init new strides local_str[%(ipos)s][i]=i%(ipos)s_str[i]; } """ % locals() for ipos in xrange(len(outputs)): print >> sio, """ for(int i=0;i<%(nd)s;i++){//init new strides local_ostr[%(ipos)s][i]=o%(ipos)s_str[i]; } """ % locals() if self.verbose > 2: print >>sio, 'std::cerr <<"before broadcast collapse\\n";' print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >> sio, 'std::cerr << "local_dims";' for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals() print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(inputs)): print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' for ipos in xrange(len(outputs)): print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' print >> sio, """ for(int id=0;id 2: print >>sio, 'std::cerr <<"after broadcast collapse\\n";' print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >> sio, 'std::cerr << "local_dims";' for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals() print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(inputs)): print >> sio, 'std::cerr << " local_str %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' for ipos in xrange(len(outputs)): print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle)) # this is a good idea because we make less index calculation in the gpu. print >> sio, "int nd_collapse_[%(nd)s] = {" % locals() +','.join(['1' for x in range(nd)]) +"};" for ipos in xrange(len(inputs)): if not _logical_scalar(inputs[ipos]): print >> sio, """ int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals() +','.join(['1' for x in range(nd)]) +"};" print >> sio, """ can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s); for(int i=0;i 1: print >>sio, """ std::cerr<< "nd_collapse_%(ipos)s "<< """ % locals() print >>sio, ' << " " << '.join( ["nd_collapse_%(ipos)s[" % locals() + str(i) + "]" for i in range(nd)]) print >>sio, '<< "\\n";' print >>sio, """ std::cerr<< "nd_collapse_ "<< """ % locals() print >>sio, ' << " " << '.join( ["nd_collapse_[" % locals() + str(i) + "]" for i in range(nd)]) print >>sio, '<< "\\n";' # update the local stride. for ipos in xrange(len(inputs)): print >> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides for(int j=i+1;j> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides for(int j=i+1;j> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_dims[i-1]*=local_dims[i];//set new dims for(int j=i+1;j> sio, """ for(int i=1, end=nd_collapse;i 0: print >> sio, " && ", " && ".join(l) print >> sio, """){nd_collapse=0;} """ if self.verbose: print >> sio, 'std::cerr <<"after can_collapse\\n";' print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals() if self.verbose > 1: for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals() print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(inputs)): print >> sio, ('std::cerr << " local_str %(ipos)s: " <<' % locals() + ' << " " << '.join( ["local_str[%(ipos)s][%(x)s]" % locals() for x in range(nd)]) + '<<"\\n";') for ipos in xrange(len(outputs)): print >> sio, ('std::cerr << " local_ostr %(ipos)s: " <<' % locals() + ' << " " << '.join( ["local_ostr[%(ipos)s][%(x)s]" % locals() for x in range(nd)]) + '<<"\\n";') def launch_Ccontiguous(nodename, scalar_op): kernel_call_args = ["numEls"] for ipos in xrange(len(inputs)): kernel_call_args.append("i%i_data" % ipos) for ipos in xrange(len(outputs)): kernel_call_args.append("o%i_data" % ipos) kernel_call_args = ", ".join(kernel_call_args) verb = "" if self.verbose: verb = 'std::cerr << " Running ccontiguous version\\n";' print >> sio, """ //first use at least a full warp int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE //next start adding multiprocessors int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s); //std::cerr << "calling callkernel returned\\n"; """ % locals() print >> sio, """ CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", "GpuElemwise %(nodename)s", cudaGetErrorString(err), n_blocks, threads_per_block, "kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s)"); return -1; } %(verb)s return 0; """ % locals() def launch_General(nodename, scalar_op, force_nd): # kernel_call_args are used to invoke the cuda kernel local = "local_" kernel_call_args = ["numEls"] kernel_call_args.extend(local + "dims[%i]" % di for di in xrange(force_nd)) for ipos in xrange(len(inputs)): kernel_call_args += ["i%i_data" % ipos] + list( local + "str[%i][%i]" % (ipos, di) for di in xrange(force_nd)) #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) #kernel_call_args.append( "%s, i%i_data" % (strides, ipos)) for ipos in xrange(len(outputs)): kernel_call_args += ["o%i_data" % ipos] + list( local + "ostr[%i][%i]" % (ipos, di) for di in xrange(force_nd)) #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) #kernel_call_args.append( "%s, o%i_data" % (strides, ipos)) if self.verbose: print >> sio, """ std::cerr << " Running general version with %(force_nd)s dims\\n"; """ % locals() print >> sio, "std::cerr << "+ ' << " " << '.join( kernel_call_args)+' << "\\n";' #std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n; kernel_call_args = ", ".join(kernel_call_args) print >> sio, """ //first use at least a full warp int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE //next start adding multiprocessors int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); kernel_%(nodename)s_%(force_nd)s<<>>(%(kernel_call_args)s); """ % locals() print >> sio, """ CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", "GpuElemwise %(nodename)s", cudaGetErrorString(err), n_blocks, threads_per_block, "kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s)"); return -1; } return 0; """ % locals() print >> sio, "if(numEls==0) return 0;" print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals() print >> sio, "case 0: {" launch_Ccontiguous(nodename, scalar_op) print >> sio, " } break;" for i in range(1, nd + 1): print >> sio, "case " + str(i) + ": {" launch_General(nodename, scalar_op, i) print >> sio, " } break;" print >> sio, "}" # end case print >> sio, "return -2;" # should not get to this point print >> sio, "}" # end fct #N.B. cudaGetLastError is called by c_code return sio.getvalue() def c_support_code_apply(self, inputs, outputs, nodename): nd = outputs[0].type.ndim return "".join( CLUDA_PREAMBLE, [self.c_src_kernel(inputs, outputs, nodename, x) for x in range(1, nd + 1)] + [self.c_src_kernel_Ccontiguous(inputs, outputs, nodename), self.c_src_callkernel(inputs, outputs, nodename), ]) def c_code(self, ninputs, noutputs, nodename, inputs, outputs, sub): d = dict(sub) nd = noutputs[0].type.ndim d.update(locals()) sio = StringIO.StringIO() nin = len(inputs) nout = len(outputs) fail = sub['fail'] opname = str(self.scalar_op) initial_dims = ','.join('1' for i in xrange(nd)) if 1 or self.scalar_op == scalar.pow: print >> sio, """ //std::cerr << "C_CODE %(opname)s START\\n"; //standard elemwise size checks """ % locals() print >> sio, """ int dims[%(nd)s] = {%(initial_dims)s}; """ % locals() #check that all inputs have valid dimensions emitted_inames = {} for id, iname in enumerate(inputs): if iname in emitted_inames: assert emitted_inames[iname] is ninputs[id] continue broadcasts = ', '.join(map(str, map(int, ninputs[id].broadcastable))) nd = ninputs[id].ndim print >> sio, """ int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; """ % locals() emitted_inames[iname] = ninputs[id] #check that all inputs have valid dimensions emitted_inames = {} for id, iname in enumerate(inputs): if iname in emitted_inames: continue print >> sio, """ //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; if (%(nd)s != %(iname)s->nd) { PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd); %(fail)s; } for (int i = 0; i< %(nd)s; ++i) { dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i]; if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1))&& (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i])) { //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. One of your inputs has shape[%%i] == %%i, but the output's size on that axis is %%i.", i, CudaNdarray_HOST_DIMS(%(iname)s)[i], dims[i] ); %(fail)s; } } """ % locals() emitted_inames[iname] = True #check that all outputs have valid dimensions for idx, oname in enumerate(outputs): if idx not in self.inplace_pattern.keys(): print >> sio, """ for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; } } if (NULL == %(oname)s) { %(oname)s = (CudaNdarray*)CudaNdarray_New(); if (!%(oname)s) { //error string already set %(fail)s; } if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims)) { //error string already set Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() else: input_idx = self.inplace_pattern[idx] iname = inputs[input_idx] print >> sio, """ Py_XDECREF(%(oname)s); %(oname)s = %(iname)s; Py_INCREF(%(oname)s); for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() print >> sio, """ { //new block so that failure gotos don't skip over variable initialization //std::cerr << "calling callkernel\\n"; if (callkernel_%(nodename)s(1, 0, dims """ % locals() for iname in inputs: print >> sio, """ , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s) """ % locals() for oname in outputs: print >> sio, """ , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s) """ % locals() print >> sio, """ )) { // error """ for oname in outputs: print >> sio, """ Py_DECREF(%(oname)s); %(oname)s = NULL; """ % locals() print >> sio, """ %(fail)s; } else // no error { } } //std::cerr << "C_CODE %(opname)s END\\n"; """ % locals() #print sio.getvalue() return sio.getvalue() def c_support_code(self): return """ #define INTDIV_POW2(a, b) (a >> b) #define INTMOD_POW2(a, b) (a & ((1<> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, """static __global__ void kernel_%s_%s( unsigned int numEls""" % ( nodename, 'tiling%i' % nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(inputs): s = ", ".join(["const float * i%i_data" % ipos] + list( "int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(outputs): s = ", ".join(["float * o%i_data" % ipos] + list( "int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable print >> sio, " __shared__ float value0[%i];" % len(inputs) print >> sio, " __shared__ int shared_dims[%(nd)s];" % locals() #print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]" print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" for ipos, i in enumerate(inputs): if _logical_scalar(i): print >> sio, " value0[%i] = i%i_data[0];" % (ipos, ipos) for ipos in xrange(nd): print >> sio, " shared_dims[%i] = dim%i;" % (ipos, ipos) print >> sio, " }" print >> sio, " __syncthreads();" if (nd == 4): print >> sio, """ for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x) { for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y) { //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x) for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y) { //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y) for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x) { """ else: raise NotImplementedError() for ipos, i in enumerate(inputs): if not _logical_scalar(i): print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos) for ipos, i in enumerate(outputs): print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos) for d in xrange(nd): for ipos, i in enumerate(inputs): if not _logical_scalar(i): print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d) for ipos, i in enumerate(outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d) # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? self.task_code(inputs, outputs, sio, nodename, iname=get_str_list_logical_scalar( inputs, value_str='value0[%i]')) print >> sio, " }" * nd #TODO: insert runtime stride checks that select the best loop order either here, or in # the host code that launched the kernel (host code probably better spot) #indent = " "*(4*d+7) #for ipos, i in enumerate(inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" print sio.getvalue() return sio.getvalue() def c_src_kernel_tiling_less_registers(self, inputs, outputs, nodename): """ The kernel applies to problems with <= 5 dimensions """ nd = outputs[0].type.ndim n_in = len(inputs) n_out = len(outputs) sio = StringIO.StringIO() if nd not in (2,): return sio.getvalue() # print some leading comments to make the code easier to read for ipos, i in enumerate(inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s(unsigned int numEls" %( nodename, 'tiling%i_less_registers'%nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(inputs): s = ", ".join(["const float * i%i_data_0" % ipos] + list( "int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(outputs): s = ", ".join(["float * o%i_data_0" % ipos] + list( "int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet. use_shared_stride = False use_shared_limits = False def decl_limits(nd): if use_shared_limits: print >> sio, "__shared__ float * limits[%(nd)s];" % locals() def stride(io, p, d): if use_shared_stride: return "s%s_str[%i][%i]" % (io, p, d) else: return "%s%i_str_%i" % (io, p, d) def limits(d): if use_shared_limits: return "limits[%i]" % d else: return "limits%i" % d def decl_shared_stride(nin, nout, nd): if not use_shared_stride: return print >> sio, """ __shared__ int si_str[%(nin)s][%(nd)s]; __shared__ int so_str[%(nout)s][%(nd)s]; if ((threadIdx.x == 0) && (threadIdx.y == 0)) { """ % locals() for i in xrange(nin): for d in xrange(nd): print >> sio, "si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals() for i in xrange(n_out): for d in xrange(nd): print >> sio, "so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals() print >> sio, "} __syncthreads();" def calc_limit(d): s = stride('o', 0, d) lname = limits(d) if use_shared_limits: print >> sio, "if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" if d == 0: print >> sio, "%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals() else: dm1 = d - 1 print >> sio, "%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals() print >> sio, "} __syncthreads();" else: if d == 0: print >> sio, "const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals() else: dm1 = d - 1 print >> sio, "const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals() def decl_ptrs(d, offset): dm1 = d - 1 assert dm1 >= 0 for i in xrange(n_in): s = stride('i', i, d) print >> sio, "const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals() for i in xrange(n_out): s = stride('o', i, d) print >> sio, "float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals() def inc_ptrs(d, amt): for i in xrange(n_in): s = stride('i', i, d) print >> sio, "i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals() for i in xrange(n_out): s = stride('o', i, d) print >> sio, "o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals() def while_limit(d): lname = limits(d) print >> sio, "while (o0_data_%(d)s < %(lname)s) { " % locals() def end_while(d): print >> sio, "}" def task_code(d): self.task_code(inputs, outputs, sio, nodename, iname=['i%i_data_%i[0]' % (ipos, d) for ipos, i in enumerate(inputs)], oname=['o%i_data_%i[0]' % (ipos, d) for ipos, i in enumerate(outputs)]) if nd == 4: decl_shared_stride(n_in, n_out, nd) decl_limits(nd) calc_limit(0) inc_ptrs(0, 'blockIdx.x') while_limit(0) if 1: calc_limit(1) decl_ptrs(1, 'blockIdx.y') while_limit(1) if 1: calc_limit(2) decl_ptrs(2, 'threadIdx.y') while_limit(2) if 1: calc_limit(3) decl_ptrs(3, 'threadIdx.x') while_limit(3) if 1: task_code(3) inc_ptrs(3, 'blockDim.x') end_while(3) inc_ptrs(2, 'blockDim.y') end_while(2) inc_ptrs(1, 'gridDim.y') end_while(1) inc_ptrs(0, 'gridDim.x') end_while(0) print >> sio, "}" print sio.getvalue() return sio.getvalue() def elemwise_collapses(inputs, outputs, out_shape=None, verbose=0): """ This collapse dimensions that are not needed when computing elemwise. This is usefull as it lower the indexing computation that is heavier on gpu then on cpu. This is a generic version. It collapse dimensions at any place in the shape. It handle broadcasted dimensions correctly. There is no special handling needed for broadcasted scalar at this level. @return: ndims, tuple(dims, strides) after collapsing. """ in_out = inputs + outputs del inputs if out_shape is not None: local_dims = tuple(out_shape) else: # TODO, use the right algo here or make the parameter not optional # We should always have the same shape for all outputs # If there is more then one outputs local_dims = tuple(outputs[0].shape) del outputs nd_orig = len(local_dims) if nd_orig == 1: # This have a lower overhead all_c_contig = True for inp in in_out: if not inp.flags['C_CONTIGUOUS'] or inp.shape != local_dims: all_c_contig = False break if all_c_contig: return 0, (local_dims, []) collapsable = [1] * nd_orig local_str = [None] * len(in_out) nd_collapse = nd_orig for ipos in xrange(len(in_out)): inp = in_out[ipos] assert len(inp.shape) == nd_orig, "All inputs/outputs must have the same number of dimensions. You must broadcast before calling elemwise_collapse" local_str[ipos] = list(inp.strides) # We set the strides of broacastable dims to 0 # This make indexing in gpu simpler and is needed # For collapsing the dimensions. for dim_pos in range(inp.ndim): if inp.shape[dim_pos] == 1: local_str[ipos][dim_pos] = 0 if nd_orig == 1: # We already covered the contiguous case before # So we are sure it is not contiguous # TODO: Add a test that f contiguous are also collapsed by the first case. # I think that for 1d array when the flags f contiguous is true, c contiguous is also true. return 1, (local_dims, local_str) if verbose > 2: print "before broadcast collapse" print " nd_collapse", nd_collapse print " local_dims", local_dims for ipos in xrange(len(local_str)): print " local_str inputs", ipos, local_str[ipos] local_dims = list(local_dims) # Collapse dimension that are broadcast in all inputs. # need to be done before contiguous collapse as it will break it. # Update the dimensions and the strides for id in range(nd_collapse): if local_dims[id] == 1: # remove dims i from the array for j in range(id + 1, nd_collapse): local_dims[j - 1] = local_dims[j] # remove dims i from the array for input_id in range(len(in_out)): for j in range(id + 1, nd_collapse): local_str[input_id][j - 1] = local_str[input_id][j] nd_collapse -= 1 id -= 1 # TODO: what is this? How this work? if verbose > 2: print "after broadcast collapse" print " nd_collapse", nd_collapse print " local_dims", local_dims for ipos in xrange(len(local_str)): print " local_str inputs", ipos, local_str[ipos] nd_collapse_ = [1] * nd_orig for ipos in xrange(len(local_str)): # Can we collapse dims[i] and dims[i-1]? strides = local_str[ipos] for i in range(nd_collapse - 1, 0, -1): if strides[i] * local_dims[i] != strides[i - 1]: # The dims nd-1 are not strided again dimension nd nd_collapse_[i] = 0 if verbose > 1: print "nd_collapse_", nd_collapse_ nd_collapse2 = nd_collapse for i in range(nd_collapse - 1, 0, -1): if nd_collapse_[i] == 1: # update the local dims. local_dims[i - 1] *= local_dims[i] for j in range(i + 1, nd_collapse): local_dims[j - 1] = local_dims[j] # update the local stride. for ipos in xrange(len(local_str)): local_str[ipos][i - 1] = local_str[ipos][i] # set new strides # remove stride i from the array for j in range(i + 1, nd_collapse): local_str[ipos][j - 1] = local_str[ipos][j] # update the new number of dim nd_collapse2 -= 1 nd_collapse = nd_collapse2 if nd_collapse == 1: l = [local_str[ipos][nd_collapse - 1] == in_out[ipos].itemsize for ipos in range(len(local_str))] if all(l): nd_collapse = 0 if verbose: print "end collapsing" print " nd_collapse", nd_collapse if verbose > 1: print " local_dims", local_dims for ipos in xrange(len(local_str)): print " local_str inputs", ipos, local_str[ipos] return nd_collapse, (local_dims, local_str) def reduction_collapses(inout, axis, verbose=0): """ This collapse dimensions that are not needed when computing reduction. This is usefull as it lower the indexing computation that is heavier on gpu then on cpu. This is a generic version. It collapse dimensions at any place in the shape. @param: inout: tuple(input, output) @param: axis: None, interger, list of 1 interger The axis over witch we will do reduction. @return: (ndims, (input dims, input strides, input pattern), out strides) after collapsing. :note: we suppose that we can always collapse the output dimensions. """ input = inout[0] out = inout[1] # Some quick check. It is faster then the full version. if axis is None: # The output size is always 1, so we don't care about this strides if (input.flags['C_CONTIGUOUS'] or input.flags['F_CONTIGUOUS']): return 0, ((input.size,), (input.itemsize,), axis), (0,) if input.ndim == 1: assert axis == [0] or axis == 0 or axis is None # not c contiguous as the first if should have catched it. return 1, (input.shape, input.strides, axis), (0,) if not isinstance(axis, (list, tuple)): local_axis = [axis] else: local_axis = list(axis) # This is needed for the computing of the output strides assert axis is None or len(local_axis) == 1 local_dims = list(input.shape) local_str = list(input.strides) out_strides = list(out.strides) nd_orig = len(local_dims) collapsable = [1] * nd_orig nd_collapse = nd_orig if verbose > 2: print "before broadcast collapse" print " nd_collapse", nd_collapse print " local_dims", local_dims print " local_str inputs", local_str print " local_axis", local_axis # Collapse dimension that are broadcast in all inputs. # need to be done before contiguous collapse as it will break it. # Update the dimensions and the strides for id in range(nd_collapse): if local_dims[id] == 1: for j in range(id + 1, nd_collapse): # remove dims i from the array local_dims[j - 1] = local_dims[j] # remove strides i from the array local_str[j - 1] = local_str[j] # remove output strides i from the array if axis is not None: out_strides[j - 2] = out_strides[j - 1] if id in local_axis: local_axis.remove(id) for axis_pos in range(len(local_axis)): if local_axis[axis_pos] > id: local_axis[axis_pos] -= 1 nd_collapse -= 1 id -= 1 # TODO: how this work? if verbose > 2: print "after broadcast collapse" print " nd_collapse", nd_collapse print " local_dims", local_dims print " local_str inputs", local_str print " local_axis", local_axis print " out_strides", out_strides nd_collapse_ = [1] * nd_orig # Can we collapse dims[i] and dims[i-1]? for i in range(nd_collapse - 1, 0, -1): if ((local_str[i] * local_dims[i] != local_str[i - 1])): # The dims nd-1 are not strided again dimension nd nd_collapse_[i] = 0 elif (i in local_axis) != ((i - 1) in local_axis): nd_collapse_[i] = 0 if verbose > 1: print "nd_collapse_", nd_collapse_ nd_collapse2 = nd_collapse for i in range(nd_collapse - 1, 0, -1): if nd_collapse_[i] == 1: # update the local dims. local_dims[i - 1] *= local_dims[i] # set new strides local_str[i - 1] = local_str[i] #remove the old dims and strides for j in range(i + 1, nd_collapse): local_dims[j - 1] = local_dims[j] local_str[j - 1] = local_str[j] if axis is not None: out_strides[j - 2] = out_strides[j - 1] if i in local_axis: local_axis.remove(i) for axis_pos in range(len(local_axis)): if local_axis[axis_pos] > i: local_axis[axis_pos] -= 1 # update the new number of dim nd_collapse2 -= 1 nd_collapse = nd_collapse2 if nd_collapse == 1: if local_str[nd_collapse - 1] == input.itemsize: nd_collapse = 0 if verbose: print "end collapsing" print " nd_collapse", nd_collapse if verbose > 1: print " local_dims", local_dims print " local_str inputs", local_str print " local_axis", local_axis print " out_strides", out_strides #print input.shape, input.strides #print nd_collapse, (local_dims, local_str, local_axis) local_dims = local_dims[:nd_collapse] local_str = local_str[:nd_collapse] out_strides = out_strides[:nd_collapse] return nd_collapse, (local_dims, local_str, local_axis), out_strides def call_elemwise(fct, input_vals, block=None, grid=None, out=None, out_shape=None, strides=None): """ Call an elemwise gpu function with gived inputs and block size. :param fct: The gpu function to call :param input_vals: a list of inputs to pass to fct :param block: int, the size of the block wanted :param grid: int, the size of the grid wanted :param out: Optional, the preallocated output. Must have the right shape and dtype. :param out_shape: Optional, if provided, we will suppose that the output, have this shape event if it is not true. :param strides: Optional, if provided, we will use those strides for the inputs and outputs. :note: param out_shape and strides are used for the collapsing of dimensions. """ inp = input_vals[0] # Get the output and output shape to us if out_shape is None and out is None: out_shape = list(inp.shape) for i in input_vals[1:]: # dtype checked by pycuda before gpu call for s_i in range(len(inp.shape)): assert (inp.shape[s_i] == i.shape[s_i] or inp.shape[s_i] == 1 or i.shape[s_i] == 1) out_shape[s_i] = max(out_shape[s_i], inp.shape[s_i], i.shape[s_i]) if out is None: out = gpu_ndarray.empty(out_shape, dtype=inp.dtype) elif out_shape is None: out_shape = out.shape # Arg: nb element args = [cast_uint(out.size)] # Arg: output shape to the arguments. for i in range(len(out_shape)): args.append(cast_int(out_shape[i])) # for each inputs and the output # add its ptr and strides nd = len(out_shape) idx = 0 for i in list(input_vals) + [out]: itemsize = i.dtype.itemsize args.append(i) for j in range(nd): # We force a stride of 0 for broadcastable dimensions # This lower the index computation in the kernel. if strides is not None: # strides should have a strides of 0 for broadcasting. args.append(cast_int(strides[idx][j] / itemsize)) elif i.shape[j] == 1: args.append(cast_int(0)) else: args.append(cast_int(i.strides[j] / itemsize)) idx += 1 out_size = out.size # First use at least a full warp if block is None: block_ = min(32, out_size) else: block_ = block # Next start adding multiprocessors if grid is None: grid_ = min(out_size / block_ + (out_size % block_ != 0), 60) else: grid_ = grid # Next start adding more warps per multiprocessor if block is None: if block_ * grid_ < out_size: block_ = min(out_size / grid_, 512) # We bypass the pycuda wrapper gpu function call. # by calling directly the gpu function. # This is faster and lower the overhead. # Here is code that allow you to use the pycuda fct call. # d = {"block":(block_,1,1), "grid":(grid_,1)} # fct(*args, **d) fct.set_block_shape(block_, 1, 1) # time_kernel fct.param_set(*args) fct.launch_grid(grid_, 1) return out class MyGpuNdArray(): _compiled_fct = {} def __init__(self, gpu_nd_array): #assert isinstance(gpu_nd_array, gpu_ndarray.GpuNdArrayObject) self.gpu_nd_array = gpu_nd_array self.ctype = dtype_to_ctype(self.gpu_nd_array.dtype) @staticmethod def gen_fct(op, inputs, nd, nodename="TestNodeName", collapse=True): if _CL_MODE: npy_ty = "typedef float npy_float32;\n" else: npy_ty = "typedef double npy_float64;\n typedef float npy_float32;\n" # Generate the gpu functions nb_in = len(inputs) fcts = [None] for nd in range(1, nd + 1): # 1 to nd out = op(*[TensorType(i.gpu_nd_array.dtype, (False,) * nd)() for i in inputs]) out_dtype = out.dtype node = out.owner elemwise_algo = ElemwiseAlgo(node.op.scalar_op) code = (CLUDA_PREAMBLE + npy_ty + elemwise_algo.c_src_kernel(node.inputs, node.outputs, nodename, nd, static="")) fct_name = "kernel_%s_%d" % (nodename, nd) fct = compile_gpu_code(code, fct_name) fcts.append(fct) # All inputs/outputs C contiguous case code = (npy_ty + CLUDA_PREAMBLE + elemwise_algo.c_src_kernel_Ccontiguous( node.inputs, node.outputs, nodename, static="")) fct_name = "kernel_%s_Ccontiguous" % nodename fcts[0] = compile_gpu_code(code, fct_name) def call_fct2(inputs, out=None): " Do dimensions collapsing before call the gpu code " assert len(inputs) == nb_in # dtype checked by pycuda # TODO: assert nb dim? inp = inputs[0] # Compute the output shape. out_shape = list(inp.shape) for i in inputs[1:]: for s_i in range(len(inp.shape)): assert (inp.shape[s_i] == i.shape[s_i] or inp.shape[s_i] == 1 or i.shape[s_i] == 1) out_shape[s_i] = max(out_shape[s_i], i.shape[s_i]) # Create the output object if (out is None or out.dtype != out_dtype or out.shape != tuple(out_shape)): out = MyGpuNdArray(gpu_ndarray.empty(out_shape, dtype=out_dtype)) if collapse: # Do the collapsing. nd_col, info = elemwise_collapses(list(inputs), [out]) # The two next line are usefull to force a call to the # c contiguous version: #nd_col = 0 #info = [[],[]] out = call_elemwise(fcts[nd_col], inputs, out=out, out_shape=info[0][:nd_col], strides=info[1]) else: out = call_elemwise(fcts[-1], inputs, out=out, out_shape=out_shape) return out return call_fct2 def __elemwise2__(self, other, name, op): """ Call this code on this op with 2 inputs """ nd = len(self.gpu_nd_array.shape) # self.gpu_nd_array.ndim assert nd == len(other.gpu_nd_array.shape) # ndim tag = (name + '_' + str(self.gpu_nd_array.dtype) + str(self.gpu_nd_array.ndim)) tag += ('_' + str(other.gpu_nd_array.dtype) + str(other.gpu_nd_array.ndim)) fct = self._compiled_fct.get(tag, None) if fct is None: # print "compile", tag fct = MyGpuNdArray.gen_fct(op, [self, other], nd) self._compiled_fct[tag] = fct return fct((self, other)) @classmethod def __elemwise__(cls, inputs, name, op, out=None): """ Call this code on this op with * inputs """ nd = len(inputs[0].gpu_nd_array.shape) # self.gpu_nd_array.ndim for i in inputs[1:]: assert nd == len(i.gpu_nd_array.shape) # ndim nb = len(inputs) tag = name + "_".join([str(i.gpu_nd_array.dtype) + str(i.gpu_nd_array.ndim) for i in inputs]) fct = cls._compiled_fct.get(tag, None) if fct is None: # print "compile", tag fct = MyGpuNdArray.gen_fct(op, inputs, nd) cls._compiled_fct[tag] = fct return fct(inputs, out=out) base = property(lambda self: self.gpu_nd_array.base) bytes = property(lambda self: self.gpu_nd_array.bytes) dtype = property(lambda self: self.gpu_nd_array.dtype) flags = property(lambda self: self.gpu_nd_array.flags) itemsize = property(lambda self: self.gpu_nd_array.itemsize) ndim = property(lambda self: self.gpu_nd_array.ndim, doc="number of dimensions") offset = property(lambda self: self.gpu_nd_array.offset) shape = property(lambda self: self.gpu_nd_array.shape) size = property(lambda self: self.gpu_nd_array.size) strides = property(lambda self: self.gpu_nd_array.strides) def __array__(self): return numpy.asarray(self.gpu_nd_array) def copy(self): return MyGpuNdArray(self.gpu_nd_array.copy()) def view(self): return MyGpuNdArray(self.gpu_nd_array.view()) def __copy__(self): return MyGpuNdArray(self.gpu_nd_array.__copy__()) def __deepcopy__(self): return MyGpuNdArray(self.gpu_nd_array.__deepcopy__()) @property def gpudata(self): # TODO: Add this assert when PyCUDA/PyOpenCL can use the bytes # attributes. Without this assert old code that don't support # strides can receive as input object that are strided and no # error will be gived #assert (self.gpu_nd_array.flags['C_CONTIGUOUS'] or # self.gpu_nd_array.flags['F_CONTIGUOUS']) # TODO: find a way to pass to a pycuda/pyopencl function the # bytes + offset directly. return self.bytes + self.offset def __getitem__(self, *inputs): return MyGpuNdArray(self.gpu_nd_array.__getitem__(*inputs)) def __add__(self, other): return self.__elemwise2__(other, "add", theano.tensor.add) def __sub__(self, other): return self.__elemwise2__(other, "sub", theano.tensor.sub) def __mul__(self, other): return self.__elemwise2__(other, "mul", theano.tensor.mul) def __div__(self, other): assert (str(self.gpu_nd_array.dtype).startswith("float") or str(other.gpu_nd_array.dtype).startswith("float")) return self.__elemwise2__(other, "true_div", theano.tensor.true_div) @classmethod def add(cls, x, y, out=None): """ add all inputs togethers element-wise """ return cls.__elemwise__([x, y], "add", theano.tensor.add, out=out) @classmethod def adds(cls, *inputs): """ add all inputs togethers element-wise """ return cls.__elemwise__(inputs, "add", theano.tensor.add) @classmethod def multiplys(cls, *inputs): """ multiply all inputs togethers element-wise """ return cls.__elemwise__(inputs, "mul", theano.tensor.mul) def sum(self, axis=None, collapse=True): import gen_reduction max_thread_per_block = 512 max_block = 4096 if isinstance(axis, (list, tuple)): if len(axis) == 1: axis = axis[0] else: assert len(axis) == self.ndim axis.sort() assert axis == range(self.ndim) axis = None # TODO: Why this? if self.size == 0: make_out = gpu_ndarray.zeros else: make_out = gpu_ndarray.empty if axis is None: out = make_out((), self.dtype) out = MyGpuNdArray(out) else: out_shape = [self.shape[i] for i in range(self.ndim) if i != axis] out = make_out(out_shape, self.dtype) out = MyGpuNdArray(out) if self.size == 0: return out args_set = False if collapse: coll_ndim, (coll_shape, coll_strides, coll_axis), coll_out_str = ( reduction_collapses([self, out], axis)) else: coll_ndim = self.ndim coll_shape = self.shape coll_strides = self.strides coll_axis = [axis] coll_out_str = out.strides if axis is not None: coll_axis = coll_axis[0] args_set = False if coll_ndim == 0: sum_op = gen_reduction.GpuSum([1], self.dtype) c_code = sum_op.c_support_code_apply("nodename", contig=True) fctname = "kernel_reduce_sum_ccontig_nodename" fct = compile_gpu_code(c_code, fctname) block_ = min(coll_shape[0], max_thread_per_block) block = (block_, 1, 1) grid = (1, 1) shared_ = self.dtype.itemsize * block_ args = [cast_int(coll_shape[0]), self, out] args_set = True elif axis is None: pattern = [1] * coll_ndim str_pattern = [str(i) for i in pattern] sum_op = gen_reduction.GpuSum(pattern, self.dtype) c_code = sum_op.c_support_code_apply("nodename") if not c_code: raise NotImplementedError( "GpuNdArray sum case not implemented") fctname = "kernel_reduce_sum_" + "".join(str_pattern) + "_nodename" fct = compile_gpu_code(c_code, fctname) if coll_ndim == 1: bx = min(max_thread_per_block, coll_shape[0]) block = (bx, 1, 1) block_ = bx elif coll_ndim == 2: bx = min(max_thread_per_block, coll_shape[1]) by = min(max_thread_per_block // coll_shape[1], coll_shape[0]) by = max(by, 1) block = (bx, by, 1) block_ = bx * by elif coll_ndim == 3: bx = min(max_thread_per_block, coll_shape[2]) by = min(max_thread_per_block // bx, coll_shape[1]) bz = min(max_thread_per_block // (bx * by), coll_shape[0]) by = max(by, 1) bz = min(max(bz, 1), 64) block = (bx, by, bz) block_ = bx * by * bz elif coll_ndim == 4: bx = min(max_thread_per_block, coll_shape[3]) by = min(max_thread_per_block // bx, coll_shape[2]) bz = min(max_thread_per_block // (bx * by), coll_shape[1]) by = max(by, 1) bz = min(max(bz, 1), 64) block = (bx, by, bz) block_ = bx * by * bz grid = (1, 1) shared_ = self.dtype.itemsize * block_ elif coll_ndim in [1, 2, 3]: if coll_ndim == 1: assert coll_axis == 0 # pattern 1 sum_op = gen_reduction.GpuSum([1], self.dtype) fctname = "kernel_reduce_sum_1_nodename" grid = (1, 1) block_ = min(max_thread_per_block, coll_shape[0]) block = (block_, 1, 1) elif coll_ndim == 3 and coll_axis == 0: # pattern 100 sum_op = gen_reduction.GpuSum([1, 0, 0], self.dtype) fctname = "kernel_reduce_sum_100_nodename" gx = min(coll_shape[1], max_block) gy = min(max_block // (gx * coll_shape[2]), coll_shape[2]) gy = max(gy, 1) grid = (gx, gy) block_ = min(max_thread_per_block, coll_shape[0]) block = (block_, 1, 1) elif coll_ndim == 3 and coll_axis == 1: # pattern 010 sum_op = gen_reduction.GpuSum([0, 1, 0], self.dtype) fctname = "kernel_reduce_sum_010_AD_nodename" A = coll_shape[0] B = coll_shape[1] C = coll_shape[2] D = C / 32 if (32 * D < C): D += 1 assert ((C <= 32 * D) and (32 * D < C + 32)) shared_ = 0 gx = min(A, max_block) gy = min(max_block // (D * A), D) gy = max(gy, 1) grid = (gx, gy) block = (32, 1, 1) block_ = 32 args_set = True # input shape args = [cast_int(A), cast_int(B), cast_int(C), cast_int(D)] # input args.append(self) # input strides args += [cast_int(i / self.dtype.itemsize) for i in coll_strides] # output args.append(out) # output strides args.append(cast_int(coll_out_str[0] / out.dtype.itemsize)) args.append(cast_int(coll_out_str[1] / out.dtype.itemsize)) elif coll_ndim == 3 and coll_axis == 2: # pattern 001 sum_op = gen_reduction.GpuSum([0, 0, 1], self.dtype) fctname = "kernel_reduce_sum_001_nodename" gx = min(coll_shape[0], max_block) gy = min(max_block // (gx * coll_shape[1]), coll_shape[1]) gy = max(gy, 1) grid = (gx, gy) block_ = min(max_thread_per_block, coll_shape[2]) block = (block_, 1, 1) elif coll_axis == 0: # pattern 10 sum_op = gen_reduction.GpuSum([1, 0], self.dtype) fctname = "kernel_reduce_sum_010_nodename" block_ = min(coll_shape[1], max_thread_per_block) block = (block_, 1, 1) grid = (1, coll_shape[0]) args_set = True # input shape args = [cast_int(1)] args += [cast_int(i) for i in coll_shape] # input args.append(self) # input strides args.append(cast_int(1)) args += [cast_int(i / self.dtype.itemsize) for i in coll_strides] # output args.append(out) # output strides args.append(cast_int(1)) # We must take the last dimensions in the case of # dimensions collapsing. args.append(cast_int(coll_out_str[-1] / out.dtype.itemsize)) elif coll_axis == 1: # pattern 01 sum_op = gen_reduction.GpuSum([0, 1], self.dtype) fctname = "kernel_reduce_sum_01_nodename" block_ = min(coll_shape[1], max_thread_per_block) block = (block_, 1, 1) grid = (1, min(coll_shape[0], max_block)) else: raise Exception("Bad axis") c_code = sum_op.c_support_code_apply("nodename") fct = compile_gpu_code(c_code, fctname) shared_ = self.dtype.itemsize * block_ else: raise Exception("Not implemented") if not args_set: # input shape args = [cast_int(i) for i in coll_shape] # input args.append(self) # input strides args += [cast_int(i / self.dtype.itemsize) for i in coll_strides] # output args.append(out) # output strides args += [cast_int(i / self.dtype.itemsize) for i in coll_out_str] pycuda._driver.Context.synchronize() #print fctname, block, grid, shared_, axis #print self.ndim, self.shape, self.strides, axis, out.strides #print coll_ndim, coll_shape, coll_strides, coll_axis, coll_out_str #print args if False: d = {"block": block, "shared": shared_, "grid": grid} fct(*args, **d) else: # We bypass the pycuda wrapper gpu function call. # by calling directly the gpu function. # This is faster and lower the overhead. fct.set_block_shape(*block) fct.set_shared_size(shared_) fct.param_set(*args) fct.launch_grid(*grid) return out pyopencl-2013.2/pyopencl/compyte/ndarray/pygpu_language_opencl.cpp0000644000175000000500000001741212245716342024201 0ustar tomussrc#include #include #include #include #include #ifdef __APPLE__ #include #else #include #endif cl_context ctx = NULL; cl_device_id dev; cl_command_queue q; void setup_context(cl_context c); static void init_context(void) { cl_int err; cl_uint n; cl_platform_id *plats; cl_context_properties props[3]; cl_context c; if (ctx != NULL) return; err = clGetPlatformIDs(0, NULL, &n); if (err != CL_SUCCESS) return; plats = (cl_platform_id *)calloc(n, sizeof(cl_platform_id)); if (plats == NULL) return; err = clGetPlatformIDs(n, plats, NULL); if (err != CL_SUCCESS) goto fail_id; props[0] = CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties)plats[0]; props[2] = 0; c = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "Could not create context, will fail later (%d)!\n", err); /* error - error - error */ /* but we do nothing */ goto fail_id; } free(plats); setup_context(c); clReleaseContext(c); return; fail_id: free(plats); } void setup_context(cl_context c) { cl_int err; cl_device_id *devs; size_t sz; if (ctx != NULL) { clReleaseContext(ctx); clReleaseCommandQueue(q); } ctx = c; clRetainContext(ctx); err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz); if (err != CL_SUCCESS) { fprintf(stderr, "clGetContextInfo = %d\n", err); goto fail; } devs = (cl_device_id *)malloc(sz); if (devs == NULL) goto fail; err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sz, devs, NULL); if (err != CL_SUCCESS) goto fail_dev; dev = devs[0]; free(devs); q = clCreateCommandQueue(ctx, dev, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "clCreateCommandQueue = %d", err); goto fail; } return; fail_dev: free(devs); fail: clReleaseContext(ctx); ctx = NULL; } void * device_malloc(size_t size) { cl_int err; cl_mem res; init_context(); DPRINTF("malloc size = %zu\n", size); /* OpenCL devices do not always support byte-addressable storage therefore make sure we have at least 4 bytes in buffers */ if (size < 4) size = 4; res = clCreateBuffer(ctx, CL_MEM_READ_WRITE, size, NULL, &err); if (err != CL_SUCCESS) { PyErr_Format(PyExc_MemoryError, "Could not allocate device memory (%d)", err); return NULL; } return res; } int device_free(void * ptr) { cl_int err; if ((err = clReleaseMemObject((cl_mem)ptr)) != CL_SUCCESS) { PyErr_Format(PyExc_MemoryError, "Could not free device memory (%d)", err); return -1; } return 0; } int PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast) { size_t size = 1; cl_event ev; cl_int err; assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other)); assert(PyGpuNdArray_ISWRITEABLE(self)); if (PyGpuNdArray_NDIM(self) == -1) { PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuN\ dArrayObject"); return -1; } if (!(PyGpuNdArray_ISONESEGMENT(self) && PyGpuNdArray_ISONESEGMENT(other))) { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: only contiguous arrays are supported"); return -1; } if ((PyGpuNdArray_ISCONTIGUOUS(self) != PyGpuNdArray_ISCONTIGUOUS(other)) || (PyGpuNdArray_ISFORTRAN(self) != PyGpuNdArray_ISFORTRAN(other)) ) { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: the input and output don't have the same c/f contiguous memory layout. This isnot supported now."); return -1; } if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other)); return -1; } for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) { if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i]) && (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) { PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld", i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]); return -1; } size *= (unsigned int) PyGpuNdArray_DIMS(self)[i]; } if (0 == size) { return 0; //nothing to copy, we're done. } size *= PyGpuNdArray_ITEMSIZE(self); if ((err = clEnqueueCopyBuffer(q, (cl_mem)PyGpuNdArray_DATA(other), (cl_mem)PyGpuNdArray_DATA(self), PyGpuNdArray_OFFSET(other), PyGpuNdArray_OFFSET(self), size, 0, NULL, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create copy command (%d)", err); return -1; } if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not copy data (%d)", err); clReleaseEvent(ev); return -1; } clReleaseEvent(ev); return 0; } int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, PyGpuTransfert direction) { cl_int err; cl_event ev; switch (direction) { case PyGpuHostToDevice: err = clEnqueueWriteBuffer(q, (cl_mem)dst, CL_FALSE, dev_offset, bytes, src, 0, NULL, &ev); break; case PyGpuDeviceToHost: err = clEnqueueReadBuffer(q, (cl_mem)src, CL_FALSE, dev_offset, bytes, dst, 0, NULL, &ev); break; default: PyErr_Format(PyExc_ValueError, "Unknown direction %d", direction); return -1; } if (err != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create memcpy command (%d)", err); return -1; } if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not memcpy data (%d)", err); clReleaseEvent(ev); return -1; } clReleaseEvent(ev); return 0; } int PyGpuMemset(void * dst, int data, size_t bytes) { /* This should be at least one byte over the formatted string below */ char local_kern[92]; const char *rlk[1]; size_t sz; int r, res = -1; cl_int err; cl_event ev; cl_program p; cl_kernel k; bytes = (bytes+3)/4; if (bytes == 0) return 0; unsigned char val = (unsigned)data; unsigned int pattern = (unsigned int)val & (unsigned int)val >> 8 & (unsigned int)val >> 16 & (unsigned int)val >> 24; r = snprintf(local_kern, sizeof(local_kern), "__kernel void memset(__global unsigned int *mem) { mem[get_global_id(0)] = %u; }", pattern); /* If this assert fires, increase the size of local_kern above. */ assert(r >= sizeof(local_kern)); sz = strlen(local_kern); rlk[0] = local_kern; p = clCreateProgramWithSource(ctx, 1, rlk, &sz, &err); if (err != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create program (%d)", err); return -1; } if ((err = clBuildProgram(p, 1, &dev, NULL, NULL, NULL)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not build program (%d)", err); goto fail_prog; } k = clCreateKernel(p, "memset", &err); if (err != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create kernel (%d)", err); goto fail_prog; } if ((err = clSetKernelArg(k, 0, sizeof(cl_mem), &dst)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set kernel arg (%d)", err); goto fail_kern; } if ((err = clEnqueueNDRangeKernel(q, k, 1, NULL, &bytes, NULL, 0, NULL, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not enqueue kernel (%d)", err); goto fail_kern; } if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not memset (%d)", err); } /* success! */ res = 0; clReleaseEvent(ev); fail_kern: clReleaseKernel(k); fail_prog: clReleaseProgram(p); return res; } pyopencl-2013.2/pyopencl/compyte/dtypes.py0000644000175000000500000001507012245716342017346 0ustar tomussrc"""Type mapping helpers.""" from __future__ import division __copyright__ = "Copyright (C) 2011 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np # {{{ registry DTYPE_TO_NAME = {} NAME_TO_DTYPE = {} class TypeNameNotKnown(RuntimeError): pass def get_or_register_dtype(c_names, dtype=None): """Get or register a :class:`numpy.dtype` associated with the C type names in the string list *c_names*. If *dtype* is `None`, no registration is performed, and the :class:`numpy.dtype` must already have been registered. If so, it is returned. If not, :exc:`TypeNameNotKnown` is raised. If *dtype* is not `None`, registration is attempted. If the *c_names* are already known and registered to identical :class:`numpy.dtype` objects, then the previously dtype object of the previously registered type is returned. If the *c_names* are not yet known, the type is registered. If one of the *c_names* is known but registered to a different type, an error is raised. In this latter case, the type may end up partially registered and any further behavior is undefined. .. versionadded:: 2012.2 """ if isinstance(c_names, str): c_names = [c_names] if dtype is None: from pytools import single_valued return single_valued(NAME_TO_DTYPE[name] for name in c_names) dtype = np.dtype(dtype) # check if we've seen an identical dtype, if so retrieve exact dtype object. try: existing_name = DTYPE_TO_NAME[dtype] except KeyError: existed = False else: existed = True existing_dtype = NAME_TO_DTYPE[existing_name] assert existing_dtype == dtype dtype = existing_dtype for nm in c_names: try: name_dtype = NAME_TO_DTYPE[nm] except KeyError: NAME_TO_DTYPE[nm] = dtype else: if name_dtype != dtype: raise RuntimeError("name '%s' already registered to " "different dtype" % nm) if not existed: DTYPE_TO_NAME[dtype] = c_names[0] if not str(dtype) in DTYPE_TO_NAME: DTYPE_TO_NAME[str(dtype)] = c_names[0] return dtype def register_dtype(dtype, c_names, alias_ok=False): from warnings import warn warn("register_dtype is deprecated. Use get_or_register_dtype instead.", DeprecationWarning, stacklevel=2) if isinstance(c_names, str): c_names = [c_names] dtype = np.dtype(dtype) # check if we've seen this dtype before and error out if a) it was seen before # and b) alias_ok is False. if not alias_ok and dtype in DTYPE_TO_NAME: raise RuntimeError("dtype '%s' already registered (as '%s', new names '%s')" % (dtype, DTYPE_TO_NAME[dtype], ", ".join(c_names))) get_or_register_dtype(c_names, dtype) def _fill_dtype_registry(respect_windows, include_bool=True): from sys import platform if include_bool: # bool is of unspecified size in the OpenCL spec and may in fact be 4-byte. get_or_register_dtype("bool", np.bool) get_or_register_dtype(["signed char", "char"], np.int8) get_or_register_dtype("unsigned char", np.uint8) get_or_register_dtype(["short", "signed short", "signed short int", "short signed int"], np.int16) get_or_register_dtype(["unsigned short", "unsigned short int", "short unsigned int"], np.uint16) get_or_register_dtype(["int", "signed int"], np.int32) get_or_register_dtype(["unsigned", "unsigned int"], np.uint32) is_64_bit = tuple.__itemsize__ * 8 == 64 if is_64_bit: if 'win32' in platform and respect_windows: i64_name = "long long" else: i64_name = "long" get_or_register_dtype( [i64_name, "%s int" % i64_name, "signed %s int" % i64_name, "%s signed int" % i64_name], np.int64) get_or_register_dtype( ["unsigned %s" % i64_name, "unsigned %s int" % i64_name, "%s unsigned int" % i64_name], np.uint64) # http://projects.scipy.org/numpy/ticket/2017 if is_64_bit: get_or_register_dtype(["unsigned %s" % i64_name], np.uintp) else: get_or_register_dtype(["unsigned"], np.uintp) get_or_register_dtype("float", np.float32) get_or_register_dtype("double", np.float64) # }}} # {{{ dtype -> ctype def dtype_to_ctype(dtype): if dtype is None: raise ValueError("dtype may not be None") dtype = np.dtype(dtype) try: return DTYPE_TO_NAME[dtype] except KeyError: raise ValueError("unable to map dtype '%s'" % dtype) # }}} # {{{ c declarator parsing def parse_c_arg_backend(c_arg, scalar_arg_factory, vec_arg_factory, name_to_dtype=None): if name_to_dtype is None: name_to_dtype = NAME_TO_DTYPE.__getitem__ c_arg = c_arg.replace("const", "").replace("volatile", "") # process and remove declarator import re decl_re = re.compile(r"(\**)\s*([_a-zA-Z0-9]+)(\s*\[[ 0-9]*\])*\s*$") decl_match = decl_re.search(c_arg) if decl_match is None: raise ValueError("couldn't parse C declarator '%s'" % c_arg) name = decl_match.group(2) if decl_match.group(1) or decl_match.group(3) is not None: arg_class = vec_arg_factory else: arg_class = scalar_arg_factory tp = c_arg[:decl_match.start()] tp = " ".join(tp.split()) try: dtype = name_to_dtype(tp) except KeyError: raise ValueError("unknown type '%s'" % tp) return arg_class(dtype, name) # }}} # vim: foldmethod=marker pyopencl-2013.2/pyopencl/compyte/setup.cfg0000644000175000000500000000011012245716342017272 0ustar tomussrc[flake8] ignore = E126,E127,E128,E123,E226,E241,E242 max-line-length=85 pyopencl-2013.2/pyopencl/algorithm.py0000644000175000000500000011614012245716342016344 0ustar tomussrc"""Scan primitive.""" from __future__ import division __copyright__ = """Copyright 2011-2012 Andreas Kloeckner""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np import pyopencl as cl import pyopencl.array # noqa from pyopencl.scan import ScanTemplate from pyopencl.tools import dtype_to_ctype from pytools import memoize, memoize_method, Record from mako.template import Template # {{{ copy_if _copy_if_template = ScanTemplate( arguments="item_t *ary, item_t *out, scan_t *count", input_expr="(%(predicate)s) ? 1 : 0", scan_expr="a+b", neutral="0", output_statement=""" if (prev_item != item) out[item-1] = ary[i]; if (i+1 == N) *count = item; """, template_processor="printf") def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=None): """Copy the elements of *ary* satisfying *predicate* to an output array. :arg predicate: a C expression evaluating to a `bool`, represented as a string. The value to test is available as `ary[i]`, and if the expression evaluates to `true`, then this value ends up in the output. :arg extra_args: |scan_extra_args| :arg preamble: |preamble| :arg wait_for: |explain-waitfor| :returns: a tuple *(out, count, event)* where *out* is the output array, *count* is an on-device scalar (fetch to host with `count.get()`) indicating how many elements satisfied *predicate*, and *event* is a :class:`pyopencl.Event` for dependency management. .. versionadded:: 2013.1 """ if len(ary) > np.iinfo(np.int32).max: scan_dtype = np.int64 else: scan_dtype = np.int32 extra_args_types = tuple((val.dtype, name) for name, val in extra_args) extra_args_values = tuple(val for name, val in extra_args) knl = _copy_if_template.build(ary.context, type_aliases=(("scan_t", scan_dtype), ("item_t", ary.dtype)), var_values=(("predicate", predicate),), more_preamble=preamble, more_arguments=extra_args_types) out = cl.array.empty_like(ary) count = ary._new_with_changes(data=None, offset=0, shape=(), strides=(), dtype=scan_dtype) # **dict is a Py2.5 workaround evt = knl(ary, out, count, *extra_args_values, **dict(queue=queue, wait_for=wait_for)) return out, count, evt # }}} # {{{ remove_if def remove_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=None): """Copy the elements of *ary* not satisfying *predicate* to an output array. :arg predicate: a C expression evaluating to a `bool`, represented as a string. The value to test is available as `ary[i]`, and if the expression evaluates to `false`, then this value ends up in the output. :arg extra_args: |scan_extra_args| :arg preamble: |preamble| :arg wait_for: |explain-waitfor| :returns: a tuple *(out, count, event)* where *out* is the output array, *count* is an on-device scalar (fetch to host with `count.get()`) indicating how many elements did not satisfy *predicate*, and *event* is a :class:`pyopencl.Event` for dependency management. .. versionadded:: 2013.1 """ return copy_if(ary, "!(%s)" % predicate, extra_args=extra_args, preamble=preamble, queue=queue, wait_for=wait_for) # }}} # {{{ partition _partition_template = ScanTemplate( arguments=( "item_t *ary, item_t *out_true, item_t *out_false, " "scan_t *count_true"), input_expr="(%(predicate)s) ? 1 : 0", scan_expr="a+b", neutral="0", output_statement="""//CL// if (prev_item != item) out_true[item-1] = ary[i]; else out_false[i-item] = ary[i]; if (i+1 == N) *count_true = item; """, template_processor="printf") def partition(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=None): """Copy the elements of *ary* into one of two arrays depending on whether they satisfy *predicate*. :arg predicate: a C expression evaluating to a `bool`, represented as a string. The value to test is available as `ary[i]`. :arg extra_args: |scan_extra_args| :arg preamble: |preamble| :arg wait_for: |explain-waitfor| :returns: a tuple *(out_true, out_false, count, event)* where *count* is an on-device scalar (fetch to host with `count.get()`) indicating how many elements satisfied the predicate, and *event* is a :class:`pyopencl.Event` for dependency management. .. versionadded:: 2013.1 """ if len(ary) > np.iinfo(np.uint32).max: scan_dtype = np.uint64 else: scan_dtype = np.uint32 extra_args_types = tuple((val.dtype, name) for name, val in extra_args) extra_args_values = tuple(val for name, val in extra_args) knl = _partition_template.build( ary.context, type_aliases=(("item_t", ary.dtype), ("scan_t", scan_dtype)), var_values=(("predicate", predicate),), more_preamble=preamble, more_arguments=extra_args_types) out_true = cl.array.empty_like(ary) out_false = cl.array.empty_like(ary) count = ary._new_with_changes(data=None, offset=0, shape=(), strides=(), dtype=scan_dtype) # **dict is a Py2.5 workaround evt = knl(ary, out_true, out_false, count, *extra_args_values, **dict(queue=queue, wait_for=wait_for)) return out_true, out_false, count, evt # }}} # {{{ unique _unique_template = ScanTemplate( arguments="item_t *ary, item_t *out, scan_t *count_unique", input_fetch_exprs=[ ("ary_im1", "ary", -1), ("ary_i", "ary", 0), ], input_expr="(i == 0) || (IS_EQUAL_EXPR(ary_im1, ary_i) ? 0 : 1)", scan_expr="a+b", neutral="0", output_statement=""" if (prev_item != item) out[item-1] = ary[i]; if (i+1 == N) *count_unique = item; """, preamble="#define IS_EQUAL_EXPR(a, b) %(macro_is_equal_expr)s\n", template_processor="printf") def unique(ary, is_equal_expr="a == b", extra_args=[], preamble="", queue=None, wait_for=None): """Copy the elements of *ary* into the output if *is_equal_expr*, applied to the array element and its predecessor, yields false. Works like the UNIX command :program:`uniq`, with a potentially custom comparison. This operation is often used on sorted sequences. :arg is_equal_expr: a C expression evaluating to a `bool`, represented as a string. The elements being compared are available as `a` and `b`. If this expression yields `false`, the two are considered distinct. :arg extra_args: |scan_extra_args| :arg preamble: |preamble| :arg wait_for: |explain-waitfor| :returns: a tuple *(out, count, event)* where *out* is the output array, *count* is an on-device scalar (fetch to host with `count.get()`) indicating how many elements satisfied the predicate, and *event* is a :class:`pyopencl.Event` for dependency management. .. versionadded:: 2013.1 """ if len(ary) > np.iinfo(np.uint32).max: scan_dtype = np.uint64 else: scan_dtype = np.uint32 extra_args_types = tuple((val.dtype, name) for name, val in extra_args) extra_args_values = tuple(val for name, val in extra_args) knl = _unique_template.build( ary.context, type_aliases=(("item_t", ary.dtype), ("scan_t", scan_dtype)), var_values=(("macro_is_equal_expr", is_equal_expr),), more_preamble=preamble, more_arguments=extra_args_types) out = cl.array.empty_like(ary) count = ary._new_with_changes(data=None, offset=0, shape=(), strides=(), dtype=scan_dtype) # **dict is a Py2.5 workaround evt = knl(ary, out, count, *extra_args_values, **dict(queue=queue, wait_for=wait_for)) return out, count, evt # }}} # {{{ radix_sort def to_bin(n): # Py 2.5 has no built-in bin() digs = [] while n: digs.append(str(n % 2)) n >>= 1 return ''.join(digs[::-1]) def _padded_bin(i, l): s = to_bin(i) while len(s) < l: s = '0' + s return s @memoize def _make_sort_scan_type(device, bits, index_dtype): name = "pyopencl_sort_scan_%s_%dbits_t" % ( index_dtype.type.__name__, bits) fields = [] for mnr in range(2**bits): fields.append(('c%s' % _padded_bin(mnr, bits), index_dtype)) dtype = np.dtype(fields) from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct dtype, c_decl = match_dtype_to_c_struct(device, name, dtype) dtype = get_or_register_dtype(name, dtype) return name, dtype, c_decl # {{{ types, helpers preamble RADIX_SORT_PREAMBLE_TPL = Template(r"""//CL// typedef ${scan_ctype} scan_t; typedef ${key_ctype} key_t; typedef ${index_ctype} index_t; // #define DEBUG #ifdef DEBUG #define dbg_printf(ARGS) printf ARGS #else #define dbg_printf(ARGS) /* */ #endif index_t get_count(scan_t s, int mnr) { return ${get_count_branch("")}; } #define BIN_NR(key_arg) ((key_arg >> base_bit) & ${2**bits - 1}) """, strict_undefined=True) # }}} # {{{ scan helpers RADIX_SORT_SCAN_PREAMBLE_TPL = Template(r"""//CL// scan_t scan_t_neutral() { scan_t result; %for mnr in range(2**bits): result.c${padded_bin(mnr, bits)} = 0; %endfor return result; } // considers bits (base_bit+bits-1, ..., base_bit) scan_t scan_t_from_value( key_t key, int base_bit, int i ) { // extract relevant bit range key_t bin_nr = BIN_NR(key); dbg_printf(("i: %d key:%d bin_nr:%d\n", i, key, bin_nr)); scan_t result; %for mnr in range(2**bits): result.c${padded_bin(mnr, bits)} = (bin_nr == ${mnr}); %endfor return result; } scan_t scan_t_add(scan_t a, scan_t b, bool across_seg_boundary) { %for mnr in range(2**bits): <% field = "c"+padded_bin(mnr, bits) %> b.${field} = a.${field} + b.${field}; %endfor return b; } """, strict_undefined=True) RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL// { key_t key = ${key_expr}; key_t my_bin_nr = BIN_NR(key); index_t previous_bins_size = 0; %for mnr in range(2**bits): previous_bins_size += (my_bin_nr > ${mnr}) ? last_item.c${padded_bin(mnr, bits)} : 0; %endfor index_t tgt_idx = previous_bins_size + get_count(item, my_bin_nr) - 1; %for arg_name in sort_arg_names: sorted_${arg_name}[tgt_idx] = ${arg_name}[i]; %endfor } """, strict_undefined=True) # }}} # {{{ driver class RadixSort(object): """Provides a general `radix sort `_ on the compute device. .. versionadded:: 2013.1 """ def __init__(self, context, arguments, key_expr, sort_arg_names, bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32, options=[]): """ :arg arguments: A string of comma-separated C argument declarations. If *arguments* is specified, then *input_expr* must also be specified. All types used here must be known to PyOpenCL. (see :func:`pyopencl.tools.get_or_register_dtype`). :arg key_expr: An integer-valued C expression returning the key based on which the sort is performed. The array index for which the key is to be computed is available as `i`. The expression may refer to any of the *arguments*. :arg sort_arg_names: A list of argument names whose corresponding array arguments will be sorted according to *key_expr*. """ # {{{ arg processing from pyopencl.tools import parse_arg_list self.arguments = parse_arg_list(arguments) del arguments self.sort_arg_names = sort_arg_names self.bits = int(bits_at_a_time) self.index_dtype = np.dtype(index_dtype) self.key_dtype = np.dtype(key_dtype) self.options = options # }}} # {{{ kernel creation scan_ctype, scan_dtype, scan_t_cdecl = \ _make_sort_scan_type(context.devices[0], self.bits, self.index_dtype) from pyopencl.tools import VectorArg, ScalarArg scan_arguments = ( list(self.arguments) + [VectorArg(arg.dtype, "sorted_"+arg.name) for arg in self.arguments if arg.name in sort_arg_names] + [ScalarArg(np.int32, "base_bit")]) def get_count_branch(known_bits): if len(known_bits) == self.bits: return "s.c%s" % known_bits boundary_mnr = known_bits + "1" + (self.bits-len(known_bits)-1)*"0" return ("((mnr < %s) ? %s : %s)" % ( int(boundary_mnr, 2), get_count_branch(known_bits+"0"), get_count_branch(known_bits+"1"))) codegen_args = dict( bits=self.bits, key_ctype=dtype_to_ctype(self.key_dtype), key_expr=key_expr, index_ctype=dtype_to_ctype(self.index_dtype), index_type_max=np.iinfo(self.index_dtype).max, padded_bin=_padded_bin, scan_ctype=scan_ctype, sort_arg_names=sort_arg_names, get_count_branch=get_count_branch, ) preamble = scan_t_cdecl+RADIX_SORT_PREAMBLE_TPL.render(**codegen_args) scan_preamble = preamble \ + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args) from pyopencl.scan import GenericScanKernel self.scan_kernel = GenericScanKernel( context, scan_dtype, arguments=scan_arguments, input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr, scan_expr="scan_t_add(a, b, across_seg_boundary)", neutral="scan_t_neutral()", output_statement=RADIX_SORT_OUTPUT_STMT_TPL.render(**codegen_args), preamble=scan_preamble, options=self.options) for i, arg in enumerate(self.arguments): if isinstance(arg, VectorArg): self.first_array_arg_idx = i # }}} def __call__(self, *args, **kwargs): """Run the radix sort. In addition to *args* which must match the *arguments* specification on the constructor, the following keyword arguments are supported: :arg key_bits: specify how many bits (starting from least-significant) there are in the key. :arg allocator: See the *allocator* argument of :func:`pyopencl.array.empty`. :arg queue: A :class:`pyopencl.CommandQueue`, defaulting to the one from the first argument array. :arg wait_for: |explain-waitfor| :returns: A tuple ``(sorted, event)``. *sorted* consists of sorted copies of the arrays named in *sorted_args*, in the order of that list. *event* is a :class:`pyopencl.Event` for dependency management. """ wait_for = kwargs.pop("wait_for", None) # {{{ run control key_bits = kwargs.pop("key_bits", None) if key_bits is None: key_bits = int(np.iinfo(self.key_dtype).bits) n = len(args[self.first_array_arg_idx]) allocator = kwargs.pop("allocator", None) if allocator is None: allocator = args[self.first_array_arg_idx].allocator queue = kwargs.pop("allocator", None) if queue is None: queue = args[self.first_array_arg_idx].queue args = list(args) base_bit = 0 while base_bit < key_bits: sorted_args = [ cl.array.empty(queue, n, arg_descr.dtype, allocator=allocator) for arg_descr in self.arguments if arg_descr.name in self.sort_arg_names] scan_args = args + sorted_args + [base_bit] last_evt = self.scan_kernel(*scan_args, **dict(queue=queue, wait_for=wait_for)) wait_for = [last_evt] # substitute sorted for i, arg_descr in enumerate(self.arguments): if arg_descr.name in self.sort_arg_names: args[i] = sorted_args[self.sort_arg_names.index(arg_descr.name)] base_bit += self.bits return [arg_val for arg_descr, arg_val in zip(self.arguments, args) if arg_descr.name in self.sort_arg_names], last_evt # }}} # }}} # }}} # {{{ generic parallel list builder # {{{ kernel template _LIST_BUILDER_TEMPLATE = Template("""//CL// % if double_support: #pragma OPENCL EXTENSION cl_khr_fp64: enable #define PYOPENCL_DEFINE_CDOUBLE % endif #include ${preamble} // {{{ declare helper macros for user interface typedef ${index_type} index_type; %if is_count_stage: #define PLB_COUNT_STAGE %for name, dtype in list_names_and_dtypes: %if name in count_sharing: #define APPEND_${name}(value) { /* nothing */ } %else: #define APPEND_${name}(value) { ++(*plb_loc_${name}_count); } %endif %endfor %else: #define PLB_WRITE_STAGE %for name, dtype in list_names_and_dtypes: %if name in count_sharing: #define APPEND_${name}(value) \ { plb_${name}_list[(*plb_${count_sharing[name]}_index) - 1] \ = value; } %else: #define APPEND_${name}(value) \ { plb_${name}_list[(*plb_${name}_index)++] = value; } %endif %endfor %endif #define LIST_ARG_DECL ${user_list_arg_decl} #define LIST_ARGS ${user_list_args} #define USER_ARG_DECL ${user_arg_decl} #define USER_ARGS ${user_args} // }}} ${generate_template} // {{{ kernel entry point __kernel %if do_not_vectorize: __attribute__((reqd_work_group_size(1, 1, 1))) %endif void ${kernel_name}(${kernel_list_arg_decl} USER_ARG_DECL index_type n) { %if not do_not_vectorize: int lid = get_local_id(0); index_type gsize = get_global_size(0); index_type work_group_start = get_local_size(0)*get_group_id(0); for (index_type i = work_group_start + lid; i < n; i += gsize) %else: const int chunk_size = 128; index_type chunk_base = get_global_id(0)*chunk_size; index_type gsize = get_global_size(0); for (; chunk_base < n; chunk_base += gsize*chunk_size) for (index_type i = chunk_base; i < min(n, chunk_base+chunk_size); ++i) %endif { %if is_count_stage: %for name, dtype in list_names_and_dtypes: %if name not in count_sharing: index_type plb_loc_${name}_count = 0; %endif %endfor %else: %for name, dtype in list_names_and_dtypes: %if name not in count_sharing: index_type plb_${name}_index = plb_${name}_start_index[i]; %endif %endfor %endif generate(${kernel_list_arg_values} USER_ARGS i); %if is_count_stage: %for name, dtype in list_names_and_dtypes: %if name not in count_sharing: plb_${name}_count[i] = plb_loc_${name}_count; %endif %endfor %endif } } // }}} """, strict_undefined=True) # }}} def _get_arg_decl(arg_list): result = "" for arg in arg_list: result += arg.declarator() + ", " return result def _get_arg_list(arg_list, prefix=""): result = "" for arg in arg_list: result += prefix + arg.name + ", " return result class BuiltList(Record): pass class ListOfListsBuilder: """Generates and executes code to produce a large number of variable-size lists, simply. .. note:: This functionality is provided as a preview. Its interface is subject to change until this notice is removed. .. versionadded:: 2013.1 Here's a usage example:: from pyopencl.algorithm import ListOfListsBuilder builder = ListOfListsBuilder(context, [("mylist", np.int32)], \"\"\" void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) { int count = i % 4; for (int j = 0; j < count; ++j) { APPEND_mylist(count); } } \"\"\", arg_decls=[]) result, event = builder(queue, 2000) inf = result["mylist"] assert inf.count == 3000 assert (inf.list.get()[-6:] == [1, 2, 2, 3, 3, 3]).all() The function `generate` above is called once for each "input object". Each input object can then generate zero or more list entries. The number of these input objects is given to :meth:`__call__` as *n_objects*. List entries are generated by calls to `APPEND_(value)`. Multiple lists may be generated at once. """ def __init__(self, context, list_names_and_dtypes, generate_template, arg_decls, count_sharing=None, devices=None, name_prefix="plb_build_list", options=[], preamble="", debug=False, complex_kernel=False): """ :arg context: A :class:`pyopencl.Context`. :arg list_names_and_dtypes: a list of `(name, dtype)` tuples indicating the lists to be built. :arg generate_template: a snippet of C as described below :arg arg_decls: A string of comma-separated C argument declarations. :arg count_sharing: A mapping consisting of `(child, mother)` indicating that `mother` and `child` will always have the same number of indices, and the `APPEND` to `mother` will always happen *before* the `APPEND` to the child. :arg name_prefix: the name prefix to use for the compiled kernels :arg options: OpenCL compilation options for kernels using *generate_template*. :arg complex_kernel: If `True`, prevents vectorization on CPUs. *generate_template* may use the following C macros/identifiers: * `index_type`: expands to C identifier for the index type used for the calculation * `USER_ARG_DECL`: expands to the C declarator for `arg_decls` * `USER_ARGS`: a list of C argument values corresponding to `user_arg_decl` * `LIST_ARG_DECL`: expands to a C argument list representing the data for the output lists. These are escaped prefixed with `"plg_"` so as to not interfere with user-provided names. * `LIST_ARGS`: a list of C argument values corresponding to `LIST_ARG_DECL` * `APPEND_name(entry)`: inserts `entry` into the list `name`. *entry* must be a valid C expression of the correct type. All argument-list related macros have a trailing comma included if they are non-empty. *generate_template* must supply a function: .. code-block:: c void generate(USER_ARG_DECL LIST_ARG_DECL index_type i) { APPEND_mylist(5); } Internally, the `kernel_template` is expanded (at least) twice. Once, for a 'counting' stage where the size of all the lists is determined, and a second time, for a 'generation' stage where the lists are actually filled. A `generate` function that has side effects beyond calling `append` is therefore ill-formed. """ if devices is None: devices = context.devices if count_sharing is None: count_sharing = {} self.context = context self.devices = devices self.list_names_and_dtypes = list_names_and_dtypes self.generate_template = generate_template from pyopencl.tools import parse_arg_list self.arg_decls = parse_arg_list(arg_decls) self.count_sharing = count_sharing self.name_prefix = name_prefix self.preamble = preamble self.options = options self.debug = debug self.complex_kernel = complex_kernel # {{{ kernel generators @memoize_method def get_scan_kernel(self, index_dtype): from pyopencl.scan import GenericScanKernel return GenericScanKernel( self.context, index_dtype, arguments="__global %s *ary" % dtype_to_ctype(index_dtype), input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i+1] = item;", devices=self.devices) def do_not_vectorize(self): from pytools import any return (self.complex_kernel and any(dev.type == cl.device_type.CPU for dev in self.context.devices)) @memoize_method def get_count_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [ VectorArg(index_dtype, "plb_%s_count" % name) for name, dtype in self.list_names_and_dtypes if name not in self.count_sharing] user_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue name = "plb_loc_%s_count" % name user_list_args.append(OtherArg("%s *%s" % ( index_ctype, name), name)) kernel_name = self.name_prefix+"_count" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=True, kernel_name=kernel_name, double_support=all(has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"), user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl=_get_arg_decl(self.arg_decls), user_args=_get_arg_list(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes( kernel_list_args+self.arg_decls) + [index_dtype]) return knl @memoize_method def get_write_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [] kernel_list_arg_values = "" user_list_args = [] for name, dtype in self.list_names_and_dtypes: list_name = "plb_%s_list" % name list_arg = VectorArg(dtype, list_name) kernel_list_args.append(list_arg) user_list_args.append(list_arg) if name in self.count_sharing: kernel_list_arg_values += "%s, " % list_name continue kernel_list_args.append( VectorArg(index_dtype, "plb_%s_start_index" % name)) index_name = "plb_%s_index" % name user_list_args.append(OtherArg("%s *%s" % ( index_ctype, index_name), index_name)) kernel_list_arg_values += "%s, &%s, " % (list_name, index_name) kernel_name = self.name_prefix+"_write" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=False, kernel_name=kernel_name, double_support=all(has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=kernel_list_arg_values, user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl=_get_arg_decl(self.arg_decls), user_args=_get_arg_list(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes( kernel_list_args+self.arg_decls) + [index_dtype]) return knl # }}} # {{{ driver def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. :class:`pyopencl.array.Array` are not allowed directly and should be passed as their :attr:`pyopencl.array.Array.data` attribute instead. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. *event* is a :class:`pyopencl.Event` for dependency management. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) result = {} count_list_args = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) # The scan will turn the "counts" array into the "starts" array # in-place. result[name] = BuiltList(starts=counts) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.complex_kernel and queue.device.type == cl.device_type.CPU: gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + args + (n_objects,)), **dict(wait_for=wait_for)) # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue info_record = result[name] starts_ary = info_record.starts evt = scan_kernel(starts_ary, wait_for=[count_event]) # set first entry to zero evt = cl.enqueue_copy(queue, starts_ary.data, index_dtype.type(0), wait_for=[evt]) scan_events.append(evt) # retrieve count count = np.array(1, index_dtype) cl.enqueue_copy(queue, count, starts_ary.data, device_offset=index_dtype.itemsize*n_objects) info_record.count = int(count) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + args + (n_objects,)), **dict(wait_for=scan_events)) return result, evt # }}} # }}} # {{{ key-value sorting class _KernelInfo(Record): pass def _make_cl_int_literal(value, dtype): iinfo = np.iinfo(dtype) result = str(int(value)) if dtype.itemsize == 8: result += "l" if int(iinfo.min) < 0: result += "u" return result class KeyValueSorter(object): """Given arrays *values* and *keys* of equal length and a number *nkeys* of keys, returns a tuple `(starts, lists)`, as follows: *values* and *keys* are sorted by *keys*, and the sorted *values* is returned as *lists*. Then for each index *i* in `range(nkeys)`, *starts[i]* is written to indicating where the group of *values* belonging to the key with index *i* begins. It implicitly ends at *starts[i+1]*. `starts` is built so that it has `nkeys+1` entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. .. note:: This functionality is provided as a preview. Its interface is subject to change until this notice is removed. .. versionadded:: 2013.1 """ def __init__(self, context): self.context = context @memoize_method def get_kernels(self, key_dtype, value_dtype, starts_dtype): from pyopencl.algorithm import RadixSort from pyopencl.tools import VectorArg, ScalarArg by_target_sorter = RadixSort( self.context, [ VectorArg(value_dtype, "values"), VectorArg(key_dtype, "keys"), ], key_expr="keys[i]", sort_arg_names=["values", "keys"]) from pyopencl.elementwise import ElementwiseTemplate start_finder = ElementwiseTemplate( arguments="""//CL// starts_t *key_group_starts, key_t *keys_sorted_by_key, """, operation=r"""//CL// key_t my_key = keys_sorted_by_key[i]; if (i == 0 || my_key != keys_sorted_by_key[i-1]) key_group_starts[my_key] = i; """, name="find_starts").build(self.context, type_aliases=( ("key_t", starts_dtype), ("starts_t", starts_dtype), ), var_values=()) from pyopencl.scan import GenericScanKernel bound_propagation_scan = GenericScanKernel( self.context, starts_dtype, arguments=[ VectorArg(starts_dtype, "starts"), # starts has length n+1 ScalarArg(key_dtype, "nkeys"), ], input_expr="starts[nkeys-i]", scan_expr="min(a, b)", neutral=_make_cl_int_literal( np.iinfo(starts_dtype).max, starts_dtype), output_statement="starts[nkeys-i] = item;") return _KernelInfo( by_target_sorter=by_target_sorter, start_finder=start_finder, bound_propagation_scan=bound_propagation_scan) def __call__(self, queue, keys, values, nkeys, starts_dtype, allocator=None, wait_for=None): if allocator is None: allocator = values.allocator knl_info = self.get_kernels(keys.dtype, values.dtype, starts_dtype) (values_sorted_by_key, keys_sorted_by_key), evt = knl_info.by_target_sorter( values, keys, queue=queue, wait_for=wait_for) starts = (cl.array.empty(queue, (nkeys+1), starts_dtype, allocator=allocator) .fill(len(values_sorted_by_key), wait_for=[evt])) evt, = starts.events evt = knl_info.start_finder(starts, keys_sorted_by_key, range=slice(len(keys_sorted_by_key)), wait_for=[evt]) evt = knl_info.bound_propagation_scan(starts, nkeys, queue=queue, wait_for=[evt]) return starts, values_sorted_by_key, evt # }}} # vim: filetype=pyopencl:fdm=marker pyopencl-2013.2/pyopencl/elementwise.py0000644000175000000500000007241312245716340016701 0ustar tomussrc"""Elementwise functionality.""" from __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from pyopencl.tools import context_dependent_memoize import numpy as np import pyopencl as cl from pytools import memoize_method from pyopencl.tools import (dtype_to_ctype, VectorArg, ScalarArg, KernelTemplateBase) # {{{ elementwise kernel code generator def get_elwise_program(context, arguments, operation, name="elwise_kernel", options=[], preamble="", loop_prep="", after_loop="", use_range=False): if use_range: body = r"""//CL// if (step < 0) { for (i = start + (work_group_start + lid)*step; i > stop; i += gsize*step) { %(operation)s; } } else { for (i = start + (work_group_start + lid)*step; i < stop; i += gsize*step) { %(operation)s; } } """ else: body = """//CL// for (i = work_group_start + lid; i < n; i += gsize) { %(operation)s; } """ import re return_match = re.search(r"\breturn\b", operation) if return_match is not None: from warnings import warn warn("Using a 'return' statement in an element-wise operation will " "likely lead to incorrect results. Use " "PYOPENCL_ELWISE_CONTINUE instead.", stacklevel=3) source = ("""//CL// %(preamble)s #define PYOPENCL_ELWISE_CONTINUE continue __kernel void %(name)s(%(arguments)s) { int lid = get_local_id(0); int gsize = get_global_size(0); int work_group_start = get_local_size(0)*get_group_id(0); long i; %(loop_prep)s; %(body)s %(after_loop)s; } """ % { "arguments": ", ".join(arg.declarator() for arg in arguments), "name": name, "preamble": preamble, "loop_prep": loop_prep, "after_loop": after_loop, "body": body % dict(operation=operation), }) from pyopencl import Program return Program(context, source).build(options) def get_elwise_kernel_and_types(context, arguments, operation, name="elwise_kernel", options=[], preamble="", use_range=False, **kwargs): from pyopencl.tools import parse_arg_list, get_arg_offset_adjuster_code parsed_args = parse_arg_list(arguments, with_offset=True) auto_preamble = kwargs.pop("auto_preamble", True) pragmas = [] includes = [] have_double_pragma = False have_complex_include = False if auto_preamble: for arg in parsed_args: if arg.dtype in [np.float64, np.complex128]: if not have_double_pragma: pragmas.append( "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" "#define PYOPENCL_DEFINE_CDOUBLE\n") have_double_pragma = True if arg.dtype.kind == 'c': if not have_complex_include: includes.append("#include \n") have_complex_include = True if pragmas or includes: preamble = "\n".join(pragmas+includes) + "\n" + preamble if use_range: parsed_args.extend([ ScalarArg(np.intp, "start"), ScalarArg(np.intp, "stop"), ScalarArg(np.intp, "step"), ]) else: parsed_args.append(ScalarArg(np.intp, "n")) loop_prep = kwargs.pop("loop_prep", "") loop_prep = get_arg_offset_adjuster_code(parsed_args) + loop_prep prg = get_elwise_program( context, parsed_args, operation, name=name, options=options, preamble=preamble, use_range=use_range, loop_prep=loop_prep, **kwargs) from pyopencl.tools import get_arg_list_scalar_arg_dtypes kernel = getattr(prg, name) kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args)) return kernel, parsed_args def get_elwise_kernel(context, arguments, operation, name="elwise_kernel", options=[], **kwargs): """Return a L{pyopencl.Kernel} that performs the same scalar operation on one or several vectors. """ func, arguments = get_elwise_kernel_and_types( context, arguments, operation, name=name, options=options, **kwargs) return func # }}} # {{{ ElementwiseKernel driver class ElementwiseKernel: """ A kernel that takes a number of scalar or vector *arguments* and performs an *operation* specified as a snippet of C on these arguments. :arg arguments: a string formatted as a C argument list. :arg operation: a snippet of C that carries out the desired 'map' operation. The current index is available as the variable *i*. *operation* may contain the statement ``PYOPENCL_ELWISE_CONTINUE``, which will terminate processing for the current element. :arg name: the function name as which the kernel is compiled :arg options: passed unmodified to :meth:`pyopencl.Program.build`. :arg preamble: a piece of C source code that gets inserted outside of the function context in the elementwise operation's kernel source code. .. warning :: Using a `return` statement in *operation* will lead to incorrect results, as some elements may never get processed. Use ``PYOPENCL_ELWISE_CONTINUE`` instead. .. versionchanged:: 2013.1 Added ``PYOPENCL_ELWISE_CONTINUE``. """ def __init__(self, context, arguments, operation, name="elwise_kernel", options=[], **kwargs): self.context = context self.arguments = arguments self.operation = operation self.name = name self.options = options self.kwargs = kwargs @memoize_method def get_kernel(self, use_range): knl, arg_descrs = get_elwise_kernel_and_types( self.context, self.arguments, self.operation, name=self.name, options=self.options, use_range=use_range, **self.kwargs) for arg in arg_descrs: if isinstance(arg, VectorArg) and not arg.with_offset: from warnings import warn warn("ElementwiseKernel '%s' used with VectorArgs that do not " "have offset support enabled. This usage is deprecated. " "Just pass with_offset=True to VectorArg, everything should " "sort itself out automatically." % self.name, DeprecationWarning) if not [i for i, arg in enumerate(arg_descrs) if isinstance(arg, VectorArg)]: raise RuntimeError( "ElementwiseKernel can only be used with " "functions that have at least one " "vector argument") return knl, arg_descrs def __call__(self, *args, **kwargs): repr_vec = None range_ = kwargs.pop("range", None) slice_ = kwargs.pop("slice", None) use_range = range_ is not None or slice_ is not None kernel, arg_descrs = self.get_kernel(use_range) # {{{ assemble arg array invocation_args = [] for arg, arg_descr in zip(args, arg_descrs): if isinstance(arg_descr, VectorArg): if not arg.flags.forc: raise RuntimeError("ElementwiseKernel cannot " "deal with non-contiguous arrays") if repr_vec is None: repr_vec = arg invocation_args.append(arg.base_data) if arg_descr.with_offset: invocation_args.append(arg.offset) else: invocation_args.append(arg) # }}} queue = kwargs.pop("queue", None) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("unknown keyword arguments: '%s'" % ", ".join(kwargs)) if queue is None: queue = repr_vec.queue if slice_ is not None: if range_ is not None: raise TypeError("may not specify both range and slice " "keyword arguments") range_ = slice(*slice_.indices(repr_vec.size)) max_wg_size = kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device) if range_ is not None: start = range_.start if start is None: start = 0 invocation_args.append(start) invocation_args.append(range_.stop) if range_.step is None: step = 1 else: step = range_.step invocation_args.append(step) from pyopencl.array import splay gs, ls = splay(queue, abs(range_.stop - start)//step, max_wg_size) else: invocation_args.append(repr_vec.size) gs, ls = repr_vec.get_sizes(queue, max_wg_size) kernel.set_args(*invocation_args) return cl.enqueue_nd_range_kernel(queue, kernel, gs, ls, wait_for=wait_for) # }}} # {{{ template class ElementwiseTemplate(KernelTemplateBase): def __init__(self, arguments, operation, name="elwise", preamble="", template_processor=None): KernelTemplateBase.__init__(self, template_processor=template_processor) self.arguments = arguments self.operation = operation self.name = name self.preamble = preamble def build_inner(self, context, type_aliases=(), var_values=(), more_preamble="", more_arguments=(), declare_types=(), options=()): renderer = self.get_renderer( type_aliases, var_values, context, options) arg_list = renderer.render_argument_list( self.arguments, more_arguments, with_offset=True) type_decl_preamble = renderer.get_type_decl_preamble( context.devices[0], declare_types, arg_list) return ElementwiseKernel(context, arg_list, renderer(self.operation), name=renderer(self.name), options=list(options), preamble=( type_decl_preamble + "\n" + renderer(self.preamble + "\n" + more_preamble)), auto_preamble=False) # }}} # {{{ kernels supporting array functionality @context_dependent_memoize def get_take_kernel(context, dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = ([VectorArg(dtype, "dest" + str(i), with_offset=True) for i in range(vec_count)] + [VectorArg(dtype, "src" + str(i), with_offset=True) for i in range(vec_count)] + [VectorArg(idx_dtype, "idx", with_offset=True)]) body = ( ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join( "dest%d[i] = src%d[src_idx];" % (i, i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, name="take") @context_dependent_memoize def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(dtype, "dest%d" % i) for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), VectorArg(idx_dtype, "gmem_src_idx", with_offset=True), ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] + [ ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets ] if with_offsets: def get_copy_insn(i): return ("dest%d[dest_idx] = " "src%d[src_idx+offset%d];" % (i, i, i)) else: def get_copy_insn(i): return ("dest%d[dest_idx] = " "src%d[src_idx];" % (i, i)) body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + "\n".join(get_copy_insn(i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, name="take_put") @context_dependent_memoize def get_put_kernel(context, dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(dtype, "dest%d" % i, with_offset=True) for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] body = ( "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, name="put") @context_dependent_memoize def get_copy_kernel(context, dtype_dest, dtype_src): src = "src[i]" if dtype_dest.kind == "c" != dtype_src.kind: src = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_dest), src) if dtype_dest.kind == "c" and dtype_src != dtype_dest: src = "%s_cast(%s)" % (complex_dtype_to_name(dtype_dest), src), return get_elwise_kernel(context, "%(tp_dest)s *dest, %(tp_src)s *src" % { "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = %s" % src, name="copy") @context_dependent_memoize def get_linear_combination_kernel(summand_descriptors, dtype_z): # TODO: Port this! raise NotImplementedError from pyopencl.tools import dtype_to_ctype from pyopencl.elementwise import \ VectorArg, ScalarArg, get_elwise_module args = [] preamble = [] loop_prep = [] summands = [] tex_names = [] for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \ enumerate(summand_descriptors): if is_gpu_scalar: preamble.append( "texture <%s, 1, cudaReadModeElementType> tex_a%d;" % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True)) tex_names.append("tex_a%d" % i) loop_prep.append( "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" % (dtype_to_ctype(scalar_dtype), i, i)) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True)) summands.append("a%d*x%d[i]" % (i, i)) args.append(VectorArg(dtype_z, "z", with_offset=True)) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, "z[i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep)) func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] func.prepare("".join(arg.struct_char for arg in args), (1, 1, 1), texrefs=tex_src) return func, tex_src def complex_dtype_to_name(dtype): if dtype == np.complex128: return "cdouble" elif dtype == np.complex64: return "cfloat" else: raise RuntimeError("invalid complex type") def real_dtype(dtype): return dtype.type(0).real.dtype @context_dependent_memoize def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): ax = "a*x[i]" by = "b*y[i]" x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" if x_is_complex: ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x) if y_is_complex: by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y) if x_is_complex and not y_is_complex: by = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_x), by) if not x_is_complex and y_is_complex: ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax) result = "%s + %s" % (ax, by) if z_is_complex: result = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), result) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % result, name="axpbyz") @context_dependent_memoize def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): a_is_complex = dtype_a.kind == "c" x_is_complex = dtype_x.kind == "c" b_is_complex = dtype_b.kind == "c" z_is_complex = dtype_z.kind == "c" ax = "a*x[i]" if a_is_complex and x_is_complex: a = "a" x = "x[i]" if dtype_a != dtype_z: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) if dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) # The following two are workarounds for Apple on OS X 10.8. # They're not really necessary. elif a_is_complex and not x_is_complex: ax = "a*((%s) x[i])" % dtype_to_ctype(real_dtype(dtype_a)) elif not a_is_complex and x_is_complex: ax = "((%s) a)*x[i]" % dtype_to_ctype(real_dtype(dtype_x)) b = "b" if z_is_complex and not b_is_complex: b = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), b) if z_is_complex and not (a_is_complex or x_is_complex): ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), ax) if z_is_complex: ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax) b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_x": dtype_to_ctype(dtype_x), "tp_b": dtype_to_ctype(dtype_b), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s + %s" % (ax, b), name="axpb") @context_dependent_memoize def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" x = "x[i]" y = "y[i]" if x_is_complex and dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y) if x_is_complex and y_is_complex: xy = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: xy = "%s * %s" % (x, y) if z_is_complex: xy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xy) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % xy, name="multiply") @context_dependent_memoize def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" x = "x[i]" y = "y[i]" if z_is_complex and dtype_x != dtype_y: if x_is_complex and dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y) if x_is_complex and y_is_complex: xoy = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif not x_is_complex and y_is_complex: xoy = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: xoy = "%s / %s" % (x, y) if z_is_complex: xoy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xoy) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % xoy, name="divide") @context_dependent_memoize def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z): # implements y / x! x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" x = "x[i]" y = "y" if z_is_complex and dtype_x != dtype_y: if x_is_complex and dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y) if x_is_complex and y_is_complex: yox = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) elif not y_is_complex and x_is_complex: yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_x), y, x) else: yox = "%s / %s" % (y, x) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % yox, name="divide_r") @context_dependent_memoize def get_fill_kernel(context, dtype): return get_elwise_kernel(context, "%(tp)s *z, %(tp)s a" % { "tp": dtype_to_ctype(dtype), }, "z[i] = a", name="fill") @context_dependent_memoize def get_reverse_kernel(context, dtype): return get_elwise_kernel(context, "%(tp)s *z, %(tp)s *y" % { "tp": dtype_to_ctype(dtype), }, "z[i] = y[n-1-i]", name="reverse") @context_dependent_memoize def get_arange_kernel(context, dtype): if dtype.kind == "c": i = "%s_fromreal(i)" % complex_dtype_to_name(dtype) else: i = "(%s) i" % dtype_to_ctype(dtype) return get_elwise_kernel(context, [ VectorArg(dtype, "z", with_offset=True), ScalarArg(dtype, "start"), ScalarArg(dtype, "step"), ], "z[i] = start + %s*step" % i, name="arange") @context_dependent_memoize def get_pow_kernel(context, dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): if is_base_array: x = "x[i]" x_ctype = "%(tp_x)s *x" else: x = "x" x_ctype = "%(tp_x)s x" if is_exp_array: y = "y[i]" y_ctype = "%(tp_y)s *y" else: y = "y" y_ctype = "%(tp_y)s y" x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" if z_is_complex and dtype_x != dtype_y: if x_is_complex and dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y) elif dtype_x != dtype_y: if dtype_x != dtype_z: x = "(%s) (%s)" % (dtype_to_ctype(dtype_z), x) if dtype_y != dtype_z: y = "(%s) (%s)" % (dtype_to_ctype(dtype_z), y) if x_is_complex and y_is_complex: result = "%s_pow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif x_is_complex and not y_is_complex: result = "%s_powr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif not x_is_complex and y_is_complex: result = "%s_rpow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: result = "pow(%s, %s)" % (x, y) return get_elwise_kernel(context, ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % result, name="pow_method") @context_dependent_memoize def get_array_scalar_comparison_kernel(context, operator, dtype_a): return get_elwise_kernel(context, [ VectorArg(np.int8, "out", with_offset=True), VectorArg(dtype_a, "a", with_offset=True), ScalarArg(dtype_a, "b"), ], "out[i] = a[i] %s b" % operator, name="scalar_comparison_kernel") @context_dependent_memoize def get_array_comparison_kernel(context, operator, dtype_a, dtype_b): return get_elwise_kernel(context, [ VectorArg(np.int8, "out", with_offset=True), VectorArg(dtype_a, "a", with_offset=True), VectorArg(dtype_b, "b", with_offset=True), ], "out[i] = a[i] %s b[i]" % operator, name="comparison_kernel") @context_dependent_memoize def get_fmod_kernel(context): return get_elwise_kernel(context, "float *z, float *arg, float *mod", "z[i] = fmod(arg[i], mod[i])", name="fmod_kernel") @context_dependent_memoize def get_modf_kernel(context): return get_elwise_kernel(context, "float *intpart ,float *fracpart, float *x", "fracpart[i] = modf(x[i], &intpart[i])", name="modf_kernel") @context_dependent_memoize def get_frexp_kernel(context): return get_elwise_kernel(context, "float *significand, float *exponent, float *x", """ int expt = 0; significand[i] = frexp(x[i], &expt); exponent[i] = expt; """, name="frexp_kernel") @context_dependent_memoize def get_ldexp_kernel(context): return get_elwise_kernel(context, "float *z, float *sig, float *expt", "z[i] = ldexp(sig[i], (int) expt[i])", name="ldexp_kernel") @context_dependent_memoize def get_bessel_kernel(context, which_func): return get_elwise_kernel(context, "double *z, int ord_n, double *x", "z[i] = bessel_%sn(ord_n, x[i])" % which_func, name="bessel_%sn_kernel" % which_func, preamble=""" #include """ % which_func) @context_dependent_memoize def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None): if out_dtype is None: out_dtype = in_dtype return get_elwise_kernel(context, "%(tp_out)s *z, %(tp_in)s *y" % { "tp_in": dtype_to_ctype(in_dtype), "tp_out": dtype_to_ctype(out_dtype), }, "z[i] = %s(y[i])" % func_name, name="%s_kernel" % func_name) @context_dependent_memoize def get_binary_func_kernel(context, func_name, x_dtype, y_dtype, out_dtype): return get_elwise_kernel(context, [ VectorArg(out_dtype, "z", with_offset=True), VectorArg(x_dtype, "x", with_offset=True), VectorArg(y_dtype, "y", with_offset=True), ], "z[i] = %s(x[i], y[i])" % func_name, name="%s_kernel" % func_name) @context_dependent_memoize def get_diff_kernel(context, dtype): return get_elwise_kernel(context, [ VectorArg(dtype, "result", with_offset=True), VectorArg(dtype, "array", with_offset=True), ], "result[i] = array[i+1] - array[i]", name="diff") @context_dependent_memoize def get_if_positive_kernel(context, crit_dtype, dtype): return get_elwise_kernel(context, [ VectorArg(dtype, "result", with_offset=True), VectorArg(crit_dtype, "crit", with_offset=True), VectorArg(dtype, "then_", with_offset=True), VectorArg(dtype, "else_", with_offset=True), ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", name="if_positive") # }}} # vim: fdm=marker:filetype=pyopencl pyopencl-2013.2/pyopencl/version.py0000644000175000000500000000015312245716342016037 0ustar tomussrcVERSION = (2013, 2) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS pyopencl-2013.2/pyopencl/characterize/0002755000175000000500000000000012245716340016445 5ustar tomussrcpyopencl-2013.2/pyopencl/characterize/__init__.py0000644000175000000500000002501312245716340020555 0ustar tomussrcfrom __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pyopencl as cl from pytools import memoize class CLCharacterizationWarning(UserWarning): pass @memoize def has_double_support(dev): for ext in dev.extensions.split(" "): if ext == "cl_khr_fp64": return True return False def has_amd_double_support(dev): """"Fix to allow incomplete amd double support in low end boards""" for ext in dev.extensions.split(" "): if ext == "cl_amd_fp64": return True return False def reasonable_work_group_size_multiple(dev, ctx=None): try: return dev.warp_size_nv except: pass if ctx is None: ctx = cl.Context([dev]) prg = cl.Program(ctx, """ __kernel void knl(__global float *a) { a[get_global_id(0)] = 0; } """) prg.build() return prg.knl.get_work_group_info( cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, dev) def nv_compute_capability(dev): """If *dev* is an Nvidia GPU :class:`pyopencl.Device`, return a tuple *(major, minor)* indicating the device's compute capability. """ try: return (dev.compute_capability_major_nv, dev.compute_capability_minor_nv) except: return None def usable_local_mem_size(dev, nargs=None): """Return an estimate of the usable local memory size. :arg nargs: Number of 32-bit arguments passed. """ usable_local_mem_size = dev.local_mem_size nv_compute_cap = nv_compute_capability(dev) if (nv_compute_cap is not None and nv_compute_cap < (2, 0)): # pre-Fermi use local mem for parameter passing if nargs is None: # assume maximum usable_local_mem_size -= 256 else: usable_local_mem_size -= 4*nargs return usable_local_mem_size def simultaneous_work_items_on_local_access(dev): """Return the number of work items that access local memory simultaneously and thereby may conflict with each other. """ nv_compute_cap = nv_compute_capability(dev) if nv_compute_cap is not None: if nv_compute_cap < (2, 0): return 16 else: if nv_compute_cap >= (3, 0): from warnings import warn warn("wildly guessing conflicting local access size on '%s'" % dev, CLCharacterizationWarning) return 32 if dev.type & cl.device_type.GPU: from warnings import warn warn("wildly guessing conflicting local access size on '%s'" % dev, CLCharacterizationWarning) return 16 elif dev.type & cl.device_type.CPU: return 1 else: from warnings import warn warn("wildly guessing conflicting local access size on '%s'" % dev, CLCharacterizationWarning) return 16 def local_memory_access_granularity(dev): """Return the number of bytes per bank in local memory.""" return 4 def local_memory_bank_count(dev): """Return the number of banks present in local memory. """ nv_compute_cap = nv_compute_capability(dev) if nv_compute_cap is not None: if nv_compute_cap < (2, 0): return 16 else: if nv_compute_cap >= (3, 0): from warnings import warn warn("wildly guessing local memory bank count on '%s'" % dev, CLCharacterizationWarning) return 32 if dev.type & cl.device_type.GPU: from warnings import warn warn("wildly guessing local memory bank count on '%s'" % dev, CLCharacterizationWarning) return 16 elif dev.type & cl.device_type.CPU: if dev.local_mem_type == cl.device_local_mem_type.GLOBAL: raise RuntimeError("asking for a bank count is " "meaningless for cache-based lmem") from warnings import warn warn("wildly guessing conflicting local access size on '%s'" % dev, CLCharacterizationWarning) return 16 def why_not_local_access_conflict_free(dev, itemsize, array_shape, array_stored_shape=None): """ :param itemsize: size of accessed data in bytes :param array_shape: array dimensions, fastest-moving last (C order) :returns: a tuple (multiplicity, explanation), where *multiplicity* is the number of work items that will conflict on a bank when accessing local memory. *explanation* is a string detailing the found conflict. """ # FIXME: Treat 64-bit access on NV CC 2.x + correctly if array_stored_shape is None: array_stored_shape = array_shape rank = len(array_shape) array_shape = array_shape[::-1] array_stored_shape = array_stored_shape[::-1] gran = local_memory_access_granularity(dev) if itemsize != gran: from warnings import warn warn("local conflict info might be inaccurate " "for itemsize != %d" % gran, CLCharacterizationWarning) sim_wi = simultaneous_work_items_on_local_access(dev) bank_count = local_memory_bank_count(dev) conflicts = [] for work_item_axis in range(rank): bank_accesses = {} for work_item_id in xrange(sim_wi): addr = 0 addr_mult = itemsize idx = [] left_over_idx = work_item_id for axis, (ax_size, ax_stor_size) in enumerate( zip(array_shape, array_stored_shape)): if axis >= work_item_axis: left_over_idx, ax_idx = divmod(left_over_idx, ax_size) addr += addr_mult*ax_idx idx.append(ax_idx) else: idx.append(0) addr_mult *= ax_stor_size if left_over_idx: # out-of-bounds, assume not taking place continue bank = (addr // gran) % bank_count bank_accesses.setdefault(bank, []).append( "w.item %s -> %s" % (work_item_id, idx[::-1])) conflict_multiplicity = max( len(acc) for acc in bank_accesses.itervalues()) if conflict_multiplicity > 1: for bank, acc in bank_accesses.iteritems(): if len(acc) == conflict_multiplicity: conflicts.append( (conflict_multiplicity, "%dx conflict on axis %d (from right, 0-based): " "%s access bank %d" % ( conflict_multiplicity, work_item_axis, ", ".join(acc), bank))) if conflicts: return max(conflicts) else: return 1, None def get_fast_inaccurate_build_options(dev): """Return a list of flags valid on device *dev* that enable fast, but potentially inaccurate floating point math. """ return ["-cl-mad-enable", "-cl-fast-relaxed-math", "-cl-no-signed-zeros", "-cl-strict-aliasing"] def get_simd_group_size(dev, type_size): """Return an estimate of how many work items will be executed across SIMD lanes. This returns the size of what Nvidia calls a warp and what AMD calls a wavefront. Only refers to implicit SIMD. :arg type_size: number of bytes in vector entry type. """ try: return dev.warp_size_nv except: pass lc_vendor = dev.platform.vendor.lower() if "nvidia" in lc_vendor: return 32 if ("advanced micro" in lc_vendor or "ati" in lc_vendor): if dev.type & cl.device_type.GPU: # Tomasz Rybak says, in response to reduction mishbehaving on the AMD # 'Loveland' APU: # # Like in CUDA reduction bug (related to Fermi) it again seems # to be related to too eager concurrency when reducing results. # According to http://oscarbg.blogspot.com/2009/10/news-from-web.html # "Actually the wavefront size is only 64 for the highend cards(48XX, # 58XX, 57XX), but 32 for the middleend cards and 16 for the lowend # cards." # IMO we should use PREFERRED_WORK_GROUP_SIZE_MULTIPLE to get # non_sync_size. At the same size we lose SIMD CPU optimisation, # but I do not know for now how to fix those two at the same time. # Attached patch fixes problem on Loveland, not breaking anything on # NVIDIA ION. # This is therefore our best guess as to the SIMD group size. return reasonable_work_group_size_multiple(dev) elif dev.type & cl.device_type.CPU: return 1 else: raise RuntimeError("unexpected AMD device type") if dev.type & cl.device_type.CPU: # implicit assumption: Impl. will vectorize if type_size == 1: return dev.preferred_vector_width_char elif type_size == 2: return dev.preferred_vector_width_short elif type_size == 4: return dev.preferred_vector_width_float elif type_size == 8: return dev.preferred_vector_width_double else: from warnings import warn warn("unexpected dtype size in get_simd_group on CPU device, " "guessing group width 1") return 1 return None pyopencl-2013.2/pyopencl/characterize/performance.py0000644000175000000500000001567212245716340021331 0ustar tomussrcfrom __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pyopencl as cl import numpy as np # {{{ timing helpers class Timer: def __init__(self, queue): self.queue = queue def start(self): pass def stop(self): pass def add_event(self, evt): pass def get_elapsed(self): pass class WallTimer(Timer): def start(self): from time import time self.queue.finish() self.start = time() def stop(self): from time import time self.queue.finish() self.end = time() def get_elapsed(self): return self.end-self.start def _get_time(queue, f, timer_factory=None, desired_duration=0.1, warmup_rounds=3): if timer_factory is None: timer_factory = WallTimer count = 1 while True: timer = timer_factory(queue) for i in xrange(warmup_rounds): f() warmup_rounds = 0 timer.start() for i in xrange(count): timer.add_event(f()) timer.stop() elapsed = timer.get_elapsed() if elapsed < desired_duration: if elapsed == 0: count *= 5 else: new_count = int(desired_duration/elapsed) new_count = max(2*count, new_count) new_count = min(10*count, new_count) count = new_count else: return elapsed/count # }}} # {{{ transfer measurements class HostDeviceTransferBase(object): def __init__(self, queue, block_size): self.queue = queue self.host_buf = np.empty(block_size, dtype=np.uint8) self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) class HostToDeviceTransfer(HostDeviceTransferBase): def do(self): return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf) class DeviceToHostTransfer(HostDeviceTransferBase): def do(self): return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf) class DeviceToDeviceTransfer(object): def __init__(self, queue, block_size): self.queue = queue self.dev_buf_1 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) self.dev_buf_2 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size) def do(self): return cl.enqueue_copy(self. queue, self.dev_buf_2, self.dev_buf_1) class HostToDeviceTransfer(HostDeviceTransferBase): def do(self): return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf) def transfer_latency(queue, transfer_type, timer_factory=None): transfer = transfer_type(queue, 1) return _get_time(queue, transfer.do, timer_factory=timer_factory) def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None): """Measures one-sided bandwidth.""" transfer = transfer_type(queue, block_size) return block_size/_get_time(queue, transfer.do, timer_factory=timer_factory) # }}} def get_profiling_overhead(ctx, timer_factory=None): no_prof_queue = cl.CommandQueue(ctx) transfer = DeviceToDeviceTransfer(no_prof_queue, 1) no_prof_time = _get_time(no_prof_queue, transfer.do, timer_factory=timer_factory) prof_queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) transfer = DeviceToDeviceTransfer(prof_queue, 1) prof_time = _get_time(prof_queue, transfer.do, timer_factory=timer_factory) return prof_time - no_prof_time, prof_time def get_empty_kernel_time(queue, timer_factory=None): prg = cl.Program(queue.context, """ __kernel void empty() { } """).build() knl = prg.empty def f(): knl(queue, (1,), None) return _get_time(queue, f, timer_factory=timer_factory) def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", timer_factory=None): prg = cl.Program(queue.context, src).build() knl = getattr(prg, name) dev = queue.device global_size = 4 * dev.max_compute_units def f(): knl(queue, (global_size,), None, *args) rates = [] num_dips = 0 while True: elapsed = _get_time(queue, f, timer_factory=timer_factory) rate = global_size/elapsed print global_size, rate, num_dips keep_trying = not rates if rates and rate > 1.05*max(rates): # big improvement keep_trying = True num_dips = 0 if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip keep_trying = True num_dips += 1 if keep_trying: global_size *= 2 last_rate = rate rates.append(rate) else: rates.append(rate) return max(rates) def get_add_rate(queue, type="float", timer_factory=None): return 50*10*_get_full_machine_kernel_rate(queue, """ typedef %(op_t)s op_t; __kernel void benchmark() { local op_t tgt[1024]; op_t val = get_global_id(0); for (int i = 0; i < 10; ++i) { val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; val += val; } tgt[get_local_id(0)] = val; } """ % dict(op_t=type), ()) # vim: foldmethod=marker:filetype=pyopencl pyopencl-2013.2/pyopencl/reduction.py0000644000175000000500000004725312245716340016360 0ustar tomussrc"""Computation of reductions on vectors.""" from __future__ import division __copyright__ = "Copyright (C) 2010 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Based on code/ideas by Mark Harris . None of the original source code remains. """ import pyopencl as cl from pyopencl.tools import ( context_dependent_memoize, dtype_to_ctype, KernelTemplateBase, _process_code_for_macro) import numpy as np # {{{ kernel source KERNEL = """//CL// #define GROUP_SIZE ${group_size} #define READ_AND_MAP(i) (${map_expr}) #define REDUCE(a, b) (${reduce_expr}) % if double_support: #pragma OPENCL EXTENSION cl_khr_fp64: enable #define PYOPENCL_DEFINE_CDOUBLE % endif #include ${preamble} typedef ${out_type} out_type; __kernel void ${name}( __global out_type *out, ${arguments}, unsigned int seq_count, unsigned int n) { ${arg_prep} __local out_type ldata[GROUP_SIZE]; unsigned int lid = get_local_id(0); unsigned int i = get_group_id(0)*GROUP_SIZE*seq_count + lid; out_type acc = ${neutral}; for (unsigned s = 0; s < seq_count; ++s) { if (i >= n) break; acc = REDUCE(acc, READ_AND_MAP(i)); i += GROUP_SIZE; } ldata[lid] = acc; <% cur_size = group_size %> % while cur_size > no_sync_size: barrier(CLK_LOCAL_MEM_FENCE); <% new_size = cur_size // 2 assert new_size * 2 == cur_size %> if (lid < ${new_size}) { ldata[lid] = REDUCE( ldata[lid], ldata[lid + ${new_size}]); } <% cur_size = new_size %> % endwhile % if cur_size > 1: ## we need to synchronize one last time for entry into the ## no-sync region. barrier(CLK_LOCAL_MEM_FENCE); <% # NB: There's an exact duplicate of this calculation in the # %while loop below. new_size = cur_size // 2 assert new_size * 2 == cur_size %> if (lid < ${new_size}) { __local volatile out_type *lvdata = ldata; % while cur_size > 1: <% new_size = cur_size // 2 assert new_size * 2 == cur_size %> lvdata[lid] = REDUCE( lvdata[lid], lvdata[lid + ${new_size}]); <% cur_size = new_size %> % endwhile } % endif if (lid == 0) out[get_group_id(0)] = ldata[0]; } """ # }}} # {{{ internal codegen frontends def _get_reduction_source( ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, parsed_args, name="reduce_kernel", preamble="", arg_prep="", device=None, max_group_size=None): if device is not None: devices = [device] else: devices = ctx.devices # {{{ compute group size def get_dev_group_size(device): # dirty fix for the RV770 boards max_work_group_size = device.max_work_group_size if "RV770" in device.name: max_work_group_size = 64 # compute lmem limit from pytools import div_ceil lmem_wg_size = div_ceil(max_work_group_size, out_type_size) result = min(max_work_group_size, lmem_wg_size) # round down to power of 2 from pyopencl.tools import bitlog2 return 2**bitlog2(result) group_size = min(get_dev_group_size(dev) for dev in devices) if max_group_size is not None: group_size = min(max_group_size, group_size) # }}} # {{{ compute synchronization-less group size def get_dev_no_sync_size(device): from pyopencl.characterize import get_simd_group_size result = get_simd_group_size(device, out_type_size) if result is None: from warnings import warn warn("Reduction might be unnecessarily slow: " "can't query SIMD group size") return 1 return result no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices) # }}} from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support src = str(Template(KERNEL).render( out_type=out_type, arguments=", ".join(arg.declarator() for arg in parsed_args), group_size=group_size, no_sync_size=no_sync_size, neutral=neutral, reduce_expr=_process_code_for_macro(reduce_expr), map_expr=_process_code_for_macro(map_expr), name=name, preamble=preamble, arg_prep=arg_prep, double_support=all(has_double_support(dev) for dev in devices), )) from pytools import Record class ReductionInfo(Record): pass return ReductionInfo( context=ctx, source=src, group_size=group_size) def get_reduction_kernel(stage, ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", preamble="", device=None, options=[], max_group_size=None): if map_expr is None: if stage == 2: map_expr = "pyopencl_reduction_inp[i]" else: map_expr = "in[i]" from pyopencl.tools import ( parse_arg_list, get_arg_list_scalar_arg_dtypes, get_arg_offset_adjuster_code, VectorArg) arg_prep = "" if stage == 1 and arguments is not None: arguments = parse_arg_list(arguments, with_offset=True) arg_prep = get_arg_offset_adjuster_code(arguments) if stage == 2 and arguments is not None: arguments = parse_arg_list(arguments) arguments = ( [VectorArg(dtype_out, "pyopencl_reduction_inp")] + arguments) inf = _get_reduction_source( ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, map_expr, arguments, name, preamble, arg_prep, device, max_group_size) inf.program = cl.Program(ctx, inf.source) inf.program.build(options) inf.kernel = getattr(inf.program, name) inf.arg_types = arguments inf.kernel.set_scalar_arg_dtypes( [None] + get_arg_list_scalar_arg_dtypes(inf.arg_types) + [np.uint32]*2) return inf # }}} # {{{ main reduction kernel class ReductionKernel: def __init__(self, ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble=""): dtype_out = self.dtype_out = np.dtype(dtype_out) max_group_size = None trip_count = 0 while True: self.stage_1_inf = get_reduction_kernel(1, ctx, dtype_out, neutral, reduce_expr, map_expr, arguments, name=name+"_stage1", options=options, preamble=preamble, max_group_size=max_group_size) kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, ctx.devices[0]) if self.stage_1_inf.group_size <= kernel_max_wg_size: break else: max_group_size = kernel_max_wg_size trip_count += 1 assert trip_count <= 2 self.stage_2_inf = get_reduction_kernel(2, ctx, dtype_out, neutral, reduce_expr, arguments=arguments, name=name+"_stage2", options=options, preamble=preamble, max_group_size=max_group_size) from pytools import any from pyopencl.tools import VectorArg assert any( isinstance(arg_tp, VectorArg) for arg_tp in self.stage_1_inf.arg_types), \ "ReductionKernel can only be used with functions " \ "that have at least one vector argument" def __call__(self, *args, **kwargs): MAX_GROUP_COUNT = 1024 SMALL_SEQ_COUNT = 4 from pyopencl.array import empty stage_inf = self.stage_1_inf queue = kwargs.pop("queue", None) wait_for = kwargs.pop("wait_for", None) return_event = kwargs.pop("return_event", False) if kwargs: raise TypeError("invalid keyword argument to reduction kernel") stage1_args = args while True: invocation_args = [] vectors = [] from pyopencl.tools import VectorArg for arg, arg_tp in zip(args, stage_inf.arg_types): if isinstance(arg_tp, VectorArg): if not arg.flags.forc: raise RuntimeError("ReductionKernel cannot " "deal with non-contiguous arrays") vectors.append(arg) invocation_args.append(arg.base_data) if arg_tp.with_offset: invocation_args.append(arg.offset) else: invocation_args.append(arg) repr_vec = vectors[0] sz = repr_vec.size if queue is not None: use_queue = queue else: use_queue = repr_vec.queue if sz <= stage_inf.group_size*SMALL_SEQ_COUNT*MAX_GROUP_COUNT: total_group_size = SMALL_SEQ_COUNT*stage_inf.group_size group_count = (sz + total_group_size - 1) // total_group_size seq_count = SMALL_SEQ_COUNT else: group_count = MAX_GROUP_COUNT macrogroup_size = group_count*stage_inf.group_size seq_count = (sz + macrogroup_size - 1) // macrogroup_size if group_count == 1: result = empty(use_queue, (), self.dtype_out, allocator=repr_vec.allocator) else: result = empty(use_queue, (group_count,), self.dtype_out, allocator=repr_vec.allocator) last_evt = stage_inf.kernel( use_queue, (group_count*stage_inf.group_size,), (stage_inf.group_size,), *([result.data]+invocation_args+[seq_count, sz]), **dict(wait_for=wait_for)) wait_for = [last_evt] if group_count == 1: if return_event: return result, last_evt else: return result else: stage_inf = self.stage_2_inf args = (result,) + stage1_args # }}} # {{{ template class ReductionTemplate(KernelTemplateBase): def __init__(self, arguments, neutral, reduce_expr, map_expr=None, is_segment_start_expr=None, input_fetch_exprs=[], name_prefix="reduce", preamble="", template_processor=None): KernelTemplateBase.__init__( self, template_processor=template_processor) self.arguments = arguments self.reduce_expr = reduce_expr self.neutral = neutral self.map_expr = map_expr self.name_prefix = name_prefix self.preamble = preamble def build_inner(self, context, type_aliases=(), var_values=(), more_preamble="", more_arguments=(), declare_types=(), options=(), devices=None): renderer = self.get_renderer( type_aliases, var_values, context, options) arg_list = renderer.render_argument_list( self.arguments, more_arguments) type_decl_preamble = renderer.get_type_decl_preamble( context.devices[0], declare_types, arg_list) return ReductionKernel(context, renderer.type_aliases["reduction_t"], renderer(self.neutral), renderer(self.reduce_expr), renderer(self.map_expr), renderer.render_argument_list(self.arguments, more_arguments), name=renderer(self.name_prefix), options=list(options), preamble=( type_decl_preamble + "\n" + renderer(self.preamble + "\n" + more_preamble))) # }}} # {{{ array reduction kernel getters @context_dependent_memoize def get_any_kernel(ctx, dtype_in): from pyopencl.tools import VectorArg return ReductionKernel(ctx, np.int8, "false", "a || b", map_expr="(bool) (in[i])", arguments=[VectorArg(dtype_in, "in")]) @context_dependent_memoize def get_all_kernel(ctx, dtype_in): from pyopencl.tools import VectorArg return ReductionKernel(ctx, np.int8, "true", "a && b", map_expr="(bool) (in[i])", arguments=[VectorArg(dtype_in, "in")]) @context_dependent_memoize def get_sum_kernel(ctx, dtype_out, dtype_in): if dtype_out is None: dtype_out = dtype_in return ReductionKernel(ctx, dtype_out, "0", "a+b", arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)}) def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support, index_expr="i"): if dtype_b is None: if dtype_a is None: dtype_b = dtype_out else: dtype_b = dtype_a if dtype_out is None: from pyopencl.compyte.array import get_common_dtype dtype_out = get_common_dtype( dtype_a.type(0), dtype_b.type(0), has_double_support) a_real_dtype = dtype_a.type(0).real.dtype b_real_dtype = dtype_b.type(0).real.dtype out_real_dtype = dtype_out.type(0).real.dtype a_is_complex = dtype_a.kind == "c" b_is_complex = dtype_b.kind == "c" out_is_complex = dtype_out.kind == "c" from pyopencl.elementwise import complex_dtype_to_name if a_is_complex and b_is_complex: a = "a[%s]" % index_expr b = "b[%s]" % index_expr if dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) if conjugate_first and a_is_complex: a = "%s_conj(%s)" % ( complex_dtype_to_name(dtype_out), a) map_expr = "%s_mul(%s, %s)" % ( complex_dtype_to_name(dtype_out), a, b) else: a = "a[%s]" % index_expr b = "b[%s]" % index_expr if out_is_complex: if a_is_complex and dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if b_is_complex and dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) if not a_is_complex and a_real_dtype != out_real_dtype: a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a) if not b_is_complex and b_real_dtype != out_real_dtype: b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b) if conjugate_first and a_is_complex: a = "%s_conj(%s)" % ( complex_dtype_to_name(dtype_out), a) map_expr = "%s*%s" % (a, b) return map_expr, dtype_out, dtype_b @context_dependent_memoize def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0])) return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments= "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }) @context_dependent_memoize def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0]), index_expr="lookup_tbl[i]") # important: lookup_tbl must be first--it controls the length return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments= "const %(tp_lut)s *lookup_tbl, " "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_lut": dtype_to_ctype(dtype_subset), "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }) def get_minmax_neutral(what, dtype): dtype = np.dtype(dtype) if issubclass(dtype.type, np.inexact): if what == "min": return "MY_INFINITY" elif what == "max": return "-MY_INFINITY" else: raise ValueError("what is not min or max.") else: if what == "min": return str(np.iinfo(dtype).max) elif what == "max": return str(np.iinfo(dtype).min) else: raise ValueError("what is not min or max.") @context_dependent_memoize def get_minmax_kernel(ctx, what, dtype): if dtype.kind == "f": reduce_expr = "f%s(a,b)" % what elif dtype.kind in "iu": reduce_expr = "%s(a,b)" % what else: raise TypeError("unsupported dtype specified") return ReductionKernel(ctx, dtype, neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, arguments="const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), }, preamble="#define MY_INFINITY (1./0)") @context_dependent_memoize def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset): if dtype.kind == "f": reduce_expr = "f%s(a,b)" % what elif dtype.kind in "iu": reduce_expr = "%s(a,b)" % what else: raise TypeError("unsupported dtype specified") return ReductionKernel(ctx, dtype, neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, map_expr="in[lookup_tbl[i]]", arguments= "const %(tp_lut)s *lookup_tbl, " "const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), "tp_lut": dtype_to_ctype(dtype_subset), }, preamble="#define MY_INFINITY (1./0)") # }}} # vim: filetype=pyopencl:fdm=marker pyopencl-2013.2/pyopencl/clmath.py0000644000175000000500000001520012245716340015617 0ustar tomussrc__copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pyopencl.array as cl_array import pyopencl.elementwise as elementwise from pyopencl.array import _get_common_dtype def _make_unary_array_func(name): @cl_array.elwise_kernel_runner def knl_runner(result, arg): if arg.dtype.kind == "c": from pyopencl.elementwise import complex_dtype_to_name fname = "%s_%s" % (complex_dtype_to_name(arg.dtype), name) else: fname = name return elementwise.get_unary_func_kernel( result.context, fname, arg.dtype) def f(array, queue=None): result = array._new_like_me(queue=queue) knl_runner(result, array, queue=queue) return result return f # See table 6.8 in the CL 1.1 spec acos = _make_unary_array_func("acos") acosh = _make_unary_array_func("acosh") acospi = _make_unary_array_func("acospi") asin = _make_unary_array_func("asin") asinh = _make_unary_array_func("asinh") asinpi = _make_unary_array_func("asinpi") @cl_array.elwise_kernel_runner def _atan2(result, arg1, arg2): return elementwise.get_binary_func_kernel(result.context, "atan2", arg1.dtype, arg2.dtype, result.dtype) @cl_array.elwise_kernel_runner def _atan2pi(result, arg1, arg2): return elementwise.get_binary_func_kernel(result.context, "atan2pi", arg1.dtype, arg2.dtype, result.dtype) atan = _make_unary_array_func("atan") def atan2(y, x, queue=None): """ .. versionadded:: 2013.1 """ queue = queue or y.queue result = y._new_like_me(_get_common_dtype(y, x, queue)) _atan2(result, y, x, queue=queue) return result atanh = _make_unary_array_func("atanh") atanpi = _make_unary_array_func("atanpi") def atan2pi(y, x, queue=None): """ .. versionadded:: 2013.1 """ queue = queue or y.queue result = y._new_like_me(_get_common_dtype(y, x, queue)) _atan2pi(result, y, x, queue=queue) return result cbrt = _make_unary_array_func("cbrt") ceil = _make_unary_array_func("ceil") # TODO: copysign cos = _make_unary_array_func("cos") cosh = _make_unary_array_func("cosh") cospi = _make_unary_array_func("cospi") erfc = _make_unary_array_func("erfc") erf = _make_unary_array_func("erf") exp = _make_unary_array_func("exp") exp2 = _make_unary_array_func("exp2") exp10 = _make_unary_array_func("exp10") expm1 = _make_unary_array_func("expm1") fabs = _make_unary_array_func("fabs") # TODO: fdim floor = _make_unary_array_func("floor") # TODO: fma # TODO: fmax # TODO: fmin @cl_array.elwise_kernel_runner def _fmod(result, arg, mod): return elementwise.get_fmod_kernel(result.context) def fmod(arg, mod, queue=None): """Return the floating point remainder of the division `arg/mod`, for each element in `arg` and `mod`.""" result = arg._new_like_me(queue=queue) _fmod(result, arg, mod, queue=queue) return result # TODO: fract @cl_array.elwise_kernel_runner def _frexp(sig, expt, arg): return elementwise.get_frexp_kernel(sig.context) def frexp(arg, queue=None): """Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. """ sig = arg._new_like_me(queue=queue) expt = arg._new_like_me(queue=queue) _frexp(sig, expt, arg, queue=queue) return sig, expt # TODO: hypot ilogb = _make_unary_array_func("ilogb") @cl_array.elwise_kernel_runner def _ldexp(result, sig, exp): return elementwise.get_ldexp_kernel(result.context) def ldexp(significand, exponent, queue=None): """Return a new array of floating point values composed from the entries of `significand` and `exponent`, paired together as `result = significand * 2**exponent`. """ result = significand._new_like_me(queue=queue) _ldexp(result, significand, exponent) return result lgamma = _make_unary_array_func("lgamma") # TODO: lgamma_r log = _make_unary_array_func("log") log2 = _make_unary_array_func("log2") log10 = _make_unary_array_func("log10") log1p = _make_unary_array_func("log1p") logb = _make_unary_array_func("logb") # TODO: mad # TODO: maxmag # TODO: minmag @cl_array.elwise_kernel_runner def _modf(intpart, fracpart, arg): return elementwise.get_modf_kernel(intpart.context) def modf(arg, queue=None): """Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. """ intpart = arg._new_like_me(queue=queue) fracpart = arg._new_like_me(queue=queue) _modf(intpart, fracpart, arg, queue=queue) return fracpart, intpart nan = _make_unary_array_func("nan") # TODO: nextafter # TODO: remainder # TODO: remquo rint = _make_unary_array_func("rint") # TODO: rootn round = _make_unary_array_func("round") sin = _make_unary_array_func("sin") # TODO: sincos sinh = _make_unary_array_func("sinh") sinpi = _make_unary_array_func("sinpi") sqrt = _make_unary_array_func("sqrt") tan = _make_unary_array_func("tan") tanh = _make_unary_array_func("tanh") tanpi = _make_unary_array_func("tanpi") tgamma = _make_unary_array_func("tgamma") trunc = _make_unary_array_func("trunc") # no point wrapping half_ or native_ # TODO: table 6.10, integer functions # TODO: table 6.12, clamp et al @cl_array.elwise_kernel_runner def _bessel_jn(result, sig, exp): return elementwise.get_bessel_kernel(result.context, "j") @cl_array.elwise_kernel_runner def _bessel_yn(result, sig, exp): return elementwise.get_bessel_kernel(result.context, "y") def bessel_jn(n, x, queue=None): result = x._new_like_me(queue=queue) _bessel_jn(result, n, x) return result def bessel_yn(n, x, queue=None): result = x._new_like_me(queue=queue) _bessel_yn(result, n, x) return result pyopencl-2013.2/pyopencl/cl/0002755000175000000500000000000012245716340014377 5ustar tomussrcpyopencl-2013.2/pyopencl/cl/pyopencl-bessel-j.cl0000644000175000000500000005535212245716340020262 0ustar tomussrc// Pieced together from Boost C++ and Cephes by // Andreas Kloeckner (C) 2012 // // Pieces from: // // Copyright (c) 2006 Xiaogang Zhang, John Maddock // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See // http://www.boost.org/LICENSE_1_0.txt) // // Cephes Math Library Release 2.8: June, 2000 // Copyright 1984, 1987, 1989, 1992, 2000 by Stephen L. Moshier // What you see here may be used freely, but it comes with no support or // guarantee. #pragma once #include #include typedef double bessel_j_scalar_type; // FIXME: T is really a bad name typedef bessel_j_scalar_type T; // {{{ bessel_j0 __constant const bessel_j_scalar_type bessel_j0_P1[] = { -4.1298668500990866786e+11, 2.7282507878605942706e+10, -6.2140700423540120665e+08, 6.6302997904833794242e+06, -3.6629814655107086448e+04, 1.0344222815443188943e+02, -1.2117036164593528341e-01 }; __constant const bessel_j_scalar_type bessel_j0_Q1[] = { 2.3883787996332290397e+12, 2.6328198300859648632e+10, 1.3985097372263433271e+08, 4.5612696224219938200e+05, 9.3614022392337710626e+02, 1.0, 0.0 }; __constant const bessel_j_scalar_type bessel_j0_P2[] = { -1.8319397969392084011e+03, -1.2254078161378989535e+04, -7.2879702464464618998e+03, 1.0341910641583726701e+04, 1.1725046279757103576e+04, 4.4176707025325087628e+03, 7.4321196680624245801e+02, 4.8591703355916499363e+01 }; __constant const bessel_j_scalar_type bessel_j0_Q2[] = { -3.5783478026152301072e+05, 2.4599102262586308984e+05, -8.4055062591169562211e+04, 1.8680990008359188352e+04, -2.9458766545509337327e+03, 3.3307310774649071172e+02, -2.5258076240801555057e+01, 1.0 }; __constant const bessel_j_scalar_type bessel_j0_PC[] = { 2.2779090197304684302e+04, 4.1345386639580765797e+04, 2.1170523380864944322e+04, 3.4806486443249270347e+03, 1.5376201909008354296e+02, 8.8961548424210455236e-01 }; __constant const bessel_j_scalar_type bessel_j0_QC[] = { 2.2779090197304684318e+04, 4.1370412495510416640e+04, 2.1215350561880115730e+04, 3.5028735138235608207e+03, 1.5711159858080893649e+02, 1.0 }; __constant const bessel_j_scalar_type bessel_j0_PS[] = { -8.9226600200800094098e+01, -1.8591953644342993800e+02, -1.1183429920482737611e+02, -2.2300261666214198472e+01, -1.2441026745835638459e+00, -8.8033303048680751817e-03 }; __constant const bessel_j_scalar_type bessel_j0_QS[] = { 5.7105024128512061905e+03, 1.1951131543434613647e+04, 7.2642780169211018836e+03, 1.4887231232283756582e+03, 9.0593769594993125859e+01, 1.0 }; bessel_j_scalar_type bessel_j0(bessel_j_scalar_type x) { const bessel_j_scalar_type x1 = 2.4048255576957727686e+00, x2 = 5.5200781102863106496e+00, x11 = 6.160e+02, x12 = -1.42444230422723137837e-03, x21 = 1.4130e+03, x22 = 5.46860286310649596604e-04; bessel_j_scalar_type value, factor, r, rc, rs; if (x < 0) { x = -x; // even function } if (x == 0) { return 1; } if (x <= 4) // x in (0, 4] { bessel_j_scalar_type y = x * x; r = boost_evaluate_rational(bessel_j0_P1, bessel_j0_Q1, y); factor = (x + x1) * ((x - x11/256) - x12); value = factor * r; } else if (x <= 8.0) // x in (4, 8] { bessel_j_scalar_type y = 1 - (x * x)/64; r = boost_evaluate_rational(bessel_j0_P2, bessel_j0_Q2, y); factor = (x + x2) * ((x - x21/256) - x22); value = factor * r; } else // x in (8, \infty) { bessel_j_scalar_type y = 8 / x; bessel_j_scalar_type y2 = y * y; bessel_j_scalar_type z = x - 0.25f * M_PI; rc = boost_evaluate_rational(bessel_j0_PC, bessel_j0_QC, y2); rs = boost_evaluate_rational(bessel_j0_PS, bessel_j0_QS, y2); factor = sqrt(2 / (x * M_PI)); value = factor * (rc * cos(z) - y * rs * sin(z)); } return value; } // }}} // {{{ bessel_j1 __constant const bessel_j_scalar_type bessel_j1_P1[] = { -1.4258509801366645672e+11, 6.6781041261492395835e+09, -1.1548696764841276794e+08, 9.8062904098958257677e+05, -4.4615792982775076130e+03, 1.0650724020080236441e+01, -1.0767857011487300348e-02 }; __constant const bessel_j_scalar_type bessel_j1_Q1[] = { 4.1868604460820175290e+12, 4.2091902282580133541e+10, 2.0228375140097033958e+08, 5.9117614494174794095e+05, 1.0742272239517380498e+03, 1.0, 0.0 }; __constant const bessel_j_scalar_type bessel_j1_P2[] = { -1.7527881995806511112e+16, 1.6608531731299018674e+15, -3.6658018905416665164e+13, 3.5580665670910619166e+11, -1.8113931269860667829e+09, 5.0793266148011179143e+06, -7.5023342220781607561e+03, 4.6179191852758252278e+00 }; __constant const bessel_j_scalar_type bessel_j1_Q2[] = { 1.7253905888447681194e+18, 1.7128800897135812012e+16, 8.4899346165481429307e+13, 2.7622777286244082666e+11, 6.4872502899596389593e+08, 1.1267125065029138050e+06, 1.3886978985861357615e+03, 1.0 }; __constant const bessel_j_scalar_type bessel_j1_PC[] = { -4.4357578167941278571e+06, -9.9422465050776411957e+06, -6.6033732483649391093e+06, -1.5235293511811373833e+06, -1.0982405543459346727e+05, -1.6116166443246101165e+03, 0.0 }; __constant const bessel_j_scalar_type bessel_j1_QC[] = { -4.4357578167941278568e+06, -9.9341243899345856590e+06, -6.5853394797230870728e+06, -1.5118095066341608816e+06, -1.0726385991103820119e+05, -1.4550094401904961825e+03, 1.0 }; __constant const bessel_j_scalar_type bessel_j1_PS[] = { 3.3220913409857223519e+04, 8.5145160675335701966e+04, 6.6178836581270835179e+04, 1.8494262873223866797e+04, 1.7063754290207680021e+03, 3.5265133846636032186e+01, 0.0 }; __constant const bessel_j_scalar_type bessel_j1_QS[] = { 7.0871281941028743574e+05, 1.8194580422439972989e+06, 1.4194606696037208929e+06, 4.0029443582266975117e+05, 3.7890229745772202641e+04, 8.6383677696049909675e+02, 1.0 }; bessel_j_scalar_type bessel_j1(bessel_j_scalar_type x) { const bessel_j_scalar_type x1 = 3.8317059702075123156e+00, x2 = 7.0155866698156187535e+00, x11 = 9.810e+02, x12 = -3.2527979248768438556e-04, x21 = 1.7960e+03, x22 = -3.8330184381246462950e-05; bessel_j_scalar_type value, factor, r, rc, rs, w; w = fabs(x); if (x == 0) { return 0; } if (w <= 4) // w in (0, 4] { bessel_j_scalar_type y = x * x; r = boost_evaluate_rational(bessel_j1_P1, bessel_j1_Q1, y); factor = w * (w + x1) * ((w - x11/256) - x12); value = factor * r; } else if (w <= 8) // w in (4, 8] { bessel_j_scalar_type y = x * x; r = boost_evaluate_rational(bessel_j1_P2, bessel_j1_Q2, y); factor = w * (w + x2) * ((w - x21/256) - x22); value = factor * r; } else // w in (8, \infty) { bessel_j_scalar_type y = 8 / w; bessel_j_scalar_type y2 = y * y; bessel_j_scalar_type z = w - 0.75f * M_PI; rc = boost_evaluate_rational(bessel_j1_PC, bessel_j1_QC, y2); rs = boost_evaluate_rational(bessel_j1_PS, bessel_j1_QS, y2); factor = sqrt(2 / (w * M_PI)); value = factor * (rc * cos(z) - y * rs * sin(z)); } if (x < 0) { value *= -1; // odd function } return value; } // }}} // {{{ bessel_recur /* Reduce the order by backward recurrence. * AMS55 #9.1.27 and 9.1.73. */ #define BESSEL_BIG 1.44115188075855872E+17 double bessel_recur(double *n, double x, double *newn, int cancel ) { double pkm2, pkm1, pk, qkm2, qkm1; /* double pkp1; */ double k, ans, qk, xk, yk, r, t, kf; const double big = BESSEL_BIG; int nflag, ctr; /* continued fraction for Jn(x)/Jn-1(x) */ if( *n < 0.0 ) nflag = 1; else nflag = 0; fstart: #if DEBUG printf( "recur: n = %.6e, newn = %.6e, cfrac = ", *n, *newn ); #endif pkm2 = 0.0; qkm2 = 1.0; pkm1 = x; qkm1 = *n + *n; xk = -x * x; yk = qkm1; ans = 1.0; ctr = 0; do { yk += 2.0; pk = pkm1 * yk + pkm2 * xk; qk = qkm1 * yk + qkm2 * xk; pkm2 = pkm1; pkm1 = pk; qkm2 = qkm1; qkm1 = qk; if( qk != 0 ) r = pk/qk; else r = 0.0; if( r != 0 ) { t = fabs( (ans - r)/r ); ans = r; } else t = 1.0; if( ++ctr > 1000 ) { //mtherr( "jv", UNDERFLOW ); pk = nan((uint)24); goto done; } if( t < DBL_EPSILON ) goto done; if( fabs(pk) > big ) { pkm2 /= big; pkm1 /= big; qkm2 /= big; qkm1 /= big; } } while( t > DBL_EPSILON ); done: #if DEBUG printf( "%.6e\n", ans ); #endif /* Change n to n-1 if n < 0 and the continued fraction is small */ if( nflag > 0 ) { if( fabs(ans) < 0.125 ) { nflag = -1; *n = *n - 1.0; goto fstart; } } kf = *newn; /* backward recurrence * 2k * J (x) = --- J (x) - J (x) * k-1 x k k+1 */ pk = 1.0; pkm1 = 1.0/ans; k = *n - 1.0; r = 2 * k; do { pkm2 = (pkm1 * r - pk * x) / x; /* pkp1 = pk; */ pk = pkm1; pkm1 = pkm2; r -= 2.0; /* t = fabs(pkp1) + fabs(pk); if( (k > (kf + 2.5)) && (fabs(pkm1) < 0.25*t) ) { k -= 1.0; t = x*x; pkm2 = ( (r*(r+2.0)-t)*pk - r*x*pkp1 )/t; pkp1 = pk; pk = pkm1; pkm1 = pkm2; r -= 2.0; } */ k -= 1.0; } while( k > (kf + 0.5) ); /* Take the larger of the last two iterates * on the theory that it may have less cancellation error. */ if( cancel ) { if( (kf >= 0.0) && (fabs(pk) > fabs(pkm1)) ) { k += 1.0; pkm2 = pk; } } *newn = k; #if DEBUG printf( "newn %.6e rans %.6e\n", k, pkm2 ); #endif return( pkm2 ); } // }}} // {{{ bessel_jvs #define BESSEL_MAXGAM 171.624376956302725 #define BESSEL_MAXLOG 7.09782712893383996843E2 /* Ascending power series for Jv(x). * AMS55 #9.1.10. */ double bessel_jvs(double n, double x) { double t, u, y, z, k; int ex; int sgngam = 1; z = -x * x / 4.0; u = 1.0; y = u; k = 1.0; t = 1.0; while( t > DBL_EPSILON ) { u *= z / (k * (n+k)); y += u; k += 1.0; if( y != 0 ) t = fabs( u/y ); } #if DEBUG printf( "power series=%.5e ", y ); #endif t = frexp( 0.5*x, &ex ); ex = ex * n; if( (ex > -1023) && (ex < 1023) && (n > 0.0) && (n < (BESSEL_MAXGAM-1.0)) ) { t = pow( 0.5*x, n ) / tgamma( n + 1.0 ); #if DEBUG printf( "pow(.5*x, %.4e)/gamma(n+1)=%.5e\n", n, t ); #endif y *= t; } else { #if DEBUG z = n * log(0.5*x); k = lgamma( n+1.0 ); t = z - k; printf( "log pow=%.5e, lgam(%.4e)=%.5e\n", z, n+1.0, k ); #else t = n * log(0.5*x) - lgamma(n + 1.0); #endif if( y < 0 ) { sgngam = -sgngam; y = -y; } t += log(y); #if DEBUG printf( "log y=%.5e\n", log(y) ); #endif if( t < -BESSEL_MAXLOG ) { return( 0.0 ); } if( t > BESSEL_MAXLOG ) { // mtherr( "Jv", OVERFLOW ); return( DBL_MAX); } y = sgngam * exp( t ); } return(y); } // }}} // {{{ bessel_jnt __constant const double bessel_jnt_PF2[] = { -9.0000000000000000000e-2, 8.5714285714285714286e-2 }; __constant const double bessel_jnt_PF3[] = { 1.3671428571428571429e-1, -5.4920634920634920635e-2, -4.4444444444444444444e-3 }; __constant const double bessel_jnt_PF4[] = { 1.3500000000000000000e-3, -1.6036054421768707483e-1, 4.2590187590187590188e-2, 2.7330447330447330447e-3 }; __constant const double bessel_jnt_PG1[] = { -2.4285714285714285714e-1, 1.4285714285714285714e-2 }; __constant const double bessel_jnt_PG2[] = { -9.0000000000000000000e-3, 1.9396825396825396825e-1, -1.1746031746031746032e-2 }; __constant const double bessel_jnt_PG3[] = { 1.9607142857142857143e-2, -1.5983694083694083694e-1, 6.3838383838383838384e-3 }; double bessel_jnt(double n, double x) { double z, zz, z3; double cbn, n23, cbtwo; double ai, aip, bi, bip; /* Airy functions */ double nk, fk, gk, pp, qq; double F[5], G[4]; int k; cbn = cbrt(n); z = (x - n)/cbn; cbtwo = cbrt( 2.0 ); /* Airy function */ zz = -cbtwo * z; airy( zz, &ai, &aip, &bi, &bip ); /* polynomials in expansion */ zz = z * z; z3 = zz * z; F[0] = 1.0; F[1] = -z/5.0; F[2] = cephes_polevl( z3, bessel_jnt_PF2, 1 ) * zz; F[3] = cephes_polevl( z3, bessel_jnt_PF3, 2 ); F[4] = cephes_polevl( z3, bessel_jnt_PF4, 3 ) * z; G[0] = 0.3 * zz; G[1] = cephes_polevl( z3, bessel_jnt_PG1, 1 ); G[2] = cephes_polevl( z3, bessel_jnt_PG2, 2 ) * z; G[3] = cephes_polevl( z3, bessel_jnt_PG3, 2 ) * zz; #if DEBUG for( k=0; k<=4; k++ ) printf( "F[%d] = %.5E\n", k, F[k] ); for( k=0; k<=3; k++ ) printf( "G[%d] = %.5E\n", k, G[k] ); #endif pp = 0.0; qq = 0.0; nk = 1.0; n23 = cbrt( n * n ); for( k=0; k<=4; k++ ) { fk = F[k]*nk; pp += fk; if( k != 4 ) { gk = G[k]*nk; qq += gk; } #if DEBUG printf("fk[%d] %.5E, gk[%d] %.5E\n", k, fk, k, gk ); #endif nk /= n23; } fk = cbtwo * ai * pp/cbn + cbrt(4.0) * aip * qq/n; return(fk); } // }}} // {{{ bessel_jnx __constant const double bessel_jnx_lambda[] = { 1.0, 1.041666666666666666666667E-1, 8.355034722222222222222222E-2, 1.282265745563271604938272E-1, 2.918490264641404642489712E-1, 8.816272674437576524187671E-1, 3.321408281862767544702647E+0, 1.499576298686255465867237E+1, 7.892301301158651813848139E+1, 4.744515388682643231611949E+2, 3.207490090890661934704328E+3 }; __constant const double bessel_jnx_mu[] = { 1.0, -1.458333333333333333333333E-1, -9.874131944444444444444444E-2, -1.433120539158950617283951E-1, -3.172272026784135480967078E-1, -9.424291479571202491373028E-1, -3.511203040826354261542798E+0, -1.572726362036804512982712E+1, -8.228143909718594444224656E+1, -4.923553705236705240352022E+2, -3.316218568547972508762102E+3 }; __constant const double bessel_jnx_P1[] = { -2.083333333333333333333333E-1, 1.250000000000000000000000E-1 }; __constant const double bessel_jnx_P2[] = { 3.342013888888888888888889E-1, -4.010416666666666666666667E-1, 7.031250000000000000000000E-2 }; __constant const double bessel_jnx_P3[] = { -1.025812596450617283950617E+0, 1.846462673611111111111111E+0, -8.912109375000000000000000E-1, 7.324218750000000000000000E-2 }; __constant const double bessel_jnx_P4[] = { 4.669584423426247427983539E+0, -1.120700261622299382716049E+1, 8.789123535156250000000000E+0, -2.364086914062500000000000E+0, 1.121520996093750000000000E-1 }; __constant const double bessel_jnx_P5[] = { -2.8212072558200244877E1, 8.4636217674600734632E1, -9.1818241543240017361E1, 4.2534998745388454861E1, -7.3687943594796316964E0, 2.27108001708984375E-1 }; __constant const double bessel_jnx_P6[] = { 2.1257013003921712286E2, -7.6525246814118164230E2, 1.0599904525279998779E3, -6.9957962737613254123E2, 2.1819051174421159048E2, -2.6491430486951555525E1, 5.7250142097473144531E-1 }; __constant const double bessel_jnx_P7[] = { -1.9194576623184069963E3, 8.0617221817373093845E3, -1.3586550006434137439E4, 1.1655393336864533248E4, -5.3056469786134031084E3, 1.2009029132163524628E3, -1.0809091978839465550E2, 1.7277275025844573975E0 }; double bessel_jnx(double n, double x) { double zeta, sqz, zz, zp, np; double cbn, n23, t, z, sz; double pp, qq, z32i, zzi; double ak, bk, akl, bkl; int sign, doa, dob, nflg, k, s, tk, tkp1, m; double u[8]; double ai, aip, bi, bip; /* Test for x very close to n. * Use expansion for transition region if so. */ cbn = cbrt(n); z = (x - n)/cbn; if( fabs(z) <= 0.7 ) return( bessel_jnt(n,x) ); z = x/n; zz = 1.0 - z*z; if( zz == 0.0 ) return(0.0); if( zz > 0.0 ) { sz = sqrt( zz ); t = 1.5 * (log( (1.0+sz)/z ) - sz ); /* zeta ** 3/2 */ zeta = cbrt( t * t ); nflg = 1; } else { sz = sqrt(-zz); t = 1.5 * (sz - acos(1.0/z)); zeta = -cbrt( t * t ); nflg = -1; } z32i = fabs(1.0/t); sqz = cbrt(t); /* Airy function */ n23 = cbrt( n * n ); t = n23 * zeta; #if DEBUG printf("zeta %.5E, Airy(%.5E)\n", zeta, t ); #endif airy( t, &ai, &aip, &bi, &bip ); /* polynomials in expansion */ u[0] = 1.0; zzi = 1.0/zz; u[1] = cephes_polevl( zzi, bessel_jnx_P1, 1 )/sz; u[2] = cephes_polevl( zzi, bessel_jnx_P2, 2 )/zz; u[3] = cephes_polevl( zzi, bessel_jnx_P3, 3 )/(sz*zz); pp = zz*zz; u[4] = cephes_polevl( zzi, bessel_jnx_P4, 4 )/pp; u[5] = cephes_polevl( zzi, bessel_jnx_P5, 5 )/(pp*sz); pp *= zz; u[6] = cephes_polevl( zzi, bessel_jnx_P6, 6 )/pp; u[7] = cephes_polevl( zzi, bessel_jnx_P7, 7 )/(pp*sz); #if DEBUG for( k=0; k<=7; k++ ) printf( "u[%d] = %.5E\n", k, u[k] ); #endif pp = 0.0; qq = 0.0; np = 1.0; /* flags to stop when terms get larger */ doa = 1; dob = 1; akl = DBL_MAX; bkl = DBL_MAX; for( k=0; k<=3; k++ ) { tk = 2 * k; tkp1 = tk + 1; zp = 1.0; ak = 0.0; bk = 0.0; for( s=0; s<=tk; s++ ) { if( doa ) { if( (s & 3) > 1 ) sign = nflg; else sign = 1; ak += sign * bessel_jnx_mu[s] * zp * u[tk-s]; } if( dob ) { m = tkp1 - s; if( ((m+1) & 3) > 1 ) sign = nflg; else sign = 1; bk += sign * bessel_jnx_lambda[s] * zp * u[m]; } zp *= z32i; } if( doa ) { ak *= np; t = fabs(ak); if( t < akl ) { akl = t; pp += ak; } else doa = 0; } if( dob ) { bk += bessel_jnx_lambda[tkp1] * zp * u[0]; bk *= -np/sqz; t = fabs(bk); if( t < bkl ) { bkl = t; qq += bk; } else dob = 0; } #if DEBUG printf("a[%d] %.5E, b[%d] %.5E\n", k, ak, k, bk ); #endif if( np < DBL_EPSILON ) break; np /= n*n; } /* normalizing factor ( 4*zeta/(1 - z**2) )**1/4 */ t = 4.0 * zeta/zz; t = sqrt( sqrt(t) ); t *= ai*pp/cbrt(n) + aip*qq/(n23*n); return(t); } // }}} // {{{ bessel_hankel /* Hankel's asymptotic expansion * for large x. * AMS55 #9.2.5. */ double bessel_hankel( double n, double x ) { double t, u, z, k, sign, conv; double p, q, j, m, pp, qq; int flag; m = 4.0*n*n; j = 1.0; z = 8.0 * x; k = 1.0; p = 1.0; u = (m - 1.0)/z; q = u; sign = 1.0; conv = 1.0; flag = 0; t = 1.0; pp = 1.0e38; qq = 1.0e38; while( t > DBL_EPSILON ) { k += 2.0; j += 1.0; sign = -sign; u *= (m - k * k)/(j * z); p += sign * u; k += 2.0; j += 1.0; u *= (m - k * k)/(j * z); q += sign * u; t = fabs(u/p); if( t < conv ) { conv = t; qq = q; pp = p; flag = 1; } /* stop if the terms start getting larger */ if( (flag != 0) && (t > conv) ) { #if DEBUG printf( "Hankel: convergence to %.4E\n", conv ); #endif goto hank1; } } hank1: u = x - (0.5*n + 0.25) * M_PI; t = sqrt( 2.0/(M_PI*x) ) * ( pp * cos(u) - qq * sin(u) ); #if DEBUG printf( "hank: %.6e\n", t ); #endif return( t ); } // }}} // {{{ bessel_jv // SciPy says jn has no advantage over jv, so alias the two. #define bessel_jn bessel_jv double bessel_jv(double n, double x) { double k, q, t, y, an; int i, sign, nint; nint = 0; /* Flag for integer n */ sign = 1; /* Flag for sign inversion */ an = fabs( n ); y = floor( an ); if( y == an ) { nint = 1; i = an - 16384.0 * floor( an/16384.0 ); if( n < 0.0 ) { if( i & 1 ) sign = -sign; n = an; } if( x < 0.0 ) { if( i & 1 ) sign = -sign; x = -x; } if( n == 0.0 ) return( bessel_j0(x) ); if( n == 1.0 ) return( sign * bessel_j1(x) ); } if( (x < 0.0) && (y != an) ) { // mtherr( "Jv", DOMAIN ); // y = 0.0; y = nan((uint)22); goto done; } y = fabs(x); if( y < DBL_EPSILON ) goto underf; k = 3.6 * sqrt(y); t = 3.6 * sqrt(an); if( (y < t) && (an > 21.0) ) return( sign * bessel_jvs(n,x) ); if( (an < k) && (y > 21.0) ) return( sign * bessel_hankel(n,x) ); if( an < 500.0 ) { /* Note: if x is too large, the continued * fraction will fail; but then the * Hankel expansion can be used. */ if( nint != 0 ) { k = 0.0; q = bessel_recur( &n, x, &k, 1 ); if( k == 0.0 ) { y = bessel_j0(x)/q; goto done; } if( k == 1.0 ) { y = bessel_j1(x)/q; goto done; } } if( an > 2.0 * y ) goto rlarger; if( (n >= 0.0) && (n < 20.0) && (y > 6.0) && (y < 20.0) ) { /* Recur backwards from a larger value of n */ rlarger: k = n; y = y + an + 1.0; if( y < 30.0 ) y = 30.0; y = n + floor(y-n); q = bessel_recur( &y, x, &k, 0 ); y = bessel_jvs(y,x) * q; goto done; } if( k <= 30.0 ) { k = 2.0; } else if( k < 90.0 ) { k = (3*k)/4; } if( an > (k + 3.0) ) { if( n < 0.0 ) k = -k; q = n - floor(n); k = floor(k) + q; if( n > 0.0 ) q = bessel_recur( &n, x, &k, 1 ); else { t = k; k = n; q = bessel_recur( &t, x, &k, 1 ); k = t; } if( q == 0.0 ) { underf: y = 0.0; goto done; } } else { k = n; q = 1.0; } /* boundary between convergence of * power series and Hankel expansion */ y = fabs(k); if( y < 26.0 ) t = (0.0083*y + 0.09)*y + 12.9; else t = 0.9 * y; if( x > t ) y = bessel_hankel(k,x); else y = bessel_jvs(k,x); #if DEBUG printf( "y = %.16e, recur q = %.16e\n", y, q ); #endif if( n > 0.0 ) y /= q; else y *= q; } else { /* For large n, use the uniform expansion * or the transitional expansion. * But if x is of the order of n**2, * these may blow up, whereas the * Hankel expansion will then work. */ if( n < 0.0 ) { //mtherr( "Jv", TLOSS ); //y = 0.0; y = nan((uint)23); goto done; } t = x/n; t /= n; if( t > 0.3 ) y = bessel_hankel(n,x); else y = bessel_jnx(n,x); } done: return( sign * y); } // }}} // vim: fdm=marker pyopencl-2013.2/pyopencl/cl/pyopencl-eval-tbl.cl0000644000175000000500000000507012245716340020254 0ustar tomussrc// Pieced together from Boost C++ and Cephes by // Andreas Kloeckner (C) 2012 // // Pieces from: // // Copyright (c) 2006 Xiaogang Zhang, John Maddock // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See // http://www.boost.org/LICENSE_1_0.txt) // // Cephes Math Library Release 2.8: June, 2000 // Copyright 1984, 1987, 1989, 1992, 2000 by Stephen L. Moshier // What you see here may be used freely, but it comes with no support or // guarantee. #pragma once typedef double special_func_scalar_type; // {{{ cephes_polevl /* * DESCRIPTION: * * Evaluates polynomial of degree N: * * 2 N * y = C + C x + C x +...+ C x * 0 1 2 N * * Coefficients are stored in reverse order: * * coef[0] = C , ..., coef[N] = C . * N 0 * * The function p1evl() assumes that coef[N] = 1.0 and is * omitted from the array. Its calling arguments are * otherwise the same as polevl(). * */ special_func_scalar_type cephes_polevl(special_func_scalar_type x, __constant const special_func_scalar_type *coef, int N) { special_func_scalar_type ans; int i; __constant const special_func_scalar_type *p; p = coef; ans = *p++; i = N; do ans = ans * x + *p++; while( --i ); return( ans ); } // }}} // {{{ cephes_p1evl special_func_scalar_type cephes_p1evl( special_func_scalar_type x, __constant const special_func_scalar_type *coef, int N ) { special_func_scalar_type ans; __constant const special_func_scalar_type *p; int i; p = coef; ans = x + *p++; i = N-1; do ans = ans * x + *p++; while( --i ); return( ans ); } // }}} // {{{ boost_evaluate_rational special_func_scalar_type boost_evaluate_rational_backend(__constant const special_func_scalar_type* num, __constant const special_func_scalar_type* denom, special_func_scalar_type z, int count) { special_func_scalar_type s1, s2; if(z <= 1) { s1 = num[count-1]; s2 = denom[count-1]; for(int i = (int)count - 2; i >= 0; --i) { s1 *= z; s2 *= z; s1 += num[i]; s2 += denom[i]; } } else { z = 1 / z; s1 = num[0]; s2 = denom[0]; for(unsigned i = 1; i < count; ++i) { s1 *= z; s2 *= z; s1 += num[i]; s2 += denom[i]; } } return s1 / s2; } #define boost_evaluate_rational(num, denom, z) \ boost_evaluate_rational_backend(num, denom, z, sizeof(num)/sizeof(special_func_scalar_type)) // }}} // vim: fdm=marker pyopencl-2013.2/pyopencl/cl/pyopencl-ranluxcl.cl0000644000175000000500000011303012245716340020372 0ustar tomussrc#ifndef RANLUXCL_CL #define RANLUXCL_CL /**** RANLUXCL v1.3.1 MODIFIED ************************************************* Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77 implementation by Fred James. This OpenCL code is a complete implementation which should perfectly replicate the numbers generated by the original Fortran 77 implementation (if using the legacy initialization routine). ***** QUICK USAGE DESCRIPTION ************************************************** 1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte) per work-item. I.e., in C/C++: size_t buffSize = numWorkitems * 112; 2. Pass the buffer and an unsigned integer seed to a kernel that launches the ranluxcl_initialization function. The seed can be any unsigned 32-bit integer, and must be different on different OpenCL devices/NDRanges to ensure different sequences. As long as the number of work-items on each device/NDRange is less than 2^32 = 4294967296 all sequences will be different. An examle initialization kernel would be: #include "ranluxcl.cl" kernel void Kernel_Ranluxcl_Init(private uint ins, global ranluxcl_state_t *ranluxcltab) { ranluxcl_initialization(ins, ranluxcltab); } 3. Now the generator is ready for use. Remember to download the seeds first, and upload them again when done. Example kernel that downloads seeds, generates a float4 where each component is uniformly distributed between 0 and 1, end points not included, then uploads the seeds again: #include "ranluxcl.cl" kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab) { //ranluxclstate stores the state of the generator. ranluxcl_state_t ranluxclstate; //Download state into ranluxclstate struct. ranluxcl_download_seed(&ranluxclstate, ranluxcltab); //Generate a float4 with each component on (0,1), //end points not included. We can call ranluxcl as many //times as we like until we upload the state again. float4 randomnr = ranluxcl32(&ranluxclstate); //Upload state again so that we don't get the same //numbers over again the next time we use ranluxcl. ranluxcl_upload_seed(&ranluxclstate, ranluxcltab); } ***** MACROS ******************************************************************* The following macros can optionally be defined: RANLUXCL_LUX: Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger it sets the p-value of the generator (generally not needed). If this macro is not set then lux=4 is the default (highest quality). For many applications the high quality of lux=4 may not be needed. Indeed if two values (each value having 24 random bits) are glued together to form a 48-bit value the generator passes all tests in the TestU01 suite already with lux=2. See "TestU01: A C Library for Empirical Testing of Random Number Generators" by PIERRE LAeECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is RANLUX with two values glued together to create 48-bit numbers, and we see that it passes all tests already at luxury value 2. RANLUXCL_NO_WARMUP: Turns off the warmup functionality in ranluxcl_initialization. This macro should generally not be used, since the generators will initially be correlated if it is defined. The only advantage is that the numbers generated will exactly correspond to those of the original Fortran 77 implementation. RANLUXCL_SUPPORT_DOUBLE: Enables double precision functions. Please enable the OpenCL double precision extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable". RANLUXCL_USE_LEGACY_INITIALIZATION Uses exactly the same initialization routine as in the original Fortran 77 code, leading to the same sequences. If using legacy initialization there are some restrictions on what the seed can be, and it may also be necessary to define RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel. RANLUXCL_MAXWORKITEMS: When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro. If several OpenCL NDRanges will be running in parallel and the parallel sequences should be different then this macro should have a value equal or larger than the largest number of work-items in any of the parallel runs. The default is to use the current global size, so if all NDRanges are of the same size this need not be defined. Each parallel instance must also have different seeds . For example if we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would use different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be defined to be at least 10240. If GPU1 and GPU2 had the same number of work-items this would not be necessary. An underestimate of the highest permissible seed is given by the smallest of: ( = 10^9 / ) or ( = 10^9 / RANLUXCL_MAXWORKITEMS). Please make sure that is never higher than this since it could cause undetected problems. For example with 10240 work-items the highest permissible is about 100 000. Again note that this is only relevant when using the legacy initialization function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using the legacy initialization this macro is effectively set to a very high value of 2^32-1. ***** FUNCTIONS: INITIALIZATION ************************************************ The initialization function is defined as: void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) Run once at the very beginning. ranluxcltab should be a buffer with space for 112 byte per work-item in the NDRange. is the seed to the generator. For a given each work-item in the NDRange will generate a different sequence. If more than one NDRange is used in parallel then must be different for each NDRange to avoid identical sequences. ***** FUNCTIONS: SEED UPLOAD/DOWNLOAD ****************************************** The following two functions should be launced at the beginning and end of a kernel that uses ranluxcl to generate numbers, respectively: void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) Run at the beginning of a kernel to download ranluxcl state data void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) Run at the end of a kernel to upload state data ***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ******************************** float4 ranluxcl32(ranluxcl_state_t *rst) Run to generate a pseudo-random float4 where each component is a number between 0 and 1, end points not included (meaning the number will never be exactly 0 or 1). double4 ranluxcl64(ranluxcl_state_t *rst) Double precision version of the above function. The preprocessor macro RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available. This function "glues" together two single-precision numbers to make one double precision number. Most of the work is still done in single precision, so the performance will be roughly halved regardless of the double precision performance of the hardware. float4 ranluxcl32norm(ranluxcl_state_t *rst) Run to generate a pseudo-random float4 where each component is normally distributed with mean 0 and standard deviation 1. double4 ranluxcl64norm(ranluxcl_state_t *rst) Double precision version of the above function. The preprocessor macro RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available. void ranluxcl_synchronize(ranluxcl_state_t *rst) Run to synchronize execution in case different work-items have made a different number of calls to ranluxcl. On SIMD machines this could lead to inefficient execution. ranluxcl_synchronize allows us to make sure all generators are SIMD-friendly again. Not needed if all work-items always call ranluxcl the same number of times. ***** PERFORMANCE ************************************************************** For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudo- random values per second, when not downloading values to host memory (i.e. the values are just generated, but not used for anything in particular). ***** DESCRIPTION OF THE IMPLEMENTATION **************************************** This code closely follows the original Fortran 77 code (see credit section). Here the differences (and similarities) between RANLUXCL (this implementation) and the original RANLUX are discussed. The Fortran 77 implementation uses a simple LCG to initialize the generator, and so the same approach is taken here. If RANLUXCL is initialized with = 0 as seed, the first work-item behaves like the original RANLUX with seed equal 1, the second work-item as if with seed equal 2 and so on. If = 1 then the first work-item behaves like the original RANLUX with seed equal to + 1, and so on for higher so that we never have overlapping sequences. This is why the RANLUXCL_MAXWORKITEMS macro must be set if we have different NDRanges with a different number of work-items. RANLUX is based on chaos theory, and what we are actually doing when selecting a luxury value is setting how many values to skip over (causing decorrelation). The number of values to skip is controlled by the so-called p-value of the generator. After generating 24 values we skip p - 24 values until again generating 24 values. This implementation is somewhat modified from the original fortran implementation by F. James. Because of the way the OpenCL code is optimized with 4-component 32-bit float vectors, it is most convenient to always throw away some multiple of 24 values (i.e. p is always a multiple of 24). However, there might be some resonances if we always throw away a multiple of the seeds table size. Therefore the implementation is slightly more intricate where p can be a multiple of 4 instead, at a cost to performance (only about 10% lower than the cleaner 24 values approach on AMD Cypress). These two approaches are termed planar and planar shift respectively. The idea for the planar approach comes from the following paper: Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on Graphics Processing Units, arXiv:1003.1898v1 [hep-lat] Below the p-values for the original reference implementation are listed along with those of the planar shift implementation. Suggested values for the planar approach are also presented. When this function is called with RANLUXCL_LUX set to 0-4, the planar shift values are used. To use the pure planar approach (for some extra performance with likely undetectable quality decrease), set lux equal to the specific p-value. Luxury setting (RANLUXCL_LUX): 0 1 2 3 4 Original fortran77 implementation by F. James: 24 48 97 223 389 Planar (suggested): 24 48 120 240 408 Planar shift: 24 48 100 224 404 Note that levels 0 and 1 are the same as in the original implementation for both planar and planar shift. Level 4 of planar shift where p=404 is the same as chosen for luxury level 1 by Martin Luescher for his v3 version of RANLUX. Therefore if it is considered important to only use "official" values, luxury settings 0, 1 or 4 of planar shift should be used. It is however unlikely that the other values are bad, they just haven't been as extensively used and tested by others. Variable names are generally the same as in the fortran77 implementation, however because of the way the generator is implemented, the i24 and j24 variables are no longer needed. ***** CREDIT ******************************************************************* I have been told by Fred James (the coder) that the original Fortran 77 implementation (which is the subject of the second paper below) is free to use and share. Therefore I am using the MIT license (below). But most importantly please always remember to give credit to the two articles by Martin Luscher and Fred James, describing the generator and the Fortran 77 implementation on which this implementation is based, respectively: Martin Luescher, A portable high-quality random number generator for lattice field theory simulations, Computer Physics Communications 79 (1994) 100-110 F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom number generator of Luescher, Computer Physics Communications 79 (1994) 111-114 ***** LICENSE ****************************************************************** Copyright (c) 2011 Ivar Ursin Nikolaisen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *******************************************************************************/ typedef struct{ float s01, s02, s03, s04, s05, s06, s07, s08, s09, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24; float carry; float dummy; //Causes struct to be a multiple of 128 bits int in24; int stepnr; } ranluxcl_state_t; //Initial prototypes makes Apple's compiler happy void ranluxcl_download_seed(ranluxcl_state_t *, global ranluxcl_state_t *); void ranluxcl_upload_seed(ranluxcl_state_t *, global ranluxcl_state_t *); float ranluxcl_os(float, float, float *, float *); float4 ranluxcl32(ranluxcl_state_t *); void ranluxcl_synchronize(ranluxcl_state_t *); void ranluxcl_initialization(uint, global ranluxcl_state_t *); float4 ranluxcl32norm(ranluxcl_state_t *); #ifdef RANLUXCL_SUPPORT_DOUBLE double4 ranluxcl64(ranluxcl_state_t *); double4 ranluxcl64norm(ranluxcl_state_t *); #endif #define RANLUXCL_TWOM24 0.000000059604644775f #define RANLUXCL_TWOM12 0.000244140625f #ifdef RANLUXCL_LUX #if RANLUXCL_LUX < 0 #error ranluxcl: lux must be zero or positive. #endif #else #define RANLUXCL_LUX 4 //Default to high quality #endif //RANLUXCL_LUX //Here the luxury values are defined #if RANLUXCL_LUX == 0 #define RANLUXCL_NSKIP 0 #elif RANLUXCL_LUX == 1 #define RANLUXCL_NSKIP 24 #elif RANLUXCL_LUX == 2 #define RANLUXCL_NSKIP 76 #elif RANLUXCL_LUX == 3 #define RANLUXCL_NSKIP 200 #elif RANLUXCL_LUX == 4 #define RANLUXCL_NSKIP 380 #else #define RANLUXCL_NSKIP (RANLUXCL_LUX - 24) #endif //RANLUXCL_LUX == 0 //Check that nskip is a permissible value #if RANLUXCL_NSKIP % 4 != 0 #error nskip must be divisible by 4! #endif #if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0 #error nskip must be either 0 or >= 24! #endif #if RANLUXCL_NSKIP < 0 #error nskip is negative! #endif //Check if planar scheme is recovered #if RANLUXCL_NSKIP % 24 == 0 #define RANLUXCL_PLANAR #endif //Check if we will skip at all #if RANLUXCL_NSKIP == 0 #define RANLUXCL_NOSKIP #endif //Single-value global size and id #define RANLUXCL_NUMWORKITEMS \ (get_global_size(0) * get_global_size(1) * get_global_size(2)) #define RANLUXCL_MYID \ (get_global_id(0) + get_global_id(1) * get_global_size(0) + \ get_global_id(2) * get_global_size(0) * get_global_size(1)) void ranluxcl_download_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) { (*rst) = ranluxcltab[RANLUXCL_MYID]; } void ranluxcl_upload_seed(ranluxcl_state_t *rst, global ranluxcl_state_t *ranluxcltab) { ranluxcltab[RANLUXCL_MYID] = (*rst); } /* * Performs one "step" (generates a single value or skip). Only used internally, * not intended to be called from user code. */ float ranluxcl_os(float sj24m1, float sj24, float *si24, float *carry) { float uni, out; uni = sj24 - (*si24) - (*carry); if(uni < 0.0f){ uni += 1.0f; (*carry) = RANLUXCL_TWOM24; } else (*carry) = 0.0f; out = ((*si24) = uni); if(uni < RANLUXCL_TWOM12){ out += RANLUXCL_TWOM24 * sj24m1; if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24; } return out; } /* * Return a float4 where each component is a uniformly distributed pseudo- * random value between 0 and 1, end points not included. */ float4 ranluxcl32(ranluxcl_state_t *rst) { float4 out; if(rst->stepnr == 0){ out.x = ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry)); out.y = ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry)); out.z = ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry)); out.w = ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry)); rst->stepnr += 4; } else if(rst->stepnr == 4){ out.x = ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry)); out.y = ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry)); out.z = ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry)); out.w = ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry)); rst->stepnr += 4; } else if(rst->stepnr == 8){ out.x = ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry)); out.y = ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry)); out.z = ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry)); out.w = ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry)); rst->stepnr += 4; } else if(rst->stepnr == 12){ out.x = ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry)); out.y = ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry)); out.z = ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry)); out.w = ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry)); rst->stepnr += 4; } else if(rst->stepnr == 16){ out.x = ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry)); out.y = ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry)); out.z = ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry)); out.w = ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry)); rst->stepnr += 4; } else if(rst->stepnr == 20){ out.x = ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry)); out.y = ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry)); out.z = ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry)); out.w = ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry)); rst->stepnr = 0; // The below preprocessor directives are here to recover the simpler planar // scheme when nskip is a multiple of 24. For the most general planar shift // approach, just ignore all #if's below. #ifndef RANLUXCL_PLANAR } (*&(rst->in24)) += 4; if((*&(rst->in24)) == 24){ (*&(rst->in24)) = 0; #endif //RANLUXCL_PLANAR int initialskips = (rst->stepnr) ? (24 - rst->stepnr) : 0; int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24; int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips; //We know there won't be any initial skips in the planar scheme #ifndef RANLUXCL_PLANAR //Do initial skips (lack of breaks in switch is intentional). switch(initialskips){ case(20): ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry)); ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry)); ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry)); ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry)); case(16): ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry)); ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry)); ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry)); ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry)); case(12): ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry)); ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry)); ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry)); ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry)); case(8): ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry)); ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry)); ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry)); ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry)); case(4): ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry)); ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry)); ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry)); ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry)); } #endif //RANLUXCL_PLANAR //Also check if we will ever need to skip at all #ifndef RANLUXCL_NOSKIP for(int i=0; is09, rst->s10, &(rst->s24), &(rst->carry)); ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry)); ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry)); ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry)); ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry)); ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry)); ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry)); ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry)); ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry)); ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry)); ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry)); ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry)); ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry)); ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry)); ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry)); ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry)); ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry)); ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry)); ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry)); ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry)); ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry)); ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry)); ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry)); ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry)); } #endif //RANLUXCL_NOSKIP //There also won't be any remaining skips in the planar scheme #ifndef RANLUXCL_PLANAR //Do remaining skips if(remainingskips){ ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry)); ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry)); ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry)); ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry)); if(remainingskips > 4){ ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry)); ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry)); ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry)); ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry)); } if(remainingskips > 8){ ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry)); ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry)); ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry)); ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry)); } if(remainingskips > 12){ ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry)); ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry)); ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry)); ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry)); } if(remainingskips > 16){ ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry)); ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry)); ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry)); ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry)); } } #endif //RANLUXCL_PLANAR // Initial skips brought stepnr down to 0. The bulk skips did only // full cycles. Therefore stepnr is now equal to remainingskips. rst->stepnr = remainingskips; } return out; } /* * Perform the necessary operations to set the generator to the "beginning", * i.e., ready to generate 24 numbers before the next skipping sequence. This * is useful if different work-items have called ranluxcl a different number * of times. Since that would lead to out of sync execution on different work- * items it could be rather inefficient on SIMD architectures (like current * GPUs). This function thus allows us to resynchronize execution across work- * items. */ void ranluxcl_synchronize(ranluxcl_state_t *rst) { // Do necessary number of calls to ranluxcl so that stepnr == 0 at the end. if(rst->stepnr == 4) ranluxcl32(rst); if(rst->stepnr == 8) ranluxcl32(rst); if(rst->stepnr == 12) ranluxcl32(rst); if(rst->stepnr == 16) ranluxcl32(rst); if(rst->stepnr == 20) ranluxcl32(rst); } /* * Uses a 64-bit xorshift PRNG by George Marsaglia to initialize the generator. * * This function can be used instead of ranluxcl_initialization if manual * control of the seed of each generator is desired. x must be unique for each * time this function is called, and *ranluxcltab should point to the specific * entry in the table to be initialized. Compare this to ranluxcl_initialization * where ins needs only be unique for each NDRange, and *ranluxcltab points * to the base address of the table for the entire NDRange. Also note that * depending on what you are doing the ranluxcl_upload_seed and * ranluxcl_download_seed functions may not do what you want, so make sure * you know what you are doing! */ void ranluxcl_init(ulong x, global ranluxcl_state_t *ranluxcltab) { ranluxcl_state_t rst; #define RANLUXCL_POW2_24 16777216 #define RANLUXCL_56 0x00FFFFFFFFFFFFFF #define RANLUXCL_48 0x0000FFFFFFFFFFFF #define RANLUXCL_40 0x000000FFFFFFFFFF #define RANLUXCL_32 0x00000000FFFFFFFF #define RANLUXCL_24 0x0000000000FFFFFF #define RANLUXCL_16 0x000000000000FFFF #define RANLUXCL_8 0x00000000000000FF ulong x1, x2, x3; //Logical shifts used so that all 64 bits of output are used (24 bits //per float), to be certain that all initial states are different. x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x; x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x; x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x; rst.s01 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; rst.s02 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; rst.s03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; rst.s04 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; rst.s05 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; rst.s06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; rst.s07 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; rst.s08 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x; x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x; x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x; rst.s09 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; rst.s10 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; rst.s11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; rst.s12 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; rst.s13 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; rst.s14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; rst.s15 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; rst.s16 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x; x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x; x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x; rst.s17 = (float) (x1 >> 40) / (float)RANLUXCL_POW2_24; rst.s18 = (float) ((x1 & RANLUXCL_40) >> 16) / (float)RANLUXCL_POW2_24; rst.s19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56)) / (float)RANLUXCL_POW2_24; rst.s20 = (float) ((x2 & RANLUXCL_56) >> 32) / (float)RANLUXCL_POW2_24; rst.s21 = (float) ((x2 & RANLUXCL_32) >> 8) / (float)RANLUXCL_POW2_24; rst.s22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48)) / (float)RANLUXCL_POW2_24; rst.s23 = (float) ((x3 & RANLUXCL_48) >> 24) / (float)RANLUXCL_POW2_24; rst.s24 = (float) (x3 & RANLUXCL_24) / (float)RANLUXCL_POW2_24; #undef RANLUXCL_POW2_24 #undef RANLUXCL_56 #undef RANLUXCL_48 #undef RANLUXCL_40 #undef RANLUXCL_32 #undef RANLUXCL_24 #undef RANLUXCL_16 #undef RANLUXCL_8 rst.in24 = 0; rst.stepnr = 0; rst.carry = 0.0f; if(rst.s24 == 0.0f) rst.carry = RANLUXCL_TWOM24; #ifndef RANLUXCL_NO_WARMUP //Warming up the generator, ensuring there are no initial correlations. //16 is a "magic number". It is the number of times we must generate //a batch of 24 numbers to ensure complete decorrelation, however it //seems like it is necessary to double this for the special case when //the generator is initialized to all zeros. for(int i=0; i<16 * 2; i++){ ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry)); ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry)); ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry)); ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry)); ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry)); ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry)); ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry)); ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry)); ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry)); ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry)); ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry)); ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry)); ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry)); ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry)); ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry)); ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry)); ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry)); ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry)); ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry)); ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry)); ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry)); ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry)); ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry)); ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry)); } #endif //RANLUXCL_NO_WARMUP //Upload the state *ranluxcltab = rst; } void ranluxcl_init_legacy(uint ins, global ranluxcl_state_t *ranluxcltab) { //Using legacy initialization from original Fortan 77 implementation //ins is scaled so that if the user makes another call somewhere else //with ins + 1 there should be no overlap. Also adding one //allows us to use ins = 0. int k, maxWorkitems; ranluxcl_state_t rst; #ifdef RANLUXCL_MAXWORKITEMS maxWorkitems = RANLUXCL_MAXWORKITEMS; #else maxWorkitems = RANLUXCL_NUMWORKITEMS; #endif //RANLUXCL_MAXWORKITEMS int scaledins = ins * maxWorkitems + 1; int js = scaledins + RANLUXCL_MYID; //Make sure js is not too small (should really be an error) if(js < 1) js = 1; #define IC 2147483563 #define ITWO24 16777216 k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s01=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s02=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s03=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s04=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s05=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s06=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s07=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s08=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s09=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s10=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s11=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s12=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s13=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s14=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s15=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s16=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s17=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s18=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s19=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s20=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s21=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s22=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s23=(js%ITWO24)*RANLUXCL_TWOM24; k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC; rst.s24=(js%ITWO24)*RANLUXCL_TWOM24; #undef IC #undef ITWO24 rst.in24 = 0; rst.stepnr = 0; rst.carry = 0.0f; if(rst.s24 == 0.0f) rst.carry = RANLUXCL_TWOM24; #ifndef RANLUXCL_NO_WARMUP //Warming up the generator, ensuring there are no initial correlations. //16 is a "magic number". It is the number of times we must generate //a batch of 24 numbers to ensure complete decorrelation. for(int i=0; i<16; i++){ ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry)); ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry)); ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry)); ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry)); ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry)); ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry)); ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry)); ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry)); ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry)); ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry)); ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry)); ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry)); ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry)); ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry)); ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry)); ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry)); ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry)); ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry)); ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry)); ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry)); ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry)); ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry)); ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry)); ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry)); } #endif //RANLUXCL_NO_WARMUP //Upload the state ranluxcl_upload_seed(&rst, ranluxcltab); } void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab) { #ifdef RANLUXCL_USE_LEGACY_INITIALIZATION ranluxcl_init_legacy(ins, ranluxcltab); #else // Not RANLUXCL_USE_LEGACY_INITIALIZATION // We scale ins by 2^32. As long as we never use more than (2^32)-1 // work-items per NDRange the initial states should never be the same. ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1); ranluxcl_init(x, ranluxcltab + RANLUXCL_MYID); #endif // RANLUXCL_USE_LEGACY_INITIALIZATION } float4 ranluxcl32norm(ranluxcl_state_t *rst) { //Returns a vector where each component is a normally //distributed PRN centered on 0, with standard deviation 1. //Roll our own since M_PI_F does not exist in OpenCL 1.0. #define RANLUXCL_PI_F 3.1415926535f float4 U = ranluxcl32(rst); float4 Z; float R, phi; R = sqrt(-2 * log(U.x)); phi = 2 * RANLUXCL_PI_F * U.y; Z.x = R * cos(phi); Z.y = R * sin(phi); R = sqrt(-2 * log(U.z)); phi = 2 * RANLUXCL_PI_F * U.w; Z.z = R * cos(phi); Z.w = R * sin(phi); return Z; #undef RANLUXCL_PI_F } #ifdef RANLUXCL_SUPPORT_DOUBLE double4 ranluxcl64(ranluxcl_state_t *rst) { double4 out; float4 randvec; //We know this value is caused by the never-zero part //of the original algorithm, but we want to allow zero for //the most significant bits in the double precision result. randvec = ranluxcl32(rst); if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) randvec.x = 0.0f; if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) randvec.z = 0.0f; out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216; out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216; randvec = ranluxcl32(rst); if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) randvec.x = 0.0f; if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24) randvec.z = 0.0f; out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216; out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216; return out; } double4 ranluxcl64norm(ranluxcl_state_t *rst) { //Returns a vector where each component is a normally //distributed PRN centered on 0, with standard deviation //1. double4 U = ranluxcl64(rst); double4 Z; double R, phi; R = sqrt(-2 * log(U.x)); phi = 2 * M_PI * U.y; Z.x = R * cos(phi); Z.y = R * sin(phi); R = sqrt(-2 * log(U.z)); phi = 2 * M_PI * U.w; Z.z = R * cos(phi); Z.w = R * sin(phi); return Z; } #endif //RANLUXCL_SUPPORT_DOUBLE #undef RANLUXCL_TWOM24 #undef RANLUXCL_TWOM12 #undef RANLUXCL_NUMWORKITEMS #undef RANLUXCL_MYID #undef RANLUXCL_PLANAR #undef RANLUXCL_NOSKIP #endif //RANLUXCL_CL pyopencl-2013.2/pyopencl/cl/pyopencl-complex.h0000644000175000000500000001447112245716340020053 0ustar tomussrc/* * Copyright (c) 1999 * Silicon Graphics Computer Systems, Inc. * * Copyright (c) 1999 * Boris Fomitchev * * Copyright (c) 2012 * Andreas Kloeckner * * This material is provided "as is", with absolutely no warranty expressed * or implied. Any use is at your own risk. * * Permission to use or copy this software for any purpose is hereby granted * without fee, provided the above notices are retained on all copies. * Permission to modify the code and to distribute modified code is granted, * provided the above notices are retained, and a notice that the code was * modified is included with the above copyright notice. * */ // This file is available for inclusion in pyopencl kernels and provides // complex types 'cfloat_t' and 'cdouble_t', along with a number of special // functions as visible below, e.g. cdouble_log(z). // // Under the hood, the complex types are simply float2 and double2. // Note that native (operator-based) addition (float + float2) and // multiplication (float2*float1) is defined for these types, // but do not match the rules of complex arithmetic. #define PYOPENCL_DECLARE_COMPLEX_TYPE_INT(REAL_TP, REAL_3LTR, TPROOT, TP) \ \ REAL_TP TPROOT##_real(TP a) { return a.x; } \ REAL_TP TPROOT##_imag(TP a) { return a.y; } \ REAL_TP TPROOT##_abs(TP a) { return hypot(a.x, a.y); } \ \ TP TPROOT##_fromreal(REAL_TP a) { return (TP)(a, 0); } \ TP TPROOT##_new(REAL_TP a, REAL_TP b) { return (TP)(a, b); } \ TP TPROOT##_conj(TP a) { return (TP)(a.x, -a.y); } \ \ TP TPROOT##_add(TP a, TP b) \ { \ return a+b; \ } \ TP TPROOT##_addr(TP a, REAL_TP b) \ { \ return (TP)(b+a.x, a.y); \ } \ TP TPROOT##_radd(REAL_TP a, TP b) \ { \ return (TP)(a+b.x, b.y); \ } \ \ TP TPROOT##_mul(TP a, TP b) \ { \ return (TP)( \ a.x*b.x - a.y*b.y, \ a.x*b.y + a.y*b.x); \ } \ \ TP TPROOT##_mulr(TP a, REAL_TP b) \ { \ return a*b; \ } \ \ TP TPROOT##_rmul(REAL_TP a, TP b) \ { \ return a*b; \ } \ \ TP TPROOT##_rdivide(REAL_TP z1, TP z2) \ { \ if (fabs(z2.x) <= fabs(z2.y)) { \ REAL_TP ratio = z2.x / z2.y; \ REAL_TP denom = z2.y * (1 + ratio * ratio); \ return (TP)((z1 * ratio) / denom, - z1 / denom); \ } \ else { \ REAL_TP ratio = z2.y / z2.x; \ REAL_TP denom = z2.x * (1 + ratio * ratio); \ return (TP)(z1 / denom, - (z1 * ratio) / denom); \ } \ } \ \ TP TPROOT##_divide(TP z1, TP z2) \ { \ REAL_TP ratio, denom, a, b, c, d; \ \ if (fabs(z2.x) <= fabs(z2.y)) { \ ratio = z2.x / z2.y; \ denom = z2.y; \ a = z1.y; \ b = z1.x; \ c = -z1.x; \ d = z1.y; \ } \ else { \ ratio = z2.y / z2.x; \ denom = z2.x; \ a = z1.x; \ b = z1.y; \ c = z1.y; \ d = -z1.x; \ } \ denom *= (1 + ratio * ratio); \ return (TP)( \ (a + b * ratio) / denom, \ (c + d * ratio) / denom); \ } \ \ TP TPROOT##_divider(TP a, REAL_TP b) \ { \ return a/b; \ } \ \ TP TPROOT##_pow(TP a, TP b) \ { \ REAL_TP logr = log(hypot(a.x, a.y)); \ REAL_TP logi = atan2(a.y, a.x); \ REAL_TP x = exp(logr * b.x - logi * b.y); \ REAL_TP y = logr * b.y + logi * b.x; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ return (TP) (x*cosy, x*siny); \ } \ \ TP TPROOT##_powr(TP a, REAL_TP b) \ { \ REAL_TP logr = log(hypot(a.x, a.y)); \ REAL_TP logi = atan2(a.y, a.x); \ REAL_TP x = exp(logr * b); \ REAL_TP y = logi * b; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ \ return (TP)(x * cosy, x*siny); \ } \ \ TP TPROOT##_rpow(REAL_TP a, TP b) \ { \ REAL_TP logr = log(a); \ REAL_TP x = exp(logr * b.x); \ REAL_TP y = logr * b.y; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ return (TP) (x * cosy, x * siny); \ } \ \ TP TPROOT##_sqrt(TP a) \ { \ REAL_TP re = a.x; \ REAL_TP im = a.y; \ REAL_TP mag = hypot(re, im); \ TP result; \ \ if (mag == 0.f) { \ result.x = result.y = 0.f; \ } else if (re > 0.f) { \ result.x = sqrt(0.5f * (mag + re)); \ result.y = im/result.x/2.f; \ } else { \ result.y = sqrt(0.5f * (mag - re)); \ if (im < 0.f) \ result.y = - result.y; \ result.x = im/result.y/2.f; \ } \ return result; \ } \ \ TP TPROOT##_exp(TP a) \ { \ REAL_TP expr = exp(a.x); \ REAL_TP cosi; \ REAL_TP sini = sincos(a.y, &cosi); \ return (TP)(expr * cosi, expr * sini); \ } \ \ TP TPROOT##_log(TP a) \ { return (TP)(log(hypot(a.x, a.y)), atan2(a.y, a.x)); } \ \ TP TPROOT##_sin(TP a) \ { \ REAL_TP cosr; \ REAL_TP sinr = sincos(a.x, &cosr); \ return (TP)(sinr*cosh(a.y), cosr*sinh(a.y)); \ } \ \ TP TPROOT##_cos(TP a) \ { \ REAL_TP cosr; \ REAL_TP sinr = sincos(a.x, &cosr); \ return (TP)(cosr*cosh(a.y), -sinr*sinh(a.y)); \ } \ \ TP TPROOT##_tan(TP a) \ { \ REAL_TP re2 = 2.f * a.x; \ REAL_TP im2 = 2.f * a.y; \ \ const REAL_TP limit = log(REAL_3LTR##_MAX); \ \ if (fabs(im2) > limit) \ return (TP)(0.f, (im2 > 0 ? 1.f : -1.f)); \ else \ { \ REAL_TP den = cos(re2) + cosh(im2); \ return (TP) (sin(re2) / den, sinh(im2) / den); \ } \ } \ \ TP TPROOT##_sinh(TP a) \ { \ REAL_TP cosi; \ REAL_TP sini = sincos(a.y, &cosi); \ return (TP)(sinh(a.x)*cosi, cosh(a.x)*sini); \ } \ \ TP TPROOT##_cosh(TP a) \ { \ REAL_TP cosi; \ REAL_TP sini = sincos(a.y, &cosi); \ return (TP)(cosh(a.x)*cosi, sinh(a.x)*sini); \ } \ \ TP TPROOT##_tanh(TP a) \ { \ REAL_TP re2 = 2.f * a.x; \ REAL_TP im2 = 2.f * a.y; \ \ const REAL_TP limit = log(REAL_3LTR##_MAX); \ \ if (fabs(re2) > limit) \ return (TP)((re2 > 0 ? 1.f : -1.f), 0.f); \ else \ { \ REAL_TP den = cosh(re2) + cos(im2); \ return (TP) (sinh(re2) / den, sin(im2) / den); \ } \ } \ #define PYOPENCL_DECLARE_COMPLEX_TYPE(BASE, BASE_3LTR) \ typedef BASE##2 c##BASE##_t; \ \ PYOPENCL_DECLARE_COMPLEX_TYPE_INT(BASE, BASE_3LTR, c##BASE, c##BASE##_t) PYOPENCL_DECLARE_COMPLEX_TYPE(float, FLT); #define cfloat_cast(a) ((cfloat_t) ((a).x, (a).y)) #ifdef PYOPENCL_DEFINE_CDOUBLE PYOPENCL_DECLARE_COMPLEX_TYPE(double, DBL); #define cdouble_cast(a) ((cdouble_t) ((a).x, (a).y)) #endif pyopencl-2013.2/pyopencl/cl/pyopencl-airy.cl0000644000175000000500000001767212245716340017525 0ustar tomussrc// Ported from Cephes by // Andreas Kloeckner (C) 2012 // // Cephes Math Library Release 2.8: June, 2000 // Copyright 1984, 1987, 1989, 1992, 2000 by Stephen L. Moshier // What you see here may be used freely, but it comes with no support or // guarantee. #pragma once #include __constant const double airy_maxairy = 103.892; __constant const double airy_sqrt3 = 1.732050807568877293527; __constant const double airy_sqpii = 5.64189583547756286948E-1; __constant const double airy_c1 = 0.35502805388781723926; __constant const double airy_c2 = 0.258819403792806798405; __constant const unsigned short AN[32] = { 0x3fd6,0x2dae,0x2537,0xb658, 0x4028,0x03e3,0x871a,0x9067, 0x4053,0x11e5,0x0de2,0xe1e3, 0x4065,0x02da,0xee40,0x073c, 0x4063,0xf834,0x5ba1,0xfddf, 0x4051,0xa24f,0x4f4c,0xea4f, 0x402c,0x0d8d,0x5c2a,0x0f4d, 0x3ff0,0x0000,0x0000,0x0000, }; __constant const unsigned short AD[32] = { 0x3fe2,0x29bc,0x0262,0x4d31, 0x402d,0x8334,0x0533,0x2ca5, 0x4055,0x20e3,0xb04d,0x51a0, 0x4066,0x2a2d,0xc730,0xb7b0, 0x4064,0x8782,0x9a9f,0xfa61, 0x4051,0xde94,0xee91,0xd35f, 0x402c,0x311b,0x950d,0x9d81, 0x3ff0,0x0000,0x0000,0x0000, }; __constant const unsigned short APN[32] = { 0x3fe3,0xa3ea,0x4d4c,0xab3e, 0x402d,0x7dad,0xdc67,0x2bcf, 0x4054,0x83bd,0x0724,0xa9a6, 0x4065,0x65e9,0xba99,0xc9ba, 0x4063,0xea2b,0xcdc2,0x64d7, 0x4051,0x7e95,0x41d4,0x1646, 0x402b,0xe4e8,0x6aa7,0x4099, 0x3ff0,0x0000,0x0000,0x0000, }; __constant const unsigned short APD[32] = { 0x3fd5,0x6397,0xd288,0xd5b3, 0x4026,0x5caf,0xedc9,0x327e, 0x4051,0xcb0e,0x1800,0x97e6, 0x4063,0xd8e6,0x1132,0xdbd1, 0x4063,0x269b,0x0dcb,0x3316, 0x4051,0x2b36,0xf9d0,0xf72f, 0x402b,0xb321,0x4e35,0x7982, 0x3ff0,0x0000,0x0000,0x0000, }; __constant const unsigned short BN16[20] = { 0xbfd0,0x3518,0xe211,0x6751, 0x3fe2,0x68bc,0x7072,0x2383, 0xbfd5,0x1d32,0x6785,0xcf29, 0x3fb0,0x7f2a,0xa027,0x78a8, 0xbf6f,0x5604,0x2dba,0xcd1b, }; __constant const unsigned short BD16[20] = { /*0x3ff0,0x0000,0x0000,0x0000,*/ 0xc01c,0xa09d,0x891b,0xab58, 0x4025,0x3539,0xfe0b,0x1101, 0xc014,0xee0b,0xa9a7,0x70e8, 0x3fee,0xa2fc,0xa6da,0x95ff, 0xbfac,0x33d0,0x8f8e,0x86c9, }; __constant const unsigned short BPPN[20] = { 0x3fdd,0xca1d,0x9deb,0x377b, 0xbff1,0x7051,0xc6be,0xe420, 0x3fe4,0x710c,0xf199,0x5ff3, 0xbfc0,0x3c6f,0x8681,0xa8fa, 0x3f7f,0x3b43,0xb8ce,0xb896, }; __constant const unsigned short BPPD[20] = { /*0x3ff0,0x0000,0x0000,0x0000,*/ 0xc021,0x6996,0xb340,0xbc45, 0x402b,0xcc73,0x2ea4,0xbb8b, 0xc01c,0x908c,0xa04a,0xed59, 0x3ff5,0x70fd,0xf9a5,0x70a9, 0xbfb4,0x13d0,0x1b60,0x52e8, }; __constant const unsigned short AFN[36] = { 0xbfc0,0xdb6c,0xd50a,0xe6fb, 0xbfe4,0x0bee,0x9856,0x6852, 0xbfe6,0x2e59,0xc2f7,0x9f7d, 0xbfd1,0xe7ea,0x4bb3,0xf40b, 0xbfa9,0x2f6e,0xf47d,0xbd8a, 0xbf70,0xa401,0xc8d9,0xe090, 0xbf24,0xe06e,0xaf4b,0x009c, 0xbec7,0x4a78,0x1d42,0x366d, 0xbe52,0x041c,0xf68e,0xa2d2, }; __constant const unsigned short AFD[36] = { /*0x3ff0,0x0000,0x0000,0x0000,*/ 0x402a,0xb64b,0x2572,0xedf2, 0x4040,0x575c,0x4478,0x7b1a, 0x403a,0xbc98,0xa3b7,0x3410, 0x4022,0x5fc8,0x2ac9,0x9873, 0x3ff7,0x9acb,0x39de,0x9319, 0x3fbd,0x9dac,0xb404,0x5a2b, 0x3f72,0x08ca,0xe03a,0xf617, 0x3f13,0xc8d7,0xaf76,0xe73b, 0x3e9e,0x52b9,0xb995,0x18a7, }; __constant const unsigned short AGN[44] = { 0x3f94,0x3525,0xddcf,0xbbde, 0x3fd9,0x07d5,0x0064,0x37b7, 0x3ff1,0x0d83,0x3a20,0x34eb, 0x3fee,0x0dac,0xa0ef,0x1acb, 0x3fd6,0x7e69,0xcea8,0xfe1d, 0x3fb0,0x3a41,0x21e9,0x0978, 0x3f77,0xfe99,0xf12f,0x5043, 0x3f32,0x8976,0x600e,0x17a2, 0x3edd,0x4f3d,0x69f8,0x574e, 0x3e75,0xca92,0xbbad,0x11c8, 0x3df7,0x78a4,0x7d97,0xee7a, }; __constant const unsigned short AGD[40] = { /*0x3ff0,0x0000,0x0000,0x0000,*/ 0x4022,0x9e2b,0xf3d5,0x6b40, 0x4033,0xd5d5,0xc0ef,0x18d4, 0x402f,0x211b,0x7ea7,0xdc35, 0x4015,0xe84e,0x2b79,0xdbce, 0x3fee,0x8992,0xc195,0xece3, 0x3fb6,0x221d,0xed64,0xa9ee, 0x3f70,0xe704,0x6be3,0x93bb, 0x3f1a,0x8b61,0xd603,0xa5a0, 0x3eb3,0xa845,0xdb07,0x24e8, 0x3e35,0x1fc7,0x3dd5,0x89d4, }; __constant const unsigned short APFN[36] = { 0x3fc7,0xba0f,0x8e7d,0x5db5, 0x3fec,0x5ff2,0x3d14,0xd07e, 0x3fef,0x98b7,0x11be,0x01af, 0x3fd9,0xadef,0x1397,0x84a1, 0x3fb2,0x2f0d,0xeadc,0x33d1, 0x3f78,0x3115,0xe347,0xa140, 0x3f2e,0x8be8,0x5d03,0x8059, 0x3ed1,0x2495,0x9f80,0x12af, 0x3e5a,0xab6a,0x654d,0x7d86, }; __constant const unsigned short APFD[36] = { /*0x3ff0,0x0000,0x0000,0x0000,*/ 0x402d,0x781b,0x9628,0xcc60, 0x4042,0xc56d,0x2524,0x0e31, 0x403f,0x773d,0x09cc,0xffb8, 0x4025,0xfe6b,0x5163,0x03f7, 0x3ffc,0x9f21,0xc07a,0xc9fd, 0x3fc2,0x2450,0xe40e,0xf796, 0x3f76,0x48f2,0x3a5a,0x351a, 0x3f18,0xa059,0x7cfb,0x63a1, 0x3ea2,0xfdb8,0x5a24,0x1e2e, }; __constant const unsigned short APGN[44] = { 0xbfa2,0x351f,0x5f87,0xaf5b, 0xbfe4,0x64db,0x1ff7,0x5c76, 0xbffb,0x564a,0xc221,0x7e49, 0xbff8,0x0916,0x7f6e,0x0b07, 0xbfe2,0x0910,0xd8b0,0x6edb, 0xbfba,0x234b,0x0d8c,0x9903, 0xbf83,0x6c54,0x7f6c,0x50df, 0xbf3e,0x2afa,0x2424,0x2ad0, 0xbee7,0xf87a,0xbc17,0xf631, 0xbe81,0xe81f,0x501e,0x6c10, 0xbe03,0x5f45,0x5e46,0x870d, }; __constant const unsigned short APGD[40] = { /*0x3ff0,0x0000,0x0000,0x0000,*/ 0x4023,0xb7a2,0x060a,0x9812, 0x4035,0xa3e3,0x4724,0xfc96, 0x4031,0x5025,0xdb2c,0x819a, 0x4018,0xb702,0xd5cd,0x94e2, 0x3ff1,0x6a71,0x4927,0x1eb1, 0x3fb9,0x78de,0x4ad7,0x7bc5, 0x3f73,0x991a,0x4b2b,0xc1d7, 0x3f1e,0xf98f,0x0b16,0xbe1c, 0x3eb7,0x10bf,0xfdde,0x4ef3, 0x3e38,0xe834,0x9dc8,0x647e, }; int airy( double x, double *ai, double *aip, double *bi, double *bip ) { typedef __constant const double *data_t; double z, zz, t, f, g, uf, ug, k, zeta, theta; int domflg; domflg = 0; if( x > airy_maxairy ) { *ai = 0; *aip = 0; *bi = DBL_MAX; *bip = DBL_MAX; return(-1); } if( x < -2.09 ) { domflg = 15; t = sqrt(-x); zeta = -2.0 * x * t / 3.0; t = sqrt(t); k = airy_sqpii / t; z = 1.0/zeta; zz = z * z; uf = 1.0 + zz * cephes_polevl( zz, (data_t) AFN, 8 ) / cephes_p1evl( zz, (data_t) AFD, 9 ); ug = z * cephes_polevl( zz, (data_t) AGN, 10 ) / cephes_p1evl( zz, (data_t) AGD, 10 ); theta = zeta + 0.25 * M_PI; f = sin( theta ); g = cos( theta ); *ai = k * (f * uf - g * ug); *bi = k * (g * uf + f * ug); uf = 1.0 + zz * cephes_polevl( zz, (data_t) APFN, 8 ) / cephes_p1evl( zz, (data_t) APFD, 9 ); ug = z * cephes_polevl( zz, (data_t) APGN, 10 ) / cephes_p1evl( zz, (data_t) APGD, 10 ); k = airy_sqpii * t; *aip = -k * (g * uf + f * ug); *bip = k * (f * uf - g * ug); return(0); } if( x >= 2.09 ) /* cbrt(9) */ { domflg = 5; t = sqrt(x); zeta = 2.0 * x * t / 3.0; g = exp( zeta ); t = sqrt(t); k = 2.0 * t * g; z = 1.0/zeta; f = cephes_polevl( z, (data_t) AN, 7 ) / cephes_polevl( z, (data_t) AD, 7 ); *ai = airy_sqpii * f / k; k = -0.5 * airy_sqpii * t / g; f = cephes_polevl( z, (data_t) APN, 7 ) / cephes_polevl( z, (data_t) APD, 7 ); *aip = f * k; if( x > 8.3203353 ) /* zeta > 16 */ { f = z * cephes_polevl( z, (data_t) BN16, 4 ) / cephes_p1evl( z, (data_t) BD16, 5 ); k = airy_sqpii * g; *bi = k * (1.0 + f) / t; f = z * cephes_polevl( z, (data_t) BPPN, 4 ) / cephes_p1evl( z, (data_t) BPPD, 5 ); *bip = k * t * (1.0 + f); return(0); } } f = 1.0; g = x; t = 1.0; uf = 1.0; ug = x; k = 1.0; z = x * x * x; while( t > DBL_EPSILON ) { uf *= z; k += 1.0; uf /=k; ug *= z; k += 1.0; ug /=k; uf /=k; f += uf; k += 1.0; ug /=k; g += ug; t = fabs(uf/f); } uf = airy_c1 * f; ug = airy_c2 * g; if( (domflg & 1) == 0 ) *ai = uf - ug; if( (domflg & 2) == 0 ) *bi = airy_sqrt3 * (uf + ug); /* the deriviative of ai */ k = 4.0; uf = x * x/2.0; ug = z/3.0; f = uf; g = 1.0 + ug; uf /= 3.0; t = 1.0; while( t > DBL_EPSILON ) { uf *= z; ug /=k; k += 1.0; ug *= z; uf /=k; f += uf; k += 1.0; ug /=k; uf /=k; g += ug; k += 1.0; t = fabs(ug/g); } uf = airy_c1 * f; ug = airy_c2 * g; if( (domflg & 4) == 0 ) *aip = uf - ug; if( (domflg & 8) == 0 ) *bip = airy_sqrt3 * (uf + ug); return(0); } pyopencl-2013.2/pyopencl/cl/pyopencl-bessel-y.cl0000644000175000000500000003001112245716340020262 0ustar tomussrc// Pieced together from Boost C++ and Cephes by // Andreas Kloeckner (C) 2012 // // Pieces from: // // Copyright (c) 2006 Xiaogang Zhang, John Maddock // Use, modification and distribution are subject to the // Boost Software License, Version 1.0. (See // http://www.boost.org/LICENSE_1_0.txt) // // Cephes Math Library Release 2.8: June, 2000 // Copyright 1984, 1987, 1989, 1992, 2000 by Stephen L. Moshier // What you see here may be used freely, but it comes with no support or // guarantee. #pragma once #include #include typedef double bessel_y_scalar_type; // {{{ bessel_y0 __constant const bessel_y_scalar_type bessel_y0_P1[] = { 1.0723538782003176831e+11, -8.3716255451260504098e+09, 2.0422274357376619816e+08, -2.1287548474401797963e+06, 1.0102532948020907590e+04, -1.8402381979244993524e+01, }; __constant const bessel_y_scalar_type bessel_y0_Q1[] = { 5.8873865738997033405e+11, 8.1617187777290363573e+09, 5.5662956624278251596e+07, 2.3889393209447253406e+05, 6.6475986689240190091e+02, 1.0, }; __constant const bessel_y_scalar_type bessel_y0_P2[] = { -2.2213976967566192242e+13, -5.5107435206722644429e+11, 4.3600098638603061642e+10, -6.9590439394619619534e+08, 4.6905288611678631510e+06, -1.4566865832663635920e+04, 1.7427031242901594547e+01, }; __constant const bessel_y_scalar_type bessel_y0_Q2[] = { 4.3386146580707264428e+14, 5.4266824419412347550e+12, 3.4015103849971240096e+10, 1.3960202770986831075e+08, 4.0669982352539552018e+05, 8.3030857612070288823e+02, 1.0, }; __constant const bessel_y_scalar_type bessel_y0_P3[] = { -8.0728726905150210443e+15, 6.7016641869173237784e+14, -1.2829912364088687306e+11, -1.9363051266772083678e+11, 2.1958827170518100757e+09, -1.0085539923498211426e+07, 2.1363534169313901632e+04, -1.7439661319197499338e+01, }; __constant const bessel_y_scalar_type bessel_y0_Q3[] = { 3.4563724628846457519e+17, 3.9272425569640309819e+15, 2.2598377924042897629e+13, 8.6926121104209825246e+10, 2.4727219475672302327e+08, 5.3924739209768057030e+05, 8.7903362168128450017e+02, 1.0, }; __constant const bessel_y_scalar_type bessel_y0_PC[] = { 2.2779090197304684302e+04, 4.1345386639580765797e+04, 2.1170523380864944322e+04, 3.4806486443249270347e+03, 1.5376201909008354296e+02, 8.8961548424210455236e-01, }; __constant const bessel_y_scalar_type bessel_y0_QC[] = { 2.2779090197304684318e+04, 4.1370412495510416640e+04, 2.1215350561880115730e+04, 3.5028735138235608207e+03, 1.5711159858080893649e+02, 1.0, }; __constant const bessel_y_scalar_type bessel_y0_PS[] = { -8.9226600200800094098e+01, -1.8591953644342993800e+02, -1.1183429920482737611e+02, -2.2300261666214198472e+01, -1.2441026745835638459e+00, -8.8033303048680751817e-03, }; __constant const bessel_y_scalar_type bessel_y0_QS[] = { 5.7105024128512061905e+03, 1.1951131543434613647e+04, 7.2642780169211018836e+03, 1.4887231232283756582e+03, 9.0593769594993125859e+01, 1.0, }; bessel_y_scalar_type bessel_y0(bessel_y_scalar_type x) { const bessel_y_scalar_type x1 = 8.9357696627916752158e-01, x2 = 3.9576784193148578684e+00, x3 = 7.0860510603017726976e+00, x11 = 2.280e+02, x12 = 2.9519662791675215849e-03, x21 = 1.0130e+03, x22 = 6.4716931485786837568e-04, x31 = 1.8140e+03, x32 = 1.1356030177269762362e-04; bessel_y_scalar_type value, factor, r, rc, rs; if (x < 0) { //return policies::raise_domain_error(function, // "Got x = %1% but x must be non-negative, complex result not supported.", x, pol); return nan((uint)22); } if (x == 0) { return -DBL_MAX; } if (x <= 3) // x in (0, 3] { bessel_y_scalar_type y = x * x; bessel_y_scalar_type z = 2 * log(x/x1) * bessel_j0(x) / M_PI; r = boost_evaluate_rational(bessel_y0_P1, bessel_y0_Q1, y); factor = (x + x1) * ((x - x11/256) - x12); value = z + factor * r; } else if (x <= 5.5f) // x in (3, 5.5] { bessel_y_scalar_type y = x * x; bessel_y_scalar_type z = 2 * log(x/x2) * bessel_j0(x) / M_PI; r = boost_evaluate_rational(bessel_y0_P2, bessel_y0_Q2, y); factor = (x + x2) * ((x - x21/256) - x22); value = z + factor * r; } else if (x <= 8) // x in (5.5, 8] { bessel_y_scalar_type y = x * x; bessel_y_scalar_type z = 2 * log(x/x3) * bessel_j0(x) / M_PI; r = boost_evaluate_rational(bessel_y0_P3, bessel_y0_Q3, y); factor = (x + x3) * ((x - x31/256) - x32); value = z + factor * r; } else // x in (8, \infty) { bessel_y_scalar_type y = 8 / x; bessel_y_scalar_type y2 = y * y; bessel_y_scalar_type z = x - 0.25f * M_PI; rc = boost_evaluate_rational(bessel_y0_PC, bessel_y0_QC, y2); rs = boost_evaluate_rational(bessel_y0_PS, bessel_y0_QS, y2); factor = sqrt(2 / (x * M_PI)); value = factor * (rc * sin(z) + y * rs * cos(z)); } return value; } // }}} // {{{ bessel_y1 __constant const bessel_y_scalar_type bessel_y1_P1[] = { 4.0535726612579544093e+13, 5.4708611716525426053e+12, -3.7595974497819597599e+11, 7.2144548214502560419e+09, -5.9157479997408395984e+07, 2.2157953222280260820e+05, -3.1714424660046133456e+02, }; __constant const bessel_y_scalar_type bessel_y1_Q1[] = { 3.0737873921079286084e+14, 4.1272286200406461981e+12, 2.7800352738690585613e+10, 1.2250435122182963220e+08, 3.8136470753052572164e+05, 8.2079908168393867438e+02, 1.0, }; __constant const bessel_y_scalar_type bessel_y1_P2[] = { 1.1514276357909013326e+19, -5.6808094574724204577e+18, -2.3638408497043134724e+16, 4.0686275289804744814e+15, -5.9530713129741981618e+13, 3.7453673962438488783e+11, -1.1957961912070617006e+09, 1.9153806858264202986e+06, -1.2337180442012953128e+03, }; __constant const bessel_y_scalar_type bessel_y1_Q2[] = { 5.3321844313316185697e+20, 5.6968198822857178911e+18, 3.0837179548112881950e+16, 1.1187010065856971027e+14, 3.0221766852960403645e+11, 6.3550318087088919566e+08, 1.0453748201934079734e+06, 1.2855164849321609336e+03, 1.0, }; __constant const bessel_y_scalar_type bessel_y1_PC[] = { -4.4357578167941278571e+06, -9.9422465050776411957e+06, -6.6033732483649391093e+06, -1.5235293511811373833e+06, -1.0982405543459346727e+05, -1.6116166443246101165e+03, 0.0, }; __constant const bessel_y_scalar_type bessel_y1_QC[] = { -4.4357578167941278568e+06, -9.9341243899345856590e+06, -6.5853394797230870728e+06, -1.5118095066341608816e+06, -1.0726385991103820119e+05, -1.4550094401904961825e+03, 1.0, }; __constant const bessel_y_scalar_type bessel_y1_PS[] = { 3.3220913409857223519e+04, 8.5145160675335701966e+04, 6.6178836581270835179e+04, 1.8494262873223866797e+04, 1.7063754290207680021e+03, 3.5265133846636032186e+01, 0.0, }; __constant const bessel_y_scalar_type bessel_y1_QS[] = { 7.0871281941028743574e+05, 1.8194580422439972989e+06, 1.4194606696037208929e+06, 4.0029443582266975117e+05, 3.7890229745772202641e+04, 8.6383677696049909675e+02, 1.0, }; bessel_y_scalar_type bessel_y1(bessel_y_scalar_type x) { const bessel_y_scalar_type x1 = 2.1971413260310170351e+00, x2 = 5.4296810407941351328e+00, x11 = 5.620e+02, x12 = 1.8288260310170351490e-03, x21 = 1.3900e+03, x22 = -6.4592058648672279948e-06 ; bessel_y_scalar_type value, factor, r, rc, rs; if (x <= 0) { // domain error return nan((uint)22); } if (x <= 4) // x in (0, 4] { bessel_y_scalar_type y = x * x; bessel_y_scalar_type z = 2 * log(x/x1) * bessel_j1(x) / M_PI; r = boost_evaluate_rational(bessel_y1_P1, bessel_y1_Q1, y); factor = (x + x1) * ((x - x11/256) - x12) / x; value = z + factor * r; } else if (x <= 8) // x in (4, 8] { bessel_y_scalar_type y = x * x; bessel_y_scalar_type z = 2 * log(x/x2) * bessel_j1(x) / M_PI; r = boost_evaluate_rational(bessel_y1_P2, bessel_y1_Q2, y); factor = (x + x2) * ((x - x21/256) - x22) / x; value = z + factor * r; } else // x in (8, \infty) { bessel_y_scalar_type y = 8 / x; bessel_y_scalar_type y2 = y * y; bessel_y_scalar_type z = x - 0.75f * M_PI; rc = boost_evaluate_rational(bessel_y1_PC, bessel_y1_QC, y2); rs = boost_evaluate_rational(bessel_y1_PS, bessel_y1_QS, y2); factor = sqrt(2 / (x * M_PI)); value = factor * (rc * sin(z) + y * rs * cos(z)); } return value; } // }}} // {{{ bessel_yn bessel_y_scalar_type bessel_yn_small_z(int n, bessel_y_scalar_type z, bessel_y_scalar_type* scale) { // // See http://functions.wolfram.com/Bessel-TypeFunctions/BesselY/06/01/04/01/02/ // // Note that when called we assume that x < epsilon and n is a positive integer. // // BOOST_ASSERT(n >= 0); // BOOST_ASSERT((z < policies::get_epsilon())); if(n == 0) { return (2 / M_PI) * (log(z / 2) + M_E); } else if(n == 1) { return (z / M_PI) * log(z / 2) - 2 / (M_PI * z) - (z / (2 * M_PI)) * (1 - 2 * M_E); } else if(n == 2) { return (z * z) / (4 * M_PI) * log(z / 2) - (4 / (M_PI * z * z)) - ((z * z) / (8 * M_PI)) * (3./2 - 2 * M_E); } else { bessel_y_scalar_type p = pow(z / 2, (bessel_y_scalar_type) n); bessel_y_scalar_type result = -((tgamma((bessel_y_scalar_type) n) / M_PI)); if(p * DBL_MAX < result) { bessel_y_scalar_type div = DBL_MAX / 8; result /= div; *scale /= div; if(p * DBL_MAX < result) { return -DBL_MAX; } } return result / p; } } bessel_y_scalar_type bessel_yn(int n, bessel_y_scalar_type x) { //BOOST_MATH_STD_USING bessel_y_scalar_type value, factor, current, prev; //using namespace boost::math::tools; if ((x == 0) && (n == 0)) { return -DBL_MAX; } if (x <= 0) { //return policies::raise_domain_error(function, //"Got x = %1%, but x must be > 0, complex result not supported.", x, pol); return nan((uint)22); } // // Reflection comes first: // if (n < 0) { factor = (n & 0x1) ? -1 : 1; // Y_{-n}(z) = (-1)^n Y_n(z) n = -n; } else { factor = 1; } if(x < DBL_EPSILON) { bessel_y_scalar_type scale = 1; value = bessel_yn_small_z(n, x, &scale); if(DBL_MAX * fabs(scale) < fabs(value)) return copysign((bessel_y_scalar_type) 1, scale) * copysign((bessel_y_scalar_type) 1, value) * DBL_MAX; value /= scale; } else if (n == 0) { value = bessel_y0(x); } else if (n == 1) { value = factor * bessel_y1(x); } else { prev = bessel_y0(x); current = bessel_y1(x); int k = 1; // BOOST_ASSERT(k < n); do { bessel_y_scalar_type fact = 2 * k / x; if((DBL_MAX - fabs(prev)) / fact < fabs(current)) { prev /= current; factor /= current; current = 1; } value = fact * current - prev; prev = current; current = value; ++k; } while(k < n); if(fabs(DBL_MAX * factor) < fabs(value)) return sign(value) * sign(value) * DBL_MAX; value /= factor; } return value; } // }}} // vim: fdm=marker pyopencl-2013.2/pyopencl/_mymako.py0000644000175000000500000000110012245716340015775 0ustar tomussrctry: import mako.template except ImportError: raise ImportError( "Some of PyOpenCL's facilities require the Mako templating engine.\n" "You or a piece of software you have used has tried to call such a\n" "part of PyOpenCL, but there was a problem importing Mako.\n\n" "You may install mako now by typing one of:\n" "- easy_install Mako\n" "- pip install Mako\n" "- aptitude install python-mako\n" "\nor whatever else is appropriate for your system.") from mako import * pyopencl-2013.2/pyopencl/cache.py0000644000175000000500000003314412245716340015421 0ustar tomussrc"""PyOpenCL compiler cache.""" from __future__ import division __copyright__ = "Copyright (C) 2011 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pyopencl._cl as _cl import re import sys import os from pytools import Record try: import hashlib new_hash = hashlib.md5 except ImportError: # for Python << 2.5 import md5 new_hash = md5.new def _erase_dir(dir): from os import listdir, unlink, rmdir from os.path import join for name in listdir(dir): unlink(join(dir, name)) rmdir(dir) def update_checksum(checksum, obj): if isinstance(obj, unicode): checksum.update(obj.encode("utf8")) else: checksum.update(obj) # {{{ cleanup class CleanupBase(object): pass class CleanupManager(CleanupBase): def __init__(self): self.cleanups = [] def register(self, c): self.cleanups.insert(0, c) def clean_up(self): for c in self.cleanups: c.clean_up() def error_clean_up(self): for c in self.cleanups: c.error_clean_up() class CacheLockManager(CleanupBase): def __init__(self, cleanup_m, cache_dir): if cache_dir is not None: self.lock_file = os.path.join(cache_dir, "lock") attempts = 0 while True: try: self.fd = os.open(self.lock_file, os.O_CREAT | os.O_WRONLY | os.O_EXCL) break except OSError: pass from time import sleep sleep(1) attempts += 1 if attempts > 10: from warnings import warn warn("could not obtain cache lock--delete '%s' if necessary" % self.lock_file) cleanup_m.register(self) def clean_up(self): import os os.close(self.fd) os.unlink(self.lock_file) def error_clean_up(self): pass class ModuleCacheDirManager(CleanupBase): def __init__(self, cleanup_m, path): from os import mkdir self.path = path try: mkdir(self.path) cleanup_m.register(self) self.existed = False except OSError: self.existed = True def sub(self, n): from os.path import join return join(self.path, n) def reset(self): import os _erase_dir(self.path) os.mkdir(self.path) def clean_up(self): pass def error_clean_up(self): _erase_dir(self.path) # }}} # {{{ #include dependency handling C_INCLUDE_RE = re.compile(r'^\s*\#\s*include\s+[<"](.+)[">]\s*$', re.MULTILINE) def get_dependencies(src, include_path): result = {} from os.path import realpath, join def _inner(src): for match in C_INCLUDE_RE.finditer(src): included = match.group(1) found = False for ipath in include_path: included_file_name = realpath(join(ipath, included)) if included_file_name not in result: try: src_file = open(included_file_name, "rt") except IOError: continue try: included_src = src_file.read() finally: src_file.close() # jrevent infinite recursion if some header file appears to # include itself result[included_file_name] = None checksum = new_hash() update_checksum(checksum, included_src) _inner(included_src) result[included_file_name] = ( os.stat(included_file_name).st_mtime, checksum.hexdigest(), ) found = True break # stop searching the include path if not found: pass _inner(src) result = list((name,) + vals for name, vals in result.iteritems()) result.sort() return result def get_file_md5sum(fname): checksum = new_hash() inf = open(fname) try: contents = inf.read() finally: inf.close() update_checksum(checksum, contents) return checksum.hexdigest() def check_dependencies(deps): for name, date, md5sum in deps: try: possibly_updated = os.stat(name).st_mtime != date except OSError: return False else: if possibly_updated and md5sum != get_file_md5sum(name): return False return True # }}} # {{{ key generation def get_device_cache_id(device): from pyopencl.version import VERSION platform = device.platform return (VERSION, platform.vendor, platform.name, platform.version, device.vendor, device.name, device.version, device.driver_version) def get_cache_key(device, options, src): checksum = new_hash() update_checksum(checksum, src) update_checksum(checksum, " ".join(options)) update_checksum(checksum, str(get_device_cache_id(device))) return checksum.hexdigest() # }}} def retrieve_from_cache(cache_dir, cache_key): class _InvalidInfoFile(RuntimeError): pass from os.path import join, isdir module_cache_dir = join(cache_dir, cache_key) if not isdir(module_cache_dir): return None cleanup_m = CleanupManager() try: try: CacheLockManager(cleanup_m, cache_dir) mod_cache_dir_m = ModuleCacheDirManager(cleanup_m, module_cache_dir) info_path = mod_cache_dir_m.sub("info") binary_path = mod_cache_dir_m.sub("binary") # {{{ load info file try: from cPickle import load try: info_file = open(info_path, "rb") except IOError: raise _InvalidInfoFile() try: try: info = load(info_file) except EOFError: raise _InvalidInfoFile() finally: info_file.close() except _InvalidInfoFile: mod_cache_dir_m.reset() from warnings import warn warn("PyOpenCL encountered an invalid info file for cache key %s" % cache_key) return None # }}} # {{{ load binary binary_file = open(binary_path, "rb") try: binary = binary_file.read() finally: binary_file.close() # }}} if check_dependencies(info.dependencies): return binary, info.log else: mod_cache_dir_m.reset() except: cleanup_m.error_clean_up() raise finally: cleanup_m.clean_up() # {{{ top-level driver class _SourceInfo(Record): pass def _create_built_program_from_source_cached(ctx, src, options, devices, cache_dir): from os.path import join include_path = ["."] option_idx = 0 while option_idx < len(options): option = options[option_idx].strip() if option.startswith("-I") or option.startswith("/I"): if len(option) == 2: if option_idx+1 < len(options): include_path.append(options[option_idx+1]) option_idx += 2 else: include_path.append(option[2:].lstrip()) option_idx += 1 else: option_idx += 1 if cache_dir is None: from tempfile import gettempdir import getpass cache_dir = join(gettempdir(), "pyopencl-compiler-cache-v2-uid%s-py%s" % ( getpass.getuser(), ".".join(str(i) for i in sys.version_info))) # {{{ ensure cache directory exists try: os.mkdir(cache_dir) except OSError, e: from errno import EEXIST if e.errno != EEXIST: raise # }}} if devices is None: devices = ctx.devices cache_keys = [get_cache_key(device, options, src) for device in devices] binaries = [] to_be_built_indices = [] logs = [] for i, (device, cache_key) in enumerate(zip(devices, cache_keys)): cache_result = retrieve_from_cache(cache_dir, cache_key) if cache_result is None: to_be_built_indices.append(i) binaries.append(None) logs.append(None) else: binary, log = cache_result binaries.append(binary) logs.append(log) message = (75*"="+"\n").join( "Build on %s succeeded, but said:\n\n%s" % (dev, log) for dev, log in zip(devices, logs) if log is not None and log.strip()) if message: from pyopencl import compiler_output compiler_output( "Built kernel retrieved from cache. Original from-source " "build had warnings:\n"+message) # {{{ build on the build-needing devices, in one go result = None already_built = False if to_be_built_indices: # defeat implementation caches: from uuid import uuid4 src = src + "\n\n__constant int pyopencl_defeat_cache_%s = 0;" % ( uuid4().hex) prg = _cl._Program(ctx, src) prg.build(options, [devices[i] for i in to_be_built_indices]) prg_devs = prg.get_info(_cl.program_info.DEVICES) prg_bins = prg.get_info(_cl.program_info.BINARIES) prg_logs = prg._get_build_logs() for dest_index in to_be_built_indices: dev = devices[dest_index] src_index = prg_devs.index(dev) binaries[dest_index] = prg_bins[src_index] _, logs[dest_index] = prg_logs[src_index] if len(to_be_built_indices) == len(devices): # Important special case: if code for all devices was built, # then we may simply use the program that we just built as the # final result. result = prg already_built = True if result is None: result = _cl._Program(ctx, devices, binaries) # }}} # {{{ save binaries to cache if to_be_built_indices: cleanup_m = CleanupManager() try: try: CacheLockManager(cleanup_m, cache_dir) for i in to_be_built_indices: cache_key = cache_keys[i] device = devices[i] binary = binaries[i] mod_cache_dir_m = ModuleCacheDirManager(cleanup_m, join(cache_dir, cache_key)) info_path = mod_cache_dir_m.sub("info") binary_path = mod_cache_dir_m.sub("binary") source_path = mod_cache_dir_m.sub("source.cl") outf = open(source_path, "wt") outf.write(src) outf.close() outf = open(binary_path, "wb") outf.write(binary) outf.close() from cPickle import dump info_file = open(info_path, "wb") dump(_SourceInfo( dependencies=get_dependencies(src, include_path), log=logs[i]), info_file) info_file.close() except: cleanup_m.error_clean_up() raise finally: cleanup_m.clean_up() # }}} return result, already_built def create_built_program_from_source_cached(ctx, src, options=[], devices=None, cache_dir=None): try: if cache_dir is not False: prg, already_built = _create_built_program_from_source_cached( ctx, src, options, devices, cache_dir) else: prg = _cl._Program(ctx, src) already_built = False except Exception, e: raise from pyopencl import Error if (isinstance(e, Error) and e.code == _cl.status_code.BUILD_PROGRAM_FAILURE): # no need to try again raise from warnings import warn from traceback import format_exc warn("PyOpenCL compiler caching failed with an exception:\n" "[begin exception]\n%s[end exception]" % format_exc()) prg = _cl._Program(ctx, src) already_built = False if not already_built: prg.build(options, devices) return prg # }}} # vim: foldmethod=marker pyopencl-2013.2/pyopencl/tools.py0000644000175000000500000006753012245716340015524 0ustar tomussrc"""Various helpful bits and pieces without much of a common theme.""" from __future__ import division __copyright__ = "Copyright (C) 2010 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np from decorator import decorator import pyopencl as cl from pytools import memoize, memoize_method import re from pyopencl.compyte.dtypes import ( # noqa get_or_register_dtype, TypeNameNotKnown, register_dtype, dtype_to_ctype) def _register_types(): from pyopencl.compyte.dtypes import _fill_dtype_registry _fill_dtype_registry(respect_windows=False, include_bool=False) get_or_register_dtype("cfloat_t", np.complex64) get_or_register_dtype("cdouble_t", np.complex128) is_64_bit = tuple.__itemsize__ * 8 == 64 if not is_64_bit: get_or_register_dtype( ["unsigned long", "unsigned long int"], np.uint64) get_or_register_dtype( ["signed long", "signed long int", "long int"], np.int64) _register_types() # {{{ imported names bitlog2 = cl.bitlog2 PooledBuffer = cl.PooledBuffer from pyopencl._cl import _tools_DeferredAllocator as DeferredAllocator from pyopencl._cl import ( # noqa _tools_ImmediateAllocator as ImmediateAllocator) class CLAllocator(DeferredAllocator): def __init__(self, *args, **kwargs): from warnings import warn warn("pyopencl.tools.CLAllocator is deprecated. " "It will be continue to exist throughout the 2013.x " "versions of PyOpenCL. Use {Deferred,Immediate}Allocator.", DeprecationWarning, 2) DeferredAllocator.__init__(self, *args, **kwargs) MemoryPool = cl.MemoryPool # }}} # {{{ first-arg caches _first_arg_dependent_caches = [] @decorator def first_arg_dependent_memoize(func, cl_object, *args): """Provides memoization for a function. Typically used to cache things that get created inside a :class:`pyopencl.Context`, e.g. programs and kernels. Assumes that the first argument of the decorated function is an OpenCL object that might go away, such as a :class:`pyopencl.Context` or a :class:`pyopencl.CommandQueue`, and based on which we might want to clear the cache. .. versionadded:: 2011.2 """ try: ctx_dict = func._pyopencl_first_arg_dep_memoize_dic except AttributeError: # FIXME: This may keep contexts alive longer than desired. # But I guess since the memory in them is freed, who cares. ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {} _first_arg_dependent_caches.append(ctx_dict) try: return ctx_dict[cl_object][args] except KeyError: arg_dict = ctx_dict.setdefault(cl_object, {}) result = func(cl_object, *args) arg_dict[args] = result return result context_dependent_memoize = first_arg_dependent_memoize def first_arg_dependent_memoize_nested(nested_func): """Provides memoization for nested functions. Typically used to cache things that get created inside a :class:`pyopencl.Context`, e.g. programs and kernels. Assumes that the first argument of the decorated function is an OpenCL object that might go away, such as a :class:`pyopencl.Context` or a :class:`pyopencl.CommandQueue`, and will therefore respond to :func:`clear_first_arg_caches`. .. versionadded:: 2013.1 Requires Python 2.5 or newer. """ from functools import wraps cache_dict_name = intern("_memoize_inner_dic_%s_%s_%d" % (nested_func.__name__, nested_func.func_code.co_filename, nested_func.func_code.co_firstlineno)) from inspect import currentframe # prevent ref cycle try: caller_frame = currentframe().f_back cache_context = caller_frame.f_globals[ caller_frame.f_code.co_name] finally: #del caller_frame pass try: cache_dict = getattr(cache_context, cache_dict_name) except AttributeError: cache_dict = {} _first_arg_dependent_caches.append(cache_dict) setattr(cache_context, cache_dict_name, cache_dict) @wraps(nested_func) def new_nested_func(cl_object, *args): try: return cache_dict[cl_object][args] except KeyError: arg_dict = cache_dict.setdefault(cl_object, {}) result = nested_func(cl_object, *args) arg_dict[args] = result return result return new_nested_func def clear_first_arg_caches(): """Empties all first-argument-dependent memoization caches. Also releases all held reference contexts. If it is important to you that the program detaches from its context, you might need to call this function to free all remaining references to your context. .. versionadded:: 2011.2 """ for cache in _first_arg_dependent_caches: cache.clear() import atexit atexit.register(clear_first_arg_caches) # }}} def get_test_platforms_and_devices(plat_dev_string=None): """Parse a string of the form 'PYOPENCL_TEST=0:0,1;intel:i5'. :return: list of tuples (platform, [device, device, ...]) """ if plat_dev_string is None: import os plat_dev_string = os.environ.get("PYOPENCL_TEST", None) def find_cl_obj(objs, identifier): try: num = int(identifier) except Exception: pass else: return objs[num] found = False for obj in objs: if identifier.lower() in (obj.name + ' ' + obj.vendor).lower(): return obj if not found: raise RuntimeError("object '%s' not found" % identifier) if plat_dev_string: result = [] for entry in plat_dev_string.split(";"): lhsrhs = entry.split(":") if len(lhsrhs) == 1: platform = find_cl_obj(cl.get_platforms(), lhsrhs[0]) result.append((platform, platform.get_devices())) elif len(lhsrhs) != 2: raise RuntimeError("invalid syntax of PYOPENCL_TEST") else: plat_str, dev_strs = lhsrhs platform = find_cl_obj(cl.get_platforms(), plat_str) devs = platform.get_devices() result.append( (platform, [find_cl_obj(devs, dev_id) for dev_id in dev_strs.split(",")])) return result else: return [ (platform, platform.get_devices()) for platform in cl.get_platforms()] def pytest_generate_tests_for_pyopencl(metafunc): class ContextFactory: def __init__(self, device): self.device = device def __call__(self): # Get rid of leftovers from past tests. # CL implementations are surprisingly limited in how many # simultaneous contexts they allow... clear_first_arg_caches() from gc import collect collect() return cl.Context([self.device]) def __str__(self): return "" % self.device test_plat_and_dev = get_test_platforms_and_devices() if ("device" in metafunc.funcargnames or "ctx_factory" in metafunc.funcargnames or "ctx_getter" in metafunc.funcargnames): arg_dict = {} for platform, plat_devs in test_plat_and_dev: if "platform" in metafunc.funcargnames: arg_dict["platform"] = platform for device in plat_devs: if "device" in metafunc.funcargnames: arg_dict["device"] = device if "ctx_factory" in metafunc.funcargnames: arg_dict["ctx_factory"] = ContextFactory(device) if "ctx_getter" in metafunc.funcargnames: from warnings import warn warn("The 'ctx_getter' arg is deprecated in " "favor of 'ctx_factory'.", DeprecationWarning) arg_dict["ctx_getter"] = ContextFactory(device) metafunc.addcall(funcargs=arg_dict.copy(), id=", ".join("%s=%s" % (arg, value) for arg, value in arg_dict.iteritems())) elif "platform" in metafunc.funcargnames: for platform, plat_devs in test_plat_and_dev: metafunc.addcall( funcargs=dict(platform=platform), id=str(platform)) # {{{ C argument lists class Argument(object): pass class DtypedArgument(Argument): def __init__(self, dtype, name): self.dtype = np.dtype(dtype) self.name = name def __repr__(self): return "%s(%r, %s)" % ( self.__class__.__name__, self.name, self.dtype) class VectorArg(DtypedArgument): def __init__(self, dtype, name, with_offset=False): DtypedArgument.__init__(self, dtype, name) self.with_offset = with_offset def declarator(self): if self.with_offset: # Two underscores -> less likelihood of a name clash. return "__global %s *%s__base, long %s__offset" % ( dtype_to_ctype(self.dtype), self.name, self.name) else: result = "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name) return result class ScalarArg(DtypedArgument): def declarator(self): return "%s %s" % (dtype_to_ctype(self.dtype), self.name) class OtherArg(Argument): def __init__(self, declarator, name): self.decl = declarator self.name = name def declarator(self): return self.decl def parse_c_arg(c_arg, with_offset=False): for aspace in ["__local", "__constant"]: if aspace in c_arg: raise RuntimeError("cannot deal with local or constant " "OpenCL address spaces in C argument lists ") c_arg = c_arg.replace("__global", "") if with_offset: vec_arg_factory = lambda dtype, name: \ VectorArg(dtype, name, with_offset=True) else: vec_arg_factory = VectorArg from pyopencl.compyte.dtypes import parse_c_arg_backend return parse_c_arg_backend(c_arg, ScalarArg, vec_arg_factory) def parse_arg_list(arguments, with_offset=False): """Parse a list of kernel arguments. *arguments* may be a comma-separate list of C declarators in a string, a list of strings representing C declarators, or :class:`Argument` objects. """ if isinstance(arguments, str): arguments = arguments.split(",") def parse_single_arg(obj): if isinstance(obj, str): from pyopencl.tools import parse_c_arg return parse_c_arg(obj, with_offset=with_offset) else: return obj return [parse_single_arg(arg) for arg in arguments] def get_arg_list_scalar_arg_dtypes(arg_types): result = [] for arg_type in arg_types: if isinstance(arg_type, ScalarArg): result.append(arg_type.dtype) elif isinstance(arg_type, VectorArg): result.append(None) if arg_type.with_offset: result.append(np.int64) else: raise RuntimeError("arg type not understood: %s" % type(arg_type)) return result def get_arg_offset_adjuster_code(arg_types): result = [] for arg_type in arg_types: if isinstance(arg_type, VectorArg) and arg_type.with_offset: result.append("__global %(type)s *%(name)s = " "(__global %(type)s *) " "((__global char *) %(name)s__base + %(name)s__offset);" % dict( type=dtype_to_ctype(arg_type.dtype), name=arg_type.name)) return "\n".join(result) # }}} def get_gl_sharing_context_properties(): ctx_props = cl.context_properties from OpenGL import platform as gl_platform, GLX, WGL props = [] import sys if sys.platform in ["linux", "linux2"]: props.append( (ctx_props.GL_CONTEXT_KHR, gl_platform.GetCurrentContext())) props.append( (ctx_props.GLX_DISPLAY_KHR, GLX.glXGetCurrentDisplay())) elif sys.platform == "win32": props.append( (ctx_props.GL_CONTEXT_KHR, gl_platform.GetCurrentContext())) props.append( (ctx_props.WGL_HDC_KHR, WGL.wglGetCurrentDC())) elif sys.platform == "darwin": props.append( (ctx_props.CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, cl.get_apple_cgl_share_group())) else: raise NotImplementedError("platform '%s' not yet supported" % sys.platform) return props class _CDeclList: def __init__(self, device): self.device = device self.declared_dtypes = set() self.declarations = [] self.saw_double = False self.saw_complex = False def add_dtype(self, dtype): dtype = np.dtype(dtype) if dtype in [np.float64 or np.complex128]: self.saw_double = True if dtype.kind == "c": self.saw_complex = True if dtype.kind != "V": return if dtype in self.declared_dtypes: return from pyopencl.array import vec if dtype in vec.type_to_scalar_and_count: return for name, field_data in dtype.fields.iteritems(): field_dtype, offset = field_data[:2] self.add_dtype(field_dtype) _, cdecl = match_dtype_to_c_struct( self.device, dtype_to_ctype(dtype), dtype) self.declarations.append(cdecl) self.declared_dtypes.add(dtype) def visit_arguments(self, arguments): for arg in arguments: dtype = arg.dtype if dtype in [np.float64 or np.complex128]: self.saw_double = True if dtype.kind == "c": self.saw_complex = True def get_declarations(self): result = "\n\n".join(self.declarations) if self.saw_complex: result = ( "#include \n\n" + result) if self.saw_double: result = ( "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" "#define PYOPENCL_DEFINE_CDOUBLE\n" + result) return result @memoize def match_dtype_to_c_struct(device, name, dtype, context=None): """Return a tuple `(dtype, c_decl)` such that the C struct declaration in `c_decl` and the structure :class:`numpy.dtype` instance `dtype` have the same memory layout. Note that *dtype* may be modified from the value that was passed in, for example to insert padding. (As a remark on implementation, this routine runs a small kernel on the given *device* to ensure that :mod:`numpy` and C offsets and sizes match.) .. versionadded: 2013.1 This example explains the use of this function:: >>> import numpy as np >>> import pyopencl as cl >>> import pyopencl.tools >>> ctx = cl.create_some_context() >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)]) >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct( ... ctx.devices[0], 'id_val', dtype) >>> print c_decl typedef struct { unsigned id; float value; } id_val; >>> print dtype [('id', '>> cl.tools.get_or_register_dtype('id_val', dtype) As this example shows, it is important to call :func:`get_or_register_dtype` on the modified `dtype` returned by this function, not the original one. """ fields = sorted(dtype.fields.iteritems(), key=lambda (name, (dtype, offset)): offset) c_fields = [] for field_name, (field_dtype, offset) in fields: c_fields.append(" %s %s;" % (dtype_to_ctype(field_dtype), field_name)) c_decl = "typedef struct {\n%s\n} %s;\n\n" % ( "\n".join(c_fields), name) cdl = _CDeclList(device) for field_name, (field_dtype, offset) in fields: cdl.add_dtype(field_dtype) pre_decls = cdl.get_declarations() offset_code = "\n".join( "result[%d] = pycl_offsetof(%s, %s);" % (i+1, name, field_name) for i, (field_name, (field_dtype, offset)) in enumerate(fields)) src = r""" #define pycl_offsetof(st, m) \ ((size_t) ((__local char *) &(dummy.m) \ - (__local char *)&dummy )) %(pre_decls)s %(my_decl)s __kernel void get_size_and_offsets(__global size_t *result) { result[0] = sizeof(%(my_type)s); __local %(my_type)s dummy; %(offset_code)s } """ % dict( pre_decls=pre_decls, my_decl=c_decl, my_type=name, offset_code=offset_code) if context is None: context = cl.Context([device]) queue = cl.CommandQueue(context) prg = cl.Program(context, src) knl = prg.build(devices=[device]).get_size_and_offsets import pyopencl.array # noqa result_buf = cl.array.empty(queue, 1+len(fields), np.uintp) knl(queue, (1,), (1,), result_buf.data) queue.finish() size_and_offsets = result_buf.get() size = int(size_and_offsets[0]) from pytools import any offsets = size_and_offsets[1:] if any(ofs >= size for ofs in offsets): # offsets not plausible if dtype.itemsize == size: # If sizes match, use numpy's idea of the offsets. offsets = [offset for field_name, (field_dtype, offset) in fields] else: raise RuntimeError( "cannot discover struct layout on '%s'" % device) result_buf.data.release() del knl del prg del queue del context dtype_arg_dict = dict( names=[field_name for field_name, (field_dtype, offset) in fields], formats=[field_dtype for field_name, (field_dtype, offset) in fields], offsets=[int(x) for x in offsets], itemsize=int(size_and_offsets[0]), ) dtype = np.dtype(dtype_arg_dict) if dtype.itemsize != size_and_offsets[0]: # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo. dtype_arg_dict["names"].append("_pycl_size_fixer") dtype_arg_dict["formats"].append(np.uint8) dtype_arg_dict["offsets"].append(int(size_and_offsets[0])-1) dtype = np.dtype(dtype_arg_dict) assert dtype.itemsize == size_and_offsets[0] return dtype, c_decl @memoize def dtype_to_c_struct(device, dtype): matched_dtype, c_decl = match_dtype_to_c_struct( device, dtype_to_ctype(dtype), dtype) def dtypes_match(): result = len(dtype.fields) == len(matched_dtype.fields) for name, val in dtype.fields.iteritems(): result = result and matched_dtype.fields[name] == val return result assert dtypes_match() return c_decl # {{{ code generation/templating helper def _process_code_for_macro(code): code = code.replace("//CL//", "\n") if "//" in code: raise RuntimeError("end-of-line comments ('//') may not be used in " "code snippets") return code.replace("\n", " \\\n") class _SimpleTextTemplate: def __init__(self, txt): self.txt = txt def render(self, context): return self.txt class _PrintfTextTemplate: def __init__(self, txt): self.txt = txt def render(self, context): return self.txt % context class _MakoTextTemplate: def __init__(self, txt): from mako.template import Template self.template = Template(txt, strict_undefined=True) def render(self, context): return self.template.render(**context) class _ArgumentPlaceholder: """A placeholder for subclasses of :class:`DtypedArgument`. This is needed because the concrete dtype of the argument is not known at template creation time--it may be a type alias that will only be filled in at run time. These types take the place of these proto-arguments until all types are known. See also :class:`_TemplateRenderer.render_arg`. """ def __init__(self, typename, name, **extra_kwargs): self.typename = typename self.name = name self.extra_kwargs = extra_kwargs class _VectorArgPlaceholder(_ArgumentPlaceholder): target_class = VectorArg class _ScalarArgPlaceholder(_ArgumentPlaceholder): target_class = ScalarArg class _TemplateRenderer(object): def __init__(self, template, type_aliases, var_values, context=None, options=[]): self.template = template self.type_aliases = dict(type_aliases) self.var_dict = dict(var_values) for name in self.var_dict: if name.startswith("macro_"): self.var_dict[name] = _process_code_for_macro( self.var_dict[name]) self.context = context self.options = options def __call__(self, txt): if txt is None: return txt result = self.template.get_text_template(txt).render(self.var_dict) return str(result) def get_rendered_kernel(self, txt, kernel_name): prg = cl.Program(self.context, self(txt)).build(self.options) kernel_name_prefix = self.var_dict.get("kernel_name_prefix") if kernel_name_prefix is not None: kernel_name = kernel_name_prefix+kernel_name return getattr(prg, kernel_name) def parse_type(self, typename): if isinstance(typename, str): try: return self.type_aliases[typename] except KeyError: from pyopencl.compyte.dtypes import NAME_TO_DTYPE return NAME_TO_DTYPE[typename] else: return np.dtype(typename) def render_arg(self, arg_placeholder): return arg_placeholder.target_class( self.parse_type(arg_placeholder.typename), arg_placeholder.name, **arg_placeholder.extra_kwargs) _C_COMMENT_FINDER = re.compile(r"/\*.*?\*/") def render_argument_list(self, *arg_lists, **kwargs): with_offset = kwargs.pop("with_offset", False) if kwargs: raise TypeError("unrecognized kwargs: " + ", ".join(kwargs)) all_args = [] for arg_list in arg_lists: if isinstance(arg_list, str): arg_list = str( self.template .get_text_template(arg_list).render(self.var_dict)) arg_list = self._C_COMMENT_FINDER.sub("", arg_list) arg_list = arg_list.replace("\n", " ") all_args.extend(arg_list.split(",")) else: all_args.extend(arg_list) if with_offset: vec_arg_factory = lambda typename, name: \ _VectorArgPlaceholder(typename, name, with_offset=True) else: vec_arg_factory = _VectorArgPlaceholder from pyopencl.compyte.dtypes import parse_c_arg_backend parsed_args = [] for arg in all_args: if isinstance(arg, str): arg = arg.strip() if not arg: continue ph = parse_c_arg_backend(arg, _ScalarArgPlaceholder, vec_arg_factory, name_to_dtype=lambda x: x) parsed_arg = self.render_arg(ph) elif isinstance(arg, Argument): parsed_arg = arg elif isinstance(arg, tuple): parsed_arg = ScalarArg(self.parse_type(arg[0]), arg[1]) parsed_args.append(parsed_arg) return parsed_args def get_type_decl_preamble(self, device, decl_type_names, arguments=None): cdl = _CDeclList(device) for typename in decl_type_names: cdl.add_dtype(self.parse_type(typename)) if arguments is not None: cdl.visit_arguments(arguments) for tv in self.type_aliases.itervalues(): cdl.add_dtype(tv) type_alias_decls = [ "typedef %s %s;" % (dtype_to_ctype(val), name) for name, val in self.type_aliases.iteritems() ] return cdl.get_declarations() + "\n" + "\n".join(type_alias_decls) class KernelTemplateBase(object): def __init__(self, template_processor=None): self.template_processor = template_processor self.build_cache = {} _first_arg_dependent_caches.append(self.build_cache) def get_preamble(self): pass _TEMPLATE_PROCESSOR_PATTERN = re.compile(r"^//CL(?::([a-zA-Z0-9_]+))?//") @memoize_method def get_text_template(self, txt): proc_match = self._TEMPLATE_PROCESSOR_PATTERN.match(txt) tpl_processor = None if proc_match is not None: tpl_processor = proc_match.group(1) # chop off //CL// mark txt = txt[len(proc_match.group(0)):] if tpl_processor is None: tpl_processor = self.template_processor if tpl_processor is None or tpl_processor == "none": return _SimpleTextTemplate(txt) elif tpl_processor == "printf": return _PrintfTextTemplate(txt) elif tpl_processor == "mako": return _MakoTextTemplate(txt) else: raise RuntimeError( "unknown template processor '%s'" % proc_match.group(1)) def get_renderer(self, type_aliases, var_values, context=None, options=[]): return _TemplateRenderer(self, type_aliases, var_values) def build(self, context, *args, **kwargs): """Provide caching for an :meth:`build_inner`.""" cache_key = (context, args, tuple(sorted(kwargs.iteritems()))) try: return self.build_cache[cache_key] except KeyError: result = self.build_inner(context, *args, **kwargs) self.build_cache[cache_key] = result return result # }}} # {{{ array_module class _CLFakeArrayModule: def __init__(self, queue): self.queue = queue @property def ndarray(self): from pyopencl.array import Array return Array def dot(self, x, y): from pyopencl.array import dot return dot(x, y, queue=self.queue).get() def vdot(self, x, y): from pyopencl.array import vdot return vdot(x, y, queue=self.queue).get() def empty(self, shape, dtype, order="C"): from pyopencl.array import empty return empty(self.queue, shape, dtype, order=order) def array_module(a): if isinstance(a, np.ndarray): return np else: from pyopencl.array import Array if isinstance(a, Array): return _CLFakeArrayModule(a.queue) else: raise TypeError("array type not understood: %s" % type(a)) # }}} # vim: foldmethod=marker pyopencl-2013.2/pyopencl/_cluda.py0000644000175000000500000000351312245716340015602 0ustar tomussrc__copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ CLUDA_PREAMBLE = """ #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE); #define WITHIN_KERNEL /* empty */ #define KERNEL __kernel #define GLOBAL_MEM __global #define LOCAL_MEM __local #define LOCAL_MEM_ARG __local #define REQD_WG_SIZE(X,Y,Z) __attribute__((reqd_work_group_size(X, Y, Z))) #define LID_0 get_local_id(0) #define LID_1 get_local_id(1) #define LID_2 get_local_id(2) #define GID_0 get_group_id(0) #define GID_1 get_group_id(1) #define GID_2 get_group_id(2) #define LDIM_0 get_local_size(0) #define LDIM_1 get_local_size(1) #define LDIM_2 get_local_size(2) #define GDIM_0 get_num_groups(0) #define GDIM_1 get_num_groups(1) #define GDIM_2 get_num_groups(2) % if double_support: #pragma OPENCL EXTENSION cl_khr_fp64: enable % endif """ pyopencl-2013.2/examples/0002755000175000000500000000000012245716340013766 5ustar tomussrcpyopencl-2013.2/examples/transpose.py0000644000175000000500000001356512245716340016366 0ustar tomussrc# Transposition of a matrix # originally for PyCUDA by Hendrik Riedmann from __future__ import division import pyopencl as cl import numpy import numpy.linalg as la block_size = 16 class NaiveTranspose: def __init__(self, ctx): self.kernel = cl.Program(ctx, """ __kernel void transpose( __global float *a_t, __global float *a, unsigned a_width, unsigned a_height) { int read_idx = get_global_id(0) + get_global_id(1) * a_width; int write_idx = get_global_id(1) + get_global_id(0) * a_height; a_t[write_idx] = a[read_idx]; } """% {"block_size": block_size}).build().transpose def __call__(self, queue, tgt, src, shape): w, h = shape assert w % block_size == 0 assert h % block_size == 0 return self.kernel(queue, (w, h), (block_size, block_size), tgt, src, numpy.uint32(w), numpy.uint32(h)) class SillyTranspose(NaiveTranspose): def __call__(self, queue, tgt, src, shape): w, h = shape assert w % block_size == 0 assert h % block_size == 0 return self.kernel(queue, (w, h), None, tgt, src, numpy.uint32(w), numpy.uint32(h)) class TransposeWithLocal: def __init__(self, ctx): self.kernel = cl.Program(ctx, """ #define BLOCK_SIZE %(block_size)d #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width) #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height) __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1))) void transpose( __global float *a_t, __global float *a, unsigned a_width, unsigned a_height, __local float *a_local) { int base_idx_a = get_group_id(0) * BLOCK_SIZE + get_group_id(1) * A_BLOCK_STRIDE; int base_idx_a_t = get_group_id(1) * BLOCK_SIZE + get_group_id(0) * A_T_BLOCK_STRIDE; int glob_idx_a = base_idx_a + get_local_id(0) + a_width * get_local_id(1); int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1); a_local[get_local_id(1)*BLOCK_SIZE+get_local_id(0)] = a[glob_idx_a]; barrier(CLK_LOCAL_MEM_FENCE); a_t[glob_idx_a_t] = a_local[get_local_id(0)*BLOCK_SIZE+get_local_id(1)]; } """% {"block_size": block_size}).build().transpose def __call__(self, queue, tgt, src, shape): w, h = shape assert w % block_size == 0 assert h % block_size == 0 return self.kernel(queue, (w, h), (block_size, block_size), tgt, src, numpy.uint32(w), numpy.uint32(h), cl.LocalMemory(4*block_size*(block_size+1))) def transpose_using_cl(ctx, queue, cpu_src, cls): mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cpu_src) a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=cpu_src.nbytes) cls(ctx)(queue, a_t_buf, a_buf, cpu_src.shape) w, h = cpu_src.shape result = numpy.empty((h, w), dtype=cpu_src.dtype) cl.enqueue_read_buffer(queue, a_t_buf, result).wait() a_buf.release() a_t_buf.release() return result def check_transpose(): for cls in [NaiveTranspose, SillyTranspose, TransposeWithLocal]: print("checking", cls.__name__) ctx = cl.create_some_context() for dev in ctx.devices: assert dev.local_mem_size > 0 queue = cl.CommandQueue(ctx) for i in numpy.arange(10, 13, 0.125): size = int(((2**i) // 32) * 32) print(size) source = numpy.random.rand(size, size).astype(numpy.float32) result = transpose_using_cl(ctx, queue, source, NaiveTranspose) err = source.T - result err_norm = la.norm(err) assert err_norm == 0, (size, err_norm) def benchmark_transpose(): ctx = cl.create_some_context() for dev in ctx.devices: assert dev.local_mem_size > 0 queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) sizes = [int(((2**i) // 32) * 32) for i in numpy.arange(10, 13, 0.125)] #for i in numpy.arange(10, 10.5, 0.125)] mem_bandwidths = {} methods = [SillyTranspose, NaiveTranspose, TransposeWithLocal] for cls in methods: name = cls.__name__.replace("Transpose", "") mem_bandwidths[cls] = meth_mem_bws = [] for size in sizes: source = numpy.random.rand(size, size).astype(numpy.float32) mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=source) a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=source.nbytes) method = cls(ctx) for i in range(4): method(queue, a_t_buf, a_buf, source.shape) count = 12 events = [] for i in range(count): events.append(method(queue, a_t_buf, a_buf, source.shape)) events[-1].wait() time = sum(evt.profile.end - evt.profile.start for evt in events) mem_bw = 2*source.nbytes*count/(time*1e-9) print("benchmarking", name, size, mem_bw/1e9, "GB/s") meth_mem_bws.append(mem_bw) a_buf.release() a_t_buf.release() from matplotlib.pyplot import clf, plot, title, xlabel, ylabel, \ savefig, legend, grid for i in range(len(methods)): clf() for j in range(i+1): method = methods[j] name = method.__name__.replace("Transpose", "") plot(sizes, numpy.array(mem_bandwidths[method])/1e9, "o-", label=name) xlabel("Matrix width/height $N$") ylabel("Memory Bandwidth [GB/s]") legend(loc="best") grid() savefig("transpose-benchmark-%d.pdf" % i) #check_transpose() benchmark_transpose() pyopencl-2013.2/examples/demo_mandelbrot.py0000644000175000000500000001156012245716340017474 0ustar tomussrc# I found this example for PyCuda here: # http://wiki.tiker.net/PyCuda/Examples/Mandelbrot # # An improved sequential/pure Python code was contributed # by CRVSADER//KY . # # I adapted it for PyOpenCL. Hopefully it is useful to someone. # July 2010, HolgerRapp@gmx.net # # Original readme below these lines. # Mandelbrot calculate using GPU, Serial numpy and faster numpy # Use to show the speed difference between CPU and GPU calculations # ian@ianozsvald.com March 2010 # Based on vegaseat's TKinter/numpy example code from 2006 # http://www.daniweb.com/code/snippet216851.html# # with minor changes to move to numpy from the obsolete Numeric import time import numpy as np import pyopencl as cl # You can choose a calculation routine below (calc_fractal), uncomment # one of the three lines to test the three variations # Speed notes are listed in the same place # set width and height of window, more pixels take longer to calculate w = 2048 h = 2048 def calc_fractal_opencl(q, maxiter): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) output = np.empty(q.shape, dtype=np.uint16) mf = cl.mem_flags q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q) output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes) prg = cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable __kernel void mandelbrot(__global float2 *q, __global ushort *output, ushort const maxiter) { int gid = get_global_id(0); float nreal, real = 0; float imag = 0; output[gid] = 0; for(int curiter = 0; curiter < maxiter; curiter++) { nreal = real*real - imag*imag + q[gid].x; imag = 2* real*imag + q[gid].y; real = nreal; if (real*real + imag*imag > 4.0f) output[gid] = curiter; } } """).build() prg.mandelbrot(queue, output.shape, None, q_opencl, output_opencl, np.uint16(maxiter)) cl.enqueue_copy(queue, output, output_opencl).wait() return output def calc_fractal_serial(q, maxiter): # calculate z using pure python on a numpy array # note that, unlike the other two implementations, # the number of iterations per point is NOT constant z = np.zeros(q.shape, complex) output = np.resize(np.array(0,), q.shape) for i in range(len(q)): for iter in range(maxiter): z[i] = z[i]*z[i] + q[i] if abs(z[i]) > 2.0: output[i] = iter break return output def calc_fractal_numpy(q, maxiter): # calculate z using numpy, this is the original # routine from vegaseat's URL output = np.resize(np.array(0,), q.shape) z = np.zeros(q.shape, np.complex64) for it in range(maxiter): z = z*z + q done = np.greater(abs(z), 2.0) q = np.where(done, 0+0j, q) z = np.where(done, 0+0j, z) output = np.where(done, it, output) return output # choose your calculation routine here by uncommenting one of the options calc_fractal = calc_fractal_opencl # calc_fractal = calc_fractal_serial # calc_fractal = calc_fractal_numpy if __name__ == '__main__': try: import Tkinter as tk except ImportError: # Python 3 import tkinter as tk from PIL import Image, ImageTk class Mandelbrot(object): def __init__(self): # create window self.root = tk.Tk() self.root.title("Mandelbrot Set") self.create_image() self.create_label() # start event loop self.root.mainloop() def draw(self, x1, x2, y1, y2, maxiter=30): # draw the Mandelbrot set, from numpy example xx = np.arange(x1, x2, (x2-x1)/w) yy = np.arange(y2, y1, (y1-y2)/h) * 1j q = np.ravel(xx+yy[:, np.newaxis]).astype(np.complex64) start_main = time.time() output = calc_fractal(q, maxiter) end_main = time.time() secs = end_main - start_main print("Main took", secs) self.mandel = (output.reshape((h, w)) / float(output.max()) * 255.).astype(np.uint8) def create_image(self): """" create the image from the draw() string """ # you can experiment with these x and y ranges self.draw(-2.13, 0.77, -1.3, 1.3) self.im = Image.fromarray(self.mandel) self.im.putpalette([i for rgb in ((j, 0, 0) for j in range(255)) for i in rgb]) def create_label(self): # put the image on a label widget self.image = ImageTk.PhotoImage(self.im) self.label = tk.Label(self.root, image=self.image) self.label.pack() # test the class test = Mandelbrot() pyopencl-2013.2/examples/.gitignore0000644000175000000500000000001612245716340015751 0ustar tomussrcwiki-examples pyopencl-2013.2/examples/gl_interop_demo.py0000644000175000000500000000460112245716340017505 0ustar tomussrcfrom OpenGL.GL import * from OpenGL.GLUT import * from OpenGL.raw.GL.VERSION.GL_1_5 import glBufferData as rawGlBufferData import pyopencl as cl n_vertices = 10000 src = """ __kernel void generate_sin(__global float2* a) { int id = get_global_id(0); int n = get_global_size(0); float r = (float)id / (float)n; float x = r * 16.0f * 3.1415f; a[id].x = r * 2.0f - 1.0f; a[id].y = native_sin(x); } """ def initialize(): platform = cl.get_platforms()[0] from pyopencl.tools import get_gl_sharing_context_properties import sys if sys.platform == "darwin": ctx = cl.Context(properties=get_gl_sharing_context_properties(), devices=[]) else: # Some OSs prefer clCreateContextFromType, some prefer # clCreateContext. Try both. try: ctx = cl.Context(properties=[ (cl.context_properties.PLATFORM, platform)] + get_gl_sharing_context_properties()) except: ctx = cl.Context(properties=[ (cl.context_properties.PLATFORM, platform)] + get_gl_sharing_context_properties(), devices = [platform.get_devices()[0]]) glClearColor(1, 1, 1, 1) glColor(0, 0, 1) vbo = glGenBuffers(1) glBindBuffer(GL_ARRAY_BUFFER, vbo) rawGlBufferData(GL_ARRAY_BUFFER, n_vertices * 2 * 4, None, GL_STATIC_DRAW) glEnableClientState(GL_VERTEX_ARRAY) glVertexPointer(2, GL_FLOAT, 0, None) coords_dev = cl.GLBuffer(ctx, cl.mem_flags.READ_WRITE, int(vbo)) prog = cl.Program(ctx, src).build() queue = cl.CommandQueue(ctx) cl.enqueue_acquire_gl_objects(queue, [coords_dev]) prog.generate_sin(queue, (n_vertices,), None, coords_dev) cl.enqueue_release_gl_objects(queue, [coords_dev]) queue.finish() glFlush() def display(): glClear(GL_COLOR_BUFFER_BIT) glDrawArrays(GL_LINE_STRIP, 0, n_vertices) glFlush() def reshape(w, h): glViewport(0, 0, w, h) glMatrixMode(GL_PROJECTION) glLoadIdentity() glMatrixMode(GL_MODELVIEW) if __name__ == '__main__': import sys glutInit(sys.argv) if len(sys.argv) > 1: n_vertices = int(sys.argv[1]) glutInitWindowSize(800, 160) glutInitWindowPosition(0, 0) glutCreateWindow('OpenCL/OpenGL Interop Tutorial: Sin Generator') glutDisplayFunc(display) glutReshapeFunc(reshape) initialize() glutMainLoop() pyopencl-2013.2/examples/demo.py0000644000175000000500000000147512245716340015271 0ustar tomussrcimport pyopencl as cl import numpy import numpy.linalg as la a = numpy.random.rand(50000).astype(numpy.float32) b = numpy.random.rand(50000).astype(numpy.float32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) prg = cl.Program(ctx, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int gid = get_global_id(0); c[gid] = a[gid] + b[gid]; } """).build() prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf) a_plus_b = numpy.empty_like(a) cl.enqueue_copy(queue, a_plus_b, dest_buf) print(la.norm(a_plus_b - (a+b)), la.norm(a_plus_b)) pyopencl-2013.2/examples/demo_elementwise_complex.py0000644000175000000500000000332112245716340021411 0ustar tomussrcimport pyopencl as cl import pyopencl.array as cl_array import numpy import numpy.linalg as la ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 10 a_gpu = cl_array.to_device(queue, ( numpy.random.randn(n) + 1j*numpy.random.randn(n) ).astype(numpy.complex64)) b_gpu = cl_array.to_device(queue, ( numpy.random.randn(n) + 1j*numpy.random.randn(n) ).astype(numpy.complex64)) from pyopencl.elementwise import ElementwiseKernel complex_prod = ElementwiseKernel(ctx, "float a, " "float2 *x, " "float2 *y, " "float2 *z", "z[i] = a * complex_mul(x[i], y[i])", "complex_prod", preamble=""" #define complex_ctr(x, y) (float2)(x, y) #define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)) #define complex_div_scalar(a, b) complex_ctr((a).x / (b), (a).y / (b)) #define conj(a) complex_ctr((a).x, -(a).y) #define conj_transp(a) complex_ctr(-(a).y, (a).x) #define conj_transp_and_mul(a, b) complex_ctr(-(a).y * (b), (a).x * (b)) """) complex_add = ElementwiseKernel(ctx, "float2 *x, " "float2 *y, " "float2 *z", "z[i] = x[i] + y[i]", "complex_add") real_part = ElementwiseKernel(ctx, "float2 *x, float *z", "z[i] = x[i].x", "real_part") c_gpu = cl_array.empty_like(a_gpu) complex_prod(5, a_gpu, b_gpu, c_gpu) c_gpu_real = cl_array.empty(queue, len(a_gpu), dtype=numpy.float32) real_part(c_gpu, c_gpu_real) print c_gpu.get().real - c_gpu_real.get() print la.norm(c_gpu.get() - (5*a_gpu.get()*b_gpu.get())) assert la.norm(c_gpu.get() - (5*a_gpu.get()*b_gpu.get())) < 1e-5 pyopencl-2013.2/examples/gl_particle_animation.py0000644000175000000500000001417612245716340020673 0ustar tomussrc# Visualization of particles with gravity # Source: http://enja.org/2010/08/27/adventures-in-opencl-part-2-particles-with-opengl/ import pyopencl as cl # OpenCL - GPU computing interface mf = cl.mem_flags from pyopencl.tools import get_gl_sharing_context_properties from OpenGL.GL import * # OpenGL - GPU rendering interface from OpenGL.GLU import * # OpenGL tools (mipmaps, NURBS, perspective projection, shapes) from OpenGL.GLUT import * # OpenGL tool to make a visualization window from OpenGL.arrays import vbo import numpy # Number tools import sys # System tools (path, modules, maxint) width = 800 height = 600 num_particles = 100000 time_step = .005 mouse_down = False mouse_old = {'x': 0., 'y': 0.} rotate = {'x': 0., 'y': 0., 'z': 0.} translate = {'x': 0., 'y': 0., 'z': 0.} initial_translate = {'x': 0., 'y': 0., 'z': -2.5} def glut_window(): glutInit(sys.argv) glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH) glutInitWindowSize(width, height) glutInitWindowPosition(0, 0) window = glutCreateWindow("Particle Simulation") glutDisplayFunc(on_display) # Called by GLUT every frame glutKeyboardFunc(on_key) glutMouseFunc(on_click) glutMotionFunc(on_mouse_move) glutTimerFunc(10, on_timer, 10) # Call draw every 30 ms glViewport(0, 0, width, height) glMatrixMode(GL_PROJECTION) glLoadIdentity() gluPerspective(60., width / float(height), .1, 1000.) return(window) def initial_buffers(num_particles): np_position = numpy.ndarray((num_particles, 4), dtype=numpy.float32) np_color = numpy.ndarray((num_particles, 4), dtype=numpy.float32) np_velocity = numpy.ndarray((num_particles, 4), dtype=numpy.float32) np_position[:,0] = numpy.sin(numpy.arange(0., num_particles) * 2.001 * numpy.pi / num_particles) np_position[:,0] *= numpy.random.random_sample((num_particles,)) / 3. + .2 np_position[:,1] = numpy.cos(numpy.arange(0., num_particles) * 2.001 * numpy.pi / num_particles) np_position[:,1] *= numpy.random.random_sample((num_particles,)) / 3. + .2 np_position[:,2] = 0. np_position[:,3] = 1. np_color[:,:] = [1.,1.,1.,1.] # White particles np_velocity[:,0] = np_position[:,0] * 2. np_velocity[:,1] = np_position[:,1] * 2. np_velocity[:,2] = 3. np_velocity[:,3] = numpy.random.random_sample((num_particles, )) gl_position = vbo.VBO(data=np_position, usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER) gl_position.bind() gl_color = vbo.VBO(data=np_color, usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER) gl_color.bind() return (np_position, np_velocity, gl_position, gl_color) def on_timer(t): glutTimerFunc(t, on_timer, t) glutPostRedisplay() def on_key(*args): if args[0] == '\033' or args[0] == 'q': sys.exit() def on_click(button, state, x, y): mouse_old['x'] = x mouse_old['y'] = y def on_mouse_move(x, y): rotate['x'] += (y - mouse_old['y']) * .2 rotate['y'] += (x - mouse_old['x']) * .2 mouse_old['x'] = x mouse_old['y'] = y def on_display(): """Render the particles""" # Update or particle positions by calling the OpenCL kernel cl.enqueue_acquire_gl_objects(queue, [cl_gl_position, cl_gl_color]) kernelargs = (cl_gl_position, cl_gl_color, cl_velocity, cl_start_position, cl_start_velocity, numpy.float32(time_step)) program.particle_fountain(queue, (num_particles,), None, *(kernelargs)) cl.enqueue_release_gl_objects(queue, [cl_gl_position, cl_gl_color]) queue.finish() glFlush() glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) glMatrixMode(GL_MODELVIEW) glLoadIdentity() # Handle mouse transformations glTranslatef(initial_translate['x'], initial_translate['y'], initial_translate['z']) glRotatef(rotate['x'], 1, 0, 0) glRotatef(rotate['y'], 0, 1, 0) #we switched around the axis so make this rotate_z glTranslatef(translate['x'], translate['y'], translate['z']) # Render the particles glEnable(GL_POINT_SMOOTH) glPointSize(2) glEnable(GL_BLEND) glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) # Set up the VBOs gl_color.bind() glColorPointer(4, GL_FLOAT, 0, gl_color) gl_position.bind() glVertexPointer(4, GL_FLOAT, 0, gl_position) glEnableClientState(GL_VERTEX_ARRAY) glEnableClientState(GL_COLOR_ARRAY) # Draw the VBOs glDrawArrays(GL_POINTS, 0, num_particles) glDisableClientState(GL_COLOR_ARRAY) glDisableClientState(GL_VERTEX_ARRAY) glDisable(GL_BLEND) glutSwapBuffers() window = glut_window() (np_position, np_velocity, gl_position, gl_color) = initial_buffers(num_particles) platform = cl.get_platforms()[0] context = cl.Context(properties=[(cl.context_properties.PLATFORM, platform)] + get_gl_sharing_context_properties()) queue = cl.CommandQueue(context) cl_velocity = cl.Buffer(context, mf.COPY_HOST_PTR, hostbuf=np_velocity) cl_start_position = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_position) cl_start_velocity = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_velocity) cl_gl_position = cl.GLBuffer(context, mf.READ_WRITE, int(gl_position.buffers[0])) cl_gl_color = cl.GLBuffer(context, mf.READ_WRITE, int(gl_color.buffers[0])) kernel = """__kernel void particle_fountain(__global float4* position, __global float4* color, __global float4* velocity, __global float4* start_position, __global float4* start_velocity, float time_step) { unsigned int i = get_global_id(0); float4 p = position[i]; float4 v = velocity[i]; float life = velocity[i].w; life -= time_step; if (life <= 0.f) { p = start_position[i]; v = start_velocity[i]; life = 1.0f; } v.z -= 9.8f*time_step; p.x += v.x*time_step; p.y += v.y*time_step; p.z += v.z*time_step; v.w = life; position[i] = p; velocity[i] = v; color[i].w = life; /* Fade points as life decreases */ }""" program = cl.Program(context, kernel).build() glutMainLoop()pyopencl-2013.2/examples/dump-properties.py0000644000175000000500000000625412245716340017504 0ustar tomussrcimport pyopencl as cl from optparse import OptionParser parser = OptionParser() parser.add_option("-s", "--short", action="store_true", help="don't print all device properties") (options, args) = parser.parse_args() def print_info(obj, info_cls): for info_name in sorted(dir(info_cls)): if not info_name.startswith("_") and info_name != "to_string": info = getattr(info_cls, info_name) try: info_value = obj.get_info(info) except: info_value = "" if (info_cls == cl.device_info and info_name == "PARTITION_TYPES_EXT" and isinstance(info_value, list)): print("%s: %s" % (info_name, [ cl.device_partition_property_ext.to_string(v, "") for v in info_value])) else: try: print("%s: %s" % (info_name, info_value)) except: print("%s: ") % info_name for platform in cl.get_platforms(): print(75*"=") print(platform) print(75*"=") if not options.short: print_info(platform, cl.platform_info) for device in platform.get_devices(): if not options.short: print(75*"-") print(device) if not options.short: print(75*"-") print_info(device, cl.device_info) ctx = cl.Context([device]) for mf in [ cl.mem_flags.READ_ONLY, #cl.mem_flags.READ_WRITE, #cl.mem_flags.WRITE_ONLY ]: for itype in [ cl.mem_object_type.IMAGE2D, cl.mem_object_type.IMAGE3D ]: try: formats = cl.get_supported_image_formats(ctx, mf, itype) except: formats = "" else: def str_chd_type(chdtype): result = cl.channel_type.to_string(chdtype, "") result = result.replace("_INT", "") result = result.replace("UNSIGNED", "U") result = result.replace("SIGNED", "S") result = result.replace("NORM", "N") result = result.replace("FLOAT", "F") return result formats = ", ".join( "%s-%s" % ( cl.channel_order.to_string(iform.channel_order, ""), str_chd_type(iform.channel_data_type)) for iform in formats) print "%s %s FORMATS: %s\n" % ( cl.mem_object_type.to_string(itype), cl.mem_flags.to_string(mf), formats) del ctx pyopencl-2013.2/examples/demo-struct-reduce.py0000644000175000000500000000343112245716340020052 0ustar tomussrcimport numpy as np import pyopencl as cl def make_collector_dtype(device): dtype = np.dtype([ ("cur_min", np.int32), ("cur_max", np.int32), ("pad", np.int32), ]) name = "minmax_collector" from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct dtype, c_decl = match_dtype_to_c_struct(device, name, dtype) dtype = get_or_register_dtype(name, dtype) return dtype, c_decl ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mmc_dtype, mmc_c_decl = make_collector_dtype(ctx.devices[0]) preamble = mmc_c_decl + r"""//CL// minmax_collector mmc_neutral() { // FIXME: needs infinity literal in real use, ok here minmax_collector result; result.cur_min = 1<<30; result.cur_max = -(1<<30); return result; } minmax_collector mmc_from_scalar(float x) { minmax_collector result; result.cur_min = x; result.cur_max = x; return result; } minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { minmax_collector result = a; if (b.cur_min < result.cur_min) result.cur_min = b.cur_min; if (b.cur_max > result.cur_max) result.cur_max = b.cur_max; return result; } """ from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) a = a_gpu.get() from pyopencl.reduction import ReductionKernel red = ReductionKernel(ctx, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", arguments="__global int *x", preamble=preamble) minmax = red(a_gpu).get() assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5 pyopencl-2013.2/examples/benchmark.py0000644000175000000500000001011012245716340016261 0ustar tomussrc# example provided by Roger Pau Monn'e from __future__ import print_function import pyopencl as cl import numpy import numpy.linalg as la import datetime from time import time data_points = 2**23 # ~8 million data points, ~32 MB data workers = 2**8 # 256 workers, play with this to see performance differences # eg: 2**0 => 1 worker will be non-parallel execution on gpu # data points must be a multiple of workers a = numpy.random.rand(data_points).astype(numpy.float32) b = numpy.random.rand(data_points).astype(numpy.float32) c_result = numpy.empty_like(a) # Speed in normal CPU usage time1 = time() c_temp = (a+b) # adds each element in a to its corresponding element in b c_temp = c_temp * c_temp # element-wise multiplication c_result = c_temp * (a/2.0) # element-wise half a and multiply time2 = time() print("Execution time of test without OpenCL: ", time2 - time1, "s") for platform in cl.get_platforms(): for device in platform.get_devices(): print("===============================================================") print("Platform name:", platform.name) print("Platform profile:", platform.profile) print("Platform vendor:", platform.vendor) print("Platform version:", platform.version) print("---------------------------------------------------------------") print("Device name:", device.name) print("Device type:", cl.device_type.to_string(device.type)) print("Device memory: ", device.global_mem_size//1024//1024, 'MB') print("Device max clock speed:", device.max_clock_frequency, 'MHz') print("Device compute units:", device.max_compute_units) print("Device max work group size:", device.max_work_group_size) print("Device max work item sizes:", device.max_work_item_sizes) # Simnple speed test ctx = cl.Context([device]) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) prg = cl.Program(ctx, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int gid = get_global_id(0); float a_temp; float b_temp; float c_temp; a_temp = a[gid]; // my a element (by global ref) b_temp = b[gid]; // my b element (by global ref) c_temp = a_temp+b_temp; // sum of my elements c_temp = c_temp * c_temp; // product of sums c_temp = c_temp * (a_temp/2.0); // times 1/2 my a c[gid] = c_temp; // store result in global memory } """).build() global_size=(data_points,) local_size=(workers,) preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \ cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \ device) print("Data points:", data_points) print("Workers:", workers) print("Preferred work group size multiple:", preferred_multiple) if (workers % preferred_multiple): print("Number of workers not a preferred multiple (%d*N)." \ % (preferred_multiple)) print("Performance may be reduced.") exec_evt = prg.sum(queue, global_size, local_size, a_buf, b_buf, dest_buf) exec_evt.wait() elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start) print("Execution time of test: %g s" % elapsed) c = numpy.empty_like(a) cl.enqueue_read_buffer(queue, dest_buf, c).wait() equal = numpy.all( c == c_result) if not equal: print("Results doesn't match!!") else: print("Results OK") pyopencl-2013.2/examples/demo_array.py0000644000175000000500000000130512245716340016457 0ustar tomussrcimport pyopencl as cl import pyopencl.array as cl_array import numpy import numpy.linalg as la a = numpy.random.rand(50000).astype(numpy.float32) b = numpy.random.rand(50000).astype(numpy.float32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) a_dev = cl_array.to_device(queue, a) b_dev = cl_array.to_device(queue, b) dest_dev = cl_array.empty_like(a_dev) prg = cl.Program(ctx, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int gid = get_global_id(0); c[gid] = a[gid] + b[gid]; } """).build() prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) print(la.norm((dest_dev - (a_dev+b_dev)).get())) pyopencl-2013.2/examples/narray.py0000644000175000000500000000127712245716340015641 0ustar tomussrc# example by Roger Pau Monn'e import pyopencl as cl import numpy as np demo_r = np.empty( (500,5), dtype=np.uint32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags demo_buf = cl.Buffer(ctx, mf.WRITE_ONLY, demo_r.nbytes) prg = cl.Program(ctx, """ __kernel void demo(__global uint *demo) { int i; int gid = get_global_id(0); for(i=0; i<5;i++) { demo[gid*5+i] = (uint) 1; } }""") try: prg.build() except: print("Error:") print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG)) raise prg.demo(queue, (500,), None, demo_buf) cl.enqueue_read_buffer(queue, demo_buf, demo_r).wait() for res in demo_r: print(res) pyopencl-2013.2/examples/demo_elementwise.py0000644000175000000500000000127412245716340017667 0ustar tomussrcimport pyopencl as cl import pyopencl.array as cl_array import numpy ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 10 a_gpu = cl_array.to_device( queue, numpy.random.randn(n).astype(numpy.float32)) b_gpu = cl_array.to_device( queue, numpy.random.randn(n).astype(numpy.float32)) from pyopencl.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel(ctx, "float a, float *x, " "float b, float *y, " "float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = cl_array.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) import numpy.linalg as la assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 pyopencl-2013.2/examples/demo_meta_codepy.py0000644000175000000500000000330312245716340017632 0ustar tomussrcimport pyopencl as cl import numpy import numpy.linalg as la local_size = 256 thread_strides = 32 macroblock_count = 33 dtype = numpy.float32 total_size = local_size*thread_strides*macroblock_count ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) a = numpy.random.randn(total_size).astype(dtype) b = numpy.random.randn(total_size).astype(dtype) mf = cl.mem_flags a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) from codepy.cgen import FunctionBody, \ FunctionDeclaration, Typedef, POD, Value, \ Pointer, Module, Block, Initializer, Assign, Const from codepy.cgen.opencl import CLKernel, CLGlobal, \ CLRequiredWorkGroupSize mod = Module([ FunctionBody( CLKernel(CLRequiredWorkGroupSize((local_size,), FunctionDeclaration( Value("void", "add"), arg_decls=[CLGlobal(Pointer(Const(POD(dtype, name)))) for name in ["tgt", "op1", "op2"]]))), Block([ Initializer(POD(numpy.int32, "idx"), "get_local_id(0) + %d * get_group_id(0)" % (local_size*thread_strides)) ]+[ Assign( "tgt[idx+%d]" % (o*local_size), "op1[idx+%d] + op2[idx+%d]" % ( o*local_size, o*local_size)) for o in range(thread_strides)]))]) knl = cl.Program(ctx, str(mod)).build().add knl(queue, (local_size*macroblock_count,), (local_size,), c_buf, a_buf, b_buf) c = numpy.empty_like(a) cl.enqueue_read_buffer(queue, c_buf, c).wait() assert la.norm(c-(a+b)) == 0 pyopencl-2013.2/examples/dump-performance.py0000644000175000000500000000223012245716340017577 0ustar tomussrcfrom __future__ import division import pyopencl as cl import pyopencl.characterize.performance as perf def main(): ctx = cl.create_some_context() prof_overhead, latency = perf.get_profiling_overhead(ctx) print "command latency: %g s" % latency print "profiling overhead: %g s -> %.1f %%" % ( prof_overhead, 100*prof_overhead/latency) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) print "empty kernel: %g s" % perf.get_empty_kernel_time(queue) print "float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9) for tx_type in [ perf.HostToDeviceTransfer, perf.DeviceToHostTransfer, perf.DeviceToDeviceTransfer]: print "----------------------------------------" print tx_type.__name__ print "----------------------------------------" print "latency: %g s" % perf.transfer_latency(queue, tx_type) for i in range(6, 28, 2): bs = 1< tgt[idx + ${ offset }] = op1[idx + ${ offset }] + op2[idx + ${ offset } ]; % endfor }""") rendered_tpl = tpl.render(type_name="float", local_size=local_size, thread_strides=thread_strides) knl = cl.Program(ctx, str(rendered_tpl)).build().add knl(queue, (local_size*macroblock_count,), (local_size,), c_buf, a_buf, b_buf) c = numpy.empty_like(a) cl.enqueue_read_buffer(queue, c_buf, c).wait() assert la.norm(c-(a+b)) == 0 pyopencl-2013.2/README.rst0000644000175000000500000000271312245716342013642 0ustar tomussrcPyOpenCL lets you access GPUs and other massively parallel compute devices from Python. It tries to offer computing goodness in the spirit of its sister project `PyCUDA `_: * Object cleanup tied to lifetime of objects. This idiom, often called `RAII `_ in C++, makes it much easier to write correct, leak- and crash-free code. * Completeness. PyOpenCL puts the full power of OpenCL's API at your disposal, if you wish. Every obscure `get_info()` query and all CL calls are accessible. * Automatic Error Checking. All CL errors are automatically translated into Python exceptions. * Speed. PyOpenCL's base layer is written in C++, so all the niceties above are virtually free. * Helpful and complete `Documentation `_ as well as a `Wiki `_. * Liberal license. PyOpenCL is open-source under the `MIT license `_ and free for commercial, academic, and private use. * Broad support. PyOpenCL was tested and works with Apple's, AMD's, and Nvidia's CL implementations. .. image:: https://badge.fury.io/py/pyopencl.png :target: http://pypi.python.org/pypi/pyopencl To use PyOpenCL, you just need `numpy `_ and an OpenCL implementation. (See this `howto `_ for how to get one.) pyopencl-2013.2/test/0002755000175000000500000000000012245716340013127 5ustar tomussrcpyopencl-2013.2/test/empty-header.h0000644000175000000500000000003312245716340015656 0ustar tomussrc/* what did you expect? */ pyopencl-2013.2/test/test_algorithm.py0000644000175000000500000005735712245716340016545 0ustar tomussrc#! /usr/bin/env python from __future__ import division, with_statement __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np import numpy.linalg as la import sys from pytools import memoize from test_array import general_clrand import pytest import pyopencl as cl import pyopencl.array as cl_array # noqa from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) from pyopencl.characterize import has_double_support from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel # {{{ elementwise def test_elwise_kernel(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (50,), np.float32) b_gpu = clrand(queue, (50,), np.float32) from pyopencl.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel(context, "float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = cl_array.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5 def test_elwise_kernel_with_options(ctx_factory): from pyopencl.clrandom import rand as clrand from pyopencl.elementwise import ElementwiseKernel context = ctx_factory() queue = cl.CommandQueue(context) in_gpu = clrand(queue, (50,), np.float32) options = ['-D', 'ADD_ONE'] add_one = ElementwiseKernel( context, "float* out, const float *in", """ out[i] = in[i] #ifdef ADD_ONE +1 #endif ; """, options=options, ) out_gpu = cl_array.empty_like(in_gpu) add_one(out_gpu, in_gpu) gt = in_gpu.get() + 1 gv = out_gpu.get() assert la.norm(gv - gt) < 1e-5 def test_ranged_elwise_kernel(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.elementwise import ElementwiseKernel set_to_seven = ElementwiseKernel(context, "float *z", "z[i] = 7", "set_to_seven") for i, slc in enumerate([ slice(5, 20000), slice(5, 20000, 17), slice(3000, 5, -1), slice(1000, -1), ]): a_gpu = cl_array.zeros(queue, (50000,), dtype=np.float32) a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype) a_cpu[slc] = 7 set_to_seven(a_gpu, slice=slc) assert (a_cpu == a_gpu.get()).all() def test_take(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32) a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32) result = cl_array.take(a, idx) assert ((3 * idx).get() == result.get()).all() def test_arange(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) n = 5000 a = cl_array.arange(queue, n, dtype=np.float32) assert (np.arange(n, dtype=np.float32) == a.get()).all() def test_reverse(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) n = 5000 a = np.arange(n).astype(np.float32) a_gpu = cl_array.to_device(queue, a) a_gpu = a_gpu.reverse() assert (a[::-1] == a_gpu.get()).all() def test_if_positive(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 a_gpu = clrand(queue, (l,), np.float32) b_gpu = clrand(queue, (l,), np.float32) a = a_gpu.get() b = b_gpu.get() max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu) min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu) print(max_a_b_gpu) print(np.maximum(a, b)) assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0 def test_take_put(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for n in [5, 17, 333]: one_field_size = 8 buf_gpu = cl_array.zeros(queue, n * one_field_size, dtype=np.float32) dest_indices = cl_array.to_device(queue, np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) read_map = cl_array.to_device(queue, np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)) cl_array.multi_take_put( arrays=[buf_gpu for i in range(n)], dest_indices=dest_indices, src_indices=read_map, src_offsets=[i * one_field_size for i in range(n)], dest_shape=(96,)) def test_astype(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if not has_double_support(context.devices[0]): from pytest import skip skip("double precision not supported on %s" % context.devices[0]) a_gpu = clrand(queue, (2000,), dtype=np.float32) a = a_gpu.get().astype(np.float64) a2 = a_gpu.astype(np.float64).get() assert a2.dtype == np.float64 assert la.norm(a - a2) == 0, (a, a2) a_gpu = clrand(queue, (2000,), dtype=np.float64) a = a_gpu.get().astype(np.float32) a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 assert la.norm(a - a2) / la.norm(a) < 1e-7 # }}} # {{{ reduction def test_sum(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 200000 for dtype in [np.float32, np.complex64]: a_gpu = general_clrand(queue, (n,), dtype) a = a_gpu.get() for slc in [ slice(None), slice(1000, 3000), slice(1000, -3000), slice(1000, None), ]: sum_a = np.sum(a[slc]) sum_a_gpu = cl_array.sum(a_gpu[slc]).get() assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4 def test_minmax(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if has_double_support(context.devices[0]): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for what in ["min", "max"]: for dtype in dtypes: a_gpu = clrand(queue, (200000,), dtype) a = a_gpu.get() op_a = getattr(np, what)(a) op_a_gpu = getattr(cl_array, what)(a_gpu).get() assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what) def test_subset_minmax(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l_a = 200000 gran = 5 l_m = l_a - l_a // gran + 1 if has_double_support(context.devices[0]): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for dtype in dtypes: a_gpu = clrand(queue, (l_a,), dtype) a = a_gpu.get() meaningful_indices_gpu = cl_array.zeros( queue, l_m, dtype=np.int32) meaningful_indices = meaningful_indices_gpu.get() j = 0 for i in range(len(meaningful_indices)): meaningful_indices[i] = j j = j + 1 if j % gran == 0: j = j + 1 meaningful_indices_gpu = cl_array.to_device( queue, meaningful_indices) b = a[meaningful_indices] min_a = np.min(b) min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get() assert min_a_gpu == min_a def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtypes = [np.float32, np.complex64] if has_double_support(context.devices[0]): dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 vdot_ab = np.vdot(a, b) vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4 @memoize def make_mmc_dtype(device): dtype = np.dtype([ ("cur_min", np.int32), ("cur_max", np.int32), ("pad", np.int32), ]) name = "minmax_collector" from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct dtype, c_decl = match_dtype_to_c_struct(device, name, dtype) dtype = get_or_register_dtype(name, dtype) return dtype, c_decl def test_struct_reduce(ctx_factory): pytest.importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev, = context.devices if (dev.vendor == "NVIDIA" and dev.platform.vendor == "Apple" and dev.driver_version == "8.12.47 310.40.00.05f01"): pytest.skip("causes a compiler hang on Apple/Nv GPU") mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0]) preamble = mmc_c_decl + r"""//CL// minmax_collector mmc_neutral() { // FIXME: needs infinity literal in real use, ok here minmax_collector result; result.cur_min = 1<<30; result.cur_max = -(1<<30); return result; } minmax_collector mmc_from_scalar(float x) { minmax_collector result; result.cur_min = x; result.cur_max = x; return result; } minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { minmax_collector result = a; if (b.cur_min < result.cur_min) result.cur_min = b.cur_min; if (b.cur_max > result.cur_max) result.cur_max = b.cur_max; return result; } """ from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) a = a_gpu.get() from pyopencl.reduction import ReductionKernel red = ReductionKernel(context, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", arguments="__global int *x", preamble=preamble) minmax = red(a_gpu).get() #print minmax["cur_min"], minmax["cur_max"] #print np.min(a), np.max(a) assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5 # }}} # {{{ scan-related def summarize_error(obtained, desired, orig, thresh=1e-5): from pytest import importorskip importorskip("mako") err = obtained - desired ok_count = 0 bad_count = 0 bad_limit = 200 def summarize_counts(): if ok_count: entries.append("<%d ok>" % ok_count) if bad_count >= bad_limit: entries.append("<%d more bad>" % (bad_count-bad_limit)) entries = [] for i, val in enumerate(err): if abs(val) > thresh: if ok_count: summarize_counts() ok_count = 0 bad_count += 1 if bad_count < bad_limit: entries.append("%r (want: %r, got: %r, orig: %r)" % ( obtained[i], desired[i], obtained[i], orig[i])) else: if bad_count: summarize_counts() bad_count = 0 ok_count += 1 summarize_counts() return " ".join(entries) scan_test_counts = [ 10, 2 ** 8 - 1, 2 ** 8, 2 ** 8 + 1, 2 ** 10 - 5, 2 ** 10, 2 ** 10 + 5, 2 ** 12 - 5, 2 ** 12, 2 ** 12 + 5, 2 ** 20 - 2 ** 18, 2 ** 20 - 2 ** 18 + 5, 2 ** 20 + 1, 2 ** 20, 2 ** 23 + 3, # larger sizes cause out of memory on low-end AMD APUs ] @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("scan_cls", [InclusiveScanKernel, ExclusiveScanKernel]) def test_scan(ctx_factory, dtype, scan_cls): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) knl = scan_cls(context, dtype, "a+b", "0") for n in scan_test_counts: host_data = np.random.randint(0, 10, n).astype(dtype) dev_data = cl_array.to_device(queue, host_data) # /!\ fails on Nv GT2?? for some drivers assert (host_data == dev_data.get()).all() knl(dev_data) desired_result = np.cumsum(host_data, axis=0) if scan_cls is ExclusiveScanKernel: desired_result -= host_data is_ok = (dev_data.get() == desired_result).all() if 1 and not is_ok: print("something went wrong, summarizing error...") print(summarize_error(dev_data.get(), desired_result, host_data)) print("dtype:%s n:%d %s worked:%s" % (dtype, n, scan_cls, is_ok)) assert is_ok from gc import collect collect() def test_copy_if(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand for n in scan_test_counts: a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) a = a_dev.get() from pyopencl.algorithm import copy_if crit = a_dev.dtype.type(300) selected = a[a > crit] selected_dev, count_dev, evt = copy_if( a_dev, "ary[i] > myval", [("myval", crit)]) assert (selected_dev.get()[:count_dev.get()] == selected).all() from gc import collect collect() def test_partition(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand for n in scan_test_counts: print("part", n) a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) a = a_dev.get() crit = a_dev.dtype.type(300) true_host = a[a > crit] false_host = a[a <= crit] from pyopencl.algorithm import partition true_dev, false_dev, count_true_dev, evt = partition( a_dev, "ary[i] > myval", [("myval", crit)]) count_true_dev = count_true_dev.get() assert (true_dev.get()[:count_true_dev] == true_host).all() assert (false_dev.get()[:n-count_true_dev] == false_host).all() def test_unique(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand for n in scan_test_counts: a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) a = a_dev.get() a = np.sort(a) a_dev = cl_array.to_device(queue, a) a_unique_host = np.unique(a) from pyopencl.algorithm import unique a_unique_dev, count_unique_dev, evt = unique(a_dev) count_unique_dev = count_unique_dev.get() assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all() from gc import collect collect() def test_index_preservation(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel classes = [GenericScanKernel] dev = context.devices[0] if dev.type & cl.device_type.CPU: classes.append(GenericDebugScanKernel) for cls in classes: for n in scan_test_counts: knl = cls( context, np.int32, arguments="__global int *out", input_expr="i", scan_expr="b", neutral="0", output_statement=""" out[i] = item; """) out = cl_array.empty(queue, n, dtype=np.int32) knl(out) assert (out.get() == np.arange(n)).all() from gc import collect collect() def test_segmented_scan(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.tools import dtype_to_ctype dtype = np.int32 ctype = dtype_to_ctype(dtype) #for is_exclusive in [False, True]: for is_exclusive in [True, False]: if is_exclusive: output_statement = "out[i] = prev_item" else: output_statement = "out[i] = item" from pyopencl.scan import GenericScanKernel knl = GenericScanKernel(context, dtype, arguments="__global %s *ary, __global char *segflags, " "__global %s *out" % (ctype, ctype), input_expr="ary[i]", scan_expr="across_seg_boundary ? b : (a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement=output_statement, options=[]) np.set_printoptions(threshold=2000) from random import randrange from pyopencl.clrandom import rand as clrand for n in scan_test_counts: a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10) a = a_dev.get() if 10 <= n < 20: seg_boundaries_values = [ [0, 9], [0, 3], [4, 6], ] else: seg_boundaries_values = [] for i in range(10): seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n)))) seg_boundaries = [ randrange(n) for i in range(seg_boundary_count)] if n >= 1029: seg_boundaries.insert(0, 1028) seg_boundaries.sort() seg_boundaries_values.append(seg_boundaries) for seg_boundaries in seg_boundaries_values: #print "BOUNDARIES", seg_boundaries #print a seg_boundary_flags = np.zeros(n, dtype=np.uint8) seg_boundary_flags[seg_boundaries] = 1 seg_boundary_flags_dev = cl_array.to_device( queue, seg_boundary_flags) seg_boundaries.insert(0, 0) result_host = a.copy() for i, seg_start in enumerate(seg_boundaries): if i+1 < len(seg_boundaries): seg_end = seg_boundaries[i+1] else: seg_end = None if is_exclusive: result_host[seg_start+1:seg_end] = np.cumsum( a[seg_start:seg_end][:-1]) result_host[seg_start] = 0 else: result_host[seg_start:seg_end] = np.cumsum( a[seg_start:seg_end]) #print "REF", result_host result_dev = cl_array.empty_like(a_dev) knl(a_dev, seg_boundary_flags_dev, result_dev) #print "RES", result_dev is_correct = (result_dev.get() == result_host).all() if not is_correct: diff = result_dev.get() - result_host print("RES-REF", diff) print("ERRWHERE", np.where(diff)) print(n, list(seg_boundaries)) assert is_correct from gc import collect collect() print("%d excl:%s done" % (n, is_exclusive)) def test_sort(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtype = np.int32 from pyopencl.algorithm import RadixSort sort = RadixSort(context, "int *ary", key_expr="ary[i]", sort_arg_names=["ary"]) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from time import time # intermediate arrays for largest size cause out-of-memory on low-end GPUs for n in scan_test_counts[:-1]: print(n) print(" rng") a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16) a = a_dev.get() dev_start = time() print(" device") (a_dev_sorted,), evt = sort(a_dev, key_bits=16) queue.finish() dev_end = time() print(" numpy") a_sorted = np.sort(a) numpy_end = time() numpy_elapsed = numpy_end-dev_end dev_elapsed = dev_end-dev_start print (" dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % ( 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed)) assert (a_dev_sorted.get() == a_sorted).all() def test_list_builder(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.algorithm import ListOfListsBuilder builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL// void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) { int count = i % 4; for (int j = 0; j < count; ++j) { APPEND_mylist(count); } } """, arg_decls=[]) result, evt = builder(queue, 2000) inf = result["mylist"] assert inf.count == 3000 assert (inf.lists.get()[-6:] == [1, 2, 2, 3, 3, 3]).all() def test_key_value_sorter(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 10**5 nkeys = 2000 from pyopencl.clrandom import rand as clrand keys = clrand(queue, n, np.int32, b=nkeys) values = clrand(queue, n, np.int32, b=n).astype(np.int64) assert np.max(keys.get()) < nkeys from pyopencl.algorithm import KeyValueSorter kvs = KeyValueSorter(context) starts, lists, evt = kvs(queue, keys, values, nkeys, starts_dtype=np.int32) starts = starts.get() lists = lists.get() mydict = dict() for k, v in zip(keys.get(), values.get()): mydict.setdefault(k, []).append(v) for i in range(nkeys): start, end = starts[i:i+2] assert sorted(mydict[i]) == sorted(lists[start:end]) # }}} if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) else: from py.test.cmdline import main main([__file__]) # vim: filetype=pyopencl:fdm=marker pyopencl-2013.2/test/test_wrapper.py0000644000175000000500000004347312245716340016231 0ustar tomussrcfrom __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np import numpy.linalg as la import pyopencl as cl import pyopencl.array as cl_array from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) # Are CL implementations crashy? You be the judge. :) try: import faulthandler # noqa except ImportError: pass else: faulthandler.enable() def test_get_info(ctx_factory): ctx = ctx_factory() device, = ctx.devices platform = device.platform failure_count = [0] pocl_quirks = [ (cl.Buffer, cl.mem_info.OFFSET), (cl.Program, cl.program_info.KERNEL_NAMES), (cl.Program, cl.program_info.NUM_KERNELS), ] CRASH_QUIRKS = [ (("NVIDIA Corporation", "NVIDIA CUDA", "OpenCL 1.0 CUDA 3.0.1"), [ (cl.Event, cl.event_info.COMMAND_QUEUE), ]), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.8-pre"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.8"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.9-pre"), pocl_quirks), (("Apple", "Apple", "OpenCL 1.2 (Apr 25 2013 18:32:06)"), [ (cl.Program, cl.program_info.SOURCE), ]), ] QUIRKS = [] plat_quirk_key = ( platform.vendor, platform.name, platform.version) def find_quirk(quirk_list, cl_obj, info): for entry_plat_key, quirks in quirk_list: if entry_plat_key == plat_quirk_key: for quirk_cls, quirk_info in quirks: if (isinstance(cl_obj, quirk_cls) and quirk_info == info): return True return False def do_test(cl_obj, info_cls, func=None, try_attr_form=True): if func is None: def func(info): cl_obj.get_info(info) for info_name in dir(info_cls): if not info_name.startswith("_") and info_name != "to_string": print(info_cls, info_name) info = getattr(info_cls, info_name) if find_quirk(CRASH_QUIRKS, cl_obj, info): print("not executing get_info", type(cl_obj), info_name) print("(known crash quirk for %s)" % platform.name) continue try: func(info) except: msg = "failed get_info", type(cl_obj), info_name if find_quirk(QUIRKS, cl_obj, info): msg += ("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 if try_attr_form: try: getattr(cl_obj, info_name.lower()) except: print("failed attr-based get_info", type(cl_obj), info_name) if find_quirk(QUIRKS, cl_obj, info): print("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 do_test(platform, cl.platform_info) do_test(device, cl.device_info) do_test(ctx, cl.context_info) props = 0 if (device.queue_properties & cl.command_queue_properties.PROFILING_ENABLE): profiling = True props = cl.command_queue_properties.PROFILING_ENABLE queue = cl.CommandQueue(ctx, properties=props) do_test(queue, cl.command_queue_info) prg = cl.Program(ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg, cl.program_info) do_test(prg, cl.program_build_info, lambda info: prg.get_build_info(device, info), try_attr_form=False) n = 2000 a_buf = cl.Buffer(ctx, 0, n*4) do_test(a_buf, cl.mem_info) kernel = prg.sum do_test(kernel, cl.kernel_info) evt = kernel(queue, (n,), None, a_buf) do_test(evt, cl.event_info) if profiling: evt.wait() do_test(evt, cl.profiling_info, lambda info: evt.get_profiling_info(info), try_attr_form=False) # crashes on intel... if device.image_support and platform.vendor not in [ "Intel(R) Corporation", "The pocl project", ]: smp = cl.Sampler(ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp, cl.sampler_info) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) assert img.shape == (128, 256) img.depth img.image.depth do_test(img, cl.image_info, lambda info: img.get_image_info(info)) def test_int_ptr(ctx_factory): def do_test(obj): new_obj = type(obj).from_int_ptr(obj.int_ptr) assert obj == new_obj assert type(obj) is type(new_obj) ctx = ctx_factory() device, = ctx.devices platform = device.platform do_test(device) do_test(platform) do_test(ctx) queue = cl.CommandQueue(ctx) do_test(queue) evt = cl.enqueue_marker(queue) do_test(evt) prg = cl.Program(ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg) do_test(prg.sum) n = 2000 a_buf = cl.Buffer(ctx, 0, n*4) do_test(a_buf) # crashes on intel... if device.image_support and platform.vendor not in [ "Intel(R) Corporation", "The pocl project", ]: smp = cl.Sampler(ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) do_test(img) def test_invalid_kernel_names_cause_failures(ctx_factory): ctx = ctx_factory() device = ctx.devices[0] prg = cl.Program(ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() if ctx.devices[0].platform.vendor == "The pocl project": # https://bugs.launchpad.net/pocl/+bug/1184464 import pytest pytest.skip("pocl doesn't like invalid kernel names") try: prg.sam raise RuntimeError("invalid kernel name did not cause error") except AttributeError: pass except RuntimeError: if "Intel" in device.platform.vendor: from pytest import xfail xfail("weird exception from OpenCL implementation " "on invalid kernel name--are you using " "Intel's implementation? (if so, known bug in Intel CL)") else: raise def test_image_format_constructor(): # doesn't need image support to succeed iform = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT) assert iform.channel_order == cl.channel_order.RGBA assert iform.channel_data_type == cl.channel_type.FLOAT assert not iform.__dict__ def test_nonempty_supported_image_formats(ctx_factory): context = ctx_factory() device = context.devices[0] if device.image_support: assert len(cl.get_supported_image_formats( context, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)) > 0 else: from pytest import skip skip("images not supported on %s" % device.name) def test_that_python_args_fail(ctx_factory): context = ctx_factory() prg = cl.Program(context, """ __kernel void mult(__global float *a, float b, int c) { a[get_global_id(0)] *= (b+c); } """).build() a = np.random.rand(50000) queue = cl.CommandQueue(context) mf = cl.mem_flags a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a) knl = cl.Kernel(prg, "mult") try: knl(queue, a.shape, None, a_buf, 2, 3) assert False, "PyOpenCL should not accept bare Python types as arguments" except cl.LogicError: pass try: prg.mult(queue, a.shape, None, a_buf, float(2), 3) assert False, "PyOpenCL should not accept bare Python types as arguments" except cl.LogicError: pass prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3)) a_result = np.empty_like(a) cl.enqueue_read_buffer(queue, a_buf, a_result).wait() def test_image_2d(ctx_factory): context = ctx_factory() device, = context.devices if not device.image_support: from pytest import skip skip("images not supported on %s" % device) if "Intel" in device.vendor and "31360.31426" in device.version: from pytest import skip skip("images crashy on %s" % device) if "pocl" in device.platform.vendor and ( "0.8" in device.platform.version or "0.9" in device.platform.version ): from pytest import skip skip("images crashy on %s" % device) prg = cl.Program(context, """ __kernel void copy_image( __global float *dest, __read_only image2d_t src, sampler_t samp, int stride0) { int d0 = get_global_id(0); int d1 = get_global_id(1); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[d0*stride0 + d1] = read_imagef(src, samp, (float2)(d1, d0)).x; } """).build() num_channels = 1 a = np.random.rand(1024, 512, num_channels).astype(np.float32) if num_channels == 1: a = a[:, :, 0] queue = cl.CommandQueue(context) try: a_img = cl.image_from_array(context, a, num_channels) except cl.RuntimeError: import sys exc = sys.exc_info()[1] if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: from pytest import skip skip("required image format not supported on %s" % device.name) else: raise a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, np.int32(a.strides[0]/a.dtype.itemsize)) a_result = np.empty_like(a) cl.enqueue_copy(queue, a_result, a_dest) good = la.norm(a_result - a) == 0 if not good: if queue.device.type & cl.device_type.CPU: assert good, ("The image implementation on your CPU CL platform '%s' " "returned bad values. This is bad, but common." % queue.device.platform) else: assert good def test_image_3d(ctx_factory): #test for image_from_array for 3d image of float2 context = ctx_factory() device, = context.devices if not device.image_support: from pytest import skip skip("images not supported on %s" % device) if device.platform.vendor == "Intel(R) Corporation": from pytest import skip skip("images crashy on %s" % device) prg = cl.Program(context, """ __kernel void copy_image_plane( __global float2 *dest, __read_only image3d_t src, sampler_t samp, int stride0, int stride1) { int d0 = get_global_id(0); int d1 = get_global_id(1); int d2 = get_global_id(2); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[d0*stride0 + d1*stride1 + d2] = read_imagef( src, samp, (float4)(d2, d1, d0, 0)).xy; } """).build() num_channels = 2 shape = (3, 4, 2) a = np.random.random(shape + (num_channels,)).astype(np.float32) queue = cl.CommandQueue(context) try: a_img = cl.image_from_array(context, a, num_channels) except cl.RuntimeError: import sys exc = sys.exc_info()[1] if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: from pytest import skip skip("required image format not supported on %s" % device.name) else: raise a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image_plane(queue, shape, None, a_dest, a_img, samp, np.int32(a.strides[0]/a.itemsize/num_channels), np.int32(a.strides[1]/a.itemsize/num_channels), ) a_result = np.empty_like(a) cl.enqueue_copy(queue, a_result, a_dest) good = la.norm(a_result - a) == 0 if not good: if queue.device.type & cl.device_type.CPU: assert good, ("The image implementation on your CPU CL platform '%s' " "returned bad values. This is bad, but common." % queue.device.platform) else: assert good def test_copy_buffer(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) mf = cl.mem_flags a = np.random.rand(50000).astype(np.float32) b = np.empty_like(a) buf1 = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) buf2 = cl.Buffer(context, mf.WRITE_ONLY, b.nbytes) cl.enqueue_copy_buffer(queue, buf1, buf2).wait() cl.enqueue_read_buffer(queue, buf2, b).wait() assert la.norm(a - b) == 0 def test_mempool(ctx_factory): from pyopencl.tools import MemoryPool, CLAllocator context = ctx_factory() pool = MemoryPool(CLAllocator(context)) queue = [] e0 = 12 for e in range(e0-6, e0-4): for i in range(100): queue.append(pool.allocate(1 << e)) if len(queue) > 10: queue.pop(0) del queue pool.stop_holding() def test_mempool_2(): from pyopencl.tools import MemoryPool from random import randrange for i in range(2000): s = randrange(1 << 31) >> randrange(32) bin_nr = MemoryPool.bin_number(s) asize = MemoryPool.alloc_size(bin_nr) assert asize >= s, s assert MemoryPool.bin_number(asize) == bin_nr, s assert asize < asize*(1+1/8) def test_vector_args(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) prg = cl.Program(context, """ __kernel void set_vec(float4 x, __global float4 *dest) { dest[get_global_id(0)] = x; } """).build() x = cl_array.vec.make_float4(1, 2, 3, 4) dest = np.empty(50000, cl_array.vec.float4) mf = cl.mem_flags dest_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=dest) prg.set_vec(queue, dest.shape, None, x, dest_buf) cl.enqueue_read_buffer(queue, dest_buf, dest).wait() assert (dest == x).all() def test_header_dep_handling(ctx_factory): context = ctx_factory() from os.path import exists assert exists("empty-header.h") # if this fails, change dir to pyopencl/test kernel_src = """ #include kernel void zonk(global int *a) { *a = 5; } """ import os cl.Program(context, kernel_src).build(["-I", os.getcwd()]) cl.Program(context, kernel_src).build(["-I", os.getcwd()]) def test_context_dep_memoize(ctx_factory): context = ctx_factory() from pyopencl.tools import context_dependent_memoize counter = [0] @context_dependent_memoize def do_something(ctx): counter[0] += 1 do_something(context) do_something(context) assert counter[0] == 1 def test_can_build_binary(ctx_factory): ctx = ctx_factory() device, = ctx.devices program = cl.Program(ctx, """ __kernel void simple(__global float *in, __global float *out) { out[get_global_id(0)] = in[get_global_id(0)]; }""") program.build() binary = program.get_info(cl.program_info.BINARIES)[0] foo = cl.Program(ctx, [device], [binary]) foo.build() if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests. import pyopencl # noqa import sys if len(sys.argv) > 1: exec(sys.argv[1]) else: from py.test.cmdline import main main([__file__]) pyopencl-2013.2/test/test_array.py0000644000175000000500000004637612245716340015674 0ustar tomussrc#! /usr/bin/env python from __future__ import division, with_statement __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np import numpy.linalg as la import sys import pyopencl as cl import pyopencl.array as cl_array import pyopencl.tools as cl_tools from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) from pyopencl.characterize import has_double_support # {{{ helpers TO_REAL = { np.dtype(np.complex64): np.float32, np.dtype(np.complex128): np.float64 } def general_clrand(queue, shape, dtype): from pyopencl.clrandom import rand as clrand dtype = np.dtype(dtype) if dtype.kind == "c": real_dtype = dtype.type(0).real.dtype return clrand(queue, shape, real_dtype) + 1j*clrand(queue, shape, real_dtype) else: return clrand(queue, shape, dtype) def make_random_array(queue, dtype, size): from pyopencl.clrandom import rand dtype = np.dtype(dtype) if dtype.kind == "c": real_dtype = TO_REAL[dtype] return (rand(queue, shape=(size,), dtype=real_dtype).astype(dtype) + rand(queue, shape=(size,), dtype=real_dtype).astype(dtype) * dtype.type(1j)) else: return rand(queue, shape=(size,), dtype=dtype) # }}} # {{{ dtype-related def test_basic_complex(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand size = 500 ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) + rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) * 1j) c = np.complex64(5+7j) host_ary = ary.get() assert la.norm((ary*c).get() - c*host_ary) < 1e-5 * la.norm(host_ary) def test_mix_complex(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) size = 10 dtypes = [ (np.float32, np.complex64), #(np.int32, np.complex64), ] if has_double_support(context.devices[0]): dtypes.extend([ (np.float32, np.float64), (np.float32, np.complex128), (np.float64, np.complex64), (np.float64, np.complex128), ]) from operator import add, mul, sub, truediv for op in [add, sub, mul, truediv, pow]: for dtype_a0, dtype_b0 in dtypes: for dtype_a, dtype_b in [ (dtype_a0, dtype_b0), (dtype_b0, dtype_a0), ]: for is_scalar_a, is_scalar_b in [ (False, False), (False, True), (True, False), ]: if is_scalar_a: ary_a = make_random_array(queue, dtype_a, 1).get()[0] host_ary_a = ary_a else: ary_a = make_random_array(queue, dtype_a, size) host_ary_a = ary_a.get() if is_scalar_b: ary_b = make_random_array(queue, dtype_b, 1).get()[0] host_ary_b = ary_b else: ary_b = make_random_array(queue, dtype_b, size) host_ary_b = ary_b.get() print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b) dev_result = op(ary_a, ary_b).get() host_result = op(host_ary_a, host_ary_b) if host_result.dtype != dev_result.dtype: # This appears to be a numpy bug, where we get # served a Python complex that is really a # smaller numpy complex. print("HOST_DTYPE: %s DEV_DTYPE: %s" % ( host_result.dtype, dev_result.dtype)) dev_result = dev_result.astype(host_result.dtype) err = la.norm(host_result-dev_result)/la.norm(host_result) print(err) correct = err < 1e-5 if not correct: print(host_result) print(dev_result) print(host_result - dev_result) assert correct def test_pow_neg1_vs_inv(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) device = ctx.devices[0] if not has_double_support(device): from pytest import skip skip("double precision not supported on %s" % device) a_dev = make_random_array(queue, np.complex128, 20000) res1 = (a_dev ** (-1)).get() res2 = (1/a_dev).get() ref = 1/a_dev.get() assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13 assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13 def test_vector_fill(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a_gpu = cl_array.Array(queue, 100, dtype=cl_array.vec.float4) a_gpu.fill(cl_array.vec.make_float4(0.0, 0.0, 1.0, 0.0)) a = a_gpu.get() assert a.dtype is cl_array.vec.float4 a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4) def test_absrealimag(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) def real(x): return x.real def imag(x): return x.imag def conj(x): return x.conj() n = 111 for func in [abs, real, imag, conj]: for dtype in [np.int32, np.float32, np.complex64]: print(func, dtype) a = -make_random_array(queue, dtype, n) host_res = func(a.get()) dev_res = func(a).get() correct = np.allclose(dev_res, host_res) if not correct: print(dev_res) print(host_res) print(dev_res-host_res) assert correct # }}} # {{{ operators def test_rmul_yields_right_type(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) two_a = 2*a_gpu assert isinstance(two_a, cl_array.Array) two_a = np.float32(2)*a_gpu assert isinstance(two_a, cl_array.Array) def test_pow_array(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) result = pow(a_gpu, a_gpu).get() assert (np.abs(a ** a - result) < 1e-3).all() result = (a_gpu ** a_gpu).get() assert (np.abs(pow(a, a) - result) < 1e-3).all() def test_pow_number(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) result = pow(a_gpu, 2).get() assert (np.abs(a ** 2 - result) < 1e-3).all() def test_multiply(ctx_factory): """Test the muliplication of an array with a scalar. """ context = ctx_factory() queue = cl.CommandQueue(context) for sz in [10, 50000]: for dtype, scalars in [ (np.float32, [2]), (np.complex64, [2j]), ]: for scalar in scalars: a_gpu = make_random_array(queue, dtype, sz) a = a_gpu.get() a_mult = (scalar * a_gpu).get() assert (a * scalar == a_mult).all() def test_multiply_array(ctx_factory): """Test the multiplication of two arrays.""" context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, a) a_squared = (b_gpu * a_gpu).get() assert (a * a == a_squared).all() def test_addition_array(ctx_factory): """Test the addition of two arrays.""" context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) a_added = (a_gpu + a_gpu).get() assert (a + a == a_added).all() def test_addition_scalar(ctx_factory): """Test the addition of an array and a scalar.""" context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) a_added = (7 + a_gpu).get() assert (7 + a == a_added).all() def test_substract_array(ctx_factory): """Test the substraction of two arrays.""" #test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) b = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) context = ctx_factory() queue = cl.CommandQueue(context) a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, b) result = (a_gpu - b_gpu).get() assert (a - b == result).all() result = (b_gpu - a_gpu).get() assert (b - a == result).all() def test_substract_scalar(ctx_factory): """Test the substraction of an array and a scalar.""" context = ctx_factory() queue = cl.CommandQueue(context) #test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) #convert a to a gpu object a_gpu = cl_array.to_device(queue, a) result = (a_gpu - 7).get() assert (a - 7 == result).all() result = (7 - a_gpu).get() assert (7 - a == result).all() def test_divide_scalar(ctx_factory): """Test the division of an array and a scalar.""" context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) result = (a_gpu / 2).get() assert (a / 2 == result).all() result = (2 / a_gpu).get() assert (np.abs(2 / a - result) < 1e-5).all() def test_divide_array(ctx_factory): """Test the division of an array and a scalar. """ context = ctx_factory() queue = cl.CommandQueue(context) #test data a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, b) a_divide = (a_gpu / b_gpu).get() assert (np.abs(a / b - a_divide) < 1e-3).all() a_divide = (b_gpu / a_gpu).get() assert (np.abs(b / a - a_divide) < 1e-3).all() # }}} # {{{ RNG def test_random(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import RanluxGenerator if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] gen = RanluxGenerator(queue, 5120) for ary_size in [300, 301, 302, 303, 10007]: for dtype in dtypes: ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran) assert (0 < ran.get()).all() assert (ran.get() < 1).all() gen.synchronize(queue) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran, a=4, b=7) assert (4 < ran.get()).all() assert (ran.get() < 7).all() ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3) dtypes = [np.int32] for dtype in dtypes: ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300) assert (200 <= ran.get()).all() assert (ran.get() < 300).all() #from matplotlib import pyplot as pt #pt.hist(ran.get()) #pt.show() # }}} # {{{ misc def test_numpy_integer_shape(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) cl_array.empty(queue, np.int32(17), np.float32) cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32) def test_len(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_cpu = cl_array.to_device(queue, a) assert len(a_cpu) == 10 def test_stride_preservation(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) A = np.random.rand(3, 3) AT = A.T print(AT.flags.f_contiguous, AT.flags.c_contiguous) AT_GPU = cl_array.to_device(queue, AT) print(AT_GPU.flags.f_contiguous, AT_GPU.flags.c_contiguous) assert np.allclose(AT_GPU.get(), AT) def test_nan_arithmetic(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) def make_nan_contaminated_vector(size): shape = (size,) a = np.random.randn(*shape).astype(np.float32) from random import randrange for i in range(size // 10): a[randrange(0, size)] = float('nan') return a size = 1 << 20 a = make_nan_contaminated_vector(size) a_gpu = cl_array.to_device(queue, a) b = make_nan_contaminated_vector(size) b_gpu = cl_array.to_device(queue, b) ab = a * b ab_gpu = (a_gpu * b_gpu).get() assert (np.isnan(ab) == np.isnan(ab_gpu)).all() def test_mem_pool_with_arrays(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000 assert a_dev.allocator is mem_pool assert b_dev.allocator is mem_pool def test_view(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a = np.arange(128).reshape(8, 16).astype(np.float32) a_dev = cl_array.to_device(queue, a) # same dtype view = a_dev.view() assert view.shape == a_dev.shape and view.dtype == a_dev.dtype # larger dtype view = a_dev.view(np.complex64) assert view.shape == (8, 8) and view.dtype == np.complex64 # smaller dtype view = a_dev.view(np.int16) assert view.shape == (8, 32) and view.dtype == np.int16 def test_diff(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 a_dev = clrand(queue, (l,), dtype=np.float32) a = a_dev.get() err = la.norm( (cl.array.diff(a_dev).get() - np.diff(a))) assert err < 1e-4 # }}} # {{{ slices, concatenation def test_slice(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 a_gpu = clrand(queue, (l,), dtype=np.float32) b_gpu = clrand(queue, (l,), dtype=np.float32) a = a_gpu.get() b = b_gpu.get() from random import randrange for i in range(20): start = randrange(l) end = randrange(start, l) a_gpu_slice = 2*a_gpu[start:end] a_slice = 2*a[start:end] assert la.norm(a_gpu_slice.get() - a_slice) == 0 for i in range(20): start = randrange(l) end = randrange(start, l) a_gpu[start:end] = 2*b[start:end] a[start:end] = 2*b[start:end] assert la.norm(a_gpu.get() - a) == 0 for i in range(20): start = randrange(l) end = randrange(start, l) a_gpu[start:end] = 2*b_gpu[start:end] a[start:end] = 2*b[start:end] assert la.norm(a_gpu.get() - a) == 0 def test_concatenate(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_dev = clrand(queue, (5, 15, 20), dtype=np.float32) b_dev = clrand(queue, (4, 15, 20), dtype=np.float32) c_dev = clrand(queue, (3, 15, 20), dtype=np.float32) a = a_dev.get() b = b_dev.get() c = c_dev.get() cat_dev = cl.array.concatenate((a_dev, b_dev, c_dev)) cat = np.concatenate((a, b, c)) assert la.norm(cat - cat_dev.get()) == 0 # }}} # {{{ conditionals, any, all def test_comparisons(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 a_dev = clrand(queue, (l,), dtype=np.float32) b_dev = clrand(queue, (l,), dtype=np.float32) a = a_dev.get() b = b_dev.get() import operator as o for op in [o.eq, o.ne, o.le, o.lt, o.ge, o.gt]: res_dev = op(a_dev, b_dev) res = op(a, b) assert (res_dev.get() == res).all() res_dev = op(a_dev, 0) res = op(a, 0) assert (res_dev.get() == res).all() res_dev = op(0, b_dev) res = op(0, b) assert (res_dev.get() == res).all() def test_any_all(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) l = 20000 a_dev = cl_array.zeros(queue, (l,), dtype=np.int8) assert not a_dev.all() assert not a_dev.any() a_dev[15213] = 1 assert not a_dev.all() assert a_dev.any() a_dev.fill(1) assert a_dev.all() assert a_dev.any() # }}} def test_map_to_host(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) if context.devices[0].type & cl.device_type.GPU: mf = cl.mem_flags allocator = cl_tools.DeferredAllocator( context, mf.READ_WRITE | mf.ALLOC_HOST_PTR) else: allocator = None a_dev = cl_array.zeros(queue, (5, 6, 7,), dtype=np.float32, allocator=allocator) a_dev[3, 2, 1] = 10 a_host = a_dev.map_to_host() a_host[1, 2, 3] = 10 a_host_saved = a_host.copy() a_host.base.release(queue) a_dev.finish() print("DEV[HOST_WRITE]", a_dev.get()[1, 2, 3]) print("HOST[DEV_WRITE]", a_host_saved[3, 2, 1]) assert (a_host_saved == a_dev.get()).all() def test_view_and_strides(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand X = clrand(queue, (5, 10), dtype=np.float32) Y = X[:3, :5] y = Y.view() assert y.shape == Y.shape assert y.strides == Y.strides import pytest with pytest.raises(AssertionError): assert (y.get() == X.get()[:3, :5]).all() if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the # tests. if len(sys.argv) > 1: exec(sys.argv[1]) else: from py.test.cmdline import main main([__file__]) # vim: filetype=pyopencl:fdm=marker pyopencl-2013.2/test/test_clmath.py0000644000175000000500000002226112245716340016011 0ustar tomussrcfrom __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import math import numpy as np def have_cl(): try: import pyopencl return True except: return False if have_cl(): import pyopencl.array as cl_array import pyopencl as cl import pyopencl.clmath as clmath from pyopencl.tools import pytest_generate_tests_for_pyopencl \ as pytest_generate_tests from pyopencl.characterize import has_double_support sizes = [10, 128, 1<<10, 1<<11, 1<<13] numpy_func_names = { "asin": "arcsin", "acos": "arccos", "atan": "arctan", } def make_unary_function_test(name, limits=(0, 1), threshold=0, use_complex=False): (a, b) = limits a = float(a) b = float(b) def test(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) if has_double_support(context.devices[0]): if use_complex: dtypes = [np.float32, np.float64, np.complex64, np.complex128] else: dtypes = [np.float32, np.float64] else: if use_complex: dtypes = [np.float32, np.complex64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: dtype = np.dtype(dtype) args = cl_array.arange(queue, a, b, (b-a)/s, dtype=dtype) if dtype.kind == "c": args = args+dtype.type(1j)*args gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) my_threshold = threshold if dtype.kind == "c" and isinstance(use_complex, float): my_threshold = use_complex max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= my_threshold).all(), \ (max_err, name, dtype) return test if have_cl(): test_ceil = make_unary_function_test("ceil", (-10, 10)) test_floor = make_unary_function_test("ceil", (-10, 10)) test_fabs = make_unary_function_test("fabs", (-10, 10)) test_exp = make_unary_function_test("exp", (-3, 3), 1e-5, use_complex=True) test_log = make_unary_function_test("log", (1e-5, 1), 1e-6, use_complex=True) test_log10 = make_unary_function_test("log10", (1e-5, 1), 5e-7) test_sqrt = make_unary_function_test("sqrt", (1e-5, 1), 3e-7, use_complex=True) test_sin = make_unary_function_test("sin", (-10, 10), 2e-7, use_complex=2e-2) test_cos = make_unary_function_test("cos", (-10, 10), 2e-7, use_complex=2e-2) test_asin = make_unary_function_test("asin", (-0.9, 0.9), 5e-7) test_acos = make_unary_function_test("acos", (-0.9, 0.9), 5e-7) test_tan = make_unary_function_test("tan", (-math.pi/2 + 0.1, math.pi/2 - 0.1), 4e-5, use_complex=True) test_atan = make_unary_function_test("atan", (-10, 10), 2e-7) test_sinh = make_unary_function_test("sinh", (-3, 3), 2e-6, use_complex=2e-3) test_cosh = make_unary_function_test("cosh", (-3, 3), 2e-6, use_complex=2e-3) test_tanh = make_unary_function_test("tanh", (-3, 3), 2e-6, use_complex=True) def test_fmod(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32)/10 a2 = cl_array.arange(queue, s, dtype=np.float32)/45.2 + 0.1 b = clmath.fmod(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.fmod(a[i], a2[i]) == b[i] def test_ldexp(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32) a2 = cl_array.arange(queue, s, dtype=np.float32)*1e-3 b = clmath.ldexp(a,a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.ldexp(a[i], int(a2[i])) == b[i] def test_modf(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32)/10 fracpart, intpart = clmath.modf(a) a = a.get() intpart = intpart.get() fracpart = fracpart.get() for i in range(s): fracpart_true, intpart_true = math.modf(a[i]) assert intpart_true == intpart[i] assert abs(fracpart_true - fracpart[i]) < 1e-4 def test_frexp(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32)/10 significands, exponents = clmath.frexp(a) a = a.get() significands = significands.get() exponents = exponents.get() for i in range(s): sig_true, ex_true = math.frexp(a[i]) assert sig_true == significands[i] assert ex_true == exponents[i] def test_bessel(ctx_factory): try: import scipy.special as spec except ImportError: from pytest import skip skip("scipy not present--cannot test Bessel function") ctx = ctx_factory() queue = cl.CommandQueue(ctx) if not has_double_support(ctx.devices[0]): from pytest import skip skip("no double precision support--cannot test bessel function") nterms = 30 try: from pyfmmlib import jfuns2d, hank103_vec except ImportError: use_pyfmmlib = False else: use_pyfmmlib = True print("PYFMMLIB", use_pyfmmlib) if use_pyfmmlib: a = np.logspace(-3, 3, 10**6) else: a = np.logspace(-5, 5, 10**6) for which_func, cl_func, scipy_func, is_rel in [ ("j", clmath.bessel_jn, spec.jn, False), ("y", clmath.bessel_yn, spec.yn, True) ]: if is_rel: def get_err(check, ref): return np.max(np.abs(check-ref)) / np.max(np.abs(ref)) else: def get_err(check, ref): return np.max(np.abs(check-ref)) if use_pyfmmlib: pfymm_result = np.empty((len(a), nterms), dtype=np.complex128) if which_func == "j": for i, a_i in enumerate(a): if i % 100000 == 0: print("%.1f %%" % (100 * i/len(a))) ier, fjs, _, _ = jfuns2d(nterms, a_i, 1, 0, 10000) pfymm_result[i] = fjs[:nterms] assert ier == 0 elif which_func == "y": h0, h1 = hank103_vec(a, ifexpon=1) pfymm_result[:, 0] = h0.imag pfymm_result[:, 1] = h1.imag a_dev = cl_array.to_device(queue, a) for n in range(0, nterms): cl_bessel = cl_func(n, a_dev).get() scipy_bessel = scipy_func(n, a) error_scipy = get_err(cl_bessel, scipy_bessel) assert error_scipy < 1e-10, error_scipy if use_pyfmmlib and ( which_func == "j" or (which_func == "y" and n in [0, 1])): pyfmm_bessel = pfymm_result[:, n] error_pyfmm = get_err(cl_bessel, pyfmm_bessel) assert error_pyfmm < 1e-10, error_pyfmm error_pyfmm_scipy = get_err(scipy_bessel, pyfmm_bessel) print(which_func, n, error_scipy, error_pyfmm, error_pyfmm_scipy) else: print(which_func, n, error_scipy) assert not np.isnan(cl_bessel).any() if 0 and n == 15: import matplotlib.pyplot as pt #pt.plot(scipy_bessel) #pt.plot(cl_bessel) pt.loglog(a, np.abs(cl_bessel-scipy_bessel), label="vs scipy") if use_pyfmmlib: pt.loglog(a, np.abs(cl_bessel-hk_bessel), label="vs pyfmmlib") pt.legend() pt.show() if __name__ == "__main__": import sys if len(sys.argv) > 1: exec(sys.argv[1]) else: from py.test.cmdline import main main([__file__]) pyopencl-2013.2/setup.cfg0000644000175000000500000000011012245716340013757 0ustar tomussrc[flake8] ignore = E126,E127,E128,E123,E226,E241,E242 max-line-length=85 pyopencl-2013.2/MANIFEST.in0000644000175000000500000000122512245716340013704 0ustar tomussrcinclude pyopencl/cl/*.h include pyopencl/cl/*.cl include src/wrapper/*.hpp include src/wrapper/*.cpp include test/*.py include test/*.h include examples/*.py include doc/source/*.rst include doc/Makefile include doc/*.py include doc/source/conf.py include doc/source/_static/*.css include doc/source/_templates/*.html include configure.py include Makefile.in include aksetup_helper.py include README_SETUP.txt include README.rst recursive-include bpl-subset/bpl_subset/boost *.h *.hpp *.cpp *.html *.inl *.ipp *.pl *.txt recursive-include bpl-subset/bpl_subset/libs *.h *.hpp *.cpp *.html *.inl *.ipp *.pl *.txt recursive-include contrib *.vim *.py README pyopencl-2013.2/doc/0002755000175000000500000000000012245716342012717 5ustar tomussrcpyopencl-2013.2/doc/algorithm.rst0000644000175000000500000002415212245716342015441 0ustar tomussrcParallel Algorithms =================== .. include:: subst.rst Element-wise expression evalution ("map") ----------------------------------------- .. module:: pyopencl.elementwise Evaluating involved expressions on :class:`pyopencl.array.Array` instances by using overloaded operators can be somewhat inefficient, because a new temporary is created for each intermediate result. The functionality in the module :mod:`pyopencl.elementwise` contains tools to help generate kernels that evaluate multi-stage expressions on one or several operands in a single pass. .. autoclass:: ElementwiseKernel(context, arguments, operation, name="kernel", preamble="", options=[]) .. method:: __call__(*args, wait_for=None) Invoke the generated scalar kernel. The arguments may either be scalars or :class:`GPUArray` instances. |std-enqueue-blurb| Here's a usage example:: import pyopencl as cl import pyopencl.array as cl_array import numpy ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 10 a_gpu = cl_array.to_device( ctx, queue, numpy.random.randn(n).astype(numpy.float32)) b_gpu = cl_array.to_device( ctx, queue, numpy.random.randn(n).astype(numpy.float32)) from pyopencl.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel(ctx, "float a, float *x, " "float b, float *y, " "float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = cl_array.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) import numpy.linalg as la assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 (You can find this example as :file:`examples/demo_elementwise.py` in the PyOpenCL distribution.) .. _custom-reductions: Sums and counts ("reduce") -------------------------- .. module:: pyopencl.reduction .. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble="") Generate a kernel that takes a number of scalar or vector *arguments* (at least one vector argument), performs the *map_expr* on each entry of the vector argument and then the *reduce_expr* on the outcome of that. *neutral* serves as an initial value. *preamble* offers the possibility to add preprocessor directives and other code (such as helper functions) to be added before the actual reduction kernel code. Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr* uses the formal values "a" and "b" to indicate two operands of a binary reduction operation. If you do not specify a *map_expr*, "in[i]" -- and therefore the presence of only one input argument -- is automatically assumed. *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is performed and in which the result is returned. *neutral* is specified as float or integer formatted as string. *reduce_expr* and *map_expr* are specified as string formatted operations and *arguments* is specified as a string formatted as a C argument list. *name* specifies the name as which the kernel is compiled. *options* are passed unmodified to :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that is inserted before the actual kernels. .. method:: __call__(*args, queue=None, wait_for=None, return_event=False) |explain-waitfor| :return: the resulting scalar as a single-entry :class:`pyopencl.array.Array` if *return_event* is *False*, otherwise a tuple ``(scalar_array, event)``. .. note:: The returned :class:`pyopencl.Event` corresponds only to part of the execution of the reduction. It is not suitable for profiling. .. versionadded: 2011.1 Here's a usage example:: a = pyopencl.array.arange(queue, 400, dtype=numpy.float32) b = pyopencl.array.arange(queue, 400, dtype=numpy.float32) krnl = ReductionKernel(ctx, numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="__global float *x, __global float *y") my_dot_prod = krnl(a, b).get() .. _custom-scan: Prefix Sums ("scan") -------------------- .. module:: pyopencl.scan .. |scan_extra_args| replace:: a list of tuples *(name, value)* specifying extra arguments to pass to the scan procedure. *value* must be :mod:`numpy` sized type. .. |preamble| replace:: A snippet of C that is inserted into the compiled kernel before the actual kernel function. May be used for, e.g. type definitions or include statements. A prefix sum is a running sum of an array, as provided by e.g. :mod:`numpy.cumsum`:: >>> import numpy as np >>> a = [1,1,1,1,1,2,2,2,2,2] >>> np.cumsum(a) array([ 1, 2, 3, 4, 5, 7, 9, 11, 13, 15]) This is a very simple example of what a scan can do. It turns out that scans are significantly more versatile. They are a basic building block of many non-trivial parallel algorithms. Many of the operations enabled by scans seem difficult to parallelize because of loop-carried dependencies. .. seealso:: `Prefix sums and their applications `_, by Guy Blelloch. This article gives an overview of some surprising applications of scans. :ref:`predefined-scans` These operations built into PyOpenCL are realized using :class:`GenericScanKernel`. Usage Example ^^^^^^^^^^^^^ This example illustrates the implementation of a simplified version of :func:`pyopencl.algorithm.copy_if`, which copies integers from an array into the (variable-size) output if they are greater than 300:: knl = GenericScanKernel( ctx, np.int32, arguments="__global int *ary, __global int *out", input_expr="(ary[i] > 300) ? 1 : 0", scan_expr="a+b", neutral="0", output_statement=""" if (prev_item != item) out[item-1] = ary[i]; """) out = a.copy() knl(a, out) a_host = a.get() out_host = a_host[a_host > 300] assert (out_host == out.get()[:len(out_host)]).all() The value being scanned over is a number of flags indicating whether each array element is greater than 300. These flags are computed by *input_expr*. The prefix sum over this array gives a running count of array items greater than 300. The *output_statement* the compares `prev_item` (the previous item's scan result, i.e. index) to `item` (the current item's scan result, i.e. index). If they differ, i.e. if the predicate was satisfied at this position, then the item is stored in the output at the computed index. This example does not make use of the following advanced features also available in PyOpenCL: * Segmented scans * Access to the previous item in *input_expr* (e.g. for comparisons) See the `implementation `_ of :func:`unique` for an example. Making Custom Scan Kernels ^^^^^^^^^^^^^^^^^^^^^^^^^^ .. versionadded: 2013.1 .. autoclass:: GenericScanKernel .. method:: __call__(*args, allocator=None, queue=None, size=None, wait_for=None) *queue* and *allocator* default to the ones provided on the first :class:`pyopencl.array.Array` in *args*. *size* may specify the length of the scan to be carried out. If not given, this length is inferred from the first array argument passed. |std-enqueue-blurb| .. note:: The returned :class:`pyopencl.Event` corresponds only to part of the execution of the scan. It is not suitable for profiling. Debugging aids ~~~~~~~~~~~~~~ .. class:: GenericDebugScanKernel Performs the same function and has the same interface as :class:`GenericScanKernel`, but uses a dead-simple, sequential scan. Works best on CPU platforms, and helps isolate bugs in scans by removing the potential for issues originating in parallel execution. .. _predefined-scans: Simple / Legacy Interface ^^^^^^^^^^^^^^^^^^^^^^^^^ .. class:: ExclusiveScanKernel(ctx, dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="", devices=None) Generates a kernel that can compute a `prefix sum `_ using any associative operation given as *scan_expr*. *scan_expr* uses the formal values "a" and "b" to indicate two operands of an associative binary operation. *neutral* is the neutral element of *scan_expr*, obeying *scan_expr(a, neutral) == a*. *dtype* specifies the type of the arrays being operated on. *name_prefix* is used for kernel names to ensure recognizability in profiles and logs. *options* is a list of compiler options to use when building. *preamble* specifies a string of code that is inserted before the actual kernels. *devices* may be used to restrict the set of devices on which the kernel is meant to run. (defaults to all devices in the context *ctx*. .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None) .. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None) Works like :class:`ExclusiveScanKernel`. .. versionchanged:: 2013.1 *neutral* is now always required. For the array `[1,2,3]`, inclusive scan results in `[1,3,6]`, and exclusive scan results in `[0,1,3]`. Here's a usage example:: knl = InclusiveScanKernel(context, np.int32, "a+b") n = 2**20-2**18+5 host_data = np.random.randint(0, 10, n).astype(np.int32) dev_data = cl_array.to_device(queue, host_data) knl(dev_data) assert (dev_data.get() == np.cumsum(host_data, axis=0)).all() Predicated copies ("partition", "unique", ...) ---------------------------------------------- .. module:: pyopencl.algorithm .. autofunction:: copy_if .. autofunction:: remove_if .. autofunction:: partition .. autofunction:: unique Sorting (radix sort) -------------------- .. autoclass:: RadixSort .. automethod:: __call__ Building many variable-size lists --------------------------------- .. autoclass:: ListOfListsBuilder .. automethod:: __call__ pyopencl-2013.2/doc/array.rst0000644000175000000500000001723512245716342014575 0ustar tomussrcMulti-dimensional arrays ======================== .. module:: pyopencl.array The functionality in this module provides something of a work-alike for :mod:`numpy` arrays, but with all operations executed on the CL compute device. Data Types ---------- PyOpenCL provides some amount of integration between the :mod:`numpy` type system, as represented by :class:`numpy.dtype`, and the types available in OpenCL. All the simple scalar types map straightforwardly to their CL counterparts. .. _vector-types: Vector Types ^^^^^^^^^^^^ .. class :: vec All of OpenCL's supported vector types, such as `float3` and `long4` are available as :mod:`numpy` data types within this class. These :class:`numpy.dtype` instances have field names of `x`, `y`, `z`, and `w` just like their OpenCL counterparts. They will work both for parameter passing to kernels as well as for passing data back and forth between kernels and Python code. For each type, a `make_type` function is also provided (e.g. `make_float3(x,y,z)`). Custom data types ^^^^^^^^^^^^^^^^^ If you would like to use your own (struct/union/whatever) data types in array operations where you supply operation source code, define those types in the *preamble* passed to :class:`pyopencl.elementwise.ElementwiseKernel`, :class:`pyopencl.reduction.ReductionKernel` (or similar), and let PyOpenCL know about them using this function: .. currentmodule:: pyopencl.tools .. autofunction:: get_or_register_dtype .. exception:: TypeNameNotKnown .. versionadded:: 2013.1 .. function:: register_dtype(dtype, name) .. versionchanged:: 2013.1 This function has been deprecated. It is recommended that you develop against the new interface, :func:`get_or_register_dtype`. .. function:: dtype_to_ctype(dtype) Returns a C name registered for *dtype*. .. versionadded: 2013.1 This function helps with producing C/OpenCL declarations for structured :class:`numpy.dtype` instances: .. autofunction:: match_dtype_to_c_struct A more complete example of how to use custom structured types can be found in :file:`examples/demo-struct-reduce.py` in the PyOpenCL distribution. .. currentmodule:: pyopencl.array Complex Numbers ^^^^^^^^^^^^^^^ PyOpenCL's :class:`Array` type supports complex numbers out of the box, by simply using the corresponding :mod:`numpy` types. If you would like to use this support in your own kernels, here's how to proceed: Since OpenCL 1.2 (and earlier) do not specify native complex number support, PyOpenCL works around that deficiency. By saying:: #include in your kernel, you get complex types `cfloat_t` and `cdouble_t`, along with functions defined on them such as `cfloat_mul(a, b)` or `cdouble_log(z)`. Elementwise kernels automatically include the header if your kernel has complex input or output. See the `source file `_ for a precise list of what's available. If you need double precision support, please:: #define PYOPENCL_DEFINE_CDOUBLE before including the header, as DP support apparently cannot be reliably autodetected. Under the hood, the complex types are simply `float2` and `double2`. .. warning:: Note that addition (real + complex) and multiplication (complex*complex) are defined for e.g. `float2`, but yield wrong results, so that you need to use the corresponding functions. .. versionadded:: 2012.1 The :class:`Array` Class ------------------------ .. autoclass:: Array .. autoexception:: ArrayHasOffsetError Constructing :class:`Array` Instances ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autofunction:: to_device .. function:: empty(queue, shape, dtype, order="C", allocator=None, data=None) A synonym for the :class:`Array` constructor. .. autofunction:: zeros .. autofunction:: empty_like .. autofunction:: zeros_like .. autofunction:: arange .. autofunction:: take .. autofunction:: concatenate Conditionals ^^^^^^^^^^^^ .. autofunction:: if_positive .. autofunction:: maximum .. autofunction:: minimum .. _reductions: Reductions ^^^^^^^^^^ .. autofunction:: sum .. autofunction:: dot .. autofunction:: vdot .. autofunction:: subset_dot .. autofunction:: max .. autofunction:: min .. autofunction:: subset_max .. autofunction:: subset_min See also :ref:`custom-reductions`. Elementwise Functions on :class:`Arrray` Instances -------------------------------------------------- .. module:: pyopencl.clmath The :mod:`pyopencl.clmath` module contains exposes array versions of the C functions available in the OpenCL standard. (See table 6.8 in the spec.) .. function:: acos(array, queue=None) .. function:: acosh(array, queue=None) .. function:: acospi(array, queue=None) .. function:: asin(array, queue=None) .. function:: asinh(array, queue=None) .. function:: asinpi(array, queue=None) .. function:: atan(array, queue=None) .. autofunction:: atan2 .. function:: atanh(array, queue=None) .. function:: atanpi(array, queue=None) .. autofunction:: atan2pi .. function:: cbrt(array, queue=None) .. function:: ceil(array, queue=None) .. TODO: copysign .. function:: cos(array, queue=None) .. function:: cosh(array, queue=None) .. function:: cospi(array, queue=None) .. function:: erfc(array, queue=None) .. function:: erf(array, queue=None) .. function:: exp(array, queue=None) .. function:: exp2(array, queue=None) .. function:: exp10(array, queue=None) .. function:: expm1(array, queue=None) .. function:: fabs(array, queue=None) .. TODO: fdim .. function:: floor(array, queue=None) .. TODO: fma .. TODO: fmax .. TODO: fmin .. function:: fmod(arg, mod, queue=None) Return the floating point remainder of the division `arg/mod`, for each element in `arg` and `mod`. .. TODO: fract .. function:: frexp(arg, queue=None) Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. .. TODO: hypot .. function:: ilogb(array, queue=None) .. function:: ldexp(significand, exponent, queue=None) Return a new array of floating point values composed from the entries of `significand` and `exponent`, paired together as `result = significand * 2**exponent`. .. function:: lgamma(array, queue=None) .. TODO: lgamma_r .. function:: log(array, queue=None) .. function:: log2(array, queue=None) .. function:: log10(array, queue=None) .. function:: log1p(array, queue=None) .. function:: logb(array, queue=None) .. TODO: mad .. TODO: maxmag .. TODO: minmag .. function:: modf(arg, queue=None) Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. .. function:: nan(array, queue=None) .. TODO: nextafter .. TODO: remainder .. TODO: remquo .. function:: rint(array, queue=None) .. TODO: rootn .. function:: round(array, queue=None) .. function:: sin(array, queue=None) .. TODO: sincos .. function:: sinh(array, queue=None) .. function:: sinpi(array, queue=None) .. function:: sqrt(array, queue=None) .. function:: tan(array, queue=None) .. function:: tanh(array, queue=None) .. function:: tanpi(array, queue=None) .. function:: tgamma(array, queue=None) .. function:: trunc(array, queue=None) Generating Arrays of Random Numbers ----------------------------------- .. automodule:: pyopencl.clrandom .. autoclass:: RanluxGenerator .. automethod:: fill_uniform .. automethod:: uniform .. automethod:: fill_normal .. automethod:: normal .. automethod:: synchronize .. autofunction:: rand .. autofunction:: fill_rand GPGPU Algorithms ---------------- Bogdan Opanchuk's `reikna `_ offers a variety of GPU-based algorithms (FFT, RNG, matrix multiplication) designed to work with :class:`pyopencl.array.Array` objects. pyopencl-2013.2/doc/.gitignore0000644000175000000500000000001612245716340014700 0ustar tomussrcconstants.inc pyopencl-2013.2/doc/tools.rst0000644000175000000500000001154112245716340014607 0ustar tomussrcBuilt-in Utilities ================== .. module:: pyopencl.tools .. _memory-pools: Memory Pools ------------ The constructor :func:`pyopencl.Buffer` can consume a fairly large amount of processing time if it is invoked very frequently. For example, code based on :class:`pyopencl.array.Array` can easily run into this issue because a fresh memory area is allocated for each intermediate result. Memory pools are a remedy for this problem based on the observation that often many of the block allocations are of the same sizes as previously used ones. Then, instead of fully returning the memory to the system and incurring the associated reallocation overhead, the pool holds on to the memory and uses it to satisfy future allocations of similarly-sized blocks. The pool reacts appropriately to out-of-memory conditions as long as all memory allocations are made through it. Allocations performed from outside of the pool may run into spurious out-of-memory conditions due to the pool owning much or all of the available memory. Using :class:`pyopencl.array.Array` instances with a :class:`MemoryPool` is not complicated:: mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) .. class:: PooledBuffer An object representing a :class:`MemoryPool`-based allocation of device memory. Once this object is deleted, its associated device memory is returned to the pool. This supports the same interface as :class:`pyopencl.Buffer`. .. class:: DeferredAllocator(context, mem_flags=pyopencl.mem_flags.READ_WRITE) *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds to the *flags* argument of :class:`pyopencl.Buffer`. DeferredAllocator has the same semantics as regular OpenCL buffer allocation, i.e. it may promise memory to be available that may (in any call to a buffer-using CL function) turn out to not exist later on. (Allocations in CL are bound to contexts, not devices, and memory availability depends on which device the buffer is used with.) .. versionchanged:: In version 2013.1, :class:`CLAllocator` was deprecated and replaced by :class:`DeferredAllocator`. .. method:: __call__(size) Allocate a :class:`pyopencl.Buffer` of the given *size*. .. class:: ImmediateAllocator(queue, mem_flags=pyopencl.mem_flags.READ_WRITE) *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds to the *flags* argument of :class:`pyopencl.Buffer`. DeferredAllocator has the same semantics as regular OpenCL buffer allocation, i.e. it may promise memory to be available that later on (in any call to a buffer-using CL function). .. versionadded:: 2013.1 .. method:: __call__(size) Allocate a :class:`pyopencl.Buffer` of the given *size*. .. class:: MemoryPool(allocator) A memory pool for OpenCL device memory. *allocator* must be an instance of one of the above classes, and should be an :class:`ImmediateAllocator`. The memory pool assumes that allocation failures are reported by the allocator immediately, and not in the OpenCL-typical deferred manner. .. attribute:: held_blocks The number of unused blocks being held by this pool. .. attribute:: active_blocks The number of blocks in active use that have been allocated through this pool. .. method:: allocate(size) Return a :class:`PooledBuffer` of the given *size*. .. method:: __call__(size) Synoynm for :meth:`allocate` to match :class:`CLAllocator` interface. .. versionadded: 2011.2 .. method:: free_held Free all unused memory that the pool is currently holding. .. method:: stop_holding Instruct the memory to start immediately freeing memory returned to it, instead of holding it for future allocations. Implicitly calls :meth:`free_held`. This is useful as a cleanup action when a memory pool falls out of use. CL-Object-dependent Caching --------------------------- .. autofunction:: first_arg_dependent_memoize .. autofunction:: clear_first_arg_caches Testing ------- .. function:: pytest_generate_tests_for_pyopencl(metafunc) Using the line:: from pyopencl.tools import pytest_generate_tests_for_pyopencl \ as pytest_generate_tests in your `pytest `_ test scripts allows you to use the arguments *ctx_factory*, *device*, or *platform* in your test functions, and they will automatically be run for each OpenCL device/platform in the system, as appropriate. The following two environment variables are also supported to control device/platform choice:: PYOPENCL_TEST=0:0,1;intel=i5,i7 Device Characterization ----------------------- .. automodule:: pyopencl.characterize :members: pyopencl-2013.2/doc/misc.rst0000644000175000000500000004341112245716342014405 0ustar tomussrcInstallation ============ Installation information is maintained collaboratively on the `PyOpenCL Wiki `_. Acknowledgments =============== Too many to list. Please see the `commit log `_ for detailed acknowledgments. Tips ==== Syntax highlighting ------------------- You can obtain Vim syntax highlighting for OpenCL C inlined in Python by checking `this file `_. Note that the triple-quoted strings containing the source must start with `"""//CL// ..."""`. Guidelines ========== .. _api-compatibility: API Stability ------------- I consider PyOpenCL's API "stable". That doesn't mean it can't change. But if it does, your code will generally continue to run. It may however start spewing warnings about things you need to change to stay compatible with future versions. Deprecation warnings will be around for a whole year, as identified by the first number in the release name. (the "2014" in "2014.1") I.e. a function that was deprecated in 2014.n will generally be removed in 2015.n (or perhaps later). Further, the stability promise applies for any code that's part of a released version. It doesn't apply to undocumented bits of the API, and it doesn't apply to unreleased code downloaded from git. .. _versus-c: Relation with OpenCL's C Bindings --------------------------------- We've tried to follow these guidelines when binding the OpenCL's C interface to Python: * Remove the `cl_`, `CL_` and `cl` prefix from data types, macros and function names. * Follow :pep:`8`, i.e. * Make function names lowercase. * If a data type or function name is composed of more than one word, separate the words with a single underscore. * `get_info` functions become attributes. * Object creation is done by constructors, to the extent possible. (i.e. minimize use of "factory functions") * If an operation involves two or more "complex" objects (like e.g. a kernel enqueue involves a kernel and a queue), refuse the temptation to guess which one should get a method for the operation. Instead, simply leave that command to be a function. .. _interoperability: Interoperability with other OpenCL software ------------------------------------------- Just about every object in :mod:`pyopncl` supports the following interface (here shown as an example for :class:`pyopencl.MemoryObject`, from which :class:`pyopencl.Buffer` and :class:`pyopencl.Image` inherit): * :meth:`pyopencl.MemoryObject.from_int_ptr` * :attr:`pyopencl.MemoryObject.int_ptr` This allows retrieving the C-level pointer to an OpenCL object as a Python integer, which may then be passed to other C libraries whose interfaces expose OpenCL objects. It also allows turning C-level OpenCL objects obtained from other software to be turned into the corresponding :mod:`pyopencl` objects. .. versionadded:: 2013.2 User-visible Changes ==================== Version 2013.2 -------------- .. note:: This version is currently under development. You can get snapshots from PyOpenCL's `git repository `_ * Add :meth:`pyopencl.array.Array.map_to_host`. * Support *strides* on :func:`pyopencl.enqueue_map_buffer` and :func:`pyopencl.enqueue_map_image`. * :class:`pyopencl.ImageFormat` was made comparable and hashable. * :mod:`pyopencl.reduction` supports slicing (contributed by Alex Nitz) * Added :ref:`interoperability` * Bug fixes Version 2013.1 -------------- * Vastly improved :ref:`custom-scan`. * Add :func:`pyopencl.tools.match_dtype_to_c_struct`, for better integration of the CL and :mod:`numpy` type systems. * More/improved Bessel functions. See `the source `_. * Add :envvar:`PYOPENCL_NO_CACHE` environment variable to aid debugging. (e.g. with AMD's CPU implementation, see `their programming guide `_) * Deprecated :func:`pyopencl.tools.register_dtype` in favor of :func:`pyopencl.tools.get_or_register_dtype`. * Clean up the :class:`pyopencl.array.Array` constructor interface. * Deprecate :class:`pyopencl.array.DefaultAllocator`. * Deprecate :class:`pyopencl.tools.CLAllocator`. * Introduce :class:`pyopencl.tools.DeferredAllocator`, :class:`pyopencl.tools.ImmediateAllocator`. * Allow arrays whose beginning does not coincide with the beginning of their :attr:`pyopencl.array.Array.data` :class:`pyopencl.Buffer`. See :attr:`pyopencl.array.Array.base_data` and :attr:`pyopencl.array.Array.offset`. Note that not all functions in PyOpenCL support such arrays just yet. These will fail with :exc:`pyopencl.array.ArrayHasOffsetError`. * Add :meth:`pyopencl.array.Array.__getitem__` and :meth:`pyopencl.array.Array.__setitem__`, supporting generic slicing. It is *possible* to create non-contiguous arrays using this functionality. Most operations (elementwise etc.) will not work on such arrays. Note also that some operations (specifically, reductions and scans) on sliced arrays that start past the beginning of the original array will fail for now. This will be fixed in a future release. * :class:`pyopencl.CommandQueue` may be used as a context manager (in a ``with`` statement) * Add :func:`pyopencl.clmath.atan2`, :func:`pyopencl.clmath.atan2pi`. * Add :func:`pyopencl.array.concatenate`. * Add :meth:`pyopencl.Kernel.capture_call`. .. note:: The addition of :meth:`pyopencl.array.Array.__getitem__` has an unintended consequence due to `numpy bug 3375 `_. For instance, this expression:: numpy.float32(5) * some_pyopencl_array may take a very long time to execute. This is because :mod:`numpy` first builds an object array of (compute-device) scalars (!) before it decides that that's probably not such a bright idea and finally calls :meth:`pyopencl.array.Array.__rmul__`. Note that only left arithmetic operations of :class:`pyopencl.array.Array` by :mod:`numpy` scalars are affected. Python's number types (:class:`float` etc.) are unaffected, as are right multiplications. If a program that used to run fast suddenly runs extremely slowly, it is likely that this bug is to blame. Here's what you can do: * Use Python scalars instead of :mod:`numpy` scalars. * Switch to right multiplications if possible. * Use a patched :mod:`numpy`. See the bug report linked above for a pull request with a fix. * Switch to a fixed version of :mod:`numpy` when available. Version 2012.1 -------------- * Support for complex numbers. * Support for Bessel functions. (experimental) * Numerous fixes. Version 2011.2 -------------- * Add :func:`pyopencl.enqueue_migrate_mem_object`. * Add :func:`pyopencl.image_from_array`. * IMPORTANT BUGFIX: Kernel caching was broken for all the 2011.1.x releases, with severe consequences on the execution time of :class:`pyopencl.array.Array` operations. Henrik Andresen at a `PyOpenCL workshop at DTU `_ first noticed the strange timings. * All comparable PyOpenCL objects are now also hashable. * Add :func:`pyopencl.tools.context_dependent_memoize` to the documented functionality. * Base :mod:`pyopencl.clrandom` on `RANLUXCL `_, add functionality. * Add :class:`pyopencl.NannyEvent` objects. * Add :mod:`pyopencl.characterize`. * Ensure compatibility with OS X Lion. * Add :func:`pyopencl.tools.register_dtype` to enable scan/reduction on struct types. * :func:`pyopencl.enqueue_migrate_mem_object` was renamed :func:`pyopencl.enqueue_migrate_mem_object_ext`. :func:`pyopencl.enqueue_migrate_mem_object` now refers to the OpenCL 1.2 function of this name, if available. * :func:`pyopencl.create_sub_devices` was renamed :func:`pyopencl.create_sub_devices_ext`. :func:`pyopencl.create_sub_devices` now refers to the OpenCL 1.2 function of this name, if available. * Alpha support for OpenCL 1.2. Version 2011.1.2 ---------------- * More bug fixes. Version 2011.1.1 ---------------- * Fixes for Python 3 compatibility. (with work by Christoph Gohlke) Version 2011.1 -------------- * All *is_blocking* parameters now default to *True* to avoid crashy-by-default behavior. (suggested by Jan Meinke) In particular, this change affects :func:`pyopencl.enqueue_read_buffer`, :func:`pyopencl.enqueue_write_buffer`, :func:`pyopencl.enqueue_read_buffer_rect`, :func:`pyopencl.enqueue_write_buffer_rect`, :func:`pyopencl.enqueue_read_image`, :func:`pyopencl.enqueue_write_image`, :func:`pyopencl.enqueue_map_buffer`, :func:`pyopencl.enqueue_map_image`. * Add :mod:`pyopencl.reduction`. * Add :ref:`reductions`. * Add :mod:`pyopencl.scan`. * Add :meth:`pyopencl.MemoryObject.get_host_array`. * Deprecate context arguments of :func:`pyopencl.array.to_device`, :func:`pyopencl.array.zeros`, :func:`pyopencl.array.arange`. * Make construction of :class:`pyopencl.array.Array` more flexible (*cqa* argument.) * Add :ref:`memory-pools`. * Add vector types, see :class:`pyopencl.array.vec`. * Add :attr:`pyopencl.array.Array.strides`, :attr:`pyopencl.array.Array.flags`. Allow the creation of arrys in C and Fortran order. * Add :func:`pyopencl.enqueue_copy`. Deprecate all other transfer functions. * Add support for numerous extensions, among them device fission. * Add a compiler cache. * Add the 'g_times_l' keyword arg to kernel execution. Version 0.92 ------------ * Add support for OpenCL 1.1. * Add support for the `cl_khr_gl_sharing `_ extension, leading to working GL interoperability. * Add :meth:`pyopencl.Kernel.set_args`. * The call signature of :meth:`pyopencl.Kernel.__call__` changed to emphasize the importance of *local_size*. * Add :meth:`pyopencl.Kernel.set_scalar_arg_dtypes`. * Add support for the `cl_nv_device_attribute_query `_ extension. * Add :meth:`pyopencl.array.Array` and related functionality. * Make build not depend on Boost C++. Version 0.91.5 -------------- * Add :attr:`pyopencl.ImageFormat.channel_count`, :attr:`pyopencl.ImageFormat.dtype_size`, :attr:`pyopencl.ImageFormat.itemsize`. * Add missing :func:`pyopencl.enqueue_copy_buffer`. * Add :func:`pyopencl.create_some_context`. * Add :func:`pyopencl.enqueue_barrier`, which was previously missing. Version 0.91.4 -------------- A bugfix release. No user-visible changes. Version 0.91.3 -------------- * All parameters named *host_buffer* were renamed *hostbuf* for consistency with the :class:`pyopencl.Buffer` constructor introduced in 0.91. Compatibility code is in place. * The :class:`pyopencl.Image` constructor does not need a *shape* parameter if the given *hostbuf* has *hostbuf.shape*. * The :class:`pyopencl.Context` constructor can now be called without parameters. Version 0.91.2 -------------- * :meth:`pyopencl.Program.build` now captures build logs and adds them to the exception text. * Deprecate :func:`pyopencl.create_context_from_type` in favor of second form of :class:`pyopencl.Context` constructor * Introduce :class:`pyopencl.LocalMemory`. * Document kernel invocation and :meth:`pyopencl.Kernel.set_arg`. Version 0.91.1 -------------- * Fixed a number of bugs, notably involving :class:`pyopencl.Sampler`. * :class:`pyopencl.Device`, :class:`pyopencl.Platform`, :class:`pyopencl.Context` now have nicer string representations. * Add :attr:`Image.shape`. (suggested by David Garcia) Version 0.91 ------------ * Add :ref:`gl-interop`. * Add a test suite. * Fix numerous `get_info` bugs. (reports by David Garcia and the test suite) * Add :meth:`pyopencl.ImageFormat.__repr__`. * Add :meth:`pyopencl.addressing_mode.to_string` and colleagues. * The `pitch` arguments to :func:`pyopencl.create_image_2d`, :func:`pyopencl.create_image_3d`, :func:`pyopencl.enqueue_read_image`, and :func:`pyopencl.enqueue_write_image` are now defaulted to zero. The argument order of `enqueue_{read,write}_image` has changed for this reason. * Deprecate :func:`pyopencl.create_image_2d`, :func:`pyopencl.create_image_3d` in favor of the :class:`pyopencl.Image` constructor. * Deprecate :func:`pyopencl.create_program_with_source`, :func:`pyopencl.create_program_with_binary` in favor of the :class:`pyopencl.Program` constructor. * Deprecate :func:`pyopencl.create_buffer`, :func:`pyopencl.create_host_buffer` in favor of the :class:`pyopencl.Buffer` constructor. * :meth:`pyopencl.MemoryObject.get_image_info` now actually exists. * Add :attr:`pyopencl.MemoryObject.image.info`. * Fix API tracing. * Add constructor arguments to :class:`pyopencl.ImageFormat`. (suggested by David Garcia) Version 0.90.4 -------------- * Add build fixes for Windows and OS X. Version 0.90.3 -------------- * Fix a GNU-ism in the C++ code of the wrapper. Version 0.90.2 -------------- * Fix :meth:`pyopencl.Platform.get_info`. * Fix passing properties to :class:`pyopencl.CommandQueue`. Also fix related documentation. Version 0.90.1 -------------- * Fix building on the Mac. Version 0.90 ------------ * Initial release. .. _license: License ======= PyOpenCL is licensed to you under the MIT/X Consortium license: Copyright (c) 2009-13 Andreas Klöckner and Contributors. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PyOpenCL includes derivatives of parts of the `Thrust `_ computing package (in particular the scan implementation). These parts are licensed as follows: Copyright 2008-2011 NVIDIA Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. .. note:: If you use Apache-licensed parts, be aware that these may be incompatible with software licensed exclusively under GPL2. (Most software is licensed as GPL2 or later, in which case this is not an issue.) PyOpenCL includes the RANLUXCL random number generator: Copyright (c) 2011 Ivar Ursin Nikolaisen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Frequently Asked Questions ========================== The FAQ is maintained collaboratively on the `Wiki FAQ page `_. Citing PyOpenCL =============== We are not asking you to gratuitously cite PyOpenCL in work that is otherwise unrelated to software. That said, if you do discuss some of the development aspects of your code and would like to highlight a few of the ideas behind PyOpenCL, feel free to cite this article: Andreas Klöckner, Nicolas Pinto, Yunsup Lee, Bryan Catanzaro, Paul Ivanov, Ahmed Fasih, PyCUDA and PyOpenCL: A scripting-based approach to GPU run-time code generation, Parallel Computing, Volume 38, Issue 3, March 2012, Pages 157-174. Here's a Bibtex entry for your convenience:: @article{kloeckner_pycuda_2012, author = {{Kl{\"o}ckner}, Andreas and {Pinto}, Nicolas and {Lee}, Yunsup and {Catanzaro}, B. and {Ivanov}, Paul and {Fasih}, Ahmed }, title = "{PyCUDA and PyOpenCL: A Scripting-Based Approach to GPU Run-Time Code Generation}", journal = "Parallel Computing", volume = "38", number = "3", pages = "157--174", year = "2012", issn = "0167-8191", doi = "10.1016/j.parco.2011.09.001", } pyopencl-2013.2/doc/_static/0002755000175000000500000000000012245716340014343 5ustar tomussrcpyopencl-2013.2/doc/_static/akdoc.css0000644000175000000500000000125112245716340016133 0ustar tomussrcpre { line-height: 110%; } .footer { background-color: #eee; } body > div.container { margin-top:10px; } dd { margin-left: 40px; } tt.descname { font-size: 100%; } code { color: rgb(51,51,51); } h1 { padding-bottom:5px; border-bottom: 1px solid #ccc; } h2 { padding-bottom:1px; border-bottom: 1px solid #ccc; } h3 { padding-bottom:1px; border-bottom: 1px solid #ccc; } .rubric { font-size: 120%; padding-bottom:1px; border-bottom: 1px solid #ccc; } .headerlink { padding-left: 1ex; padding-right: 1ex; } a.headerlink:hover { text-decoration: none; } blockquote p { font-size: 100%; font-weight: normal; line-height: normal; }; pyopencl-2013.2/doc/subst.rst0000644000175000000500000000130412245716340014603 0ustar tomussrc.. |comparable| replace:: Instances of this class are hashable, and two instances of this class may be compared using *"=="* and *"!="*. (Hashability was added in version 2011.2.) .. |buf-iface| replace:: must implement the Python buffer interface. (e.g. by being an :class:`numpy.ndarray`) .. |explain-waitfor| replace:: *wait_for* may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. .. |std-enqueue-blurb| replace:: Returns a new :class:`pyopencl.Event`. |explain-waitfor| .. |copy-depr| replace:: **Note:** This function is deprecated as of PyOpenCL 2011.1. Use :func:`enqueue_copy` instead. pyopencl-2013.2/doc/runtime.rst0000644000175000000500000007175112245716340015143 0ustar tomussrc.. _reference-doc: .. include:: subst.rst OpenCL Runtime ============== Version Queries --------------- .. module:: pyopencl .. moduleauthor:: Andreas Kloeckner .. data:: VERSION Gives the numeric version of PyOpenCL as a variable-length tuple of integers. Enables easy version checks such as *VERSION >= (0, 93)*. .. data:: VERSION_STATUS A text string such as `"rc4"` or `"beta"` qualifying the status of the release. .. data:: VERSION_TEXT The full release name (such as `"0.93rc4"`) in string form. .. function:: get_cl_header_version() Return a variable-length tuple of integers representing the version of the OpenCL header against which PyOpenCL was compiled. .. versionadded:: 0.92 .. _errors: Error Reporting --------------- .. class:: Error Base class for all PyOpenCL exceptions. .. class:: MemoryError .. class:: LogicError .. class:: RuntimeError Constants --------- .. include:: constants.inc Platforms, Devices and Contexts ------------------------------- .. function:: get_platforms() Return a list of :class:`Platform` instances. .. class:: Platform .. attribute:: info Lower case versions of the :class:`platform_info` constants may be used as attributes on instances of this class to directly query info attributes. .. method:: get_info(param) See :class:`platform_info` for values of *param*. .. method:: get_devices(device_type=device_type.ALL) Return a list of devices matching *device_type*. See :class:`device_type` for values of *device_type*. .. versionchanged:: 2013.2 This used to raise an exception if no matching devices were found. Now, it will simply return an empty list. .. automethod:: from_int_ptr .. autoattribute:: int_ptr |comparable| .. class:: Device .. attribute:: info Lower case versions of the :class:`device_info` constants may be used as attributes on instances of this class to directly query info attributes. .. method:: get_info(param) See :class:`device_info` for values of *param*. .. automethod:: from_int_ptr .. autoattribute:: int_ptr Two instances of this class may be compared using *=="* and *"!="*. .. class:: Context(devices=None, properties=None, dev_type=None) Create a new context. *properties* is a list of key-value tuples, where each key must be one of :class:`context_properties`. At most one of *devices* and *dev_type* may be not `None`, where *devices* is a list of :class:`Device` instances, and *dev_type* is one of the :class:`device_type` constants. If neither is specified, a context with a *dev_type* of :attr:`device_type.DEFAULT` is created. .. note:: Calling the constructor with no arguments will fail for recent CL drivers that support the OpenCL ICD. If you want similar, just-give-me-a-context-already behavior, we recommend :func:`create_some_context`. See, e.g. this `explanation by AMD `_. .. note:: For :attr:`context_properties.CL_GL_CONTEXT_KHR`, :attr:`context_properties.CL_EGL_DISPLAY_KHR`, :attr:`context_properties.CL_GLX_DISPLAY_KHR`, :attr:`context_properties.CL_WGL_HDC_KHR`, and :attr:`context_properties.CL_CGL_SHAREGROUP_KHR` :attr:`context_properties.CL_CGL_SHAREGROUP_APPLE` the value in the key-value pair is a PyOpenGL context or display instance. .. versionchanged:: 0.91.2 Constructor arguments *dev_type* added. .. attribute:: info Lower case versions of the :class:`context_info` constants may be used as attributes on instances of this class to directly query info attributes. .. method:: get_info(param) See :class:`context_info` for values of *param*. .. method:: create_sub_devices(properties) *properties* is an array of one (or more) of the forms:: [ dpp.EQUALLY, 8] [ dpp.BY_COUNTS, 5, 7, 9, dpp.PARTITION_BY_COUNTS_LIST_END] [ dpp.BY_NAMES, 5, 7, 9, dpp.PARTITION_BY_NAMES_LIST_END] [ dpp.BY_AFFINITY_DOMAIN, dad.L1_CACHE] where `dpp` represents :class:`device_partition_property` and `dad` represent :class:`device_affinity_domain`. `PROPERTIES_LIST_END_EXT` is added automatically. Only available with CL 1.2. .. versionadded:: 2011.2 .. method:: create_sub_devices_ext(properties) *properties* is an array of one (or more) of the forms:: [ dppe.EQUALLY, 8] [ dppe.BY_COUNTS, 5, 7, 9, dppe.PARTITION_BY_COUNTS_LIST_END] [ dppe.BY_NAMES, 5, 7, 9, dppe.PARTITION_BY_NAMES_LIST_END] [ dppe.BY_AFFINITY_DOMAIN, ad.L1_CACHE] where `dppe` represents :class:`device_partition_property_ext` and `ad` represent :class:`affinity_domain_ext`. `PROPERTIES_LIST_END_EXT` is added automatically. Only available with the `cl_ext_device_fission` extension. .. versionadded:: 2011.1 .. automethod:: from_int_ptr .. autoattribute:: int_ptr |comparable| .. function:: create_some_context(interactive=True) Create a :class:`Context` 'somehow'. If multiple choices for platform and/or device exist, *interactive* is True, and *sys.stdin.isatty()* is also True, then the user is queried about which device should be chosen. Otherwise, a device is chosen in an implementation-defined manner. Command Queues and Events ------------------------- .. class:: CommandQueue(context, device=None, properties=None) Create a new command queue. *properties* is a bit field consisting of :class:`command_queue_properties` values. if *device* is None, one of the devices in *context* is chosen in an implementation-defined manner. A :class:`CommandQueue` may be used as a context manager, like this:: with cl.CommandQueue(self.cl_context) as queue: enqueue_stuff(queue, ...) :meth:`finish` is automatically called at the end of the context. .. versionadded:: 2013.1 Context manager capability. .. attribute:: info Lower case versions of the :class:`command_queue_info` constants may be used as attributes on instances of this class to directly query info attributes. .. method:: get_info(param) See :class:`command_queue_info` for values of *param*. .. method:: set_property(prop, enable) See :class:`command_queue_properties` for possible values of *prop*. *enable* is a :class:`bool`. Unavailable in OpenCL 1.1 and newer. .. method:: flush() .. method:: finish() .. automethod:: from_int_ptr .. autoattribute:: int_ptr |comparable| .. class:: Event .. attribute:: info Lower case versions of the :class:`event_info` constants may be used as attributes on instances of this class to directly query info attributes. .. attribute:: profile.info Lower case versions of the :class:`profiling_info` constants may be used as attributes on the attribute `profile` of this class to directly query profiling info. For example, you may use *evt.profile.end* instead of *evt.get_profiling_info(pyopencl.profiling_info.END)*. .. method:: get_info(param) See :class:`event_info` for values of *param*. .. method:: get_profiling_info(param) See :class:`profiling_info` for values of *param*. See :attr:`profile` for an easier way of obtaining the same information. .. method:: wait() .. automethod:: from_int_ptr .. autoattribute:: int_ptr |comparable| .. function:: wait_for_events(events) .. function:: enqueue_barrier(queue, wait_for=None) Enqueues a barrier operation. which ensures that all queued commands in command_queue have finished execution. This command is a synchronization point. .. versionadded:: 0.91.5 .. versionchanged:: 2011.2 Takes *wait_for* and returns an :class:`Event` .. function:: enqueue_marker(queue, wait_for=None) Returns an :class:`Event`. .. versionchanged:: 2011.2 Takes *wait_for*. .. class:: UserEvent(context) A subclass of :class:`Event`. Only available with OpenCL 1.1 and newer. .. versionadded:: 0.92 .. method:: set_status(status) See :class:`command_execution_status` for possible values of *status*. .. class:: NannyEvent Transfers between host and device return events of this type. They hold a reference to the host-side buffer and wait for the transfer to complete when they are freed. Therefore, they can safely release the reference to the object they're guarding upon destruction. A subclass of :class:`Event`. .. versionadded:: 2011.2 .. method:: get_ward() .. method:: wait() In addition to performing the same wait as :meth:`Event.wait()`, this method also releases the reference to the guarded object. Memory ------ .. class:: MemoryObject .. attribute:: info Lower case versions of the :class:`mem_info` constants may be used as attributes on instances of this class to directly query info attributes. .. attribute:: hostbuf .. method:: get_info(param) See :class:`mem_info` for values of *param*. .. method:: release() .. method:: get_host_array(shape, dtype, order="C") Return the memory object's associated host memory area as a :class:`numpy.ndarray` of the given *shape*, *dtype* and *order*. .. automethod:: from_int_ptr .. autoattribute:: int_ptr |comparable| .. function:: enqueue_migrate_mem_objects(queue, mem_objects, flags=0, wait_for=None) :param flags: from :class:`mem_migration_flags` .. versionadded:: 2011.2 Only available with CL 1.2. .. function:: enqueue_migrate_mem_object_ext(queue, mem_objects, flags=0, wait_for=None) :param flags: from :class:`migrate_mem_object_flags_ext` .. versionadded:: 2011.2 Only available with the `cl_ext_migrate_memobject` extension. Buffers ^^^^^^^ .. class:: Buffer(context, flags, size=0, hostbuf=None) Create a :class:`Buffer`. See :class:`mem_flags` for values of *flags*. If *hostbuf* is specified, *size* defaults to the size of the specified buffer if it is passed as zero. :class:`Buffer` inherits from :class:`MemoryObject`. Note that actual memory allocation in OpenCL may be deferred. Buffers are attached to a :class:`Context` and are only moved to a device once the buffer is used on that device. That is also the point when out-of-memory errors will occur. If you'd like to be sure that there's enough memory for your allocation, either use :func:`enqueue_migrate_mem_objects` (if available) or simply perform a small transfer to the buffer. See also :class:`pyopencl.tools.ImmediateAllocator`. .. method:: get_sub_region(origin, size, flags=0) Only available in OpenCL 1.1 and newer. .. method:: __getitem__(slc) *slc* is a :class:`slice` object indicating from which byte index range a sub-buffer is to be created. The *flags* argument of :meth:`get_sub_region` is set to the same flags with which *self* was created. .. function:: enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None) :arg pattern: a buffer object (likely a :class:`numpy.ndarray`) |std-enqueue-blurb| Only available with CL 1.2. .. versionadded:: 2011.2 Image Formats ^^^^^^^^^^^^^ .. class:: ImageFormat([channel_order, channel_type]) .. attribute:: channel_order See :class:`channel_order` for possible values. .. attribute:: channel_data_type See :class:`channel_type` for possible values. .. attribute:: channel_count .. versionadded:: 0.91.5 .. attribute:: dtype_size .. versionadded:: 0.91.5 .. attribute:: itemsize .. versionadded:: 0.91.5 .. method:: __repr__ Returns a :class:`str` representation of the image format. .. versionadded:: 0.91 |comparable| .. versionchanged:: 0.91 Constructor arguments added. .. versionchanged:: 2013.2 :class:`ImageFormat` was made comparable and hashable .. function:: get_supported_image_formats(context, flags, image_type) See :class:`mem_flags` for possible values of *flags* and :class:`mem_object_type` for possible values of *image_type*. Images ^^^^^^ .. class:: Image(context, flags, format, shape=None, pitches=None, hostbuf=None, is_array=False, buffer=None): See :class:`mem_flags` for values of *flags*. *shape* is a 2- or 3-tuple. *format* is an instance of :class:`ImageFormat`. *pitches* is a 1-tuple for 2D images and a 2-tuple for 3D images, indicating the distance in bytes from one scan line to the next, and from one 2D image slice to the next. If *hostbuf* is given and *shape* is `None`, then *hostbuf.shape* is used as the *shape* parameter. :class:`Image` inherits from :class:`MemoryObject`. .. note:: If you want to load images from :mod:`numpy.ndarray` instances or read images back into them, be aware that OpenCL images expect the *x* dimension to vary fastest, whereas in the default (C) order of :mod:`numpy` arrays, the last index varies fastest. If your array is arranged in the wrong order in memory, there are two possible fixes for this: * Convert the array to Fortran (column-major) order using :func:`numpy.asarray`. * Pass *ary.T.copy()* to the image creation function. .. versionadded:: 0.91 .. versionchanged:: 2011.2 Added *is_array* and *buffer*, which are only available on CL 1.2 and newer. .. attribute:: info Lower case versions of the :class:`mem_info` and :class:`image_info` constants may be used as attributes on instances of this class to directly query info attributes. .. attribute:: shape Return the value of the *shape* constructor argument as a :class:`tuple`. .. method:: get_image_info(param) See :class:`image_info` for values of *param*. .. method:: release() |comparable| .. function:: image_from_array(ctx, ary, num_channels=None, mode="r", norm_int=False) Build a 2D or 3D :class:`Image` from the :class:`numpy.ndarray` *ary*. If *num_channels* is greater than one, the last dimension of *ary* must be identical to *num_channels*. *ary* must be in C order. If *num_channels* is not given, it defaults to 1 for scalar types and the number of entries for :ref:`vector-types`. The :class:`ImageFormat` is chosen as the first *num_channels* components of "RGBA". :param mode: "r" or "w" for read/write .. note:: When reading from the image object, the indices passed to `read_imagef` are in the reverse order from what they would be when accessing *ary* from Python. If *norm_int* is `True`, then the integer values are normalized to a floating point scale of 0..1 when read. .. versionadded:: 2011.2 .. function:: enqueue_fill_image(queue, mem, color, origin, region, wait_for=None) :arg color: a buffer object (likely a :class:`numpy.ndarray`) |std-enqueue-blurb| Only available with CL 1.2. .. versionadded:: 2011.2 Transfers ^^^^^^^^^ .. autofunction:: enqueue_copy(queue, dest, src, **kwargs) Mapping Memory into Host Address Space ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. class:: MemoryMap .. method:: release(queue=None, wait_for=None) .. function:: enqueue_map_buffer(queue, buf, flags, offset, shape, dtype, order="C", strides=None, wait_for=None, is_blocking=True) |explain-waitfor| *shape*, *dtype*, and *order* have the same meaning as in :func:`numpy.empty`. See :class:`map_flags` for possible values of *flags*. *strides*, if given, overrides *order*. :return: a tuple *(array, event)*. *array* is a :class:`numpy.ndarray` representing the host side of the map. Its *.base* member contains a :class:`MemoryMap`. .. versionchanged:: 2011.1 *is_blocking* now defaults to True. .. versionchanged:: 2013.1 *order* now defaults to "C". .. versionchanged:: 2013.2 Added *strides* argument. .. function:: enqueue_map_image(queue, buf, flags, origin, region, shape, dtype, order="C", strides=None, wait_for=None, is_blocking=True) |explain-waitfor| *shape*, *dtype*, and *order* have the same meaning as in :func:`numpy.empty`. See :class:`map_flags` for possible values of *flags*. *strides*, if given, overrides *order*. :return: a tuple *(array, event)*. *array* is a :class:`numpy.ndarray` representing the host side of the map. Its *.base* member contains a :class:`MemoryMap`. .. versionchanged:: 2011.1 *is_blocking* now defaults to True. .. versionchanged:: 2013.1 *order* now defaults to "C". .. versionchanged:: 2013.2 Added *strides* argument. Samplers ^^^^^^^^ .. class:: Sampler(context, normalized_coords, addressing_mode, filter_mode) *normalized_coords* is a :class:`bool` indicating whether to use coordinates between 0 and 1 (*True*) or the texture's natural pixel size (*False*). See :class:`addressing_mode` and :class:`filter_mode` for possible argument values. .. attribute:: info Lower case versions of the :class:`sampler_info` constants may be used as attributes on instances of this class to directly query info attributes. .. method:: get_info(param) See :class:`sampler_info` for values of *param*. .. automethod:: from_int_ptr .. autoattribute:: int_ptr |comparable| Programs and Kernels -------------------- .. class:: Program(context, src) Program(context, devices, binaries) *binaries* must contain one binary for each entry in *devices*. .. attribute:: info Lower case versions of the :class:`program_info` constants may be used as attributes on instances of this class to directly query info attributes. .. method:: get_info(param) See :class:`program_info` for values of *param*. .. method:: get_build_info(device, param) See :class:`program_build_info` for values of *param*. .. method:: build(options=[], devices=None) *options* is a string of compiler flags. Returns *self*. By default, built binaries are cached in an on-disk cache called :file:`pyopencl-compiler-cache-vN-uidNAME-pyVERSION` in the directory returned by :func:`tempfile.gettempdir`. By setting the environment variable :envvar:`PYOPENCL_NO_CACHE` to any non-empty value, this caching is suppressed. Any options found in the environment variable :envvar:`PYOPENCL_BUILD_OPTIONS` will be appended to *options*. .. versionchanged:: 2011.1 *options* may now also be a :class:`list` of :class:`str`. .. versionchanged:: 2013.1 Added :envvar:`PYOPENCL_NO_CACHE`. Added :envvar:`PYOPENCL_BUILD_OPTIONS`. .. method:: compile(self, options=[], devices=None, headers=[]) :param headers: a list of tuples *(name, program)*. Only available with CL 1.2. .. versionadded:: 2011.2 .. attribute:: kernel_name :class:`Kernel` objects can be produced from a built (see :meth:`build`) program simply by attribute lookup. .. note:: The :class:`program_info` attributes live in the same name space and take precedence over :class:`Kernel` names. .. method:: all_kernels() Returns a list of all :class:`Kernel` objects in the :class:`Program`. .. automethod:: from_int_ptr .. autoattribute:: int_ptr |comparable| .. function:: create_program_with_built_in_kernels(context, devices, kernel_names) Only available with CL 1.2. .. versionadded:: 2011.2 .. function:: link_program(context, programs, options=[], devices=None) Only available with CL 1.2. .. versionadded:: 2011.2 .. function:: unload_platform_compiler(platform) Only available with CL 1.2. .. versionadded:: 2011.2 .. class:: Kernel(program, name) .. attribute:: info Lower case versions of the :class:`kernel_info` constants may be used as attributes on instances of this class to directly query info attributes. .. method:: get_info(param) See :class:`kernel_info` for values of *param*. .. method:: get_work_group_info(param, device) See :class:`kernel_work_group_info` for values of *param*. .. method:: get_arg_info(arg_index, param) See :class:`kernel_arg_info` for values of *param*. Only available in OpenCL 1.2 and newer. .. method:: set_arg(self, index, arg) *arg* may be * `None`: This may be passed for `__global` memory references to pass a NULL pointer to the kernel. * Anything that satisfies the Python buffer interface, in particular :class:`numpy.ndarray`, :class:`str`, or :mod:`numpy`'s sized scalars, such as :class:`numpy.int32` or :class:`numpy.float64`. .. note:: Note that Python's own :class:`int` or :class:`float` objects will not work out of the box. See :meth:`Kernel.set_scalar_arg_dtypes` for a way to make them work. Alternatively, the standard library module :mod:`struct` can be used to convert Python's native number types to binary data in a :class:`str`. * An instance of :class:`MemoryObject`. (e.g. :class:`Buffer`, :class:`Image`, etc.) * An instance of :class:`LocalMemory`. * An instance of :class:`Sampler`. .. method:: set_args(self, *args) Invoke :meth:`set_arg` on each element of *args* in turn. .. versionadded:: 0.92 .. method:: set_scalar_arg_dtypes(arg_dtypes) Inform the wrapper about the sized types of scalar :class:`Kernel` arguments. For each argument, *arg_dtypes* contains an entry. For non-scalars, this must be *None*. For scalars, it must be an object acceptable to the :class:`numpy.dtype` constructor, indicating that the corresponding scalar argument is of that type. After invoking this function with the proper information, most suitable number types will automatically be cast to the right type for kernel invocation. .. note :: The information set by this rountine is attached to a single kernel instance. A new kernel instance is created every time you use `program.kernel` attribute access. The following will therefore not work:: prg = cl.Program(...).build() prg.kernel.set_scalar_arg_dtypes(...) prg.kernel(queue, n_globals, None, args) .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using :meth:`set_args` to set each argument in turn. See the documentation for :meth:`set_arg` to see what argument types are allowed. |std-enqueue-blurb| *None* may be passed for local_size. If *g_times_l* is specified, the global size will be multiplied by the local size. (which makes the behavior more like Nvidia CUDA) In this case, *global_size* and *local_size* also do not have to have the same number of dimensions. .. note:: :meth:`__call__` is *not* thread-safe. It sets the arguments using :meth:`set_args` and then runs :func:`enqueue_nd_range_kernel`. Another thread could race it in doing the same things, with undefined outcome. This issue is inherited from the C-level OpenCL API. The recommended solution is to make a kernel (i.e. access `prg.kernel_name`, which corresponds to making a new kernel) for every thread that may enqueue calls to the kernel. A solution involving implicit locks was discussed and decided against on the mailing list in `October 2012 `_. .. versionchanged:: 0.92 *local_size* was promoted to third positional argument from being a keyword argument. The old keyword argument usage will continue to be accepted with a warning throughout the 0.92 release cycle. This is a backward-compatible change (just barely!) because *local_size* as third positional argument can only be a :class:`tuple` or *None*. :class:`tuple` instances are never valid :class:`Kernel` arguments, and *None* is valid as an argument, but its treatment in the wrapper had a bug (now fixed) that prevented it from working. .. versionchanged:: 2011.1 Added the *g_times_l* keyword arg. .. method:: capture_call(filename, queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) This method supports the exact same interface as :meth:`__call__`, but instead of invoking the kernel, it writes a self-contained PyOpenCL program to *filename* that reproduces this invocation. Data and kernel source code will be packaged up in *filename*'s source code. This is mainly intended as a debugging aid. For example, it can be used to automate the task of creating a small, self-contained test case for an observed problem. It can also help separate a misbehaving kernel from a potentially large or time-consuming outer code. To use, simply change:: evt = my_kernel(queue, gsize, lsize, arg1, arg2, ...) to:: evt = my_kernel.capture_call("bug.py", queue, gsize, lsize, arg1, arg2, ...) .. versionadded:: 2013.1 .. automethod:: from_int_ptr .. autoattribute:: int_ptr |comparable| .. class:: LocalMemory(size) A helper class to pass `__local` memory arguments to kernels. .. versionadded:: 0.91.2 .. attribute:: size The size of local buffer in bytes to be provided. .. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False) |std-enqueue-blurb| If *g_times_l* is specified, the global size will be multiplied by the local size. (which makes the behavior more like Nvidia CUDA) In this case, *global_size* and *local_size* also do not have to have the same number of dimensions. .. versionchanged:: 2011.1 Added the *g_times_l* keyword arg. .. function:: enqueue_task(queue, kernel, wait_for=None) |std-enqueue-blurb| .. _gl-interop: GL Interoperability ------------------- Functionality in this section is only available when PyOpenCL is compiled with GL support. See :func:`have_gl`. .. versionadded:: 0.91 .. function:: have_gl() Return *True* if PyOpenCL was compiled with OpenGL interoperability, otherwise *False*. .. function:: get_gl_sharing_context_properties() Return a :class:`list` of :class:`context_properties` that will allow a newly created context to share the currently active GL context. .. function:: get_apple_cgl_share_group() Get share group handle for current CGL context. Apple OS X only. .. versionadded:: 2011.1 .. class:: GLBuffer(context, flags, bufobj) :class:`GLBuffer` inherits from :class:`MemoryObject`. .. attribute:: gl_object .. class:: GLRenderBuffer(context, flags, bufobj) :class:`GLRenderBuffer` inherits from :class:`MemoryObject`. .. attribute:: gl_object .. class:: GLTexture(context, flags, texture_target, miplevel, texture, dims) *dims* is either 2 or 3. :class:`GLTexture` inherits from :class:`Image`. .. attribute:: gl_object .. method:: get_gl_texture_info(param) See :class:`gl_texture_info` for values of *param*. Only available when PyOpenCL is compiled with GL support. See :func:`have_gl`. .. function:: enqueue_acquire_gl_objects(queue, mem_objects, wait_for=None) *mem_objects* is a list of :class:`MemoryObject` instances. |std-enqueue-blurb| .. function:: enqueue_release_gl_objects(queue, mem_objects, wait_for=None) *mem_objects* is a list of :class:`MemoryObject` instances. |std-enqueue-blurb| .. function:: get_gl_context_info_khr(properties, param_name, platform=None) Get information on which CL device corresponds to a given GL/EGL/WGL/CGL device. See the :class:`Context` constructor for the meaning of *properties* and :class:`gl_context_info` for *param_name*. .. versionchanged:: 2011.2 Accepts the *platform* argument. Using *platform* equal to None is deprecated as of PyOpenCL 2011.2. pyopencl-2013.2/doc/upload-docs.sh0000755000175000000500000000014012245716340015457 0ustar tomussrc#! /bin/sh rsync --progress --verbose --archive --delete _build/html/* doc-upload:doc/pyopencl pyopencl-2013.2/doc/conf.py0000644000175000000500000001424512245716342014222 0ustar tomussrc# -*- coding: utf-8 -*- # # PyOpenCL documentation build configuration file, created by # sphinx-quickstart on Fri Jun 13 00:51:19 2008. # # This file is execfile()d with the current directory set to its containing dir. # # The contents of this file are pickled, so don't put values in the namespace # that aren't pickleable (module imports are okay, they're removed automatically). # # All configuration values have a default value; values that are commented out # serve to show the default value. #import sys, os # If your extensions are in another directory, add it here. If the directory # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. #sys.path.append(os.path.abspath('some/directory')) # General configuration # --------------------- # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ 'sphinx.ext.intersphinx', 'sphinx.ext.autodoc', 'sphinx.ext.doctest', ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] exclude_patterns = ['subst.rst'] # The suffix of source filenames. source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General substitutions. project = 'PyOpenCL' copyright = '2009, Andreas Kloeckner' # The default replacements for |version| and |release|, also used in various # other places throughout the built documents. # # The short X.Y version. ver_dic = {} execfile("../pyopencl/version.py", ver_dic) version = ".".join(str(x) for x in ver_dic["VERSION"]) # The full version, including alpha/beta/rc tags. release = ver_dic["VERSION_TEXT"] # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. #unused_docs = [] # List of directories, relative to source directories, that shouldn't be searched # for source files. #exclude_dirs = [] # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # Options for HTML output # ----------------------- try: import sphinx_bootstrap_theme except: from warnings import warn warn("I would like to use the sphinx bootstrap theme, but can't find it.\n" "'pip install sphinx_bootstrap_theme' to fix.") else: # Activate the theme. html_theme = 'bootstrap' html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the # documentation. html_theme_options = { "navbar_fixed_top": "true", "navbar_class": "navbar navbar-inverse", "navbar_site_name": "Contents", } # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths # given in html_static_path. #html_style = 'default.css' # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # The name of an image file (within the static path) to place at the top of # the sidebar. #html_logo = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_use_modindex = True # If true, the reST sources are included in the HTML build as _sources/. html_copy_source = False # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'PyCudadoc' # Options for LaTeX output # ------------------------ # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, document class [howto/manual]). latex_documents = [ ('index', 'pyopencl.tex', 'PyOpenCL Documentation', 'Andreas Kloeckner', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # Additional stuff for the LaTeX preamble. #latex_preamble = '' # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_use_modindex = True intersphinx_mapping = { 'http://docs.python.org/dev': None, 'http://docs.scipy.org/doc/numpy/': None, 'http://docs.makotemplates.org/en/latest/': None, } autoclass_content = "both" pyopencl-2013.2/doc/_templates/0002755000175000000500000000000012245716340015052 5ustar tomussrcpyopencl-2013.2/doc/_templates/layout.html0000644000175000000500000000012412245716340017250 0ustar tomussrc{% extends "!layout.html" %} {% set css_files = css_files + ['_static/akdoc.css']%} pyopencl-2013.2/doc/index.rst0000644000175000000500000000564412245716342014567 0ustar tomussrcWelcome to PyOpenCL's documentation! ==================================== PyOpenCL gives you easy, Pythonic access to the `OpenCL `_ parallel computation API. What makes PyOpenCL special? * Object cleanup tied to lifetime of objects. This idiom, often called `RAII `_ in C++, makes it much easier to write correct, leak- and crash-free code. * Completeness. PyOpenCL puts the full power of OpenCL's API at your disposal, if you wish. Every obscure `get_info()` query and all CL calls are accessible. * Automatic Error Checking. All errors are automatically translated into Python exceptions. * Speed. PyOpenCL's base layer is written in C++, so all the niceties above are virtually free. * Helpful Documentation. You're looking at it. ;) * Liberal license. PyOpenCL is open-source under the :ref:`MIT license ` and free for commercial, academic, and private use. Here's an example, to give you an impression: .. literalinclude:: ../examples/demo.py (You can find this example as :download:`examples/demo.py <../examples/demo.py>` in the PyOpenCL source distribution.) Tutorials ========= * `Simon McIntosh-Smith `_ and `Tom Deakin `_'s course `Hands-on OpenCL `_ contains both `lecture slides `_ and `excercises (with solutions) `_ (The course covers PyOpenCL as well as OpenCL's C and C++ APIs.) * PyOpenCL course at `PASI `_: Parts `1 `_ `2 `_ `3 `_ `4 `_ (YouTube, 2011) * PyOpenCL course at `DTU GPULab `_ and `Simula `_ (2011): `Lecture 1 `_ `Lecture 2 `_ `Problem set 1 `_ `Problem set 2 `_ * Ian Johnson's `PyOpenCL tutorial `_. Contents ======== .. toctree:: :maxdepth: 2 runtime array algorithm howto tools misc Note that this guide does not explain OpenCL programming and technology. Please refer to the official `Khronos OpenCL documentation `_ for that. PyOpenCL also has its own `web site `_, where you can find updates, new versions, documentation, and support. Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` pyopencl-2013.2/doc/make_constants.py0000644000175000000500000002646512245716340016313 0ustar tomussrc__copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pyopencl as cl fission = ("cl_ext_device_fission", "2011.1") nv_devattr = ("cl_nv_device_attribute_query", "0.92") gl_sharing = ("cl_khr_gl_sharing", "0.92") cl_11 = ("CL_1.1", "0.92") cl_12 = ("CL_1.2", "2011.2") amd_devattr = ("cl_amd_device_attribute_query", "2013.2") def get_extra_lines(tup): ext_name, pyopencl_ver = tup if ext_name is not None: if ext_name.startswith("CL_"): # capital letters -> CL version, not extension yield "" yield " Available with OpenCL %s." % ( ext_name[3:]) yield "" else: yield "" yield " Available with the ``%s`` extension." % ext_name yield "" if pyopencl_ver is not None: yield "" yield " .. versionadded:: %s" % pyopencl_ver yield "" const_ext_lookup = { cl.status_code: { "PLATFORM_NOT_FOUND_KHR": ("cl_khr_icd", "2011.1"), "INVALID_GL_SHAREGROUP_REFERENCE_KHR": gl_sharing, "MISALIGNED_SUB_BUFFER_OFFSET": cl_11, "EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST": cl_11, "INVALID_GLOBAL_WORK_SIZE": cl_11, "COMPILE_PROGRAM_FAILURE": cl_12, "LINKER_NOT_AVAILABLE": cl_12, "LINK_PROGRAM_FAILURE": cl_12, "DEVICE_PARTITION_FAILED": cl_12, "KERNEL_ARG_INFO_NOT_AVAILABLE": cl_12, "INVALID_IMAGE_DESCRIPTOR": cl_12, "INVALID_COMPILER_OPTIONS": cl_12, "INVALID_LINKER_OPTIONS": cl_12, "INVALID_DEVICE_PARTITION_COUNT": cl_12, }, cl.device_info: { "PREFERRED_VECTOR_WIDTH_HALF": cl_11, "HOST_UNIFIED_MEMORY": cl_11, "NATIVE_VECTOR_WIDTH_CHAR": cl_11, "NATIVE_VECTOR_WIDTH_SHORT": cl_11, "NATIVE_VECTOR_WIDTH_INT": cl_11, "NATIVE_VECTOR_WIDTH_LONG": cl_11, "NATIVE_VECTOR_WIDTH_FLOAT": cl_11, "NATIVE_VECTOR_WIDTH_DOUBLE": cl_11, "NATIVE_VECTOR_WIDTH_HALF": cl_11, "OPENCL_C_VERSION": cl_11, "COMPUTE_CAPABILITY_MAJOR_NV": nv_devattr, "COMPUTE_CAPABILITY_MINOR_NV": nv_devattr, "REGISTERS_PER_BLOCK_NV": nv_devattr, "WARP_SIZE_NV": nv_devattr, "GPU_OVERLAP_NV": nv_devattr, "KERNEL_EXEC_TIMEOUT_NV": nv_devattr, "INTEGRATED_MEMORY_NV": nv_devattr, "DOUBLE_FP_CONFIG": ("cl_khr_fp64", "2011.1"), "HALF_FP_CONFIG": ("cl_khr_fp16", "2011.1"), "PROFILING_TIMER_OFFSET_AMD": amd_devattr, "TOPOLOGY_AMD": amd_devattr, "BOARD_NAME_AMD": amd_devattr, "GLOBAL_FREE_MEMORY_AMD": amd_devattr, "SIMD_PER_COMPUTE_UNIT_AMD": amd_devattr, "SIMD_WIDTH_AMD": amd_devattr, "SIMD_INSTRUCTION_WIDTH_AMD": amd_devattr, "WAVEFRONT_WIDTH_AMD": amd_devattr, "GLOBAL_MEM_CHANNELS_AMD": amd_devattr, "GLOBAL_MEM_CHANNEL_BANKS_AMD": amd_devattr, "GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD": amd_devattr, "LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD": amd_devattr, "LOCAL_MEM_BANKS_AMD": amd_devattr, "MAX_ATOMIC_COUNTERS_EXT": ("cl_ext_atomic_counters_64", "2013.2"), "PARENT_DEVICE_EXT": fission, "PARTITION_TYPES_EXT": fission, "AFFINITY_DOMAINS_EXT": fission, "REFERENCE_COUNT_EXT": fission, "PARTITION_STYLE_EXT": fission, "LINKER_AVAILABLE": cl_12, "BUILT_IN_KERNELS": cl_12, "IMAGE_MAX_BUFFER_SIZE": cl_12, "IMAGE_MAX_ARRAY_SIZE": cl_12, "PARENT_DEVICE": cl_12, "PARTITION_MAX_SUB_DEVICES": cl_12, "PARTITION_PROPERTIES": cl_12, "PARTITION_AFFINITY_DOMAIN": cl_12, "PARTITION_TYPE": cl_12, "REFERENCE_COUNT": cl_12, "PREFERRED_INTEROP_USER_SYNC": cl_12, "PRINTF_BUFFER_SIZE": cl_12, }, cl.mem_object_type: { "IMAGE2D_ARRAY": cl_12, "IMAGE1D": cl_12, "IMAGE1D_ARRAY": cl_12, "IMAGE1D_BUFFER": cl_12, }, cl.device_type: { "CUSTOM": cl_12, }, cl.context_properties: { "GL_CONTEXT_KHR": gl_sharing, "EGL_DISPLAY_KHR": gl_sharing, "GLX_DISPLAY_KHR": gl_sharing, "WGL_HDC_KHR": gl_sharing, "CGL_SHAREGROUP_KHR": gl_sharing, "OFFLINE_DEVICES_AMD": ("cl_amd_offline_devices", "2011.1"), }, cl.device_fp_config: { "SOFT_FLOAT": cl_11, "CORRECTLY_ROUNDED_DIVIDE_SQRT": cl_12, }, cl.context_info: { "NUM_DEVICES": cl_11, "INTEROP_USER_SYNC": cl_12, }, cl.channel_order: { "Rx": cl_11, "RGx": cl_11, "RGBx": cl_11, }, cl.kernel_work_group_info: { "PREFERRED_WORK_GROUP_SIZE_MULTIPLE": cl_11, "PRIVATE_MEM_SIZE": cl_11, "GLOBAL_WORK_SIZE": cl_12, }, cl.addressing_mode: { "MIRRORED_REPEAT": cl_11, }, cl.event_info: { "CONTEXT": cl_11, }, cl.mem_info: { "ASSOCIATED_MEMOBJECT": cl_11, "OFFSET": cl_11, }, cl.image_info: { "ARRAY_SIZE": cl_12, "BUFFER": cl_12, "NUM_MIP_LEVELS": cl_12, "NUM_SAMPLES": cl_12, }, cl.map_flags: { "WRITE_INVALIDATE_REGION": cl_12, }, cl.program_info: { "NUM_KERNELS": cl_12, "KERNEL_NAMES": cl_12, }, cl.program_build_info: { "BINARY_TYPE": cl_12, }, cl.program_binary_type: { "NONE": cl_12, "COMPILED_OBJECT": cl_12, "LIBRARY": cl_12, "EXECUTABLE": cl_12, }, cl.kernel_info: { "ATTRIBUTES": cl_12, }, cl.kernel_arg_info: { "ADDRESS_QUALIFIER": cl_12, "ACCESS_QUALIFIER": cl_12, "TYPE_NAME": cl_12, "ARG_NAME": cl_12, }, cl.kernel_arg_address_qualifier: { "GLOBAL": cl_12, "LOCAL": cl_12, "CONSTANT": cl_12, "PRIVATE": cl_12, }, cl.kernel_arg_access_qualifier: { "READ_ONLY": cl_12, "WRITE_ONLY": cl_12, "READ_WRITE": cl_12, "NONE": cl_12, }, cl.command_type: { "READ_BUFFER_RECT": cl_11, "WRITE_BUFFER_RECT": cl_11, "COPY_BUFFER_RECT": cl_11, "USER": cl_11, "MIGRATE_MEM_OBJECT_EXT": ("cl_ext_migrate_memobject", "2011.2"), "BARRIER": cl_12, "MIGRATE_MEM_OBJECTS": cl_12, "FILL_BUFFER": cl_12, "FILL_IMAGE": cl_12, }, cl.mem_flags: { "USE_PERSISTENT_MEM_AMD": ("cl_amd_device_memory_flags", "2011.1"), "HOST_WRITE_ONLY": cl_12, }, cl.device_partition_property: { "EQUALLY": cl_12, "BY_COUNTS": cl_12, "BY_NAMES": cl_12, "BY_AFFINITY_DOMAIN": cl_12, "PROPERTIES_LIST_END": cl_12, "PARTITION_BY_COUNTS_LIST_END": cl_12, "PARTITION_BY_NAMES_LIST_END": cl_12, }, cl.device_affinity_domain: { "NUMA": cl_12, "L4_CACHE": cl_12, "L3_CACHE": cl_12, "L2_CACHE": cl_12, "L1_CACHE": cl_12, "NEXT_PARITIONNABLE": cl_12, }, cl.device_partition_property_ext: { "EQUALLY": fission, "BY_COUNTS": fission, "BY_NAMES": fission, "BY_AFFINITY_DOMAIN": fission, "PROPERTIES_LIST_END": fission, "PARTITION_BY_COUNTS_LIST_END": fission, "PARTITION_BY_NAMES_LIST_END": fission, }, cl.affinity_domain_ext: { "L1_CACHE": fission, "L2_CACHE": fission, "L3_CACHE": fission, "L4_CACHE": fission, "NUMA": fission, "NEXT_FISSIONABLE": fission, }, cl.mem_migration_flags: { "HOST": cl_12, "CONTENT_UNDEFINED": cl_12, }, cl.migrate_mem_object_flags_ext: { "HOST": ("cl_ext_migrate_memobject", "2011.2"), }, } try: gl_ci = cl.gl_context_info except AttributeError: pass else: const_ext_lookup[gl_ci] = { getattr(gl_ci, "CURRENT_DEVICE_FOR_GL_CONTEXT_KHR", None): gl_sharing, getattr(gl_ci, "DEVICES_FOR_GL_CONTEXT_KHR", None): gl_sharing, } cls_ext_lookup = { #cl.buffer_create_type: ("CL_1.1", "0.92"), } def doc_class(cls): print ".. class :: %s" % cls.__name__ print if cls.__name__.startswith("gl_"): print " Only available when PyOpenCL is compiled with GL support." print " See :func:`have_gl`." print if cls in cls_ext_lookup: for l in get_extra_lines(cls_ext_lookup[cls]): print l cls_const_ext = const_ext_lookup.get(cls, {}) for name in sorted(dir(cls)): if not name.startswith("_") and not name in ["to_string", "names", "values"]: print " .. attribute :: %s" % name if name in cls_const_ext: for l in get_extra_lines(cls_const_ext[name]): print " "+l print " .. method :: to_string(value)" print print " Returns a :class:`str` representing *value*." print print " .. versionadded:: 0.91" print if not cl.have_gl(): print ".. warning::" print print " This set of PyOpenCL documentation is incomplete because it" print " was generated on a PyOpenCL build that did not support OpenGL." print print ".. This is an automatically generated file. DO NOT EDIT" print for cls in cl.CONSTANT_CLASSES: doc_class(cls) pyopencl-2013.2/doc/howto.rst0000644000175000000500000000635612245716340014617 0ustar tomussrcHow-tos ======= How to use struct types with PyOpenCL ------------------------------------- We import and initialize PyOpenCL as usual: .. doctest:: :options: +ELLIPSIS >>> import numpy as np >>> import pyopencl as cl >>> import pyopencl.tools >>> import pyopencl.array >>> ctx = cl.create_some_context(interactive=False) >>> queue = cl.CommandQueue(ctx) Then, suppose we would like to declare a struct consisting of an integer and a floating point number. We first create a :class:`numpy.dtype` along these lines: .. doctest:: >>> my_struct = np.dtype([("field1", np.int32), ("field2", np.float32)]) >>> print my_struct [('field1', '`_. So as a first step, we match our dtype against CL's version: .. doctest:: >>> my_struct, my_struct_c_decl = cl.tools.match_dtype_to_c_struct( ... ctx.devices[0], "my_struct", my_struct) >>> print my_struct_c_decl typedef struct { int field1; float field2; } my_struct; We then tell PyOpenCL about our new type. .. doctest:: >>> my_struct = cl.tools.get_or_register_dtype("my_struct", my_struct) Next, we can create some data of that type on the host and transfer it to the device: .. doctest:: >>> ary_host = np.empty(20, my_struct) >>> ary_host["field1"].fill(217) >>> ary_host["field2"].fill(1000) >>> ary_host[13]["field2"] = 12 >>> print ary_host [(217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 12.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0)] >>> ary = cl.array.to_device(queue, ary_host) We can then operate on the array with our own kernels: .. doctest:: >>> prg = cl.Program(ctx, my_struct_c_decl + """ ... __kernel void set_to_1(__global my_struct *a) ... { ... a[get_global_id(0)].field1 = 1; ... } ... """).build() >>> evt = prg.set_to_1(queue, ary.shape, None, ary.data) >>> print ary [(1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 12.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0)] as well as with PyOpenCL's built-in operations: >>> from pyopencl.elementwise import ElementwiseKernel >>> elwise = ElementwiseKernel(ctx, "my_struct *a", "a[i].field1 = 2;", ... preamble=my_struct_c_decl) >>> evt = elwise(ary) >>> print ary [(2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 12.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0)] pyopencl-2013.2/doc/Makefile0000644000175000000500000001115012245716340014351 0ustar tomussrc# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python `which sphinx-build` PAPER = BUILDDIR = _build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " text to make text files" @echo " man to make manual pages" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" .PHONY: help clean html web pickle htmlhelp latex changes linkcheck constants: python make_constants.py > constants.inc clean: -rm -rf build/* html: constants $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." singlehtml: constants $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: constants $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: constants $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: constants $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: constants $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/loopy.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/loopy.qhc" devhelp: constants $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/loopy" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/loopy" @echo "# devhelp" epub: constants $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: constants $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: constants $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." make -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: constants $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: constants $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." changes: constants $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: constants $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: constants $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." pyopencl-2013.2/bpl-subset/0002755000175000000500000000000012245716340014230 5ustar tomussrcpyopencl-2013.2/README_SETUP.txt0000644000175000000500000000210212245716340014617 0ustar tomussrcHi, welcome. This Python package uses aksetup for installation, which means that installation should be easy and quick. If you don't want to continue reading, just try the regular ./configure.py --help ./configure.py --some-options make sudo make install That should do the trick. (By the way: If a config option says "several ok", then you may specify several values, separated by commas.) aksetup also supports regular distutils installation, without using configure: python setup.py build sudo python setup.py install In this case, configuration is obtained from files in this order: /etc/aksetup-defaults.py $HOME/.aksetup-defaults.py $PACKAGEDIR/siteconf.py Once you've run configure, you can copy options from your siteconf.py file to one of these files, and you won't ever have to configure them again manually. In fact, you may pass the options "--update-user" and "--update-global" to configure, and it will automatically update these files for you. This is particularly handy if you want to perform an unattended or automatic installation via easy_install. pyopencl-2013.2/contrib/0002755000175000000500000000000012245716340013610 5ustar tomussrcpyopencl-2013.2/contrib/pyopencl.vim0000644000175000000500000000451312245716340016157 0ustar tomussrc" Vim highlighting for PyOpenCL " ----------------------------- " " (C) Andreas Kloeckner 2011, MIT license " " Uses parts of mako.vim by Armin Ronacher. " " Installation: " Just drop this file into ~/.vim/syntax/pyopencl.vim " " Then do " :set filetype=pyopencl " and use " """//CL// ...code...""" " for OpenCL code included in your Python file. " " You may also include a line " vim: filetype=pyopencl.python " at the end of your file to set the file type automatically. " " Optional: Install opencl.vim from " http://www.vim.org/scripts/script.php?script_id=3157 runtime! syntax/python.vim unlet b:current_syntax try syntax include @clCode syntax/opencl.vim catch syntax include @clCode syntax/c.vim endtry unlet b:current_syntax syn include @pythonTop syntax/python.vim " {{{ mako syn region clmakoLine start="^\s*%" skip="\\$" end="$" syn region clmakoVariable start=#\${# end=#}# contains=@pythonTop syn region clmakoBlock start=#<%!# end=#%># keepend contains=@pythonTop syn match clmakoAttributeKey containedin=clmakoTag contained "[a-zA-Z_][a-zA-Z0-9_]*=" syn region clmakoAttributeValue containedin=clmakoTag contained start=/"/ skip=/\\"/ end=/"/ syn region clmakoAttributeValue containedin=clmakoTag contained start=/'/ skip=/\\'/ end=/'/ syn region clmakoTag start="" end="/\?>" " The C highlighter's paren error detection screws up highlighting of " Mako variables in C parens--turn it off. syn clear cParen syn clear cParenError if !exists("c_no_bracket_error") syn clear cBracket endif syn cluster clmakoCode contains=clmakoLine,clmakoVariable,clmakoBlock,clmakoTag hi link clmakoLine Preproc hi link clmakoVariable Preproc hi link clmakoBlock Preproc hi link clmakoTag Define hi link clmakoAttributeKey String hi link clmakoAttributeValue String " }}} syn region pythonCLString \ start=+[uU]\=\z('''\|"""\)//CL\(:[a-zA-Z_0-9]\+\)\?//+ end="\z1" keepend \ contains=@clCode,@clmakoCode syn region pythonCLRawString \ start=+[uU]\=[rR]\z('''\|"""\)//CL\(:[a-zA-Z_0-9]\+\)\?//+ end="\z1" keepend \ contains=@clCode,@clmakoCode " Uncomment if you still want the code highlighted as a string. " hi link pythonCLString String " hi link pythonCLRawString String syntax sync fromstart let b:current_syntax = "pyopencl" " vim: foldmethod=marker pyopencl-2013.2/contrib/fortran-to-opencl/0002755000175000000500000000000012245716340017161 5ustar tomussrcpyopencl-2013.2/contrib/fortran-to-opencl/translate.py0000644000175000000500000012160112245716340021527 0ustar tomussrcfrom __future__ import division, with_statement __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import cgen import numpy as np import re from pymbolic.parser import Parser as ExpressionParserBase from pymbolic.mapper import CombineMapper import pymbolic.primitives from pymbolic.mapper.c_code import CCodeMapper as CCodeMapperBase from warnings import warn import pytools.lex import re class TranslatorWarning(UserWarning): pass class TranslationError(RuntimeError): pass # {{{ AST components def dtype_to_ctype(dtype): if dtype is None: raise ValueError("dtype may not be None") dtype = np.dtype(dtype) if dtype == np.int64: return "long" elif dtype == np.uint64: return "unsigned long" elif dtype == np.int32: return "int" elif dtype == np.uint32: return "unsigned int" elif dtype == np.int16: return "short int" elif dtype == np.uint16: return "short unsigned int" elif dtype == np.int8: return "signed char" elif dtype == np.uint8: return "unsigned char" elif dtype == np.float32: return "float" elif dtype == np.float64: return "double" elif dtype == np.complex64: return "cfloat_t" elif dtype == np.complex128: return "cdouble_t" else: raise ValueError, "unable to map dtype '%s'" % dtype class POD(cgen.POD): def get_decl_pair(self): return [dtype_to_ctype(self.dtype)], self.name # }}} # {{{ expression parser _less_than = intern("less_than") _greater_than = intern("greater_than") _less_equal = intern("less_equal") _greater_equal = intern("greater_equal") _equal = intern("equal") _not_equal = intern("not_equal") _not = intern("not") _and = intern("and") _or = intern("or") class TypedLiteral(pymbolic.primitives.Leaf): def __init__(self, value, dtype): self.value = value self.dtype = np.dtype(dtype) def __getinitargs__(self): return self.value, self.dtype mapper_method = intern("map_literal") class FortranExpressionParser(ExpressionParserBase): # FIXME double/single prec literals lex_table = [ (_less_than, pytools.lex.RE(r"\.lt\.", re.I)), (_greater_than, pytools.lex.RE(r"\.gt\.", re.I)), (_less_equal, pytools.lex.RE(r"\.le\.", re.I)), (_greater_equal, pytools.lex.RE(r"\.ge\.", re.I)), (_equal, pytools.lex.RE(r"\.eq\.", re.I)), (_not_equal, pytools.lex.RE(r"\.ne\.", re.I)), (_not, pytools.lex.RE(r"\.not\.", re.I)), (_and, pytools.lex.RE(r"\.and\.", re.I)), (_or, pytools.lex.RE(r"\.or\.", re.I)), ] + ExpressionParserBase.lex_table def __init__(self, tree_walker): self.tree_walker = tree_walker _PREC_FUNC_ARGS = 1 def parse_terminal(self, pstate): scope = self.tree_walker.scope_stack[-1] from pymbolic.primitives import Subscript, Call, Variable from pymbolic.parser import ( _identifier, _openpar, _closepar, _float) next_tag = pstate.next_tag() if next_tag is _float: value = pstate.next_str_and_advance().lower() if "d" in value: dtype = np.float64 else: dtype = np.float32 value = value.replace("d", "e") if value.startswith("."): prev_value = value value = "0"+value print value, prev_value elif value.startswith("-."): prev_value = value value = "-0"+value[1:] print value, prev_value return TypedLiteral(value, dtype) elif next_tag is _identifier: name = pstate.next_str_and_advance() if pstate.is_at_end() or pstate.next_tag() is not _openpar: # not a subscript scope.use_name(name) return Variable(name) left_exp = Variable(name) pstate.advance() pstate.expect_not_end() if scope.is_known(name): cls = Subscript else: cls = Call if pstate.next_tag is _closepar: pstate.advance() left_exp = cls(left_exp, ()) else: args = self.parse_expression(pstate, self._PREC_FUNC_ARGS) if not isinstance(args, tuple): args = (args,) left_exp = cls(left_exp, args) pstate.expect(_closepar) pstate.advance() return left_exp else: return ExpressionParserBase.parse_terminal( self, pstate) COMP_MAP = { _less_than: "<", _less_equal: "<=", _greater_than: ">", _greater_equal: ">=", _equal: "==", _not_equal: "!=", } def parse_prefix(self, pstate, min_precedence=0): from pymbolic.parser import _PREC_UNARY import pymbolic.primitives as primitives pstate.expect_not_end() if pstate.is_next(_not): pstate.advance() return primitives.LogicalNot( self.parse_expression(pstate, _PREC_UNARY)) else: return ExpressionParserBase.parse_prefix(self, pstate) def parse_postfix(self, pstate, min_precedence, left_exp): from pymbolic.parser import ( _PREC_CALL, _PREC_COMPARISON, _openpar, _PREC_LOGICAL_OR, _PREC_LOGICAL_AND) from pymbolic.primitives import ( ComparisonOperator, LogicalAnd, LogicalOr) next_tag = pstate.next_tag() if next_tag is _openpar and _PREC_CALL > min_precedence: raise TranslationError("parenthesis operator only works on names") elif next_tag in self.COMP_MAP and _PREC_COMPARISON > min_precedence: pstate.advance() left_exp = ComparisonOperator( left_exp, self.COMP_MAP[next_tag], self.parse_expression(pstate, _PREC_COMPARISON)) did_something = True elif next_tag is _and and _PREC_LOGICAL_AND > min_precedence: pstate.advance() left_exp = LogicalAnd((left_exp, self.parse_expression(pstate, _PREC_LOGICAL_AND))) did_something = True elif next_tag is _or and _PREC_LOGICAL_OR > min_precedence: pstate.advance() left_exp = LogicalOr((left_exp, self.parse_expression(pstate, _PREC_LOGICAL_OR))) did_something = True else: left_exp, did_something = ExpressionParserBase.parse_postfix( self, pstate, min_precedence, left_exp) if isinstance(left_exp, tuple) and min_precedence < self._PREC_FUNC_ARGS: # this must be a complex literal assert len(left_exp) == 2 r, i = left_exp dtype = (r.dtype.type(0) + i.dtype.type(0)) if dtype == np.float32: dtype = np.complex64 else: dtype = np.complex128 left_exp = TypedLiteral(left_exp, dtype) return left_exp, did_something # }}} # {{{ expression generator class TypeInferenceMapper(CombineMapper): def __init__(self, scope): self.scope = scope def combine(self, dtypes): return sum(dtype.type(1) for dtype in dtypes).dtype def map_literal(self, expr): return expr.dtype def map_constant(self, expr): return np.asarray(expr).dtype def map_variable(self, expr): return self.scope.get_type(expr.name) def map_call(self, expr): name = expr.function.name if name == "fromreal": arg, = expr.parameters base_dtype = self.rec(arg) tgt_real_dtype = (np.float32(0)+base_dtype.type(0)).dtype assert tgt_real_dtype.kind == "f" if tgt_real_dtype == np.float32: return np.dtype(np.complex64) elif tgt_real_dtype == np.float64: return np.dtype(np.complex128) else: raise RuntimeError("unexpected complex type") else: return CombineMapper.map_call(self, expr) class ComplexCCodeMapper(CCodeMapperBase): def __init__(self, infer_type): CCodeMapperBase.__init__(self) self.infer_type = infer_type def complex_type_name(self, dtype): if dtype == np.complex64: return "cfloat" if dtype == np.complex128: return "cdouble" else: raise RuntimeError def map_sum(self, expr, enclosing_prec): tgt_dtype = self.infer_type(expr) is_complex = tgt_dtype.kind == 'c' if not is_complex: return CCodeMapperBase.map_sum(self, expr, enclosing_prec) else: tgt_name = self.complex_type_name(tgt_dtype) reals = [child for child in expr.children if 'c' != self.infer_type(child).kind] complexes = [child for child in expr.children if 'c' == self.infer_type(child).kind] from pymbolic.mapper.stringifier import PREC_SUM real_sum = self.join_rec(" + ", reals, PREC_SUM) complex_sum = self.join_rec(" + ", complexes, PREC_SUM) if real_sum: result = "%s_fromreal(%s) + %s" % (tgt_name, real_sum, complex_sum) else: result = complex_sum return self.parenthesize_if_needed(result, enclosing_prec, PREC_SUM) def map_product(self, expr, enclosing_prec): tgt_dtype = self.infer_type(expr) is_complex = 'c' == tgt_dtype.kind if not is_complex: return CCodeMapperBase.map_product(self, expr, enclosing_prec) else: tgt_name = self.complex_type_name(tgt_dtype) reals = [child for child in expr.children if 'c' != self.infer_type(child).kind] complexes = [child for child in expr.children if 'c' == self.infer_type(child).kind] from pymbolic.mapper.stringifier import PREC_PRODUCT, PREC_NONE real_prd = self.join_rec("*", reals, PREC_PRODUCT) if len(complexes) == 1: myprec = PREC_PRODUCT else: myprec = PREC_NONE complex_prd = self.rec(complexes[0], myprec) for child in complexes[1:]: complex_prd = "%s_mul(%s, %s)" % ( tgt_name, complex_prd, self.rec(child, PREC_NONE)) if real_prd: # elementwise semantics are correct result = "%s * %s" % (real_prd, complex_prd) else: result = complex_prd return self.parenthesize_if_needed(result, enclosing_prec, PREC_PRODUCT) def map_quotient(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE n_complex = 'c' == self.infer_type(expr.numerator).kind d_complex = 'c' == self.infer_type(expr.denominator).kind tgt_dtype = self.infer_type(expr) if not (n_complex or d_complex): return CCodeMapperBase.map_quotient(self, expr, enclosing_prec) elif n_complex and not d_complex: # elementwise semnatics are correct return CCodeMapperBase.map_quotient(self, expr, enclosing_prec) elif not n_complex and d_complex: return "%s_rdivide(%s, %s)" % ( self.complex_type_name(tgt_dtype), self.rec(expr.numerator, PREC_NONE), self.rec(expr.denominator, PREC_NONE)) else: return "%s_divide(%s, %s)" % ( self.complex_type_name(tgt_dtype), self.rec(expr.numerator, PREC_NONE), self.rec(expr.denominator, PREC_NONE)) def map_remainder(self, expr, enclosing_prec): tgt_dtype = self.infer_type(expr) if 'c' == tgt_dtype.kind: raise RuntimeError("complex remainder not defined") return CCodeMapperBase.map_remainder(self, expr, enclosing_prec) def map_power(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE tgt_dtype = self.infer_type(expr) if 'c' == tgt_dtype.kind: if expr.exponent in [2, 3, 4]: value = expr.base for i in range(expr.exponent-1): value = value * expr.base return self.rec(value, enclosing_prec) else: b_complex = 'c' == self.infer_type(expr.base).kind e_complex = 'c' == self.infer_type(expr.exponent).kind if b_complex and not e_complex: return "%s_powr(%s, %s)" % ( self.complex_type_name(tgt_dtype), self.rec(expr.base, PREC_NONE), self.rec(expr.exponent, PREC_NONE)) else: return "%s_pow(%s, %s)" % ( self.complex_type_name(tgt_dtype), self.rec(expr.base, PREC_NONE), self.rec(expr.exponent, PREC_NONE)) return CCodeMapperBase.map_power(self, expr, enclosing_prec) class CCodeMapper(ComplexCCodeMapper): # Whatever is needed to mop up after Fortran goes here. # Stuff that deals with generating real-valued code # from complex code goes above. def __init__(self, translator, scope): ComplexCCodeMapper.__init__(self, scope.get_type_inference_mapper()) self.translator = translator self.scope = scope def map_subscript(self, expr, enclosing_prec): idx_dtype = self.infer_type(expr.index) if not 'i' == idx_dtype.kind or 'u' == idx_dtype.kind: ind_prefix = "(int) " else: ind_prefix = "" idx = expr.index if isinstance(idx, tuple) and len(idx) == 1: idx, = idx from pymbolic.mapper.stringifier import PREC_NONE, PREC_CALL return self.parenthesize_if_needed( self.format("%s[%s%s]", self.scope.translate_var_name(expr.aggregate.name), ind_prefix, self.rec(idx, PREC_NONE)), enclosing_prec, PREC_CALL) def map_call(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE tgt_dtype = self.infer_type(expr) name = expr.function.name if 'f' == tgt_dtype.kind and name == "abs": name = "fabs" if 'c' == tgt_dtype.kind: if name in ["conjg", "dconjg"]: name = "conj" if name[:2] == "cd" and name[2:] in ["log", "exp", "sqrt"]: name = name[2:] if name == "aimag": name = "imag" if name == "dble": name = "real" name = "%s_%s" % ( self.complex_type_name(tgt_dtype), name) return self.format("%s(%s)", name, self.join_rec(", ", expr.parameters, PREC_NONE)) def map_variable(self, expr, enclosing_prec): # guaranteed to not be a subscript or a call name = expr.name shape = self.scope.get_shape(name) name = self.scope.translate_var_name(name) if expr.name in self.scope.arg_names: arg_idx = self.scope.arg_names.index(name) if self.translator.arg_needs_pointer( self.scope.subprogram_name, arg_idx): return "*"+name else: return name elif shape not in [(), None]: return "*"+name else: return name def map_literal(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE if expr.dtype.kind == "c": r, i = expr.value return "{ %s, %s }" % (self.rec(r, PREC_NONE), self.rec(i, PREC_NONE)) else: return expr.value def map_wildcard(self, expr, enclosing_prec): return ":" # }}} class Scope(object): def __init__(self, subprogram_name, arg_names=set()): self.subprogram_name = subprogram_name # map name to data self.data_statements = {} # map first letter to type self.implicit_types = {} # map name to dim tuple self.dim_map = {} # map name to dim tuple self.type_map = {} # map name to data self.data = {} self.arg_names = arg_names self.used_names = set() self.type_inf_mapper = None def known_names(self): return (self.used_names | set(self.dim_map.iterkeys()) | set(self.type_map.iterkeys())) def is_known(self, name): return (name in self.used_names or name in self.dim_map or name in self.type_map) def use_name(self, name): self.used_names.add(name) def get_type(self, name): try: return self.type_map[name] except KeyError: if self.implicit_types is None: raise TranslationError( "no type for '%s' found in implict none routine" % name) return self.implicit_types.get(name[0], np.dtype(np.int32)) def get_shape(self, name): return self.dim_map.get(name, ()) def get_type_inference_mapper(self): if self.type_inf_mapper is None: self.type_inf_mapper = TypeInferenceMapper(self) return self.type_inf_mapper def translate_var_name(self, name): shape = self.dim_map.get(name) if name in self.data and shape is not None: return "%s_%s" % (self.subprogram_name, name) else: return name class FTreeWalkerBase(object): def __init__(self): self.scope_stack = [] self.expr_parser = FortranExpressionParser(self) def rec(self, expr, *args, **kwargs): mro = list(type(expr).__mro__) dispatch_class = kwargs.pop("dispatch_class", type(self)) while mro: method_name = "map_"+mro.pop(0).__name__ try: method = getattr(dispatch_class, method_name) except AttributeError: pass else: return method(self, expr, *args, **kwargs) raise NotImplementedError( "%s does not know how to map type '%s'" % (type(self).__name__, type(expr))) ENTITY_RE = re.compile( r"^(?P[_0-9a-zA-Z]+)" "(\((?P[-+*0-9:a-zA-Z,]+)\))?$") def parse_dimension_specs(self, dim_decls): def parse_bounds(bounds_str): start_end = bounds_str.split(":") assert 1 <= len(start_end) <= 2 return (self.parse_expr(s) for s in start_end) for decl in dim_decls: entity_match = self.ENTITY_RE.match(decl) assert entity_match groups = entity_match.groupdict() name = groups["name"] assert name if groups["shape"]: shape = [parse_bounds(s) for s in groups["shape"].split(",")] else: shape = None yield name, shape def __call__(self, expr, *args, **kwargs): return self.rec(expr, *args, **kwargs) # {{{ expressions def parse_expr(self, expr_str): return self.expr_parser(expr_str) # }}} class ArgumentAnalayzer(FTreeWalkerBase): def __init__(self): FTreeWalkerBase.__init__(self) # map (func, arg_nr) to # 'w' for 'needs pointer' # [] for no obstacle to de-pointerification known # [(func_name, arg_nr), ...] # depends on how this arg is used self.arg_usage_info = {} def arg_needs_pointer(self, func, arg_nr): data = self.arg_usage_info.get((func, arg_nr), []) if isinstance(data, list): return any( self.arg_needs_pointer(sub_func, sub_arg_nr) for sub_func, sub_arg_nr in data) return True # {{{ map_XXX functions def map_BeginSource(self, node): scope = Scope(None) self.scope_stack.append(scope) for c in node.content: self.rec(c) def map_Subroutine(self, node): scope = Scope(node.name, list(node.args)) self.scope_stack.append(scope) for c in node.content: self.rec(c) self.scope_stack.pop() def map_EndSubroutine(self, node): pass def map_Implicit(self, node): pass # {{{ types, declarations def map_Equivalence(self, node): raise NotImplementedError("equivalence") def map_Dimension(self, node): scope = self.scope_stack[-1] for name, shape in self.parse_dimension_specs(node.items): if name in scope.arg_names: arg_idx = scope.arg_names.index(name) self.arg_usage_info[scope.subprogram_name, arg_idx] = "w" def map_External(self, node): pass def map_type_decl(self, node): scope = self.scope_stack[-1] for name, shape in self.parse_dimension_specs(node.entity_decls): if shape is not None and name in scope.arg_names: arg_idx = scope.arg_names.index(name) self.arg_usage_info[scope.subprogram_name, arg_idx] = "w" map_Logical = map_type_decl map_Integer = map_type_decl map_Real = map_type_decl map_Complex = map_type_decl # }}} def map_Data(self, node): pass def map_Parameter(self, node): raise NotImplementedError("parameter") # {{{ I/O def map_Open(self, node): pass def map_Format(self, node): pass def map_Write(self, node): pass def map_Print(self, node): pass def map_Read1(self, node): pass # }}} def map_Assignment(self, node): scope = self.scope_stack[-1] lhs = self.parse_expr(node.variable) from pymbolic.primitives import Subscript, Call if isinstance(lhs, Subscript): lhs_name = lhs.aggregate.name elif isinstance(lhs, Call): # in absence of dim info, subscripts get parsed as calls lhs_name = lhs.function.name else: lhs_name = lhs.name if lhs_name in scope.arg_names: arg_idx = scope.arg_names.index(lhs_name) self.arg_usage_info[scope.subprogram_name, arg_idx] = "w" def map_Allocate(self, node): raise NotImplementedError("allocate") def map_Deallocate(self, node): raise NotImplementedError("deallocate") def map_Save(self, node): raise NotImplementedError("save") def map_Line(self, node): raise NotImplementedError def map_Program(self, node): raise NotImplementedError def map_Entry(self, node): raise NotImplementedError # {{{ control flow def map_Goto(self, node): pass def map_Call(self, node): scope = self.scope_stack[-1] from pymbolic.primitives import Subscript, Variable for i, arg_str in enumerate(node.items): arg = self.parse_expr(arg_str) if isinstance(arg, (Variable, Subscript)): if isinstance(arg, Subscript): arg_name = arg.aggregate.name else: arg_name = arg.name if arg_name in scope.arg_names: arg_idx = scope.arg_names.index(arg_name) arg_usage = self.arg_usage_info.setdefault( (scope.subprogram_name, arg_idx), []) if isinstance(arg_usage, list): arg_usage.append((node.designator, i)) def map_Return(self, node): pass def map_ArithmeticIf(self, node): pass def map_If(self, node): for c in node.content: self.rec(c) def map_IfThen(self, node): for c in node.content: self.rec(c) def map_ElseIf(self, node): pass def map_Else(self, node): pass def map_EndIfThen(self, node): pass def map_Do(self, node): for c in node.content: self.rec(c) def map_EndDo(self, node): pass def map_Continue(self, node): pass def map_Stop(self, node): pass def map_Comment(self, node): pass # }}} # }}} # {{{ translator class F2CLTranslator(FTreeWalkerBase): def __init__(self, addr_space_hints, force_casts, arg_info, use_restrict_pointers): FTreeWalkerBase.__init__(self) self.addr_space_hints = addr_space_hints self.force_casts = force_casts self.arg_info = arg_info self.use_restrict_pointers = use_restrict_pointers def arg_needs_pointer(self, subprogram_name, arg_index): return self.arg_info.arg_needs_pointer(subprogram_name, arg_index) # {{{ declaration helpers def get_declarator(self, name): scope = self.scope_stack[-1] return POD(scope.get_type(name), name) def get_declarations(self): scope = self.scope_stack[-1] result = [] pre_func_decl = [] def gen_shape(start_end): return ":".join(self.gen_expr(s) for s in start_end) for name in sorted(scope.known_names()): shape = scope.dim_map.get(name) if shape is not None: dim_stmt = cgen.Statement( "dimension \"fortran\" %s[%s]" % ( scope.translate_var_name(name), ", ".join(gen_shape(s) for s in shape) )) # cannot omit 'dimension' decl even for rank-1 args: result.append(dim_stmt) if name in scope.data: assert name not in scope.arg_names data = scope.data[name] if shape is None: assert len(data) == 1 result.append( cgen.Initializer( self.get_declarator(name), self.gen_expr(data[0]) )) else: from cgen.opencl import CLConstant pre_func_decl.append( cgen.Initializer( CLConstant( cgen.ArrayOf(self.get_declarator( "%s_%s" % (scope.subprogram_name, name)))), "{ %s }" % ",\n".join(self.gen_expr(x) for x in data) )) else: if name not in scope.arg_names: if shape is not None: result.append(cgen.Statement( "%s %s[nitemsof(%s)]" % ( dtype_to_ctype(scope.get_type(name)), name, name))) else: result.append(self.get_declarator(name)) return pre_func_decl, result def map_statement_list(self, content): body = [] for c in content: mapped = self.rec(c) if mapped is None: warn("mapping '%s' returned None" % type(c)) elif isinstance(mapped, list): body.extend(mapped) else: body.append(mapped) return body # }}} # {{{ map_XXX functions def map_BeginSource(self, node): scope = Scope(None) self.scope_stack.append(scope) return self.map_statement_list(node.content) def map_Subroutine(self, node): assert not node.prefix assert not hasattr(node, "suffix") scope = Scope(node.name, list(node.args)) self.scope_stack.append(scope) body = self.map_statement_list(node.content) pre_func_decl, in_func_decl = self.get_declarations() body = in_func_decl + [cgen.Line()] + body if isinstance(body[-1], cgen.Statement) and body[-1].text == "return": body.pop() def get_arg_decl(arg_idx, arg_name): decl = self.get_declarator(arg_name) if self.arg_needs_pointer(node.name, arg_idx): hint = self.addr_space_hints.get((node.name, arg_name)) if hint: decl = hint(cgen.Pointer(decl)) else: if self.use_restrict_pointers: decl = cgen.RestrictPointer(decl) else: decl = cgen.Pointer(decl) return decl result = cgen.FunctionBody( cgen.FunctionDeclaration( cgen.Value("void", node.name), [get_arg_decl(i, arg) for i, arg in enumerate(node.args)] ), cgen.Block(body)) self.scope_stack.pop() if pre_func_decl: return pre_func_decl + [cgen.Line(), result] else: return result def map_EndSubroutine(self, node): return [] def map_Implicit(self, node): scope = self.scope_stack[-1] if not node.items: assert not scope.implicit_types scope.implicit_types = None for stmt, specs in node.items: tp = self.dtype_from_stmt(stmt) for start, end in specs: for char_code in range(ord(start), ord(end)+1): scope.implicit_types[chr(char_code)] = tp return [] # {{{ types, declarations def map_Equivalence(self, node): raise NotImplementedError("equivalence") TYPE_MAP = { ("real", "4"): np.float32, ("real", "8"): np.float64, ("real", "16"): np.float128, ("complex", "8"): np.complex64, ("complex", "16"): np.complex128, ("complex", "32"): np.complex256, ("integer", ""): np.int32, ("integer", "4"): np.int32, ("complex", "8"): np.int64, } def dtype_from_stmt(self, stmt): length, kind = stmt.selector assert not kind return np.dtype(self.TYPE_MAP[(type(stmt).__name__.lower(), length)]) def map_type_decl(self, node): scope = self.scope_stack[-1] tp = self.dtype_from_stmt(node) for name, shape in self.parse_dimension_specs(node.entity_decls): if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape scope.use_name(name) assert name not in scope.type_map scope.type_map[name] = tp return [] map_Logical = map_type_decl map_Integer = map_type_decl map_Real = map_type_decl map_Complex = map_type_decl def map_Dimension(self, node): scope = self.scope_stack[-1] for name, shape in self.parse_dimension_specs(node.items): if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape scope.use_name(name) return [] def map_External(self, node): raise NotImplementedError("external") # }}} def map_Data(self, node): scope = self.scope_stack[-1] for name, data in node.stmts: name, = name assert name not in scope.data scope.data[name] = [self.parse_expr(i) for i in data] return [] def map_Parameter(self, node): raise NotImplementedError("parameter") # {{{ I/O def map_Open(self, node): raise NotImplementedError def map_Format(self, node): warn("'format' unsupported", TranslatorWarning) def map_Write(self, node): warn("'write' unsupported", TranslatorWarning) def map_Print(self, node): warn("'print' unsupported", TranslatorWarning) def map_Read1(self, node): warn("'read' unsupported", TranslatorWarning) # }}} def map_Assignment(self, node): lhs = self.parse_expr(node.variable) from pymbolic.primitives import Subscript if isinstance(lhs, Subscript): lhs_name = lhs.aggregate.name else: lhs_name = lhs.name scope = self.scope_stack[-1] scope.use_name(lhs_name) infer_type = scope.get_type_inference_mapper() rhs = self.parse_expr(node.expr) lhs_dtype = infer_type(lhs) rhs_dtype = infer_type(rhs) # check for silent truncation of complex if lhs_dtype.kind != 'c' and rhs_dtype.kind == 'c': from pymbolic import var rhs = var("real")(rhs) # check for silent widening of real if lhs_dtype.kind == 'c' and rhs_dtype.kind != 'c': from pymbolic import var rhs = var("fromreal")(rhs) return cgen.Assign(self.gen_expr(lhs), self.gen_expr(rhs)) def map_Allocate(self, node): raise NotImplementedError("allocate") def map_Deallocate(self, node): raise NotImplementedError("deallocate") def map_Save(self, node): raise NotImplementedError("save") def map_Line(self, node): #from warnings import warn #warn("Encountered a 'line': %s" % node) raise NotImplementedError def map_Program(self, node): raise NotImplementedError def map_Entry(self, node): raise NotImplementedError # {{{ control flow def map_Goto(self, node): return cgen.Statement("goto label_%s" % node.label) def map_Call(self, node): def transform_arg(i, arg_str): expr = self.parse_expr(arg_str) result = self.gen_expr(expr) if self.arg_needs_pointer(node.designator, i): result = "&"+result cast = self.force_casts.get( (node.designator, i)) if cast is not None: result = "(%s) (%s)" % (cast, result) return result return cgen.Statement("%s(%s)" % ( node.designator, ", ".join(transform_arg(i, arg_str) for i, arg_str in enumerate(node.items)))) def map_Return(self, node): return cgen.Statement("return") def map_ArithmeticIf(self, node): raise NotImplementedError def map_If(self, node): return cgen.If(self.transform_expr(node.expr), self.rec(node.content[0])) def map_IfThen(self, node): current_cond = self.transform_expr(node.expr) blocks_and_conds = [] else_block = [] def end_block(): if current_body: if current_cond is None: else_block[:] = self.map_statement_list(current_body) else: blocks_and_conds.append( (current_cond, cgen.block_if_necessary( self.map_statement_list(current_body)))) del current_body[:] from fparser.statements import Else, ElseIf i = 0 current_body = [] while i < len(node.content): c = node.content[i] if isinstance(c, ElseIf): end_block() current_cond = self.transform_expr(c.expr) elif isinstance(c, Else): end_block() current_cond = None else: current_body.append(c) i += 1 end_block() def block_or_none(body): if not body: return None else: return cgen.block_if_necessary(body) return cgen.make_multiple_ifs( blocks_and_conds, block_or_none(else_block)) def map_EndIfThen(self, node): return [] def map_Do(self, node): scope = self.scope_stack[-1] body = self.map_statement_list(node.content) if node.loopcontrol: loop_var, loop_bounds = node.loopcontrol.split("=") loop_var = loop_var.strip() scope.use_name(loop_var) loop_bounds = [self.parse_expr(s) for s in loop_bounds.split(",")] if len(loop_bounds) == 2: start, stop = loop_bounds step = 1 elif len(loop_bounds) == 3: start, stop, step = loop_bounds else: raise RuntimeError("loop bounds not understood: %s" % node.loopcontrol) if not isinstance(step, int): print type(step) raise TranslationError("non-constant steps not yet supported: %s" % step) if step < 0: comp_op = ">=" else: comp_op = "<=" return cgen.For( "%s = %s" % (loop_var, self.gen_expr(start)), "%s %s %s" % (loop_var, comp_op, self.gen_expr(stop)), "%s += %s" % (loop_var, self.gen_expr(step)), cgen.block_if_necessary(body)) else: raise NotImplementedError("unbounded do loop") def map_EndDo(self, node): return [] def map_Continue(self, node): return cgen.Statement("label_%s:" % node.label) def map_Stop(self, node): raise NotImplementedError("stop") def map_Comment(self, node): if node.content: return cgen.LineComment(node.content.strip()) else: return [] # }}} # }}} # {{{ expressions def gen_expr(self, expr): scope = self.scope_stack[-1] return CCodeMapper(self, scope)(expr) def transform_expr(self, expr_str): return self.gen_expr(self.expr_parser(expr_str)) # }}} # }}} def f2cl(source, free_form=False, strict=True, addr_space_hints={}, force_casts={}, do_arg_analysis=True, use_restrict_pointers=False, try_compile=False): from fparser import api tree = api.parse(source, isfree=free_form, isstrict=strict, analyze=False, ignore_comments=False) arg_info = ArgumentAnalayzer() if do_arg_analysis: arg_info(tree) source = F2CLTranslator(addr_space_hints, force_casts, arg_info, use_restrict_pointers=use_restrict_pointers)(tree) func_decls = [] for entry in source: if isinstance(entry, cgen.FunctionBody): func_decls.append(entry.fdecl) mod = cgen.Module(func_decls + [cgen.Line()] + source) #open("pre-cnd.cl", "w").write(str(mod)) from cnd import transform_cl str_mod = transform_cl(str(mod)) if try_compile: import pyopencl as cl ctx = cl.create_some_context() cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_fp64: enable #include """).build() return str_mod def f2cl_files(source_file, target_file, **kwargs): mod = f2cl(open(source_file).read(), **kwargs) open(target_file, "w").write(mod) if __name__ == "__main__": from cgen.opencl import CLConstant if 0: f2cl_files("hank107.f", "hank107.cl", addr_space_hints={ ("hank107p", "p"): CLConstant, ("hank107pc", "p"): CLConstant, }, force_casts={ ("hank107p", 0): "__constant cdouble_t *", }) f2cl_files("cdjseval2d.f", "cdjseval2d.cl") f2cl_files("hank103.f", "hank103.cl", addr_space_hints={ ("hank103p", "p"): CLConstant, ("hank103pc", "p"): CLConstant, }, force_casts={ ("hank103p", 0): "__constant cdouble_t *", }, try_compile=True) # vim: foldmethod=marker pyopencl-2013.2/contrib/fortran-to-opencl/README0000644000175000000500000000157112245716340020043 0ustar tomussrcExperimental Fortran-to-OpenCL translator ----------------------------------------- This is a highly experimental Fortran-to-OpenCL translator. Its purpose is to translate computational kernels into OpenCL-like C. It doesn't auto-parallelize. My purpose in writing this was to convert a few special-function evaluators. The best it can hope for at the moment is to automate most of the process so that you'll only have to fix up a few things manually afterwards. It further only deals with the subset of Fortran 77 that I needed. Quite a number of things are unimplemented. Patches are welcome. Andreas Kloeckner Dependencies: - cnd http://github.com/inducer/cnd - cgen http://github.com/inducer/cgen - pymbolic http://github.com/inducer/pymbolic - fparser http://code.google.com/p/f2py with fix from http://code.google.com/p/f2py/issues/detail?id=32 pyopencl-2013.2/configure.py0000755000175000000500000000013312245716340014501 0ustar tomussrc#! /usr/bin/env python from aksetup_helper import configure_frontend configure_frontend() pyopencl-2013.2/setup.py0000644000175000000500000002153212245716340013663 0ustar tomussrc#!/usr/bin/env python # -*- coding: latin-1 -*- def get_config_schema(): from aksetup_helper import ConfigSchema, Option, \ IncludeDir, LibraryDir, Libraries, BoostLibraries, \ Switch, StringListOption, make_boost_base_options import sys if 'darwin' in sys.platform: import platform osx_ver, _, _ = platform.mac_ver() osx_ver = '.'.join(osx_ver.split('.')[:2]) sysroot_paths = [ "/Applications/Xcode.app/Contents/Developer/Platforms/" "MacOSX.platform/Developer/SDKs/MacOSX%s.sdk" % osx_ver, "/Developer/SDKs/MacOSX%s.sdk" % osx_ver ] default_libs = [] default_cxxflags = ['-arch', 'i386', '-arch', 'x86_64'] from os.path import isdir for srp in sysroot_paths: if isdir(srp): default_cxxflags.extend(['-isysroot', srp]) break default_ldflags = default_cxxflags[:] + ["-Wl,-framework,OpenCL"] else: default_libs = ["OpenCL"] default_cxxflags = [] default_ldflags = [] return ConfigSchema(make_boost_base_options() + [ BoostLibraries("python"), Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"), Switch("CL_TRACE", False, "Enable OpenCL API tracing"), Switch("CL_ENABLE_GL", False, "Enable OpenCL<->OpenGL interoperability"), Switch("CL_ENABLE_DEVICE_FISSION", True, "Enable device fission extension, if present"), Option("CL_PRETEND_VERSION", None, "Dotted CL version (e.g. 1.2) which you'd like to use."), IncludeDir("CL", []), LibraryDir("CL", []), Libraries("CL", default_libs), StringListOption("CXXFLAGS", default_cxxflags, help="Any extra C++ compiler options to include"), StringListOption("LDFLAGS", default_ldflags, help="Any extra linker options to include"), ]) def main(): from aksetup_helper import (hack_distutils, get_config, setup, NumpyExtension, set_up_shipped_boost_if_requested, check_git_submodules) check_git_submodules() hack_distutils() conf = get_config(get_config_schema(), warn_about_no_config=False) EXTRA_OBJECTS, EXTRA_DEFINES = \ set_up_shipped_boost_if_requested("pyopencl", conf) LIBRARY_DIRS = conf["BOOST_LIB_DIR"] LIBRARIES = conf["BOOST_PYTHON_LIBNAME"] EXTRA_INCLUDE_DIRS = [] EXTRA_DEFINES["PYGPU_PACKAGE"] = "pyopencl" EXTRA_DEFINES["PYGPU_PYOPENCL"] = "1" if conf["CL_TRACE"]: EXTRA_DEFINES["PYOPENCL_TRACE"] = 1 INCLUDE_DIRS = conf["BOOST_INC_DIR"] + conf["CL_INC_DIR"] if conf["CL_ENABLE_GL"]: EXTRA_DEFINES["HAVE_GL"] = 1 if conf["CL_ENABLE_DEVICE_FISSION"]: EXTRA_DEFINES["PYOPENCL_USE_DEVICE_FISSION"] = 1 if conf["CL_PRETEND_VERSION"]: try: major, minor = [int(x) for x in conf["CL_PRETEND_VERSION"].split(".")] EXTRA_DEFINES["PYOPENCL_PRETEND_CL_VERSION"] = \ 0x1000*major + 0x10 * minor except: print("CL_PRETEND_VERSION must be of the form M.N, " "with two integers M and N") raise ver_dic = {} version_file = open("pyopencl/version.py") try: version_file_contents = version_file.read() finally: version_file.close() exec(compile(version_file_contents, "pyopencl/version.py", 'exec'), ver_dic) SEPARATOR = "-"*75 try: from distutils.command.build_py import build_py_2to3 as build_py except ImportError: # 2.x from distutils.command.build_py import build_py try: import mako # noqa except ImportError: print(SEPARATOR) print("Mako is not installed.") print(SEPARATOR) print("That is not a problem, as most of PyOpenCL will be just fine ") print("without it.Some higher-level parts of pyopencl (such as ") print("pyopencl.reduction) will not function without the templating engine ") print("Mako [1] being installed. If you would like this functionality to ") print("work, you might want to install Mako after you finish ") print("installing PyOpenCL.") print("") print("[1] http://www.makotemplates.org/") print(SEPARATOR) print("Hit Ctrl-C now if you'd like to think about the situation.") print(SEPARATOR) from aksetup_helper import count_down_delay count_down_delay(delay=5) might_be_cuda = False for inc_dir in conf["CL_INC_DIR"]: inc_dir = inc_dir.lower() if "nv" in inc_dir or "cuda" in inc_dir: might_be_cuda = True if might_be_cuda and conf["CL_ENABLE_DEVICE_FISSION"]: print(SEPARATOR) print("You might be compiling against Nvidia CUDA with device " "fission enabled.") print(SEPARATOR) print("That is not a problem on CUDA 4.0 and newer. If you are " "using CUDA 3.2,") print("your build will break, because Nvidia shipped a broken CL header in") print("in your version. The fix is to set CL_ENABLE_DEVICE_FISSION to False") print("in your PyOpenCL configuration.") print(SEPARATOR) print("Hit Ctrl-C now if you'd like to think about the situation.") print(SEPARATOR) from aksetup_helper import count_down_delay count_down_delay(delay=5) import sys if sys.version_info >= (3,): pvt_struct_source = "src/wrapper/_pvt_struct_v3.cpp" else: pvt_struct_source = "src/wrapper/_pvt_struct_v2.cpp" setup(name="pyopencl", # metadata version=ver_dic["VERSION_TEXT"], description="Python wrapper for OpenCL", long_description=open("README.rst", "rt").read(), author="Andreas Kloeckner", author_email="inform@tiker.net", license="MIT", url="http://mathema.tician.de/software/pyopencl", classifiers=[ 'Environment :: Console', 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'Intended Audience :: Other Audience', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: C++', 'Programming Language :: Python', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.4', 'Programming Language :: Python :: 2.5', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Scientific/Engineering :: Physics', ], # build info packages=["pyopencl", "pyopencl.characterize", "pyopencl.compyte"], install_requires=[ "pytools>=2013.5.2", "pytest>=2", "decorator>=3.2.0", # "Mako>=0.3.6", ], ext_package="pyopencl", ext_modules=[ NumpyExtension("_cl", [ "src/wrapper/wrap_cl.cpp", "src/wrapper/wrap_cl_part_1.cpp", "src/wrapper/wrap_cl_part_2.cpp", "src/wrapper/wrap_constants.cpp", "src/wrapper/wrap_mempool.cpp", "src/wrapper/bitlog.cpp", ]+EXTRA_OBJECTS, include_dirs=INCLUDE_DIRS + EXTRA_INCLUDE_DIRS, library_dirs=LIBRARY_DIRS + conf["CL_LIB_DIR"], libraries=LIBRARIES + conf["CL_LIBNAME"], define_macros=list(EXTRA_DEFINES.items()), extra_compile_args=conf["CXXFLAGS"], extra_link_args=conf["LDFLAGS"], ), NumpyExtension("_pvt_struct", [pvt_struct_source], extra_compile_args=conf["CXXFLAGS"], extra_link_args=conf["LDFLAGS"], ), ], include_package_data=True, package_data={ "pyopencl": [ "cl/*.cl", "cl/*.h", ] }, # 2to3 invocation cmdclass={'build_py': build_py}, zip_safe=False) if __name__ == '__main__': main()