pycuda-2013.1.1+git20140310/0002755000175000000500000000000012313360366013333 5ustar tomussrcpycuda-2013.1.1+git20140310/aksetup_helper.py0000644000175000000500000006274312313360364016730 0ustar tomussrcimport setuptools # noqa from setuptools import Extension def count_down_delay(delay): from time import sleep import sys while delay: sys.stdout.write("Continuing in %d seconds... \r" % delay) sys.stdout.flush() delay -= 1 sleep(1) print("") DASH_SEPARATOR = 75 * "-" def setup(*args, **kwargs): from setuptools import setup try: setup(*args, **kwargs) except KeyboardInterrupt: raise except SystemExit: raise except: print(DASH_SEPARATOR) print("Sorry, your build failed. Try rerunning configure.py with " "different options.") print(DASH_SEPARATOR) raise class NumpyExtension(Extension): # nicked from # http://mail.python.org/pipermail/distutils-sig/2007-September/008253.html # solution by Michael Hoffmann def __init__(self, *args, **kwargs): Extension.__init__(self, *args, **kwargs) self._include_dirs = self.include_dirs del self.include_dirs # restore overwritten property def get_numpy_incpath(self): from imp import find_module # avoid actually importing numpy, it screws up distutils file, pathname, descr = find_module("numpy") from os.path import join return join(pathname, "core", "include") def get_include_dirs(self): return self._include_dirs + [self.get_numpy_incpath()] def set_include_dirs(self, value): self._include_dirs = value def del_include_dirs(self): pass include_dirs = property(get_include_dirs, set_include_dirs, del_include_dirs) class PyUblasExtension(NumpyExtension): def get_module_include_path(self, name): from pkg_resources import Requirement, resource_filename return resource_filename(Requirement.parse(name), "%s/include" % name) @property def include_dirs(self): return self._include_dirs + [ self.get_numpy_incpath(), self.get_module_include_path("pyublas"), ] class HedgeExtension(PyUblasExtension): @property def include_dirs(self): return self._include_dirs + [ self.get_numpy_incpath(), self.get_module_include_path("pyublas"), self.get_module_include_path("hedge"), ] # {{{ tools def flatten(list): """For an iterable of sub-iterables, generate each member of each sub-iterable in turn, i.e. a flattened version of that super-iterable. Example: Turn [[a,b,c],[d,e,f]] into [a,b,c,d,e,f]. """ for sublist in list: for j in sublist: yield j def humanize(sym_str): words = sym_str.lower().replace("_", " ").split(" ") return " ".join([word.capitalize() for word in words]) # }}} # {{{ siteconf handling def get_config(schema=None, warn_about_no_config=True): if schema is None: from setup import get_config_schema schema = get_config_schema() if (not schema.have_config() and not schema.have_global_config() and warn_about_no_config): print("*************************************************************") print("*** I have detected that you have not run configure.py.") print("*************************************************************") print("*** Additionally, no global config files were found.") print("*** I will go ahead with the default configuration.") print("*** In all likelihood, this will not work out.") print("*** ") print("*** See README_SETUP.txt for more information.") print("*** ") print("*** If the build does fail, just re-run configure.py with the") print("*** correct arguments, and then retry. Good luck!") print("*************************************************************") print("*** HIT Ctrl-C NOW IF THIS IS NOT WHAT YOU WANT") print("*************************************************************") count_down_delay(delay=10) return expand_options(schema.read_config()) def hack_distutils(debug=False, fast_link=True, what_opt=3): # hack distutils.sysconfig to eliminate debug flags # stolen from mpi4py def remove_prefixes(optlist, bad_prefixes): for bad_prefix in bad_prefixes: for i, flag in enumerate(optlist): if flag.startswith(bad_prefix): optlist.pop(i) break return optlist import sys if not sys.platform.lower().startswith("win"): from distutils import sysconfig cvars = sysconfig.get_config_vars() cflags = cvars.get('OPT') if cflags: cflags = remove_prefixes(cflags.split(), ['-g', '-O', '-Wstrict-prototypes', '-DNDEBUG']) if debug: cflags.append("-g") else: if what_opt is None: pass else: cflags.append("-O%s" % what_opt) cflags.append("-DNDEBUG") cvars['OPT'] = str.join(' ', cflags) cvars["CFLAGS"] = cvars["BASECFLAGS"] + " " + cvars["OPT"] if fast_link: for varname in ["LDSHARED", "BLDSHARED"]: ldsharedflags = cvars.get(varname) if ldsharedflags: ldsharedflags = remove_prefixes(ldsharedflags.split(), ['-Wl,-O']) cvars[varname] = str.join(' ', ldsharedflags) # }}} # {{{ configure guts def default_or(a, b): if a is None: return b else: return a def expand_str(s, options): import re def my_repl(match): sym = match.group(1) try: repl = options[sym] except KeyError: from os import environ repl = environ[sym] return expand_str(repl, options) return re.subn(r"\$\{([a-zA-Z0-9_]+)\}", my_repl, s)[0] def expand_value(v, options): if isinstance(v, str): return expand_str(v, options) elif isinstance(v, list): result = [] for i in v: try: exp_i = expand_value(i, options) except: pass else: result.append(exp_i) return result else: return v def expand_options(options): return dict( (k, expand_value(v, options)) for k, v in options.items()) class ConfigSchema: def __init__(self, options, conf_file="siteconf.py", conf_dir="."): self.optdict = dict((opt.name, opt) for opt in options) self.options = options self.conf_dir = conf_dir self.conf_file = conf_file from os.path import expanduser self.user_conf_file = expanduser("~/.aksetup-defaults.py") import sys if not sys.platform.lower().startswith("win"): self.global_conf_file = "/etc/aksetup-defaults.py" else: self.global_conf_file = None def get_conf_file(self): import os return os.path.join(self.conf_dir, self.conf_file) def set_conf_dir(self, conf_dir): self.conf_dir = conf_dir def get_default_config(self): return dict((opt.name, opt.default) for opt in self.options) def read_config_from_pyfile(self, filename): result = {} filevars = {} infile = open(filename, "r") try: contents = infile.read() finally: infile.close() exec(compile(contents, filename, "exec"), filevars) for key, value in filevars.items(): if key in self.optdict: result[key] = value return result def update_conf_file(self, filename, config): result = {} filevars = {} try: exec(compile(open(filename, "r").read(), filename, "exec"), filevars) except IOError: pass if "__builtins__" in filevars: del filevars["__builtins__"] for key, value in config.items(): if value is not None: filevars[key] = value keys = filevars.keys() keys.sort() outf = open(filename, "w") for key in keys: outf.write("%s = %s\n" % (key, repr(filevars[key]))) outf.close() return result def update_user_config(self, config): self.update_conf_file(self.user_conf_file, config) def update_global_config(self, config): if self.global_conf_file is not None: self.update_conf_file(self.global_conf_file, config) def get_default_config_with_files(self): result = self.get_default_config() import os confignames = [] if self.global_conf_file is not None: confignames.append(self.global_conf_file) confignames.append(self.user_conf_file) for fn in confignames: if os.access(fn, os.R_OK): result.update(self.read_config_from_pyfile(fn)) return result def have_global_config(self): import os result = os.access(self.user_conf_file, os.R_OK) if self.global_conf_file is not None: result = result or os.access(self.global_conf_file, os.R_OK) return result def have_config(self): import os return os.access(self.get_conf_file(), os.R_OK) def read_config(self, warn_if_none=True): import os cfile = self.get_conf_file() result = self.get_default_config_with_files() if os.access(cfile, os.R_OK): filevars = {} exec(compile(open(cfile, "r").read(), cfile, "exec"), filevars) for key, value in filevars.items(): if key in self.optdict: result[key] = value elif key == "__builtins__": pass else: raise KeyError("invalid config key in %s: %s" % ( cfile, key)) return result def add_to_configparser(self, parser, def_config=None): if def_config is None: def_config = self.get_default_config_with_files() for opt in self.options: default = default_or(def_config.get(opt.name), opt.default) opt.add_to_configparser(parser, default) def get_from_configparser(self, options): result = {} for opt in self.options: result[opt.name] = opt.take_from_configparser(options) return result def write_config(self, config): outf = open(self.get_conf_file(), "w") for opt in self.options: value = config[opt.name] if value is not None: outf.write("%s = %s\n" % (opt.name, repr(config[opt.name]))) outf.close() def make_substitutions(self, config): return dict((opt.name, opt.value_to_str(config[opt.name])) for opt in self.options) class Option(object): def __init__(self, name, default=None, help=None): self.name = name self.default = default self.help = help def as_option(self): return self.name.lower().replace("_", "-") def metavar(self): last_underscore = self.name.rfind("_") return self.name[last_underscore+1:] def get_help(self, default): result = self.help if self.default: result += " (default: %s)" % self.value_to_str( default_or(default, self.default)) return result def value_to_str(self, default): return default def add_to_configparser(self, parser, default=None): default = default_or(default, self.default) default_str = self.value_to_str(default) parser.add_option( "--" + self.as_option(), dest=self.name, default=default_str, metavar=self.metavar(), help=self.get_help(default)) def take_from_configparser(self, options): return getattr(options, self.name) class Switch(Option): def add_to_configparser(self, parser, default=None): if not isinstance(self.default, bool): raise ValueError("Switch options must have a default") if default is None: default = self.default option_name = self.as_option() if default: option_name = "no-" + option_name action = "store_false" else: action = "store_true" parser.add_option( "--" + option_name, dest=self.name, help=self.get_help(default), default=default, action=action) class StringListOption(Option): def value_to_str(self, default): if default is None: return None return ",".join([str(el).replace(",", r"\,") for el in default]) def get_help(self, default): return Option.get_help(self, default) + " (several ok)" def take_from_configparser(self, options): opt = getattr(options, self.name) if opt is None: return None else: if opt: import re sep = re.compile(r"(? #include #include #include "bitlog.hpp" namespace PYGPU_PACKAGE { template inline T signed_left_shift(T x, signed shift_amount) { if (shift_amount < 0) return x >> -shift_amount; else return x << shift_amount; } template inline T signed_right_shift(T x, signed shift_amount) { if (shift_amount < 0) return x << -shift_amount; else return x >> shift_amount; } template class memory_pool { public: typedef typename Allocator::pointer_type pointer_type; typedef typename Allocator::size_type size_type; private: typedef boost::uint32_t bin_nr_t; typedef std::vector bin_t; typedef boost::ptr_map container_t; container_t m_container; typedef typename container_t::value_type bin_pair_t; std::auto_ptr m_allocator; // A held block is one that's been released by the application, but that // we are keeping around to dish out again. unsigned m_held_blocks; // An active block is one that is in use by the application. unsigned m_active_blocks; bool m_stop_holding; int m_trace; public: memory_pool(Allocator const &alloc=Allocator()) : m_allocator(alloc.copy()), m_held_blocks(0), m_active_blocks(0), m_stop_holding(false), m_trace(false) { if (m_allocator->is_deferred()) { PyErr_WarnEx(PyExc_UserWarning, "Memory pools expect non-deferred " "semantics from their allocators. You passed a deferred " "allocator, i.e. an allocator whose allocations can turn out to " "be unavailable long after allocation.", 1); } } virtual ~memory_pool() { free_held(); } static const unsigned mantissa_bits = 2; static const unsigned mantissa_mask = (1 << mantissa_bits) - 1; static bin_nr_t bin_number(size_type size) { signed l = bitlog2(size); size_type shifted = signed_right_shift(size, l-signed(mantissa_bits)); if (size && (shifted & (1 << mantissa_bits)) == 0) throw std::runtime_error("memory_pool::bin_number: bitlog2 fault"); size_type chopped = shifted & mantissa_mask; return l << mantissa_bits | chopped; } void set_trace(bool flag) { if (flag) ++m_trace; else --m_trace; } static size_type alloc_size(bin_nr_t bin) { bin_nr_t exponent = bin >> mantissa_bits; bin_nr_t mantissa = bin & mantissa_mask; size_type ones = signed_left_shift(1, signed(exponent)-signed(mantissa_bits) ); if (ones) ones -= 1; size_type head = signed_left_shift( (1<second; } void inc_held_blocks() { if (m_held_blocks == 0) start_holding_blocks(); ++m_held_blocks; } void dec_held_blocks() { --m_held_blocks; if (m_held_blocks == 0) stop_holding_blocks(); } virtual void start_holding_blocks() { } virtual void stop_holding_blocks() { } public: pointer_type allocate(size_type size) { bin_nr_t bin_nr = bin_number(size); bin_t &bin = get_bin(bin_nr); if (bin.size()) { if (m_trace) std::cout << "[pool] allocation of size " << size << " served from bin " << bin_nr << " which contained " << bin.size() << " entries" << std::endl; return pop_block_from_bin(bin, size); } size_type alloc_sz = alloc_size(bin_nr); assert(bin_number(alloc_sz) == bin_nr); if (m_trace) std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl; try { return get_from_allocator(alloc_sz); } catch (PYGPU_PACKAGE::error &e) { if (!e.is_out_of_memory()) throw; } if (m_trace) std::cout << "[pool] allocation triggered OOM, running GC" << std::endl; m_allocator->try_release_blocks(); if (bin.size()) return pop_block_from_bin(bin, size); if (m_trace) std::cout << "[pool] allocation still OOM after GC" << std::endl; while (try_to_free_memory()) { try { return get_from_allocator(alloc_sz); } catch (PYGPU_PACKAGE::error &e) { if (!e.is_out_of_memory()) throw; } } throw PYGPU_PACKAGE::error( "memory_pool::allocate", #ifdef PYGPU_PYCUDA CUDA_ERROR_OUT_OF_MEMORY, #endif #ifdef PYGPU_PYOPENCL CL_MEM_OBJECT_ALLOCATION_FAILURE, #endif "failed to free memory for allocation"); } void free(pointer_type p, size_type size) { --m_active_blocks; bin_nr_t bin_nr = bin_number(size); if (!m_stop_holding) { inc_held_blocks(); get_bin(bin_nr).push_back(p); if (m_trace) std::cout << "[pool] block of size " << size << " returned to bin " << bin_nr << " which now contains " << get_bin(bin_nr).size() << " entries" << std::endl; } else m_allocator->free(p); } void free_held() { BOOST_FOREACH(bin_pair_t bin_pair, m_container) { bin_t &bin = *bin_pair.second; while (bin.size()) { m_allocator->free(bin.back()); bin.pop_back(); dec_held_blocks(); } } assert(m_held_blocks == 0); } void stop_holding() { m_stop_holding = true; free_held(); } unsigned active_blocks() { return m_active_blocks; } unsigned held_blocks() { return m_held_blocks; } bool try_to_free_memory() { BOOST_FOREACH(bin_pair_t bin_pair, // free largest stuff first std::make_pair(m_container.rbegin(), m_container.rend())) { bin_t &bin = *bin_pair.second; if (bin.size()) { m_allocator->free(bin.back()); bin.pop_back(); dec_held_blocks(); return true; } } return false; } private: pointer_type get_from_allocator(size_type alloc_sz) { pointer_type result = m_allocator->allocate(alloc_sz); ++m_active_blocks; return result; } pointer_type pop_block_from_bin(bin_t &bin, size_type size) { pointer_type result = bin.back(); bin.pop_back(); dec_held_blocks(); ++m_active_blocks; return result; } }; template class pooled_allocation : public boost::noncopyable { public: typedef Pool pool_type; typedef typename Pool::pointer_type pointer_type; typedef typename Pool::size_type size_type; private: boost::shared_ptr m_pool; pointer_type m_ptr; size_type m_size; bool m_valid; public: pooled_allocation(boost::shared_ptr p, size_type size) : m_pool(p), m_ptr(p->allocate(size)), m_size(size), m_valid(true) { } ~pooled_allocation() { if (m_valid) free(); } void free() { if (m_valid) { m_pool->free(m_ptr, m_size); m_valid = false; } else throw PYGPU_PACKAGE::error( "pooled_device_allocation::free", #ifdef PYGPU_PYCUDA CUDA_ERROR_INVALID_HANDLE #endif #ifdef PYGPU_PYOPENCL CL_INVALID_VALUE #endif ); } pointer_type ptr() const { return m_ptr; } size_type size() const { return m_size; } }; } #endif pycuda-2013.1.1+git20140310/src/cpp/cuda.cpp0000644000175000000500000000014212313360364016315 0ustar tomussrc#include "cuda.hpp" boost::thread_specific_ptr pycuda::context_stack_ptr; pycuda-2013.1.1+git20140310/src/cpp/cuda_gl.hpp0000644000175000000500000001740712313360364017020 0ustar tomussrc#ifndef _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_GL_HPP #define _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_GL_HPP #include #if defined(__APPLE__) || defined(MACOSX) #include #else /* __APPLE__ */ #include #endif #include namespace pycuda { namespace gl { // {{{ pre-3.0-style API inline void gl_init() { CUDAPP_CALL_GUARDED(cuGLInit, ()); PyErr_Warn( PyExc_DeprecationWarning, "gl_init() has been deprecated since CUDA 3.0 " "and PyCUDA 2011.1."); } inline boost::shared_ptr make_gl_context(device const &dev, unsigned int flags) { CUcontext ctx; CUDAPP_CALL_GUARDED(cuGLCtxCreate, (&ctx, flags, dev.handle())); boost::shared_ptr result(new context(ctx)); context_stack::get().push(result); return result; } class buffer_object : public context_dependent { private: GLuint m_handle; bool m_valid; public: buffer_object(GLuint handle) : m_handle(handle), m_valid(true) { CUDAPP_CALL_GUARDED(cuGLRegisterBufferObject, (handle)); PyErr_Warn( PyExc_DeprecationWarning, "buffer_object has been deprecated since CUDA 3.0 " "and PyCUDA 2011.1."); } ~buffer_object() { if (m_valid) unregister(); } GLuint handle() { return m_handle; } void unregister() { if (m_valid) { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuGLUnregisterBufferObject, (m_handle)); m_valid = false; } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(buffer_object); } else throw pycuda::error("buffer_object::unregister", CUDA_ERROR_INVALID_HANDLE); } }; class buffer_object_mapping : public context_dependent { private: boost::shared_ptr m_buffer_object; CUdeviceptr m_devptr; size_t m_size; bool m_valid; public: buffer_object_mapping( boost::shared_ptr bobj, CUdeviceptr devptr, size_t size) : m_buffer_object(bobj), m_devptr(devptr), m_size(size), m_valid(true) { PyErr_Warn( PyExc_DeprecationWarning, "buffer_object_mapping has been deprecated since CUDA 3.0 " "and PyCUDA 2011.1."); } ~buffer_object_mapping() { if (m_valid) unmap(); } void unmap() { if (m_valid) { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuGLUnmapBufferObject, (m_buffer_object->handle())); m_valid = false; } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(buffer_object_mapping) } else throw pycuda::error("buffer_object_mapping::unmap", CUDA_ERROR_INVALID_HANDLE); } CUdeviceptr device_ptr() const { return m_devptr; } size_t size() const { return m_size; } }; inline buffer_object_mapping *map_buffer_object( boost::shared_ptr bobj) { CUdeviceptr devptr; pycuda_size_t size; CUDAPP_CALL_GUARDED(cuGLMapBufferObject, (&devptr, &size, bobj->handle())); PyErr_Warn( PyExc_DeprecationWarning, "map_buffer_object has been deprecated since CUDA 3.0 " "and PyCUDA 2011.1."); return new buffer_object_mapping(bobj, devptr, size); } // }}} // {{{ new-style (3.0+) API #if CUDAPP_CUDA_VERSION >= 3000 class registered_object : public context_dependent { protected: GLuint m_gl_handle; bool m_valid; CUgraphicsResource m_resource; public: registered_object(GLuint gl_handle) : m_gl_handle(gl_handle), m_valid(true) { } ~registered_object() { if (m_valid) unregister(); } GLuint gl_handle() { return m_gl_handle; } CUgraphicsResource resource() { return m_resource; } void unregister() { if (m_valid) { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP( cuGraphicsUnregisterResource, (m_resource)); m_valid = false; } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(registered_object); } else throw pycuda::error("registered_object::unregister", CUDA_ERROR_INVALID_HANDLE); } }; class registered_buffer : public registered_object { public: registered_buffer(GLuint gl_handle, CUgraphicsMapResourceFlags flags=CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE) : registered_object(gl_handle) { CUDAPP_CALL_GUARDED(cuGraphicsGLRegisterBuffer, (&m_resource, gl_handle, flags)); } }; class registered_image : public registered_object { public: registered_image(GLuint gl_handle, GLenum target, CUgraphicsMapResourceFlags flags=CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE) : registered_object(gl_handle) { CUDAPP_CALL_GUARDED(cuGraphicsGLRegisterImage, (&m_resource, gl_handle, target, flags)); } }; class registered_mapping : public context_dependent { private: boost::shared_ptr m_object; boost::shared_ptr m_stream; bool m_valid; public: registered_mapping( boost::shared_ptr robj, boost::shared_ptr strm) : m_object(robj), m_stream(strm), m_valid(true) { } ~registered_mapping() { if (m_valid) unmap_no_strm(); } void unmap_no_strm() { unmap(m_stream); } void unmap(boost::shared_ptr const &strm) { CUstream s_handle; if (!strm.get()) s_handle = 0; else s_handle = strm->handle(); if (m_valid) { try { scoped_context_activation ca(get_context()); CUgraphicsResource res = m_object->resource(); CUDAPP_CALL_GUARDED_CLEANUP(cuGraphicsUnmapResources, (1, &res, s_handle)); m_valid = false; } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(registered_mapping) } else throw pycuda::error("registered_mapping::unmap", CUDA_ERROR_INVALID_HANDLE); } py::tuple device_ptr_and_size() const { CUdeviceptr devptr; pycuda_size_t size; CUDAPP_CALL_GUARDED(cuGraphicsResourceGetMappedPointer, (&devptr, &size, m_object->resource())); return py::make_tuple(devptr, size); } inline pycuda::array *array(unsigned int index, unsigned int level) const { CUarray devptr; CUDAPP_CALL_GUARDED(cuGraphicsSubResourceGetMappedArray, (&devptr, m_object->resource(), index, level)); std::auto_ptr result( new pycuda::array(devptr, false)); return result.release(); } }; inline registered_mapping *map_registered_object( boost::shared_ptr const &robj, py::object strm_py) { CUstream s_handle; boost::shared_ptr strm_sptr; if (strm_py.ptr() == Py_None) { s_handle = 0; } else { strm_sptr = py::extract >(strm_py); s_handle = strm_sptr->handle(); } CUgraphicsResource res = robj->resource(); CUDAPP_CALL_GUARDED(cuGraphicsMapResources, (1, &res, s_handle)); return new registered_mapping(robj, strm_sptr); } #endif // }}} } } #endif // vim: foldmethod=marker pycuda-2013.1.1+git20140310/src/cpp/bitlog.hpp0000644000175000000500000000146112313360364016673 0ustar tomussrc// Base-2 logarithm bithack. #ifndef _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_BITLOG_HPP #define _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_BITLOG_HPP #include #include namespace pycuda { extern const char log_table_8[]; inline unsigned bitlog2_16(boost::uint16_t v) { if (unsigned long t = v >> 8) return 8+log_table_8[t]; else return log_table_8[v]; } inline unsigned bitlog2_32(boost::uint32_t v) { if (boost::uint16_t t = v >> 16) return 16+bitlog2_16(t); else return bitlog2_16(boost::uint16_t(v)); } inline unsigned bitlog2(size_t v) { #if (ULONG_MAX != 4294967295) || defined(_WIN64) if (boost::uint32_t t = v >> 32) return 32+bitlog2_32(t); else #endif return bitlog2_32(unsigned(v)); } } #endif pycuda-2013.1.1+git20140310/src/cpp/cuda.hpp0000644000175000000500000015756212313360364016345 0ustar tomussrc// A C++ wrapper for CUDA #ifndef _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_HPP #define _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_HPP #include #ifdef CUDAPP_PRETEND_CUDA_VERSION #define CUDAPP_CUDA_VERSION CUDAPP_PRETEND_CUDA_VERSION #else #define CUDAPP_CUDA_VERSION CUDA_VERSION #endif #if CUDAPP_CUDA_VERSION >= 4000 #include #endif #ifndef _MSC_VER #include #endif #include #include #include #include #include #include #include #include #include #include #include #if (BOOST_VERSION/100) < 1035 #warning ***************************************************************** #warning **** Your version of Boost C++ is likely too old for PyCUDA. **** #warning ***************************************************************** #endif // MAYBE? cuMemcpy, cuPointerGetAttribute // TODO: cuCtxSetCurrent, cuCtxGetCurrent // (use once the old, deprecated functions have been removed from CUDA) // #define CUDAPP_TRACE_CUDA #define CUDAPP_POST_30_BETA #ifdef CUDAPP_PRETEND_CUDA_VERSION #define CUDAPP_CUDA_VERSION CUDAPP_PRETEND_CUDA_VERSION #else #define CUDAPP_CUDA_VERSION CUDA_VERSION #endif #if (PY_VERSION_HEX >= 0x03000000) && (PY_VERSION_HEX < 0x03030000) #error PyCUDA does not support Python 3 versions earlier than 3.3. #endif #if PY_VERSION_HEX >= 0x02050000 typedef Py_ssize_t PYCUDA_BUFFER_SIZE_T; #else typedef int PYCUDA_BUFFER_SIZE_T; #endif #define PYCUDA_PARSE_STREAM_PY \ CUstream s_handle; \ if (stream_py.ptr() != Py_None) \ { \ const stream &s = py::extract(stream_py); \ s_handle = s.handle(); \ } \ else \ s_handle = 0; // {{{ tracing and error guards #ifdef CUDAPP_TRACE_CUDA #define CUDAPP_PRINT_CALL_TRACE(NAME) \ std::cerr << NAME << std::endl; #define CUDAPP_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) \ std::cerr << NAME << " (" << EXTRA_INFO << ')' << std::endl; #define CUDAPP_PRINT_ERROR_TRACE(NAME, CODE) \ if (CODE != CUDA_SUCCESS) \ std::cerr << NAME << " failed with code " << CODE << std::endl; #else #define CUDAPP_PRINT_CALL_TRACE(NAME) /*nothing*/ #define CUDAPP_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) /*nothing*/ #define CUDAPP_PRINT_ERROR_TRACE(NAME, CODE) /*nothing*/ #endif #define CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \ { \ CUDAPP_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \ CUresult cu_status_code; \ Py_BEGIN_ALLOW_THREADS \ cu_status_code = NAME ARGLIST; \ Py_END_ALLOW_THREADS \ if (cu_status_code != CUDA_SUCCESS) \ throw pycuda::error(#NAME, cu_status_code);\ } #define CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \ { \ CUDAPP_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \ CUresult cu_status_code; \ cu_status_code = NAME ARGLIST; \ CUDAPP_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CUDA_SUCCESS) \ throw pycuda::error(#NAME, cu_status_code);\ } #define CUDAPP_CALL_GUARDED_THREADED(NAME, ARGLIST) \ { \ CUDAPP_PRINT_CALL_TRACE(#NAME); \ CUresult cu_status_code; \ Py_BEGIN_ALLOW_THREADS \ cu_status_code = NAME ARGLIST; \ Py_END_ALLOW_THREADS \ CUDAPP_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CUDA_SUCCESS) \ throw pycuda::error(#NAME, cu_status_code);\ } #define CUDAPP_CALL_GUARDED(NAME, ARGLIST) \ { \ CUDAPP_PRINT_CALL_TRACE(#NAME); \ CUresult cu_status_code; \ cu_status_code = NAME ARGLIST; \ CUDAPP_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CUDA_SUCCESS) \ throw pycuda::error(#NAME, cu_status_code);\ } #define CUDAPP_CALL_GUARDED_CLEANUP(NAME, ARGLIST) \ { \ CUDAPP_PRINT_CALL_TRACE(#NAME); \ CUresult cu_status_code; \ cu_status_code = NAME ARGLIST; \ CUDAPP_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CUDA_SUCCESS) \ std::cerr \ << "PyCUDA WARNING: a clean-up operation failed (dead context maybe?)" \ << std::endl \ << pycuda::error::make_message(#NAME, cu_status_code) \ << std::endl; \ } #define CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(TYPE) \ catch (pycuda::cannot_activate_out_of_thread_context) \ { } \ catch (pycuda::cannot_activate_dead_context) \ { \ /* PyErr_Warn( \ PyExc_UserWarning, #TYPE " in dead context was implicitly cleaned up");*/ \ } // In all likelihood, this TYPE's managing thread has exited, and // therefore its context has already been deleted. No need to harp // on the fact that we still thought there was cleanup to do. // }}} namespace pycuda { namespace py = boost::python; typedef #if CUDAPP_CUDA_VERSION >= 3020 size_t #else unsigned int #endif pycuda_size_t; typedef #if defined(_WIN32) && defined(_WIN64) long long #else long #endif hash_type; // {{{ error reporting class error : public std::runtime_error { private: const char *m_routine; CUresult m_code; public: static std::string make_message(const char *rout, CUresult c, const char *msg=0) { std::string result = rout; result += " failed: "; result += curesult_to_str(c); if (msg) { result += " - "; result += msg; } return result; } error(const char *rout, CUresult c, const char *msg=0) : std::runtime_error(make_message(rout, c, msg)), m_routine(rout), m_code(c) { } const char *routine() const { return m_routine; } CUresult code() const { return m_code; } bool is_out_of_memory() const { return code() == CUDA_ERROR_OUT_OF_MEMORY; } static const char *curesult_to_str(CUresult e) { switch (e) { case CUDA_SUCCESS: return "success"; case CUDA_ERROR_INVALID_VALUE: return "invalid value"; case CUDA_ERROR_OUT_OF_MEMORY: return "out of memory"; case CUDA_ERROR_NOT_INITIALIZED: return "not initialized"; #if CUDAPP_CUDA_VERSION >= 2000 case CUDA_ERROR_DEINITIALIZED: return "deinitialized"; #endif #if CUDAPP_CUDA_VERSION >= 4000 case CUDA_ERROR_PROFILER_DISABLED: return "profiler disabled"; case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "profiler not initialized"; case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "profiler already started"; case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "profiler already stopped"; #endif case CUDA_ERROR_NO_DEVICE: return "no device"; case CUDA_ERROR_INVALID_DEVICE: return "invalid device"; case CUDA_ERROR_INVALID_IMAGE: return "invalid image"; case CUDA_ERROR_INVALID_CONTEXT: return "invalid context"; case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "context already current"; case CUDA_ERROR_MAP_FAILED: return "map failed"; case CUDA_ERROR_UNMAP_FAILED: return "unmap failed"; case CUDA_ERROR_ARRAY_IS_MAPPED: return "array is mapped"; case CUDA_ERROR_ALREADY_MAPPED: return "already mapped"; case CUDA_ERROR_NO_BINARY_FOR_GPU: return "no binary for gpu"; case CUDA_ERROR_ALREADY_ACQUIRED: return "already acquired"; case CUDA_ERROR_NOT_MAPPED: return "not mapped"; #if CUDAPP_CUDA_VERSION >= 3000 case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "not mapped as array"; case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "not mapped as pointer"; #ifdef CUDAPP_POST_30_BETA case CUDA_ERROR_ECC_UNCORRECTABLE: return "ECC uncorrectable"; #endif #endif #if CUDAPP_CUDA_VERSION >= 3010 case CUDA_ERROR_UNSUPPORTED_LIMIT: return "unsupported limit"; #endif #if CUDAPP_CUDA_VERSION >= 4000 case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "context already in use"; #endif case CUDA_ERROR_INVALID_SOURCE: return "invalid source"; case CUDA_ERROR_FILE_NOT_FOUND: return "file not found"; #if CUDAPP_CUDA_VERSION >= 3010 case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "shared object symbol not found"; case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "shared object init failed"; #endif case CUDA_ERROR_INVALID_HANDLE: return "invalid handle"; case CUDA_ERROR_NOT_FOUND: return "not found"; case CUDA_ERROR_NOT_READY: return "not ready"; case CUDA_ERROR_LAUNCH_FAILED: return "launch failed"; case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "launch out of resources"; case CUDA_ERROR_LAUNCH_TIMEOUT: return "launch timeout"; case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "launch incompatible texturing"; #if CUDAPP_CUDA_VERSION >= 4000 case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "peer access already enabled"; case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "peer access not enabled"; case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "primary context active"; case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "context is destroyed"; #endif #if (CUDAPP_CUDA_VERSION >= 3000) && (CUDAPP_CUDA_VERSION < 3020) case CUDA_ERROR_POINTER_IS_64BIT: return "attempted to retrieve 64-bit pointer via 32-bit api function"; case CUDA_ERROR_SIZE_IS_64BIT: return "attempted to retrieve 64-bit size via 32-bit api function"; #endif #if CUDAPP_CUDA_VERSION >= 4010 case CUDA_ERROR_ASSERT: return "device-side assert triggered"; case CUDA_ERROR_TOO_MANY_PEERS: return "too many peers"; case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "host memory already registered"; case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "host memory not registered"; #endif #if CUDAPP_CUDA_VERSION >= 5000 case CUDA_ERROR_NOT_SUPPORTED: return "operation not supported on current system or device"; #endif case CUDA_ERROR_UNKNOWN: return "unknown"; default: return "invalid/unknown error code"; } } }; struct cannot_activate_out_of_thread_context : public std::logic_error { cannot_activate_out_of_thread_context(std::string const &w) : std::logic_error(w) { } }; struct cannot_activate_dead_context : public std::logic_error { cannot_activate_dead_context(std::string const &w) : std::logic_error(w) { } }; // }}} // {{{ version query ------------------------------------------------------------ #if CUDAPP_CUDA_VERSION >= 2020 inline int get_driver_version() { int result; CUDAPP_CALL_GUARDED(cuDriverGetVersion, (&result)); return result; } #endif // }}} // {{{ device class context; class device { private: CUdevice m_device; public: device(CUdevice dev) : m_device(dev) { } static int count() { int result; CUDAPP_CALL_GUARDED(cuDeviceGetCount, (&result)); return result; } std::string name() { char buffer[1024]; CUDAPP_CALL_GUARDED(cuDeviceGetName, (buffer, sizeof(buffer), m_device)); return buffer; } #if CUDAPP_CUDA_VERSION >= 4010 std::string pci_bus_id() { char buffer[1024]; CUDAPP_CALL_GUARDED(cuDeviceGetPCIBusId, (buffer, sizeof(buffer), m_device)); return buffer; } #endif py::tuple compute_capability() { int major, minor; CUDAPP_CALL_GUARDED(cuDeviceComputeCapability, (&major, &minor, m_device)); return py::make_tuple(major, minor); } pycuda_size_t total_memory() { pycuda_size_t bytes; CUDAPP_CALL_GUARDED(cuDeviceTotalMem, (&bytes, m_device)); return bytes; } int get_attribute(CUdevice_attribute attr) const { int result; CUDAPP_CALL_GUARDED(cuDeviceGetAttribute, (&result, attr, m_device)); return result; } bool operator==(const device &other) const { return m_device == other.m_device; } bool operator!=(const device &other) const { return m_device != other.m_device; } hash_type hash() const { return m_device; } boost::shared_ptr make_context(unsigned int flags); CUdevice handle() const { return m_device; } #if CUDAPP_CUDA_VERSION >= 4000 bool can_access_peer(device const &other) { int result; CUDAPP_CALL_GUARDED(cuDeviceCanAccessPeer, (&result, handle(), other.handle())); return result; } #endif }; inline void init(unsigned int flags) { CUDAPP_CALL_GUARDED(cuInit, (flags)); } inline device *make_device(int ordinal) { CUdevice result; CUDAPP_CALL_GUARDED(cuDeviceGet, (&result, ordinal)); return new device(result); } #if CUDAPP_CUDA_VERSION >= 4010 inline device *make_device_from_pci_bus_id(std::string const pci_bus_id) { CUdevice result; CUDAPP_CALL_GUARDED(cuDeviceGetByPCIBusId, (&result, const_cast(pci_bus_id.c_str()))); return new device(result); } #endif // }}} // {{{ context /* A word on context management: We don't let CUDA's context stack get more * than one deep. CUDA only supports pushing floating contexts. We may wish * to push contexts that are already active at a deeper stack level, so we * maintain all contexts floating other than the top one. */ // for friend decl namespace gl { boost::shared_ptr make_gl_context(device const &dev, unsigned int flags); } class context_stack; extern boost::thread_specific_ptr context_stack_ptr; class context_stack { /* This wrapper is necessary because we need to pop the contents * off the stack before we destroy each of the contexts. This, in turn, * is because the contexts need to be able to access the stack in order * to be destroyed. */ private: typedef std::stack > stack_t; typedef stack_t::value_type value_type;; stack_t m_stack; public: ~context_stack(); bool empty() const { return m_stack.empty(); } value_type &top() { return m_stack.top(); } void pop() { m_stack.pop(); } void push(value_type v) { m_stack.push(v); } static context_stack &get() { if (context_stack_ptr.get() == 0) context_stack_ptr.reset(new context_stack); return *context_stack_ptr; } }; class context : boost::noncopyable { private: CUcontext m_context; bool m_valid; unsigned m_use_count; boost::thread::id m_thread; public: context(CUcontext ctx) : m_context(ctx), m_valid(true), m_use_count(1), m_thread(boost::this_thread::get_id()) { } ~context() { if (m_valid) { /* It's possible that we get here with a non-zero m_use_count. Since the context * stack holds shared_ptrs, this must mean that the context stack itself is getting * destroyed, which means it's ok for this context to sign off, too. */ detach(); } } CUcontext handle() const { return m_context; } bool operator==(const context &other) const { return m_context == other.m_context; } bool operator!=(const context &other) const { return m_context != other.m_context; } hash_type hash() const { return hash_type(m_context) ^ hash_type(this); } boost::thread::id thread_id() const { return m_thread; } bool is_valid() const { return m_valid; } static boost::shared_ptr attach(unsigned int flags) { CUcontext current; CUDAPP_CALL_GUARDED(cuCtxAttach, (¤t, flags)); boost::shared_ptr result(new context(current)); context_stack::get().push(result); return result; } void detach() { if (m_valid) { bool active_before_destruction = current_context().get() == this; if (active_before_destruction) { CUDAPP_CALL_GUARDED_CLEANUP(cuCtxDetach, (m_context)); } else { if (m_thread == boost::this_thread::get_id()) { CUDAPP_CALL_GUARDED_CLEANUP(cuCtxPushCurrent, (m_context)); CUDAPP_CALL_GUARDED_CLEANUP(cuCtxDetach, (m_context)); /* pop is implicit in detach */ } else { // In all likelihood, this context's managing thread has exited, and // therefore this context has already been deleted. No need to harp // on the fact that we still thought there was cleanup to do. // std::cerr << "PyCUDA WARNING: leaked out-of-thread context " << std::endl; } } m_valid = false; if (active_before_destruction) { boost::shared_ptr new_active = current_context(this); if (new_active.get()) { CUDAPP_CALL_GUARDED(cuCtxPushCurrent, (new_active->m_context)); } } } else throw error("context::detach", CUDA_ERROR_INVALID_CONTEXT, "cannot detach from invalid context"); } static device get_device() { CUdevice dev; CUDAPP_CALL_GUARDED(cuCtxGetDevice, (&dev)); return device(dev); } #if CUDAPP_CUDA_VERSION >= 2000 static void prepare_context_switch() { if (!context_stack::get().empty()) { CUcontext popped; CUDAPP_CALL_GUARDED(cuCtxPopCurrent, (&popped)); } } static void pop() { prepare_context_switch(); context_stack &ctx_stack = context_stack::get(); if (ctx_stack.empty()) { throw error("context::pop", CUDA_ERROR_INVALID_CONTEXT, "cannot pop non-current context"); } boost::shared_ptr current = current_context(); if (current) --current->m_use_count; ctx_stack.pop(); current = current_context(); if (current) CUDAPP_CALL_GUARDED(cuCtxPushCurrent, (current_context()->m_context)); } #else static void prepare_context_switch() { } #endif static void synchronize() { CUDAPP_CALL_GUARDED_THREADED(cuCtxSynchronize, ()); } static boost::shared_ptr current_context(context *except=0) { while (true) { if (context_stack::get().empty()) return boost::shared_ptr(); boost::shared_ptr result(context_stack::get().top()); if (result.get() != except && result->is_valid()) { // good, weak pointer didn't expire return result; } // context invalid, pop it and try again. context_stack::get().pop(); } } #if CUDAPP_CUDA_VERSION >= 3010 static void set_limit(CUlimit limit, size_t value) { CUDAPP_CALL_GUARDED(cuCtxSetLimit, (limit, value)); } static size_t get_limit(CUlimit limit) { size_t value; CUDAPP_CALL_GUARDED(cuCtxGetLimit, (&value, limit)); return value; } #endif #if CUDAPP_CUDA_VERSION >= 3020 static CUfunc_cache get_cache_config() { CUfunc_cache value; CUDAPP_CALL_GUARDED(cuCtxGetCacheConfig, (&value)); return value; } static void set_cache_config(CUfunc_cache cc) { CUDAPP_CALL_GUARDED(cuCtxSetCacheConfig, (cc)); } unsigned int get_api_version() { unsigned int value; CUDAPP_CALL_GUARDED(cuCtxGetApiVersion, (m_context, &value)); return value; } #endif #if CUDAPP_CUDA_VERSION >= 4000 static void enable_peer_access(context const &peer, unsigned int flags) { CUDAPP_CALL_GUARDED(cuCtxEnablePeerAccess, (peer.handle(), flags)); } static void disable_peer_access(context const &peer) { CUDAPP_CALL_GUARDED(cuCtxDisablePeerAccess, (peer.handle())); } #endif #if CUDAPP_CUDA_VERSION >= 4020 static CUsharedconfig get_shared_config() { CUsharedconfig config; CUDAPP_CALL_GUARDED(cuCtxGetSharedMemConfig, (&config)); return config; } static void set_shared_config(CUsharedconfig config) { CUDAPP_CALL_GUARDED(cuCtxSetSharedMemConfig, (config)); } #endif friend class device; friend void context_push(boost::shared_ptr ctx); friend boost::shared_ptr gl::make_gl_context(device const &dev, unsigned int flags); }; inline boost::shared_ptr device::make_context(unsigned int flags) { context::prepare_context_switch(); CUcontext ctx; CUDAPP_CALL_GUARDED(cuCtxCreate, (&ctx, flags, m_device)); boost::shared_ptr result(new context(ctx)); context_stack::get().push(result); return result; } #if CUDAPP_CUDA_VERSION >= 2000 inline void context_push(boost::shared_ptr ctx) { context::prepare_context_switch(); CUDAPP_CALL_GUARDED(cuCtxPushCurrent, (ctx->m_context)); context_stack::get().push(ctx); ++ctx->m_use_count; } #endif inline context_stack::~context_stack() { if (!m_stack.empty()) { std::cerr << "-------------------------------------------------------------------" << std::endl << "PyCUDA ERROR: The context stack was not empty upon module cleanup." << std::endl << "-------------------------------------------------------------------" << std::endl << "A context was still active when the context stack was being" << std::endl << "cleaned up. At this point in our execution, CUDA may already" << std::endl << "have been deinitialized, so there is no way we can finish" << std::endl << "cleanly. The program will be aborted now." << std::endl << "Use Context.pop() to avoid this problem." << std::endl << "-------------------------------------------------------------------" << std::endl; abort(); } } class explicit_context_dependent { private: boost::shared_ptr m_ward_context; public: void acquire_context() { m_ward_context = context::current_context(); if (m_ward_context.get() == 0) throw error("explicit_context_dependent", CUDA_ERROR_INVALID_CONTEXT, "no currently active context?"); } void release_context() { m_ward_context.reset(); } boost::shared_ptr get_context() { return m_ward_context; } }; class context_dependent : public explicit_context_dependent { private: boost::shared_ptr m_ward_context; public: context_dependent() { acquire_context(); } }; class scoped_context_activation { private: boost::shared_ptr m_context; bool m_did_switch; public: scoped_context_activation(boost::shared_ptr ctx) : m_context(ctx) { if (!m_context->is_valid()) throw pycuda::cannot_activate_dead_context( "cannot activate dead context"); m_did_switch = context::current_context() != m_context; if (m_did_switch) { if (boost::this_thread::get_id() != m_context->thread_id()) throw pycuda::cannot_activate_out_of_thread_context( "cannot activate out-of-thread context"); #if CUDAPP_CUDA_VERSION >= 2000 context_push(m_context); #else throw pycuda::error("scoped_context_activation", CUDA_ERROR_INVALID_CONTEXT, "not available in CUDA < 2.0"); #endif } } ~scoped_context_activation() { #if CUDAPP_CUDA_VERSION >= 2000 if (m_did_switch) m_context->pop(); #endif } }; // }}} // {{{ stream class event; class stream : public boost::noncopyable, public context_dependent { private: CUstream m_stream; public: stream(unsigned int flags=0) { CUDAPP_CALL_GUARDED(cuStreamCreate, (&m_stream, flags)); } ~stream() { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuStreamDestroy, (m_stream)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(stream); } void synchronize() { CUDAPP_CALL_GUARDED_THREADED(cuStreamSynchronize, (m_stream)); } CUstream handle() const { return m_stream; } intptr_t handle_int() const { return (intptr_t) m_stream; } #if CUDAPP_CUDA_VERSION >= 3020 void wait_for_event(const event &evt); #endif bool is_done() const { CUDAPP_PRINT_CALL_TRACE("cuStreamQuery"); CUresult result = cuStreamQuery(m_stream); switch (result) { case CUDA_SUCCESS: return true; case CUDA_ERROR_NOT_READY: return false; default: CUDAPP_PRINT_ERROR_TRACE("cuStreamQuery", result); throw error("cuStreamQuery", result); } } }; // }}} // {{{ array class array : public boost::noncopyable, public context_dependent { private: CUarray m_array; bool m_managed; public: array(const CUDA_ARRAY_DESCRIPTOR &descr) : m_managed(true) { CUDAPP_CALL_GUARDED(cuArrayCreate, (&m_array, &descr)); } #if CUDAPP_CUDA_VERSION >= 2000 array(const CUDA_ARRAY3D_DESCRIPTOR &descr) : m_managed(true) { CUDAPP_CALL_GUARDED(cuArray3DCreate, (&m_array, &descr)); } #endif array(CUarray ary, bool managed) : m_array(ary), m_managed(managed) { } ~array() { free(); } void free() { if (m_managed) { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuArrayDestroy, (m_array)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(array); m_managed = false; release_context(); } } CUDA_ARRAY_DESCRIPTOR get_descriptor() { CUDA_ARRAY_DESCRIPTOR result; CUDAPP_CALL_GUARDED(cuArrayGetDescriptor, (&result, m_array)); return result; } #if CUDAPP_CUDA_VERSION >= 2000 CUDA_ARRAY3D_DESCRIPTOR get_descriptor_3d() { CUDA_ARRAY3D_DESCRIPTOR result; CUDAPP_CALL_GUARDED(cuArray3DGetDescriptor, (&result, m_array)); return result; } #endif CUarray handle() const { return m_array; } }; // }}} // {{{ texture reference class module; class texture_reference : public boost::noncopyable { private: CUtexref m_texref; bool m_managed; // life support for array and module boost::shared_ptr m_array; boost::shared_ptr m_module; public: texture_reference() : m_managed(true) { CUDAPP_CALL_GUARDED(cuTexRefCreate, (&m_texref)); } texture_reference(CUtexref tr, bool managed) : m_texref(tr), m_managed(managed) { } ~texture_reference() { if (m_managed) { CUDAPP_CALL_GUARDED_CLEANUP(cuTexRefDestroy, (m_texref)); } } void set_module(boost::shared_ptr mod) { m_module = mod; } CUtexref handle() const { return m_texref; } void set_array(boost::shared_ptr ary) { CUDAPP_CALL_GUARDED(cuTexRefSetArray, (m_texref, ary->handle(), CU_TRSA_OVERRIDE_FORMAT)); m_array = ary; } pycuda_size_t set_address(CUdeviceptr dptr, unsigned int bytes, bool allow_offset=false) { pycuda_size_t byte_offset; CUDAPP_CALL_GUARDED(cuTexRefSetAddress, (&byte_offset, m_texref, dptr, bytes)); if (!allow_offset && byte_offset != 0) throw pycuda::error("texture_reference::set_address", CUDA_ERROR_INVALID_VALUE, "texture binding resulted in offset, but allow_offset was false"); m_array.reset(); return byte_offset; } #if CUDAPP_CUDA_VERSION >= 2020 void set_address_2d(CUdeviceptr dptr, const CUDA_ARRAY_DESCRIPTOR &descr, unsigned int pitch) { CUDAPP_CALL_GUARDED(cuTexRefSetAddress2D, (m_texref, &descr, dptr, pitch)); } #endif void set_format(CUarray_format fmt, int num_packed_components) { CUDAPP_CALL_GUARDED(cuTexRefSetFormat, (m_texref, fmt, num_packed_components)); } void set_address_mode(int dim, CUaddress_mode am) { CUDAPP_CALL_GUARDED(cuTexRefSetAddressMode, (m_texref, dim, am)); } void set_filter_mode(CUfilter_mode fm) { CUDAPP_CALL_GUARDED(cuTexRefSetFilterMode, (m_texref, fm)); } void set_flags(unsigned int flags) { CUDAPP_CALL_GUARDED(cuTexRefSetFlags, (m_texref, flags)); } CUdeviceptr get_address() { CUdeviceptr result; CUDAPP_CALL_GUARDED(cuTexRefGetAddress, (&result, m_texref)); return result; } array *get_array() { CUarray result; CUDAPP_CALL_GUARDED(cuTexRefGetArray, (&result, m_texref)); return new array(result, false); } CUaddress_mode get_address_mode(int dim) { CUaddress_mode result; CUDAPP_CALL_GUARDED(cuTexRefGetAddressMode, (&result, m_texref, dim)); return result; } CUfilter_mode get_filter_mode() { CUfilter_mode result; CUDAPP_CALL_GUARDED(cuTexRefGetFilterMode, (&result, m_texref)); return result; } #if CUDAPP_CUDA_VERSION >= 2000 py::tuple get_format() { CUarray_format fmt; int num_channels; CUDAPP_CALL_GUARDED(cuTexRefGetFormat, (&fmt, &num_channels, m_texref)); return py::make_tuple(fmt, num_channels); } #endif unsigned int get_flags() { unsigned int result; CUDAPP_CALL_GUARDED(cuTexRefGetFlags, (&result, m_texref)); return result; } }; // }}} // {{{ surface reference #if CUDAPP_CUDA_VERSION >= 3010 class module; class surface_reference : public boost::noncopyable { private: CUsurfref m_surfref; // life support for array and module boost::shared_ptr m_array; boost::shared_ptr m_module; public: surface_reference(CUsurfref sr) : m_surfref(sr) { } void set_module(boost::shared_ptr mod) { m_module = mod; } CUsurfref handle() const { return m_surfref; } void set_array(boost::shared_ptr ary, unsigned int flags) { CUDAPP_CALL_GUARDED(cuSurfRefSetArray, (m_surfref, ary->handle(), flags)); m_array = ary; } array *get_array() { CUarray result; CUDAPP_CALL_GUARDED(cuSurfRefGetArray, (&result, m_surfref)); return new array(result, false); } }; #endif // }}} // {{{ module class function; class module : public boost::noncopyable, public context_dependent { private: CUmodule m_module; public: module(CUmodule mod) : m_module(mod) { } ~module() { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuModuleUnload, (m_module)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(module); } CUmodule handle() const { return m_module; } function get_function(const char *name); py::tuple get_global(const char *name) { CUdeviceptr devptr; pycuda_size_t bytes; CUDAPP_CALL_GUARDED(cuModuleGetGlobal, (&devptr, &bytes, m_module, name)); return py::make_tuple(devptr, bytes); } }; inline module *module_from_file(const char *filename) { CUmodule mod; CUDAPP_CALL_GUARDED(cuModuleLoad, (&mod, filename)); return new module(mod); } inline texture_reference *module_get_texref( boost::shared_ptr mod, const char *name) { CUtexref tr; CUDAPP_CALL_GUARDED(cuModuleGetTexRef, (&tr, mod->handle(), name)); std::auto_ptr result( new texture_reference(tr, false)); result->set_module(mod); return result.release(); } #if CUDAPP_CUDA_VERSION >= 3010 inline surface_reference *module_get_surfref( boost::shared_ptr mod, const char *name) { CUsurfref sr; CUDAPP_CALL_GUARDED(cuModuleGetSurfRef, (&sr, mod->handle(), name)); std::auto_ptr result( new surface_reference(sr)); result->set_module(mod); return result.release(); } #endif // }}} // {{{ function class function { private: CUfunction m_function; std::string m_symbol; public: function(CUfunction func, std::string const &sym) : m_function(func), m_symbol(sym) { } void set_block_shape(int x, int y, int z) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncSetBlockShape, (m_function, x, y, z), m_symbol); } void set_shared_size(unsigned int bytes) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncSetSharedSize, (m_function, bytes), m_symbol); } void param_set_size(unsigned int bytes) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuParamSetSize, (m_function, bytes), m_symbol); } void param_set(int offset, unsigned int value) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuParamSeti, (m_function, offset, value), m_symbol); } void param_set(int offset, float value) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuParamSetf, (m_function, offset, value), m_symbol); } void param_setv(int offset, void *buf, size_t len) { // maybe the unsigned int will change, it does not seem right CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuParamSetv, (m_function, offset, buf, (unsigned int) len), m_symbol); } void param_set_texref(const texture_reference &tr) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(cuParamSetTexRef, (m_function, CU_PARAM_TR_DEFAULT, tr.handle()), m_symbol); } void launch() { CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO( cuLaunch, (m_function), m_symbol); } void launch_grid(int grid_width, int grid_height) { CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO( cuLaunchGrid, (m_function, grid_width, grid_height), m_symbol); } void launch_grid_async(int grid_width, int grid_height, const stream &s) { CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO( cuLaunchGridAsync, (m_function, grid_width, grid_height, s.handle()), m_symbol); } #if CUDAPP_CUDA_VERSION >= 2020 int get_attribute(CUfunction_attribute attr) const { int result; CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncGetAttribute, (&result, attr, m_function), m_symbol); return result; } #endif #if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA) void set_cache_config(CUfunc_cache fc) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncSetCacheConfig, (m_function, fc), m_symbol); } #endif #if CUDAPP_CUDA_VERSION >= 4000 void launch_kernel(py::tuple grid_dim_py, py::tuple block_dim_py, py::object parameter_buffer, unsigned shared_mem_bytes, py::object stream_py) { const unsigned axis_count = 3; unsigned grid_dim[axis_count]; unsigned block_dim[axis_count]; for (unsigned i = 0; i < axis_count; ++i) { grid_dim[i] = 1; block_dim[i] = 1; } pycuda_size_t gd_length = py::len(grid_dim_py); if (gd_length > axis_count) throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, "too many grid dimensions in kernel launch"); for (unsigned i = 0; i < gd_length; ++i) grid_dim[i] = py::extract(grid_dim_py[i]); pycuda_size_t bd_length = py::len(block_dim_py); if (bd_length > axis_count) throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, "too many block dimensions in kernel launch"); for (unsigned i = 0; i < bd_length; ++i) block_dim[i] = py::extract(block_dim_py[i]); PYCUDA_PARSE_STREAM_PY; const void *par_buf; PYCUDA_BUFFER_SIZE_T py_par_len; if (PyObject_AsReadBuffer(parameter_buffer.ptr(), &par_buf, &py_par_len)) throw py::error_already_set(); size_t par_len = py_par_len; void *config[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast(par_buf), CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len, CU_LAUNCH_PARAM_END }; CUDAPP_CALL_GUARDED( cuLaunchKernel, (m_function, grid_dim[0], grid_dim[1], grid_dim[2], block_dim[0], block_dim[1], block_dim[2], shared_mem_bytes, s_handle, 0, config )); } #endif #if CUDAPP_CUDA_VERSION >= 4020 void set_shared_config(CUsharedconfig config) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncSetSharedMemConfig, (m_function, config), m_symbol); } #endif }; inline function module::get_function(const char *name) { CUfunction func; CUDAPP_CALL_GUARDED(cuModuleGetFunction, (&func, m_module, name)); return function(func, name); } // }}} // {{{ device memory inline py::tuple mem_get_info() { pycuda_size_t free, total; CUDAPP_CALL_GUARDED(cuMemGetInfo, (&free, &total)); return py::make_tuple(free, total); } inline CUdeviceptr mem_alloc(size_t bytes) { CUdeviceptr devptr; CUDAPP_CALL_GUARDED(cuMemAlloc, (&devptr, bytes)); return devptr; } inline void mem_free(CUdeviceptr devptr) { CUDAPP_CALL_GUARDED_CLEANUP(cuMemFree, (devptr)); } // A class the user can override to make device_allocation- // workalikes. class pointer_holder_base { public: virtual ~pointer_holder_base() { } virtual CUdeviceptr get_pointer() = 0; operator CUdeviceptr() { return get_pointer(); } py::object as_buffer(size_t size, size_t offset) { return py::object( py::handle<>( #if PY_VERSION_HEX >= 0x03030000 PyMemoryView_FromMemory((char *) (get_pointer() + offset), size, PyBUF_READ | PyBUF_WRITE) #else /* Py2 */ PyBuffer_FromReadWriteMemory((void *) (get_pointer() + offset), size) #endif )); } }; class device_allocation : public boost::noncopyable, public context_dependent { private: bool m_valid; protected: CUdeviceptr m_devptr; public: device_allocation(CUdeviceptr devptr) : m_valid(true), m_devptr(devptr) { } void free() { if (m_valid) { try { scoped_context_activation ca(get_context()); mem_free(m_devptr); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(device_allocation); release_context(); m_valid = false; } else throw pycuda::error("device_allocation::free", CUDA_ERROR_INVALID_HANDLE); } ~device_allocation() { if (m_valid) free(); } operator CUdeviceptr() const { return m_devptr; } py::object as_buffer(size_t size, size_t offset) { return py::object( py::handle<>( #if PY_VERSION_HEX >= 0x03030000 PyMemoryView_FromMemory((char *) (m_devptr + offset), size, PyBUF_READ | PyBUF_WRITE) #else /* Py2 */ PyBuffer_FromReadWriteMemory((void *) (m_devptr + offset), size) #endif )); } }; inline Py_ssize_t mem_alloc_pitch( std::auto_ptr &da, unsigned int width, unsigned int height, unsigned int access_size) { CUdeviceptr devptr; pycuda_size_t pitch; CUDAPP_CALL_GUARDED(cuMemAllocPitch, (&devptr, &pitch, width, height, access_size)); da = std::auto_ptr(new device_allocation(devptr)); return pitch; } inline py::tuple mem_get_address_range(CUdeviceptr ptr) { CUdeviceptr base; pycuda_size_t size; CUDAPP_CALL_GUARDED(cuMemGetAddressRange, (&base, &size, ptr)); return py::make_tuple(base, size); } inline void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); } inline void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); } inline void memcpy_atoa( array const &dst, unsigned int dst_index, array const &src, unsigned int src_index, unsigned int len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); } // }}} // {{{ ipc_mem_handle #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000 class ipc_mem_handle : public boost::noncopyable, public context_dependent { private: bool m_valid; protected: CUdeviceptr m_devptr; public: ipc_mem_handle(py::object obj, CUipcMem_flags flags=CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) : m_valid(true) { if (!PyByteArray_Check(obj.ptr())) throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE, "argument is not a bytes array"); CUipcMemHandle handle; if (PyByteArray_GET_SIZE(obj.ptr()) != sizeof(handle)) throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE, "handle has the wrong size"); memcpy(&handle, PyByteArray_AS_STRING(obj.ptr()), sizeof(handle)); CUDAPP_CALL_GUARDED(cuIpcOpenMemHandle, (&m_devptr, handle, flags)); } void close() { if (m_valid) { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuIpcCloseMemHandle, (m_devptr)); mem_free(m_devptr); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(ipc_mem_handle); release_context(); m_valid = false; } else throw pycuda::error("ipc_mem_handle::close", CUDA_ERROR_INVALID_HANDLE); } ~ipc_mem_handle() { if (m_valid) close(); } operator CUdeviceptr() const { return m_devptr; } }; inline py::object mem_get_ipc_handle(CUdeviceptr devptr) { CUipcMemHandle handle; CUDAPP_CALL_GUARDED(cuIpcGetMemHandle, (&handle, devptr)); return py::object(py::handle<>(PyByteArray_FromStringAndSize( reinterpret_cast(&handle), sizeof(handle)))); } #endif // }}} // {{{ structured memcpy #define MEMCPY_SETTERS \ void set_src_host(py::object buf_py) \ { \ srcMemoryType = CU_MEMORYTYPE_HOST; \ PYCUDA_BUFFER_SIZE_T len; \ if (PyObject_AsReadBuffer(buf_py.ptr(), &srcHost, &len)) \ throw py::error_already_set(); \ } \ \ void set_src_array(array const &ary) \ { \ srcMemoryType = CU_MEMORYTYPE_ARRAY; \ srcArray = ary.handle(); \ } \ \ void set_src_device(CUdeviceptr devptr) \ { \ srcMemoryType = CU_MEMORYTYPE_DEVICE; \ srcDevice = devptr; \ } \ \ void set_dst_host(py::object buf_py) \ { \ dstMemoryType = CU_MEMORYTYPE_HOST; \ PYCUDA_BUFFER_SIZE_T len; \ if (PyObject_AsWriteBuffer(buf_py.ptr(), &dstHost, &len)) \ throw py::error_already_set(); \ } \ \ void set_dst_array(array const &ary) \ { \ dstMemoryType = CU_MEMORYTYPE_ARRAY; \ dstArray = ary.handle(); \ } \ \ void set_dst_device(CUdeviceptr devptr) \ { \ dstMemoryType = CU_MEMORYTYPE_DEVICE; \ dstDevice = devptr; \ } #if CUDAPP_CUDA_VERSION >= 4000 #define MEMCPY_SETTERS_UNIFIED \ void set_src_unified(py::object buf_py) \ { \ srcMemoryType = CU_MEMORYTYPE_UNIFIED; \ PYCUDA_BUFFER_SIZE_T len; \ if (PyObject_AsReadBuffer(buf_py.ptr(), &srcHost, &len)) \ throw py::error_already_set(); \ } \ \ void set_dst_unified(py::object buf_py) \ { \ dstMemoryType = CU_MEMORYTYPE_UNIFIED; \ PYCUDA_BUFFER_SIZE_T len; \ if (PyObject_AsWriteBuffer(buf_py.ptr(), &dstHost, &len)) \ throw py::error_already_set(); \ } #else #define MEMCPY_SETTERS_UNIFIED /* empty */ #endif struct memcpy_2d : public CUDA_MEMCPY2D { memcpy_2d() { srcXInBytes = 0; srcY = 0; dstXInBytes = 0; dstY = 0; } MEMCPY_SETTERS; MEMCPY_SETTERS_UNIFIED; void execute(bool aligned=false) const { if (aligned) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2D, (this)); } else { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DUnaligned, (this)); } } void execute_async(const stream &s) const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DAsync, (this, s.handle())); } }; #if CUDAPP_CUDA_VERSION >= 2000 struct memcpy_3d : public CUDA_MEMCPY3D { memcpy_3d() { reserved0 = 0; reserved1 = 0; srcXInBytes = 0; srcY = 0; srcZ = 0; srcLOD = 0; dstXInBytes = 0; dstY = 0; dstZ = 0; dstLOD = 0; } MEMCPY_SETTERS; MEMCPY_SETTERS_UNIFIED; void execute() const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3D, (this)); } void execute_async(const stream &s) const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DAsync, (this, s.handle())); } }; #endif #if CUDAPP_CUDA_VERSION >= 4000 struct memcpy_3d_peer : public CUDA_MEMCPY3D_PEER { memcpy_3d_peer() { srcXInBytes = 0; srcY = 0; srcZ = 0; srcLOD = 0; dstXInBytes = 0; dstY = 0; dstZ = 0; dstLOD = 0; } MEMCPY_SETTERS; MEMCPY_SETTERS_UNIFIED; void set_src_context(context const &ctx) { srcContext = ctx.handle(); } void set_dst_context(context const &ctx) { dstContext = ctx.handle(); } void execute() const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeer, (this)); } void execute_async(const stream &s) const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeerAsync, (this, s.handle())); } }; #endif // }}} // {{{ host memory inline void *mem_host_alloc(size_t size, unsigned flags=0) { void *m_data; #if CUDAPP_CUDA_VERSION >= 2020 CUDAPP_CALL_GUARDED(cuMemHostAlloc, (&m_data, size, flags)); #else if (flags != 0) throw pycuda::error("mem_host_alloc", CUDA_ERROR_INVALID_VALUE, "nonzero flags in mem_host_alloc not allowed in CUDA 2.1 and older"); CUDAPP_CALL_GUARDED(cuMemAllocHost, (&m_data, size)); #endif return m_data; } inline void mem_host_free(void *ptr) { CUDAPP_CALL_GUARDED_CLEANUP(cuMemFreeHost, (ptr)); } #if CUDAPP_CUDA_VERSION >= 6000 inline CUdeviceptr mem_managed_alloc(size_t size, unsigned flags=0) { CUdeviceptr m_data; CUDAPP_CALL_GUARDED(cuMemAllocManaged, (&m_data, size, flags)); return m_data; } #endif #if CUDAPP_CUDA_VERSION >= 4000 inline void *mem_host_register(void *ptr, size_t bytes, unsigned int flags=0) { CUDAPP_CALL_GUARDED(cuMemHostRegister, (ptr, bytes, flags)); return ptr; } inline void mem_host_unregister(void *ptr) { CUDAPP_CALL_GUARDED_CLEANUP(cuMemHostUnregister, (ptr)); } #endif inline void *aligned_malloc(size_t size, size_t alignment, void **original_pointer) { // alignment must be a power of two. if ((alignment & (alignment - 1)) != 0) throw pycuda::error("aligned_malloc", CUDA_ERROR_INVALID_VALUE, "alignment must be a power of two"); if (alignment == 0) throw pycuda::error("aligned_malloc", CUDA_ERROR_INVALID_VALUE, "alignment must non-zero"); void *p = malloc(size + (alignment - 1)); if (!p) throw pycuda::error("aligned_malloc", CUDA_ERROR_OUT_OF_MEMORY, "aligned malloc failed"); *original_pointer = p; p = (void *)((((ptrdiff_t)(p)) + (alignment-1)) & -alignment); return p; } struct host_pointer : public boost::noncopyable, public context_dependent { protected: bool m_valid; void *m_data; public: host_pointer() : m_valid(false) { } host_pointer(void *ptr) : m_valid(true), m_data(ptr) { } virtual ~host_pointer() { } void *data() { return m_data; } #if CUDAPP_CUDA_VERSION >= 2020 CUdeviceptr get_device_pointer() { CUdeviceptr result; CUDAPP_CALL_GUARDED(cuMemHostGetDevicePointer, (&result, m_data, 0)); return result; } #endif }; struct pagelocked_host_allocation : public host_pointer { public: pagelocked_host_allocation(size_t bytesize, unsigned flags=0) : host_pointer(mem_host_alloc(bytesize, flags)) { } /* Don't try to be clever and coalesce these in the base class. * Won't work: Destructors may not call virtual functions. */ ~pagelocked_host_allocation() { if (m_valid) free(); } void free() { if (m_valid) { try { scoped_context_activation ca(get_context()); mem_host_free(m_data); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(pagelocked_host_allocation); release_context(); m_valid = false; } else throw pycuda::error("pagelocked_host_allocation::free", CUDA_ERROR_INVALID_HANDLE); } #if CUDAPP_CUDA_VERSION >= 3020 unsigned int get_flags() { unsigned int flags; CUDAPP_CALL_GUARDED(cuMemHostGetFlags, (&flags, m_data)); return flags; } #endif }; struct aligned_host_allocation : public host_pointer { void *m_original_pointer; public: aligned_host_allocation(size_t size, size_t alignment) : host_pointer(aligned_malloc(size, alignment, &m_original_pointer)) { } /* Don't try to be clever and coalesce these in the base class. * Won't work: Destructors may not call virtual functions. */ ~aligned_host_allocation() { if (m_valid) free(); } void free() { if (m_valid) { ::free(m_original_pointer); m_valid = false; } else throw pycuda::error("aligned_host_allocation::free", CUDA_ERROR_INVALID_HANDLE); } }; #if CUDAPP_CUDA_VERSION >= 6000 struct managed_allocation : public device_allocation { public: managed_allocation(size_t bytesize, unsigned flags=0) : device_allocation(mem_managed_alloc(bytesize, flags)) { } // The device pointer is also valid on the host void *data() { return (void *) m_devptr; } CUdeviceptr get_device_pointer() { return m_devptr; } void attach(unsigned flags, py::object stream_py) { PYCUDA_PARSE_STREAM_PY; CUDAPP_CALL_GUARDED(cuStreamAttachMemAsync, (s_handle, m_devptr, 0, flags)); } }; #endif #if CUDAPP_CUDA_VERSION >= 4000 struct registered_host_memory : public host_pointer { private: py::object m_base; public: registered_host_memory(void *p, size_t bytes, unsigned int flags=0, py::object base=py::object()) : host_pointer(mem_host_register(p, bytes, flags)), m_base(base) { } /* Don't try to be clever and coalesce these in the base class. * Won't work: Destructors may not call virtual functions. */ ~registered_host_memory() { if (m_valid) free(); } void free() { if (m_valid) { try { scoped_context_activation ca(get_context()); mem_host_unregister(m_data); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(host_allocation); release_context(); m_valid = false; } else throw pycuda::error("registered_host_memory::free", CUDA_ERROR_INVALID_HANDLE); } py::object base() const { return m_base; } }; #endif // }}} // {{{ event class event : public boost::noncopyable, public context_dependent { private: CUevent m_event; public: event(unsigned int flags=0) { CUDAPP_CALL_GUARDED(cuEventCreate, (&m_event, flags)); } event(CUevent evt) : m_event(evt) { } ~event() { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuEventDestroy, (m_event)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(event); } event *record(py::object stream_py) { PYCUDA_PARSE_STREAM_PY; CUDAPP_CALL_GUARDED(cuEventRecord, (m_event, s_handle)); return this; } CUevent handle() const { return m_event; } event *synchronize() { CUDAPP_CALL_GUARDED_THREADED(cuEventSynchronize, (m_event)); return this; } bool query() const { CUDAPP_PRINT_CALL_TRACE("cuEventQuery"); CUresult result = cuEventQuery(m_event); switch (result) { case CUDA_SUCCESS: return true; case CUDA_ERROR_NOT_READY: return false; default: CUDAPP_PRINT_ERROR_TRACE("cuEventQuery", result); throw error("cuEventQuery", result); } } float time_since(event const &start) { float result; CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, start.m_event, m_event)); return result; } float time_till(event const &end) { float result; CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, m_event, end.m_event)); return result; } #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000 py::object ipc_handle() { CUipcEventHandle handle; CUDAPP_CALL_GUARDED(cuIpcGetEventHandle, (&handle, m_event)); return py::object(py::handle<>(PyByteArray_FromStringAndSize( reinterpret_cast(&handle), sizeof(handle)))); } #endif }; #if CUDAPP_CUDA_VERSION >= 3020 inline void stream::wait_for_event(const event &evt) { CUDAPP_CALL_GUARDED(cuStreamWaitEvent, (m_stream, evt.handle(), 0)); } #endif #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000 inline event *event_from_ipc_handle(py::object obj) { if (!PyByteArray_Check(obj.ptr())) throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE, "argument is not a bytes array"); CUipcEventHandle handle; if (PyByteArray_GET_SIZE(obj.ptr()) != sizeof(handle)) throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE, "handle has the wrong size"); memcpy(&handle, PyByteArray_AS_STRING(obj.ptr()), sizeof(handle)); CUevent evt; CUDAPP_CALL_GUARDED(cuIpcOpenEventHandle, (&evt, handle)); return new event(evt); } #endif // }}} // {{{ profiler #if CUDAPP_CUDA_VERSION >= 4000 inline void initialize_profiler( const char *config_file, const char *output_file, CUoutput_mode output_mode) { CUDAPP_CALL_GUARDED(cuProfilerInitialize, (config_file, output_file, output_mode)); } inline void start_profiler() { CUDAPP_CALL_GUARDED(cuProfilerStart, ()); } inline void stop_profiler() { CUDAPP_CALL_GUARDED(cuProfilerStop, ()); } #endif // }}} } #endif // vim: foldmethod=marker pycuda-2013.1.1+git20140310/src/cpp/bitlog.cpp0000644000175000000500000000164212313360364016667 0ustar tomussrc#include /* from http://graphics.stanford.edu/~seander/bithacks.html */ const char pycuda::log_table_8[] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; pycuda-2013.1.1+git20140310/src/cpp/curand.hpp0000644000175000000500000000766512313360364016703 0ustar tomussrc#ifndef _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CURAND_HPP #define _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CURAND_HPP #if CUDAPP_CUDA_VERSION >= 3020 #include #ifdef CUDAPP_TRACE_CUDA #define CURAND_PRINT_ERROR_TRACE(NAME, CODE) \ if (CODE != CURAND_STATUS_SUCCESS) \ std::cerr << NAME << " failed with code " << CODE << std::endl; #else #define CURAND_PRINT_ERROR_TRACE(NAME, CODE) /*nothing*/ #endif #define CURAND_CALL_GUARDED(NAME, ARGLIST) \ { \ CUDAPP_PRINT_CALL_TRACE(#NAME); \ curandStatus_t cu_status_code; \ cu_status_code = NAME ARGLIST; \ CURAND_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CURAND_STATUS_SUCCESS) \ throw pycuda::error(#NAME, CUDA_SUCCESS);\ } #else #define CURAND_PRINT_ERROR_TRACE(NAME, CODE) /*nothing*/ #define CURAND_CALL_GUARDED(NAME, ARGLIST) /*nothing*/ #endif namespace pycuda { namespace curandom { py::tuple py_curand_version() { int version = 0; #if CUDAPP_CUDA_VERSION >= 3020 curandGetVersion(&version); #endif return py::make_tuple( version / 1000, (version % 1000)/10, version % 10); } #if CUDAPP_CUDA_VERSION >= 3020 void py_curand_get_direction_vectors( curandDirectionVectorSet_t set, py::object dst, int count) { void *buf; PYCUDA_BUFFER_SIZE_T len; int n = 0; if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len)) throw py::error_already_set(); if (CURAND_DIRECTION_VECTORS_32_JOEKUO6 == set #if CUDAPP_CUDA_VERSION >= 4000 || CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 == set #endif ) { curandDirectionVectors32_t *vectors; CURAND_CALL_GUARDED(curandGetDirectionVectors32, (&vectors, set)); while (count > 0) { int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors32_t); memcpy((unsigned int *)buf+n*20000*sizeof(curandDirectionVectors32_t)/sizeof(unsigned int), vectors, size); count -= size/sizeof(curandDirectionVectors32_t); n++; } } #if CUDAPP_CUDA_VERSION >= 4000 if (CURAND_DIRECTION_VECTORS_64_JOEKUO6 == set || CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 == set) { curandDirectionVectors64_t *vectors; CURAND_CALL_GUARDED(curandGetDirectionVectors64, (&vectors, set)); while (count > 0) { int size = ((count > 20000) ? 20000 : count)*sizeof(curandDirectionVectors64_t); memcpy((unsigned long long *)buf+n*20000*sizeof(curandDirectionVectors64_t)/sizeof(unsigned long long), vectors, size); count -= size/sizeof(curandDirectionVectors64_t); n++; } } #endif } #endif #if CUDAPP_CUDA_VERSION >= 4000 void py_curand_get_scramble_constants32(py::object dst, int count) { void *buf; PYCUDA_BUFFER_SIZE_T len; int n = 0; if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len)) throw py::error_already_set(); unsigned int *vectors; CURAND_CALL_GUARDED(curandGetScrambleConstants32, (&vectors)); // Documentation does not mention number of dimensions // Assuming the same as in getDirectionVectors* while (count > 0) { int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned int); memcpy((unsigned int *)buf+n*20000, vectors, size); count -= size/sizeof(unsigned int); n++; } } void py_curand_get_scramble_constants64(py::object dst, int count) { void *buf; PYCUDA_BUFFER_SIZE_T len; int n = 0; if (PyObject_AsWriteBuffer(dst.ptr(), &buf, &len)) throw py::error_already_set(); unsigned long long *vectors; CURAND_CALL_GUARDED(curandGetScrambleConstants64, (&vectors)); // Documentation does not mention number of dimensions // Assuming the same as in getDirectionVectors* while (count > 0) { int size = ((count > 20000) ? 20000 : count)*sizeof(unsigned long long); memcpy((unsigned long long *)buf+n*20000, vectors, size); count -= size/sizeof(unsigned long long); n++; } } #endif } } #endif pycuda-2013.1.1+git20140310/src/wrapper/0002755000175000000500000000000012313360364015600 5ustar tomussrcpycuda-2013.1.1+git20140310/src/wrapper/wrap_cudadrv.cpp0000644000175000000500000013512412313360364020771 0ustar tomussrc#include #include #include #include #include "tools.hpp" #include "wrap_helpers.hpp" #include #if CUDAPP_CUDA_VERSION < 1010 #error PyCuda only works with CUDA 1.1 or newer. #endif using namespace pycuda; using boost::shared_ptr; namespace { // {{{ error handling py::handle<> CudaError, CudaMemoryError, CudaLogicError, CudaRuntimeError, CudaLaunchError; void translate_cuda_error(const pycuda::error &err) { if (err.code() == CUDA_ERROR_LAUNCH_FAILED || err.code() == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES || err.code() == CUDA_ERROR_LAUNCH_TIMEOUT || err.code() == CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ) PyErr_SetString(CudaLaunchError.get(), err.what()); else if (err.code() == CUDA_ERROR_OUT_OF_MEMORY) PyErr_SetString(CudaMemoryError.get(), err.what()); else if (err.code() == CUDA_ERROR_NO_DEVICE || err.code() == CUDA_ERROR_NO_BINARY_FOR_GPU || err.code() == CUDA_ERROR_NO_BINARY_FOR_GPU || err.code() == CUDA_ERROR_FILE_NOT_FOUND || err.code() == CUDA_ERROR_NOT_READY #if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA) || err.code() == CUDA_ERROR_ECC_UNCORRECTABLE #endif ) PyErr_SetString(CudaRuntimeError.get(), err.what()); else if (err.code() == CUDA_ERROR_UNKNOWN) PyErr_SetString(CudaError.get(), err.what()); else PyErr_SetString(CudaLogicError.get(), err.what()); } // }}} py::tuple cuda_version() { return py::make_tuple( CUDAPP_CUDA_VERSION / 1000, (CUDAPP_CUDA_VERSION % 1000)/10, CUDAPP_CUDA_VERSION % 10); } class host_alloc_flags { }; class mem_host_register_flags { }; class mem_peer_register_flags { }; class array3d_flags { }; // {{{ "python-aware" wrappers py::object device_get_attribute(device const &dev, CUdevice_attribute attr) { #if CUDAPP_CUDA_VERSION >= 2020 if (attr == CU_DEVICE_ATTRIBUTE_COMPUTE_MODE) return py::object(CUcomputemode(dev.get_attribute(attr))); else #endif return py::object(dev.get_attribute(attr)); } device_allocation *mem_alloc_wrap(unsigned long bytes) { return new device_allocation(pycuda::mem_alloc_gc(bytes)); } class pointer_holder_base_wrap : public pointer_holder_base, public py::wrapper { public: CUdeviceptr get_pointer() { return this->get_override("get_pointer")(); } }; py::tuple mem_alloc_pitch_wrap( unsigned int width, unsigned int height, unsigned int access_size) { std::auto_ptr da; Py_ssize_t pitch = mem_alloc_pitch( da, width, height, access_size); return py::make_tuple( handle_from_new_ptr(da.release()), pitch); } // {{{ memory set void py_memset_d8(CUdeviceptr dst, unsigned char uc, unsigned int n ) { CUDAPP_CALL_GUARDED_THREADED(cuMemsetD8, (dst, uc, n )); } void py_memset_d16(CUdeviceptr dst, unsigned short us, unsigned int n ) { CUDAPP_CALL_GUARDED_THREADED(cuMemsetD16, (dst, us, n )); } void py_memset_d32(CUdeviceptr dst, unsigned int ui, unsigned int n ) { CUDAPP_CALL_GUARDED_THREADED(cuMemsetD32, (dst, ui, n )); } void py_memset_d2d8(CUdeviceptr dst, unsigned int dst_pitch, unsigned char uc, unsigned int width, unsigned int height ) { CUDAPP_CALL_GUARDED_THREADED(cuMemsetD2D8, (dst, dst_pitch, uc, width, height)); } void py_memset_d2d16(CUdeviceptr dst, unsigned int dst_pitch, unsigned short us, unsigned int width, unsigned int height ) { CUDAPP_CALL_GUARDED_THREADED(cuMemsetD2D16, (dst, dst_pitch, us, width, height)); } void py_memset_d2d32(CUdeviceptr dst, unsigned int dst_pitch, unsigned int ui, unsigned int width, unsigned int height ) { CUDAPP_CALL_GUARDED_THREADED(cuMemsetD2D32, (dst, dst_pitch, ui, width, height)); } // }}} // {{{ memory copies void py_memcpy_htod(CUdeviceptr dst, py::object src) { const void *buf; PYCUDA_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer(src.ptr(), &buf, &len)) throw py::error_already_set(); CUDAPP_CALL_GUARDED_THREADED(cuMemcpyHtoD, (dst, buf, len)); } void py_memcpy_htod_async(CUdeviceptr dst, py::object src, py::object stream_py) { const void *buf; PYCUDA_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer(src.ptr(), &buf, &len)) throw py::error_already_set(); PYCUDA_PARSE_STREAM_PY; CUDAPP_CALL_GUARDED_THREADED(cuMemcpyHtoDAsync, (dst, buf, len, s_handle)); } void py_memcpy_dtoh(py::object dest, CUdeviceptr src) { void *buf; PYCUDA_BUFFER_SIZE_T len; if (PyObject_AsWriteBuffer(dest.ptr(), &buf, &len)) throw py::error_already_set(); CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoH, (buf, src, len)); } void py_memcpy_dtoh_async(py::object dest, CUdeviceptr src, py::object stream_py) { void *buf; PYCUDA_BUFFER_SIZE_T len; if (PyObject_AsWriteBuffer(dest.ptr(), &buf, &len)) throw py::error_already_set(); PYCUDA_PARSE_STREAM_PY; CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoHAsync, (buf, src, len, s_handle)); } void py_memcpy_htoa(array const &ary, unsigned int index, py::object src) { const void *buf; PYCUDA_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer(src.ptr(), &buf, &len)) throw py::error_already_set(); CUDAPP_CALL_GUARDED_THREADED(cuMemcpyHtoA, (ary.handle(), index, buf, len)); } void py_memcpy_atoh(py::object dest, array const &ary, unsigned int index) { void *buf; PYCUDA_BUFFER_SIZE_T len; if (PyObject_AsWriteBuffer(dest.ptr(), &buf, &len)) throw py::error_already_set(); CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoH, (buf, ary.handle(), index, len)); } void py_memcpy_dtod(CUdeviceptr dest, CUdeviceptr src, unsigned int byte_count) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoD, (dest, src, byte_count)); } #if CUDAPP_CUDA_VERSION >= 3000 void py_memcpy_dtod_async(CUdeviceptr dest, CUdeviceptr src, unsigned int byte_count, py::object stream_py) { PYCUDA_PARSE_STREAM_PY; CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoDAsync, (dest, src, byte_count, s_handle)); } #endif #if CUDAPP_CUDA_VERSION >= 4000 void py_memcpy_peer(CUdeviceptr dest, CUdeviceptr src, unsigned int byte_count, py::object dest_context_py, py::object src_context_py ) { boost::shared_ptr dest_context = context::current_context(); boost::shared_ptr src_context = dest_context; if (dest_context_py.ptr() == Py_None) dest_context = py::extract >(dest_context_py); if (src_context_py.ptr() == Py_None) src_context = py::extract >(src_context_py); CUDAPP_CALL_GUARDED_THREADED(cuMemcpyPeer, ( dest, dest_context->handle(), src, src_context->handle(), byte_count)); } void py_memcpy_peer_async(CUdeviceptr dest, CUdeviceptr src, unsigned int byte_count, py::object dest_context_py, py::object src_context_py, py::object stream_py) { boost::shared_ptr dest_context = context::current_context(); boost::shared_ptr src_context = dest_context; if (dest_context_py.ptr() == Py_None) dest_context = py::extract >(dest_context_py); if (src_context_py.ptr() == Py_None) src_context = py::extract >(src_context_py); PYCUDA_PARSE_STREAM_PY CUDAPP_CALL_GUARDED_THREADED(cuMemcpyPeerAsync, ( dest, dest_context->handle(), src, src_context->handle(), byte_count, s_handle)); } #endif // }}} // }}} void function_param_setv(function &f, int offset, py::object buffer) { const void *buf; PYCUDA_BUFFER_SIZE_T len; if (PyObject_AsReadBuffer(buffer.ptr(), &buf, &len)) throw py::error_already_set(); f.param_setv(offset, const_cast(buf), len); } // {{{ module_from_buffer module *module_from_buffer(py::object buffer, py::object py_options, py::object message_handler) { const char *mod_buf; PYCUDA_BUFFER_SIZE_T len; if (PyObject_AsCharBuffer(buffer.ptr(), &mod_buf, &len)) throw py::error_already_set(); CUmodule mod; #if CUDAPP_CUDA_VERSION >= 2010 const size_t buf_size = 32768; char info_buf[buf_size], error_buf[buf_size]; std::vector options; std::vector option_values; #define ADD_OPTION_PTR(KEY, PTR) \ { \ options.push_back(KEY); \ option_values.push_back(PTR); \ } ADD_OPTION_PTR(CU_JIT_INFO_LOG_BUFFER, info_buf); ADD_OPTION_PTR(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, (void *) buf_size); ADD_OPTION_PTR(CU_JIT_ERROR_LOG_BUFFER, error_buf); ADD_OPTION_PTR(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, (void *) buf_size); PYTHON_FOREACH(key_value, py_options) ADD_OPTION_PTR( py::extract(key_value[0]), (void *) py::extract(key_value[1])()); #undef ADD_OPTION CUDAPP_PRINT_CALL_TRACE("cuModuleLoadDataEx"); CUresult cu_status_code; \ cu_status_code = cuModuleLoadDataEx(&mod, mod_buf, (unsigned int) options.size(), const_cast(&*options.begin()), const_cast(&*option_values.begin())); size_t info_buf_size = size_t(option_values[1]); size_t error_buf_size = size_t(option_values[3]); if (message_handler != py::object()) message_handler(cu_status_code == CUDA_SUCCESS, std::string(info_buf, info_buf_size), std::string(error_buf, error_buf_size)); if (cu_status_code != CUDA_SUCCESS) throw pycuda::error("cuModuleLoadDataEx", cu_status_code, std::string(error_buf, error_buf_size).c_str()); #else if (py::len(py_options)) throw pycuda::error("module_from_buffer", CUDA_ERROR_INVALID_VALUE, "non-empty options argument only supported on CUDA 2.1 and newer"); CUDAPP_CALL_GUARDED(cuModuleLoadData, (&mod, mod_buf)); #endif return new module(mod); } // }}} template PyObject *mem_obj_to_long(T const &mo) { #if defined(_WIN32) && defined(_WIN64) return PyLong_FromUnsignedLongLong((CUdeviceptr) mo); #else return PyLong_FromUnsignedLong((CUdeviceptr) mo); #endif } // {{{ special host memory <-> numpy template py::handle<> numpy_empty(py::object shape, py::object dtype, py::object order_py, unsigned par1) { PyArray_Descr *tp_descr; if (PyArray_DescrConverter(dtype.ptr(), &tp_descr) != NPY_SUCCEED) throw py::error_already_set(); py::extract shape_as_int(shape); std::vector dims; if (shape_as_int.check()) dims.push_back(shape_as_int()); else std::copy( py::stl_input_iterator(shape), py::stl_input_iterator(), back_inserter(dims)); std::auto_ptr alloc( new Allocation( tp_descr->elsize*pycuda::size_from_dims(dims.size(), &dims.front()), par1) ); NPY_ORDER order = PyArray_CORDER; PyArray_OrderConverter(order_py.ptr(), &order); int ary_flags = 0; if (order == PyArray_FORTRANORDER) ary_flags |= NPY_FARRAY; else if (order == PyArray_CORDER) ary_flags |= NPY_CARRAY; else throw pycuda::error("numpy_empty", CUDA_ERROR_INVALID_VALUE, "unrecognized order specifier"); py::handle<> result = py::handle<>(PyArray_NewFromDescr( &PyArray_Type, tp_descr, int(dims.size()), &dims.front(), /*strides*/ NULL, alloc->data(), ary_flags, /*obj*/NULL)); py::handle<> alloc_py(handle_from_new_ptr(alloc.release())); PyArray_BASE(result.get()) = alloc_py.get(); Py_INCREF(alloc_py.get()); return result; } #if CUDAPP_CUDA_VERSION >= 4000 py::handle<> register_host_memory(py::object ary, unsigned flags) { if (!PyArray_Check(ary.ptr())) throw pycuda::error("register_host_memory", CUDA_ERROR_INVALID_VALUE, "ary argument is not a numpy array"); if (!PyArray_ISCONTIGUOUS(ary.ptr())) throw pycuda::error("register_host_memory", CUDA_ERROR_INVALID_VALUE, "ary argument is not contiguous"); std::auto_ptr regmem( new registered_host_memory( PyArray_DATA(ary.ptr()), PyArray_SIZE(ary.ptr()), flags, ary)); PyObject *new_array_ptr = PyArray_FromInterface(ary.ptr()); if (new_array_ptr == Py_NotImplemented) throw pycuda::error("register_host_memory", CUDA_ERROR_INVALID_VALUE, "ary argument does not expose array interface"); py::handle<> result(new_array_ptr); py::handle<> regmem_py(handle_from_new_ptr(regmem.release())); PyArray_BASE(result.get()) = regmem_py.get(); Py_INCREF(regmem_py.get()); return result; } #endif // }}} // }}} bool have_gl_ext() { #ifdef HAVE_GL return true; #else return false; #endif } } void pycuda_expose_tools(); void pycuda_expose_gl(); void pycuda_expose_curand(); BOOST_PYTHON_MODULE(_driver) { py::def("get_version", cuda_version); #if CUDAPP_CUDA_VERSION >= 2020 py::def("get_driver_version", pycuda::get_driver_version); #endif // {{{ exceptions #define DECLARE_EXC(NAME, BASE) \ Cuda##NAME = py::handle<>(PyErr_NewException("pycuda._driver." #NAME, BASE, NULL)); \ py::scope().attr(#NAME) = Cuda##NAME; { DECLARE_EXC(Error, NULL); DECLARE_EXC(MemoryError, CudaError.get()); DECLARE_EXC(LogicError, CudaError.get()); DECLARE_EXC(LaunchError, CudaError.get()); DECLARE_EXC(RuntimeError, CudaError.get()); py::register_exception_translator(translate_cuda_error); } // }}} // {{{ constants #if CUDAPP_CUDA_VERSION >= 4010 py::enum_("ipc_mem_flags") .value("LAZY_ENABLE_PEER_ACCESS", CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) ; #endif #if CUDAPP_CUDA_VERSION >= 2000 py::enum_("ctx_flags") .value("SCHED_AUTO", CU_CTX_SCHED_AUTO) .value("SCHED_SPIN", CU_CTX_SCHED_SPIN) .value("SCHED_YIELD", CU_CTX_SCHED_YIELD) .value("SCHED_MASK", CU_CTX_SCHED_MASK) #if CUDAPP_CUDA_VERSION >= 2020 && CUDAPP_CUDA_VERSION < 4000 .value("BLOCKING_SYNC", CU_CTX_BLOCKING_SYNC) .value("SCHED_BLOCKING_SYNC", CU_CTX_BLOCKING_SYNC) #endif #if CUDAPP_CUDA_VERSION >= 4000 .value("BLOCKING_SYNC", CU_CTX_SCHED_BLOCKING_SYNC) .value("SCHED_BLOCKING_SYNC", CU_CTX_SCHED_BLOCKING_SYNC) #endif #if CUDAPP_CUDA_VERSION >= 2020 .value("MAP_HOST", CU_CTX_MAP_HOST) #endif #if CUDAPP_CUDA_VERSION >= 3020 .value("LMEM_RESIZE_TO_MAX", CU_CTX_LMEM_RESIZE_TO_MAX) #endif .value("FLAGS_MASK", CU_CTX_FLAGS_MASK) ; #endif #if CUDAPP_CUDA_VERSION >= 2020 py::enum_("event_flags") .value("DEFAULT", CU_EVENT_DEFAULT) .value("BLOCKING_SYNC", CU_EVENT_BLOCKING_SYNC) #if CUDAPP_CUDA_VERSION >= 3020 .value("DISABLE_TIMING", CU_EVENT_DISABLE_TIMING) #endif #if CUDAPP_CUDA_VERSION >= 4010 .value("INTERPROCESS", CU_EVENT_INTERPROCESS) #endif ; #endif py::enum_("array_format") .value("UNSIGNED_INT8", CU_AD_FORMAT_UNSIGNED_INT8) .value("UNSIGNED_INT16", CU_AD_FORMAT_UNSIGNED_INT16) .value("UNSIGNED_INT32", CU_AD_FORMAT_UNSIGNED_INT32) .value("SIGNED_INT8" , CU_AD_FORMAT_SIGNED_INT8) .value("SIGNED_INT16" , CU_AD_FORMAT_SIGNED_INT16) .value("SIGNED_INT32" , CU_AD_FORMAT_SIGNED_INT32) .value("HALF" , CU_AD_FORMAT_HALF) .value("FLOAT" , CU_AD_FORMAT_FLOAT) ; #if CUDAPP_CUDA_VERSION >= 3000 { py::class_ cls("array3d_flags", py::no_init); // deprecated cls.attr("ARRAY3D_2DARRAY") = CUDA_ARRAY3D_2DARRAY; #if CUDAPP_CUDA_VERSION >= 4000 cls.attr("ARRAY3D_LAYERED") = CUDA_ARRAY3D_LAYERED; #endif cls.attr("2DARRAY") = CUDA_ARRAY3D_2DARRAY; #if CUDAPP_CUDA_VERSION >= 3010 cls.attr("SURFACE_LDST") = CUDA_ARRAY3D_SURFACE_LDST; #endif #if CUDAPP_CUDA_VERSION >= 4010 cls.attr("CUBEMAP") = CUDA_ARRAY3D_CUBEMAP; cls.attr("TEXTURE_GATHER") = CUDA_ARRAY3D_TEXTURE_GATHER; #endif } #endif py::enum_("address_mode") .value("WRAP", CU_TR_ADDRESS_MODE_WRAP) .value("CLAMP", CU_TR_ADDRESS_MODE_CLAMP) .value("MIRROR", CU_TR_ADDRESS_MODE_MIRROR) #if CUDAPP_CUDA_VERSION >= 3020 .value("BORDER", CU_TR_ADDRESS_MODE_BORDER) #endif ; py::enum_("filter_mode") .value("POINT", CU_TR_FILTER_MODE_POINT) .value("LINEAR", CU_TR_FILTER_MODE_LINEAR) ; py::enum_("device_attribute") .value("MAX_THREADS_PER_BLOCK", CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) .value("MAX_BLOCK_DIM_X", CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X) .value("MAX_BLOCK_DIM_Y", CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y) .value("MAX_BLOCK_DIM_Z", CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z) .value("MAX_GRID_DIM_X", CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X) .value("MAX_GRID_DIM_Y", CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y) .value("MAX_GRID_DIM_Z", CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z) #if CUDAPP_CUDA_VERSION >= 2000 .value("MAX_SHARED_MEMORY_PER_BLOCK", CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) #endif .value("SHARED_MEMORY_PER_BLOCK", CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK) .value("TOTAL_CONSTANT_MEMORY", CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY) .value("WARP_SIZE", CU_DEVICE_ATTRIBUTE_WARP_SIZE) .value("MAX_PITCH", CU_DEVICE_ATTRIBUTE_MAX_PITCH) #if CUDAPP_CUDA_VERSION >= 2000 .value("MAX_REGISTERS_PER_BLOCK", CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK) #endif .value("REGISTERS_PER_BLOCK", CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK) .value("CLOCK_RATE", CU_DEVICE_ATTRIBUTE_CLOCK_RATE) .value("TEXTURE_ALIGNMENT", CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT) .value("GPU_OVERLAP", CU_DEVICE_ATTRIBUTE_GPU_OVERLAP) #if CUDAPP_CUDA_VERSION >= 2000 .value("MULTIPROCESSOR_COUNT", CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) #endif #if CUDAPP_CUDA_VERSION >= 2020 .value("KERNEL_EXEC_TIMEOUT", CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT) .value("INTEGRATED", CU_DEVICE_ATTRIBUTE_INTEGRATED) .value("CAN_MAP_HOST_MEMORY", CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY) .value("COMPUTE_MODE", CU_DEVICE_ATTRIBUTE_COMPUTE_MODE) #endif #if CUDAPP_CUDA_VERSION >= 3000 .value("MAXIMUM_TEXTURE1D_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH) .value("MAXIMUM_TEXTURE2D_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH) .value("MAXIMUM_TEXTURE2D_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT) .value("MAXIMUM_TEXTURE3D_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH) .value("MAXIMUM_TEXTURE3D_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT) .value("MAXIMUM_TEXTURE3D_DEPTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH) .value("MAXIMUM_TEXTURE2D_ARRAY_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH) .value("MAXIMUM_TEXTURE2D_ARRAY_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT) .value("MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES) #ifdef CUDAPP_POST_30_BETA .value("SURFACE_ALIGNMENT", CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT) .value("CONCURRENT_KERNELS", CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS) .value("ECC_ENABLED", CU_DEVICE_ATTRIBUTE_ECC_ENABLED) #endif #endif #if CUDAPP_CUDA_VERSION >= 4000 .value("MAXIMUM_TEXTURE2D_LAYERED_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH) .value("MAXIMUM_TEXTURE2D_LAYERED_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT) .value("MAXIMUM_TEXTURE2D_LAYERED_LAYERS", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS) .value("MAXIMUM_TEXTURE1D_LAYERED_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH) .value("MAXIMUM_TEXTURE1D_LAYERED_LAYERS", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS) #endif #if CUDAPP_CUDA_VERSION >= 3020 .value("PCI_BUS_ID", CU_DEVICE_ATTRIBUTE_PCI_BUS_ID) .value("PCI_DEVICE_ID", CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID) .value("TCC_DRIVER", CU_DEVICE_ATTRIBUTE_TCC_DRIVER) #endif #if CUDAPP_CUDA_VERSION >= 4000 .value("MEMORY_CLOCK_RATE", CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE) .value("GLOBAL_MEMORY_BUS_WIDTH", CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH) .value("L2_CACHE_SIZE", CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE) .value("MAX_THREADS_PER_MULTIPROCESSOR", CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR) .value("ASYNC_ENGINE_COUNT", CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT) .value("UNIFIED_ADDRESSING", CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) #endif #if CUDAPP_CUDA_VERSION >= 4010 .value("MAXIMUM_TEXTURE2D_GATHER_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH) .value("MAXIMUM_TEXTURE2D_GATHER_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT) .value("MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE) .value("MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE) .value("MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE) .value("PCI_DOMAIN_ID", CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID) .value("TEXTURE_PITCH_ALIGNMENT", CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT) .value("MAXIMUM_TEXTURECUBEMAP_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH) .value("MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH) .value("MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS) .value("MAXIMUM_SURFACE1D_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH) .value("MAXIMUM_SURFACE2D_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH) .value("MAXIMUM_SURFACE2D_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT) .value("MAXIMUM_SURFACE3D_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH) .value("MAXIMUM_SURFACE3D_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT) .value("MAXIMUM_SURFACE3D_DEPTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH) .value("MAXIMUM_SURFACE1D_LAYERED_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH) .value("MAXIMUM_SURFACE1D_LAYERED_LAYERS", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS) .value("MAXIMUM_SURFACE2D_LAYERED_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH) .value("MAXIMUM_SURFACE2D_LAYERED_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT) .value("MAXIMUM_SURFACE2D_LAYERED_LAYERS", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS) .value("MAXIMUM_SURFACECUBEMAP_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH) .value("MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH) .value("MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS", CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS) .value("MAXIMUM_TEXTURE1D_LINEAR_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH) .value("MAXIMUM_TEXTURE2D_LINEAR_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH) .value("MAXIMUM_TEXTURE2D_LINEAR_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT) .value("MAXIMUM_TEXTURE2D_LINEAR_PITCH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH) #endif #if CUDAPP_CUDA_VERSION >= 5000 .value("MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH) .value("MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT) .value("COMPUTE_CAPABILITY_MAJOR", CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) .value("COMPUTE_CAPABILITY_MINOR", CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR) .value("MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH", CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH) #endif #if CUDAPP_CUDA_VERSION >= 5050 .value("STREAM_PRIORITIES_SUPPORTED", CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED) #endif #if CUDAPP_CUDA_VERSION >= 6000 .value("GLOBAL_L1_CACHE_SUPPORTED", CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED) .value("LOCAL_L1_CACHE_SUPPORTED", CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED) .value("MAX_SHARED_MEMORY_PER_MULTIPROCESSOR", CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR) .value("MAX_REGISTERS_PER_MULTIPROCESSOR", CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR) .value("MANAGED_MEMORY", CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY) .value("MULTI_GPU_BOARD", CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD) .value("MULTI_GPU_BOARD_GROUP_ID", CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID) #endif ; #if CUDAPP_CUDA_VERSION >= 4000 py::enum_("pointer_attribute") .value("CONTEXT", CU_POINTER_ATTRIBUTE_CONTEXT) .value("MEMORY_TYPE", CU_POINTER_ATTRIBUTE_MEMORY_TYPE) .value("DEVICE_POINTER", CU_POINTER_ATTRIBUTE_DEVICE_POINTER) .value("HOST_POINTER", CU_POINTER_ATTRIBUTE_HOST_POINTER) ; #endif #if CUDAPP_CUDA_VERSION >= 4000 py::enum_("profiler_output_mode") .value("KEY_VALUE_PAIR", CU_OUT_KEY_VALUE_PAIR) .value("CSV", CU_OUT_CSV) ; #endif #if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA) py::enum_("func_cache") .value("PREFER_NONE", CU_FUNC_CACHE_PREFER_NONE) .value("PREFER_SHARED", CU_FUNC_CACHE_PREFER_SHARED) .value("PREFER_L1", CU_FUNC_CACHE_PREFER_L1) #if CUDAPP_CUDA_VERSION >= 4010 .value("PREFER_EQUAL", CU_FUNC_CACHE_PREFER_EQUAL) #endif ; #endif #if CUDAPP_CUDA_VERSION >= 4020 py::enum_("shared_config") .value("DEFAULT_BANK_SIZE", CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE) .value("FOUR_BYTE_BANK_SIZE", CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE) .value("EIGHT_BYTE_BANK_SIZE", CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE) ; #endif #if CUDAPP_CUDA_VERSION >= 2020 py::enum_("function_attribute") .value("MAX_THREADS_PER_BLOCK", CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK) .value("SHARED_SIZE_BYTES", CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES) .value("CONST_SIZE_BYTES", CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES) .value("LOCAL_SIZE_BYTES", CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES) .value("NUM_REGS", CU_FUNC_ATTRIBUTE_NUM_REGS) #if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA) .value("PTX_VERSION", CU_FUNC_ATTRIBUTE_PTX_VERSION) .value("BINARY_VERSION", CU_FUNC_ATTRIBUTE_BINARY_VERSION) #endif .value("MAX", CU_FUNC_ATTRIBUTE_MAX) ; #endif py::enum_("memory_type") .value("HOST", CU_MEMORYTYPE_HOST) .value("DEVICE", CU_MEMORYTYPE_DEVICE) .value("ARRAY", CU_MEMORYTYPE_ARRAY) #if CUDAPP_CUDA_VERSION >= 4000 .value("UNIFIED", CU_MEMORYTYPE_UNIFIED) #endif ; #if CUDAPP_CUDA_VERSION >= 2020 py::enum_("compute_mode") .value("DEFAULT", CU_COMPUTEMODE_DEFAULT) .value("EXCLUSIVE", CU_COMPUTEMODE_EXCLUSIVE) .value("PROHIBITED", CU_COMPUTEMODE_PROHIBITED) #if CUDAPP_CUDA_VERSION >= 4000 .value("EXCLUSIVE_PROCESS", CU_COMPUTEMODE_EXCLUSIVE_PROCESS) #endif ; #endif #if CUDAPP_CUDA_VERSION >= 2010 py::enum_("jit_option") .value("MAX_REGISTERS", CU_JIT_MAX_REGISTERS) .value("THREADS_PER_BLOCK", CU_JIT_THREADS_PER_BLOCK) .value("WALL_TIME", CU_JIT_WALL_TIME) .value("INFO_LOG_BUFFER", CU_JIT_INFO_LOG_BUFFER) .value("INFO_LOG_BUFFER_SIZE_BYTES", CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) .value("ERROR_LOG_BUFFER", CU_JIT_ERROR_LOG_BUFFER) .value("ERROR_LOG_BUFFER_SIZE_BYTES", CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES) .value("OPTIMIZATION_LEVEL", CU_JIT_OPTIMIZATION_LEVEL) .value("TARGET_FROM_CUCONTEXT", CU_JIT_TARGET_FROM_CUCONTEXT) .value("TARGET", CU_JIT_TARGET) .value("FALLBACK_STRATEGY", CU_JIT_FALLBACK_STRATEGY) ; py::enum_("jit_target") .value("COMPUTE_10", CU_TARGET_COMPUTE_10) .value("COMPUTE_11", CU_TARGET_COMPUTE_11) .value("COMPUTE_12", CU_TARGET_COMPUTE_12) .value("COMPUTE_13", CU_TARGET_COMPUTE_13) #if CUDAPP_CUDA_VERSION >= 3000 .value("COMPUTE_20", CU_TARGET_COMPUTE_20) #endif #if CUDAPP_CUDA_VERSION >= 3020 .value("COMPUTE_21", CU_TARGET_COMPUTE_21) #endif ; py::enum_("jit_fallback") .value("PREFER_PTX", CU_PREFER_PTX) .value("PREFER_BINARY", CU_PREFER_BINARY) ; #endif #if CUDAPP_CUDA_VERSION >= 2020 { py::class_ cls("host_alloc_flags", py::no_init); cls.attr("PORTABLE") = CU_MEMHOSTALLOC_PORTABLE; cls.attr("DEVICEMAP") = CU_MEMHOSTALLOC_DEVICEMAP; cls.attr("WRITECOMBINED") = CU_MEMHOSTALLOC_WRITECOMBINED; } #endif #if CUDAPP_CUDA_VERSION >= 4000 { py::class_ cls("mem_host_register_flags", py::no_init); cls.attr("PORTABLE") = CU_MEMHOSTREGISTER_PORTABLE; cls.attr("DEVICEMAP") = CU_MEMHOSTREGISTER_DEVICEMAP; } #endif #if CUDAPP_CUDA_VERSION >= 3010 py::enum_("limit") .value("STACK_SIZE", CU_LIMIT_STACK_SIZE) .value("PRINTF_FIFO_SIZE", CU_LIMIT_PRINTF_FIFO_SIZE) #if CUDAPP_CUDA_VERSION >= 3020 .value("MALLOC_HEAP_SIZE", CU_LIMIT_MALLOC_HEAP_SIZE) #endif ; #endif #if CUDAPP_CUDA_VERSION >= 6000 py::enum_("mem_attach_flags") .value("GLOBAL", CU_MEM_ATTACH_GLOBAL) .value("HOST", CU_MEM_ATTACH_HOST) .value("SINGLE", CU_MEM_ATTACH_SINGLE) ; #endif // graphics enums ----------------------------------------------------------- #if CUDAPP_CUDA_VERSION >= 3000 py::enum_("graphics_register_flags") .value("NONE", CU_GRAPHICS_REGISTER_FLAGS_NONE) #if CUDAPP_CUDA_VERSION >= 4000 .value("READ_ONLY", CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY) .value("WRITE_DISCARD", CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD) .value("SURFACE_LDST", CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST) #endif #if CUDAPP_CUDA_VERSION >= 4010 .value("TEXTURE_GATHER", CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER) #endif ; py::enum_("array_cubemap_face") .value("POSITIVE_X", CU_CUBEMAP_FACE_POSITIVE_X) .value("NEGATIVE_X", CU_CUBEMAP_FACE_NEGATIVE_X) .value("POSITIVE_Y", CU_CUBEMAP_FACE_POSITIVE_Y) .value("NEGATIVE_Y", CU_CUBEMAP_FACE_NEGATIVE_Y) .value("POSITIVE_Z", CU_CUBEMAP_FACE_POSITIVE_Z) .value("NEGATIVE_Z", CU_CUBEMAP_FACE_NEGATIVE_Z) ; #endif // }}} py::def("init", init, py::arg("flags")=0); // {{{ device { typedef device cl; py::class_("Device", py::no_init) .def("__init__", py::make_constructor(make_device)) #if CUDAPP_CUDA_VERSION >= 4010 .def("__init__", py::make_constructor(make_device_from_pci_bus_id)) #endif .DEF_SIMPLE_METHOD(count) .staticmethod("count") .DEF_SIMPLE_METHOD(name) #if CUDAPP_CUDA_VERSION >= 4010 .DEF_SIMPLE_METHOD(pci_bus_id) #endif .DEF_SIMPLE_METHOD(compute_capability) .DEF_SIMPLE_METHOD(total_memory) .def("get_attribute", device_get_attribute) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cl::hash) .def("make_context", &cl::make_context, (py::args("self"), py::args("flags")=0)) #if CUDAPP_CUDA_VERSION >= 4000 .DEF_SIMPLE_METHOD(can_access_peer) #endif ; } // }}} // {{{ context { typedef context cl; py::class_, boost::noncopyable >("Context", py::no_init) .def(py::self == py::self) .def(py::self != py::self) .def("__hash__", &cl::hash) .def("attach", &cl::attach, (py::arg("flags")=0)) .staticmethod("attach") .DEF_SIMPLE_METHOD(detach) #if CUDAPP_CUDA_VERSION >= 2000 .def("push", context_push) .DEF_SIMPLE_METHOD(pop) .staticmethod("pop") .DEF_SIMPLE_METHOD(get_device) .staticmethod("get_device") #endif .DEF_SIMPLE_METHOD(synchronize) .staticmethod("synchronize") .def("get_current", (boost::shared_ptr (*)()) &cl::current_context) .staticmethod("get_current") #if CUDAPP_CUDA_VERSION >= 3010 .DEF_SIMPLE_METHOD(set_limit) .staticmethod("set_limit") .DEF_SIMPLE_METHOD(get_limit) .staticmethod("get_limit") #endif #if CUDAPP_CUDA_VERSION >= 3020 .DEF_SIMPLE_METHOD(get_cache_config) .staticmethod("get_cache_config") .DEF_SIMPLE_METHOD(set_cache_config) .staticmethod("set_cache_config") .DEF_SIMPLE_METHOD(get_api_version) #endif #if CUDAPP_CUDA_VERSION >= 4000 .def("enable_peer_access", &cl::enable_peer_access, (py::arg("peer"), py::arg("flags")=0)) .staticmethod("enable_peer_access") .DEF_SIMPLE_METHOD(disable_peer_access) .staticmethod("disable_peer_access") #endif #if CUDAPP_CUDA_VERSION >= 4020 .DEF_SIMPLE_METHOD(get_shared_config) .staticmethod("get_shared_config") .DEF_SIMPLE_METHOD(set_shared_config) .staticmethod("set_shared_config") #endif ; } // }}} // {{{ stream { typedef stream cl; py::class_ > ("Stream", py::init(py::arg("flags")=0)) .DEF_SIMPLE_METHOD(synchronize) .DEF_SIMPLE_METHOD(is_done) #if CUDAPP_CUDA_VERSION >= 3020 .DEF_SIMPLE_METHOD(wait_for_event) #endif .add_property("handle", &cl::handle_int) ; } // }}} // {{{ module { typedef module cl; py::class_ >("Module", py::no_init) .def("get_function", &cl::get_function, (py::args("self", "name")), py::with_custodian_and_ward_postcall<0, 1>()) .def("get_global", &cl::get_global, (py::args("self", "name"))) .def("get_texref", module_get_texref, (py::args("self", "name")), py::return_value_policy()) #if CUDAPP_CUDA_VERSION >= 3010 .def("get_surfref", module_get_surfref, (py::args("self", "name")), py::return_value_policy()) #endif ; } py::def("module_from_file", module_from_file, (py::arg("filename")), py::return_value_policy()); py::def("module_from_buffer", module_from_buffer, (py::arg("buffer"), py::arg("options")=py::list(), py::arg("message_handler")=py::object()), py::return_value_policy()); // }}} // {{{ function { typedef function cl; py::class_("Function", py::no_init) .def("_set_block_shape", &cl::set_block_shape) .def("_set_shared_size", &cl::set_shared_size) .def("_param_set_size", &cl::param_set_size) .def("_param_seti", (void (cl::*)(int, unsigned int)) &cl::param_set) .def("_param_setf", (void (cl::*)(int, float )) &cl::param_set) .def("_param_setv", function_param_setv) .DEF_SIMPLE_METHOD(param_set_texref) .def("_launch", &cl::launch) .def("_launch_grid", &cl::launch_grid, py::args("grid_width", "grid_height")) .def("_launch_grid_async", &cl::launch_grid_async, py::args("grid_width", "grid_height", "s")) #if CUDAPP_CUDA_VERSION >= 2020 .DEF_SIMPLE_METHOD(get_attribute) #endif #if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA) .DEF_SIMPLE_METHOD(set_cache_config) #endif #if CUDAPP_CUDA_VERSION >= 4000 .def("_launch_kernel", &cl::launch_kernel) #endif ; } // }}} // {{{ pointer holder { typedef pointer_holder_base cl; py::class_( "PointerHolderBase") .def("get_pointer", py::pure_virtual(&cl::get_pointer)) .def("as_buffer", &cl::as_buffer, (py::arg("size"), py::arg("offset")=0)) ; py::implicitly_convertible(); } { typedef device_allocation cl; py::class_("DeviceAllocation", py::no_init) .def("__int__", &cl::operator CUdeviceptr) .def("__long__", mem_obj_to_long) .def("__index__", mem_obj_to_long) .def("as_buffer", &cl::as_buffer, (py::arg("size"), py::arg("offset")=0)) .DEF_SIMPLE_METHOD(free) ; py::implicitly_convertible(); } #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000 { typedef ipc_mem_handle cl; py::class_("IPCMemoryHandle", py::init >()) .def("__int__", &cl::operator CUdeviceptr) .def("__long__", mem_obj_to_long) .DEF_SIMPLE_METHOD(close) ; py::implicitly_convertible(); } DEF_SIMPLE_FUNCTION(mem_get_ipc_handle); #endif // }}} // {{{ host memory { typedef host_pointer cl; py::class_("HostPointer", py::no_init) #if CUDAPP_CUDA_VERSION >= 2020 .DEF_SIMPLE_METHOD(get_device_pointer) #endif ; } { typedef pagelocked_host_allocation cl; py::class_ > wrp( "PagelockedHostAllocation", py::no_init); wrp .DEF_SIMPLE_METHOD(free) #if CUDAPP_CUDA_VERSION >= 3020 .DEF_SIMPLE_METHOD(get_flags) #endif ; py::scope().attr("HostAllocation") = wrp; } { typedef aligned_host_allocation cl; py::class_ > wrp( "AlignedHostAllocation", py::no_init); wrp .DEF_SIMPLE_METHOD(free) ; } #if CUDAPP_CUDA_VERSION >= 6000 { typedef managed_allocation cl; py::class_ > wrp( "ManagedAllocation", py::no_init); wrp .DEF_SIMPLE_METHOD(get_device_pointer) .def("attach", &cl::attach, (py::arg("mem_flags"), py::arg("stream")=py::object())) ; } #endif #if CUDAPP_CUDA_VERSION >= 4000 { typedef registered_host_memory cl; py::class_ >( "RegisteredHostMemory", py::no_init) .def("unregister", &cl::free) ; } #endif py::def("pagelocked_empty", numpy_empty, (py::arg("shape"), py::arg("dtype"), py::arg("order")="C", py::arg("mem_flags")=0)); py::def("aligned_empty", numpy_empty, (py::arg("shape"), py::arg("dtype"), py::arg("order")="C", py::arg("alignment")=4096)); #if CUDAPP_CUDA_VERSION >= 6000 py::def("managed_empty", numpy_empty, (py::arg("shape"), py::arg("dtype"), py::arg("order")="C", py::arg("mem_flags")=0)); #endif #if CUDAPP_CUDA_VERSION >= 4000 py::def("register_host_memory", register_host_memory, (py::arg("ary"), py::arg("flags")=0)); #endif // }}} DEF_SIMPLE_FUNCTION(mem_get_info); py::def("mem_alloc", mem_alloc_wrap, py::return_value_policy()); py::def("mem_alloc_pitch", mem_alloc_pitch_wrap, py::args("width", "height", "access_size")); DEF_SIMPLE_FUNCTION(mem_get_address_range); // {{{ memset/memcpy py::def("memset_d8", py_memset_d8, py::args("dest", "data", "size")); py::def("memset_d16", py_memset_d16, py::args("dest", "data", "size")); py::def("memset_d32", py_memset_d32, py::args("dest", "data", "size")); py::def("memset_d2d8", py_memset_d2d8, py::args("dest", "pitch", "data", "width", "height")); py::def("memset_d2d16", py_memset_d2d16, py::args("dest", "pitch", "data", "width", "height")); py::def("memset_d2d32", py_memset_d2d32, py::args("dest", "pitch", "data", "width", "height")); py::def("memcpy_htod", py_memcpy_htod, (py::args("dest"), py::arg("src"))); py::def("memcpy_htod_async", py_memcpy_htod_async, (py::args("dest"), py::arg("src"), py::arg("stream")=py::object())); py::def("memcpy_dtoh", py_memcpy_dtoh, (py::args("dest"), py::arg("src"))); py::def("memcpy_dtoh_async", py_memcpy_dtoh_async, (py::args("dest"), py::arg("src"), py::arg("stream")=py::object())); py::def("memcpy_dtod", py_memcpy_dtod, py::args("dest", "src", "size")); #if CUDAPP_CUDA_VERSION >= 3000 py::def("memcpy_dtod_async", py_memcpy_dtod_async, (py::args("dest", "src", "size"), py::arg("stream")=py::object())); #endif #if CUDAPP_CUDA_VERSION >= 4000 py::def("memcpy_peer", py_memcpy_peer, (py::args("dest", "src", "size"), py::arg("dest_context")=py::object(), py::arg("src_context")=py::object())); /* py::def("memcpy_peer_async", py_memcpy_peer_async, (py::args("dest", "src", "size"), py::arg("dest_context")=py::object(), py::arg("src_context")=py::object(), py::arg("stream")=py::object())); */ #endif DEF_SIMPLE_FUNCTION_WITH_ARGS(memcpy_dtoa, ("ary", "index", "src", "len")); DEF_SIMPLE_FUNCTION_WITH_ARGS(memcpy_atod, ("dest", "ary", "index", "len")); DEF_SIMPLE_FUNCTION_WITH_ARGS(py_memcpy_htoa, ("ary", "index", "src")); DEF_SIMPLE_FUNCTION_WITH_ARGS(py_memcpy_atoh, ("dest", "ary", "index")); DEF_SIMPLE_FUNCTION_WITH_ARGS(memcpy_atoa, ("dest", "dest_index", "src", "src_index", "len")); #if CUDAPP_CUDA_VERSION >= 4000 #define WRAP_MEMCPY_2D_UNIFIED_SETTERS \ .DEF_SIMPLE_METHOD(set_src_unified) \ .DEF_SIMPLE_METHOD(set_dst_unified) #else #define WRAP_MEMCPY_2D_UNIFIED_SETTERS /* empty */ #endif #define WRAP_MEMCPY_2D_PROPERTIES \ .def_readwrite("src_x_in_bytes", &cl::srcXInBytes) \ .def_readwrite("src_y", &cl::srcY) \ .def_readwrite("src_memory_type", &cl::srcMemoryType) \ .def_readwrite("src_device", &cl::srcDevice) \ .def_readwrite("src_pitch", &cl::srcPitch) \ \ .DEF_SIMPLE_METHOD(set_src_host) \ .DEF_SIMPLE_METHOD(set_src_array) \ .DEF_SIMPLE_METHOD(set_src_device) \ \ .def_readwrite("dst_x_in_bytes", &cl::dstXInBytes) \ .def_readwrite("dst_y", &cl::dstY) \ .def_readwrite("dst_memory_type", &cl::dstMemoryType) \ .def_readwrite("dst_device", &cl::dstDevice) \ .def_readwrite("dst_pitch", &cl::dstPitch) \ \ .DEF_SIMPLE_METHOD(set_dst_host) \ .DEF_SIMPLE_METHOD(set_dst_array) \ .DEF_SIMPLE_METHOD(set_dst_device) \ \ .def_readwrite("width_in_bytes", &cl::WidthInBytes) \ .def_readwrite("height", &cl::Height) \ \ WRAP_MEMCPY_2D_UNIFIED_SETTERS { typedef memcpy_2d cl; py::class_("Memcpy2D") WRAP_MEMCPY_2D_PROPERTIES .def("__call__", &cl::execute, py::args("self", "aligned")) .def("__call__", &cl::execute_async) ; } #if CUDAPP_CUDA_VERSION >= 2000 #define WRAP_MEMCPY_3D_PROPERTIES \ WRAP_MEMCPY_2D_PROPERTIES \ .def_readwrite("src_z", &cl::srcZ) \ .def_readwrite("src_lod", &cl::srcLOD) \ .def_readwrite("src_height", &cl::srcHeight) \ \ .def_readwrite("dst_z", &cl::dstZ) \ .def_readwrite("dst_lod", &cl::dstLOD) \ .def_readwrite("dst_height", &cl::dstHeight) \ \ .def_readwrite("depth", &cl::Depth) \ { typedef memcpy_3d cl; py::class_("Memcpy3D") WRAP_MEMCPY_3D_PROPERTIES .def("__call__", &cl::execute) .def("__call__", &cl::execute_async) ; } #endif #if CUDAPP_CUDA_VERSION >= 4000 { typedef memcpy_3d_peer cl; py::class_("Memcpy3DPeer") WRAP_MEMCPY_3D_PROPERTIES .DEF_SIMPLE_METHOD(set_src_context) .DEF_SIMPLE_METHOD(set_dst_context) .def("__call__", &cl::execute) .def("__call__", &cl::execute_async) ; } #endif // }}} // {{{ event { typedef event cl; py::class_ ("Event", py::init >(py::arg("flags"))) .def("record", &cl::record, py::arg("stream")=py::object(), py::return_self<>()) .def("synchronize", &cl::synchronize, py::return_self<>()) .DEF_SIMPLE_METHOD(query) .DEF_SIMPLE_METHOD(time_since) .DEF_SIMPLE_METHOD(time_till) #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000 .DEF_SIMPLE_METHOD(ipc_handle) .def("from_ipc_handle", event_from_ipc_handle, py::return_value_policy()) .staticmethod("from_ipc_handle") #endif ; } // }}} // {{{ arrays { typedef CUDA_ARRAY_DESCRIPTOR cl; py::class_("ArrayDescriptor") .def_readwrite("width", &cl::Width) .def_readwrite("height", &cl::Height) .def_readwrite("format", &cl::Format) .def_readwrite("num_channels", &cl::NumChannels) ; } #if CUDAPP_CUDA_VERSION >= 2000 { typedef CUDA_ARRAY3D_DESCRIPTOR cl; py::class_("ArrayDescriptor3D") .def_readwrite("width", &cl::Width) .def_readwrite("height", &cl::Height) .def_readwrite("depth", &cl::Depth) .def_readwrite("format", &cl::Format) .def_readwrite("num_channels", &cl::NumChannels) .def_readwrite("flags", &cl::Flags) ; } #endif { typedef array cl; py::class_, boost::noncopyable> ("Array", py::init()) .DEF_SIMPLE_METHOD(free) .DEF_SIMPLE_METHOD(get_descriptor) #if CUDAPP_CUDA_VERSION >= 2000 .def(py::init()) .DEF_SIMPLE_METHOD(get_descriptor_3d) #endif ; } // }}} // {{{ texture reference { typedef texture_reference cl; py::class_("TextureReference") .DEF_SIMPLE_METHOD(set_array) .def("set_address", &cl::set_address, (py::arg("devptr"), py::arg("bytes"), py::arg("allow_offset")=false)) #if CUDAPP_CUDA_VERSION >= 2020 .DEF_SIMPLE_METHOD_WITH_ARGS(set_address_2d, ("devptr", "descr", "pitch")) #endif .DEF_SIMPLE_METHOD_WITH_ARGS(set_format, ("format", "num_components")) .DEF_SIMPLE_METHOD_WITH_ARGS(set_address_mode, ("dim", "am")) .DEF_SIMPLE_METHOD(set_filter_mode) .DEF_SIMPLE_METHOD(set_flags) .DEF_SIMPLE_METHOD(get_address) .def("get_array", &cl::get_array, py::return_value_policy()) .DEF_SIMPLE_METHOD(get_address_mode) .DEF_SIMPLE_METHOD(get_filter_mode) #if CUDAPP_CUDA_VERSION >= 2000 .DEF_SIMPLE_METHOD(get_format) #endif .DEF_SIMPLE_METHOD(get_flags) ; } // }}} // {{{ surface reference #if CUDAPP_CUDA_VERSION >= 3010 { typedef surface_reference cl; py::class_("SurfaceReference", py::no_init) .def("set_array", &cl::set_array, (py::arg("array"), py::arg("flags")=0)) .def("get_array", &cl::get_array, py::return_value_policy()) ; } #endif // }}} // {{{ profiler control #if CUDAPP_CUDA_VERSION >= 4000 DEF_SIMPLE_FUNCTION(initialize_profiler); DEF_SIMPLE_FUNCTION(start_profiler); DEF_SIMPLE_FUNCTION(stop_profiler); #endif // }}} py::scope().attr("TRSA_OVERRIDE_FORMAT") = CU_TRSA_OVERRIDE_FORMAT; py::scope().attr("TRSF_READ_AS_INTEGER") = CU_TRSF_READ_AS_INTEGER; py::scope().attr("TRSF_NORMALIZED_COORDINATES") = CU_TRSF_NORMALIZED_COORDINATES; py::scope().attr("TR_DEFAULT") = CU_PARAM_TR_DEFAULT; DEF_SIMPLE_FUNCTION(have_gl_ext); pycuda_expose_tools(); #ifdef HAVE_GL pycuda_expose_gl(); #endif #ifdef HAVE_CURAND pycuda_expose_curand(); #endif } // vim: foldmethod=marker pycuda-2013.1.1+git20140310/src/wrapper/wrap_helpers.hpp0000644000175000000500000000251612313360364021006 0ustar tomussrc#ifndef PYCUDA_WRAP_HELPERS_HEADER_SEEN #define PYCUDA_WRAP_HELPERS_HEADER_SEEN #include #include #include #define PYTHON_ERROR(TYPE, REASON) \ { \ PyErr_SetString(PyExc_##TYPE, REASON); \ throw boost::python::error_already_set(); \ } #define ENUM_VALUE(NAME) \ value(#NAME, NAME) #define DEF_SIMPLE_METHOD(NAME) \ def(#NAME, &cl::NAME) #define DEF_SIMPLE_METHOD_WITH_ARGS(NAME, ARGS) \ def(#NAME, &cl::NAME, boost::python::args ARGS) #define DEF_SIMPLE_FUNCTION(NAME) \ boost::python::def(#NAME, &NAME) #define DEF_SIMPLE_FUNCTION_WITH_ARGS(NAME, ARGS) \ boost::python::def(#NAME, &NAME, boost::python::args ARGS) #define DEF_SIMPLE_RO_MEMBER(NAME) \ def_readonly(#NAME, &cl::m_##NAME) #define DEF_SIMPLE_RW_MEMBER(NAME) \ def_readwrite(#NAME, &cl::m_##NAME) #define PYTHON_FOREACH(NAME, ITERABLE) \ BOOST_FOREACH(boost::python::object NAME, \ std::make_pair( \ boost::python::stl_input_iterator(ITERABLE), \ boost::python::stl_input_iterator())) namespace { template inline boost::python::handle<> handle_from_new_ptr(T *ptr) { return boost::python::handle<>( typename boost::python::manage_new_object::apply::type()(ptr)); } } #endif pycuda-2013.1.1+git20140310/src/wrapper/tools.hpp0000644000175000000500000000216012313360364017446 0ustar tomussrc#ifndef _ASDFDAFVVAFF_PYCUDA_HEADER_SEEN_TOOLS_HPP #define _ASDFDAFVVAFF_PYCUDA_HEADER_SEEN_TOOLS_HPP #include #include #include #include "numpy_init.hpp" namespace pycuda { inline npy_intp size_from_dims(size_t ndim, const npy_intp *dims) { if (ndim != 0) return std::accumulate(dims, dims+ndim, npy_intp(1), std::multiplies()); else return 1; } inline void run_python_gc() { namespace py = boost::python; py::object gc_mod( py::handle<>( PyImport_ImportModule("gc"))); gc_mod.attr("collect")(); } inline CUdeviceptr mem_alloc_gc(size_t bytes) { try { return pycuda::mem_alloc(bytes); } catch (pycuda::error &e) { if (e.code() != CUDA_ERROR_OUT_OF_MEMORY) throw; } // If we get here, we got OUT_OF_MEMORY from CUDA. // We should run the Python GC to try and free up // some memory references. run_python_gc(); // Now retry the allocation. If it fails again, // let it fail. return pycuda::mem_alloc(bytes); } } #endif pycuda-2013.1.1+git20140310/src/wrapper/wrap_curand.cpp0000644000175000000500000000223312313360364020607 0ustar tomussrc#include #include #include "tools.hpp" #include "wrap_helpers.hpp" #if CUDAPP_CUDA_VERSION >= 3020 #include #endif using namespace pycuda; using namespace pycuda::curandom; void pycuda_expose_curand() { using py::arg; using py::args; #if CUDAPP_CUDA_VERSION >= 3020 py::enum_("direction_vector_set") .value("VECTOR_32", CURAND_DIRECTION_VECTORS_32_JOEKUO6) #if CUDAPP_CUDA_VERSION >= 4000 .value("SCRAMBLED_VECTOR_32", CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6) .value("VECTOR_64", CURAND_DIRECTION_VECTORS_64_JOEKUO6) .value("SCRAMBLED_VECTOR_64", CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6) #endif ; #endif py::def("get_curand_version", py_curand_version); #if CUDAPP_CUDA_VERSION >= 3020 py::def("_get_direction_vectors", py_curand_get_direction_vectors, (arg("set"), arg("dst"), arg("count"))); #endif #if CUDAPP_CUDA_VERSION >= 4000 py::def("_get_scramble_constants32", py_curand_get_scramble_constants32, (arg("dst"), arg("count"))); py::def("_get_scramble_constants64", py_curand_get_scramble_constants64, (arg("dst"), arg("count"))); #endif } pycuda-2013.1.1+git20140310/src/wrapper/_pvt_struct_v3.cpp0000644000175000000500000013433412313360364021276 0ustar tomussrc/* struct module -- pack values into and (out of) bytes objects */ /* New version supporting byte order, alignment and size options, character strings, and unsigned numbers */ #define PY_SSIZE_T_CLEAN #include "Python.h" #include "structmember.h" #include #include "numpy_init.hpp" namespace { extern PyTypeObject PyStructType; } /* The translation function for each format character is table driven */ typedef struct _formatdef { char format; Py_ssize_t size; Py_ssize_t alignment; PyObject* (*unpack)(const char *, const struct _formatdef *); int (*pack)(char *, PyObject *, const struct _formatdef *); } formatdef; typedef struct _formatcode { const struct _formatdef *fmtdef; Py_ssize_t offset; Py_ssize_t size; } formatcode; /* Struct object interface */ typedef struct { PyObject_HEAD Py_ssize_t s_size; Py_ssize_t s_len; formatcode *s_codes; PyObject *s_format; PyObject *weakreflist; /* List of weak references */ } PyStructObject; #define PyStruct_Check(op) PyObject_TypeCheck(op, &PyStructType) #define PyStruct_CheckExact(op) (Py_TYPE(op) == &PyStructType) /* Exception */ static PyObject *StructError; /* Define various structs to figure out the alignments of types */ typedef struct { char c; short x; } st_short; typedef struct { char c; int x; } st_int; typedef struct { char c; long x; } st_long; typedef struct { char c; float x; } st_float; typedef struct { char c; double x; } st_double; typedef struct { char c; void *x; } st_void_p; typedef struct { char c; size_t x; } st_size_t; #define SHORT_ALIGN (sizeof(st_short) - sizeof(short)) #define INT_ALIGN (sizeof(st_int) - sizeof(int)) #define LONG_ALIGN (sizeof(st_long) - sizeof(long)) #define FLOAT_ALIGN (sizeof(st_float) - sizeof(float)) #define DOUBLE_ALIGN (sizeof(st_double) - sizeof(double)) #define VOID_P_ALIGN (sizeof(st_void_p) - sizeof(void *)) #define SIZE_T_ALIGN (sizeof(st_size_t) - sizeof(size_t)) /* We can't support q and Q in native mode unless the compiler does; in std mode, they're 8 bytes on all platforms. */ #ifdef HAVE_LONG_LONG typedef struct { char c; PY_LONG_LONG x; } s_long_long; #define LONG_LONG_ALIGN (sizeof(s_long_long) - sizeof(PY_LONG_LONG)) #endif #if !defined(__cplusplus) && defined(HAVE_C99_BOOL) #define BOOL_TYPE _Bool typedef struct { char c; _Bool x; } s_bool; #define BOOL_ALIGN (sizeof(s_bool) - sizeof(BOOL_TYPE)) #else #define BOOL_TYPE char #define BOOL_ALIGN 0 #endif #define STRINGIFY(x) #x #ifdef __powerc #pragma options align=reset #endif /* Helper for integer format codes: converts an arbitrary Python object to a PyLongObject if possible, otherwise fails. Caller should decref. */ static PyObject * get_pylong(PyObject *v) { assert(v != NULL); if (!PyLong_Check(v)) { /* Not an integer; try to use __index__ to convert. */ if (PyIndex_Check(v)) { v = PyNumber_Index(v); if (v == NULL) return NULL; } else { PyErr_SetString(StructError, "required argument is not an integer"); return NULL; } } else Py_INCREF(v); assert(PyLong_Check(v)); return v; } /* Helper routine to get a C long and raise the appropriate error if it isn't one */ static int get_long(PyObject *v, long *p) { long x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsLong(v); Py_DECREF(v); if (x == (long)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } /* Same, but handling unsigned long */ static int get_ulong(PyObject *v, unsigned long *p) { unsigned long x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsUnsignedLong(v); Py_DECREF(v); if (x == (unsigned long)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } #ifdef HAVE_LONG_LONG /* Same, but handling native long long. */ static int get_longlong(PyObject *v, PY_LONG_LONG *p) { PY_LONG_LONG x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsLongLong(v); Py_DECREF(v); if (x == (PY_LONG_LONG)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } /* Same, but handling native unsigned long long. */ static int get_ulonglong(PyObject *v, unsigned PY_LONG_LONG *p) { unsigned PY_LONG_LONG x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsUnsignedLongLong(v); Py_DECREF(v); if (x == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } #endif /* Same, but handling Py_ssize_t */ static int get_ssize_t(PyObject *v, Py_ssize_t *p) { Py_ssize_t x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsSsize_t(v); Py_DECREF(v); if (x == (Py_ssize_t)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } /* Same, but handling size_t */ static int get_size_t(PyObject *v, size_t *p) { size_t x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsSize_t(v); Py_DECREF(v); if (x == (size_t)-1 && PyErr_Occurred()) { if (PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "argument out of range"); return -1; } *p = x; return 0; } #define RANGE_ERROR(x, f, flag, mask) return _range_error(f, flag) /* Floating point helpers */ #if 0 static PyObject * unpack_float(const char *p, /* start of 4-byte string */ int le) /* true for little-endian, false for big-endian */ { double x; x = _PyFloat_Unpack4((unsigned char *)p, le); if (x == -1.0 && PyErr_Occurred()) return NULL; return PyFloat_FromDouble(x); } static PyObject * unpack_double(const char *p, /* start of 8-byte string */ int le) /* true for little-endian, false for big-endian */ { double x; x = _PyFloat_Unpack8((unsigned char *)p, le); if (x == -1.0 && PyErr_Occurred()) return NULL; return PyFloat_FromDouble(x); } #endif /* Helper to format the range error exceptions */ static int _range_error(const formatdef *f, int is_unsigned) { /* ulargest is the largest unsigned value with f->size bytes. * Note that the simpler: * ((size_t)1 << (f->size * 8)) - 1 * doesn't work when f->size == sizeof(size_t) because C doesn't * define what happens when a left shift count is >= the number of * bits in the integer being shifted; e.g., on some boxes it doesn't * shift at all when they're equal. */ const size_t ulargest = (size_t)-1 >> ((SIZEOF_SIZE_T - f->size)*8); assert(f->size >= 1 && f->size <= SIZEOF_SIZE_T); if (is_unsigned) PyErr_Format(StructError, "'%c' format requires 0 <= number <= %zu", f->format, ulargest); else { const Py_ssize_t largest = (Py_ssize_t)(ulargest >> 1); PyErr_Format(StructError, "'%c' format requires %zd <= number <= %zd", f->format, ~ largest, largest); } return -1; } /* A large number of small routines follow, with names of the form [bln][up]_TYPE [bln] distiguishes among big-endian, little-endian and native. [pu] distiguishes between pack (to struct) and unpack (from struct). TYPE is one of char, byte, ubyte, etc. */ // {{{ /* Native mode routines. ****************************************************/ /* NOTE: In all n[up]_ routines handling types larger than 1 byte, there is *no* guarantee that the p pointer is properly aligned for each type, therefore memcpy is called. An intermediate variable is used to compensate for big-endian architectures. Normally both the intermediate variable and the memcpy call will be skipped by C optimisation in little-endian architectures (gcc >= 2.91 does this). */ static PyObject * nu_char(const char *p, const formatdef *f) { return PyBytes_FromStringAndSize(p, 1); } static PyObject * nu_byte(const char *p, const formatdef *f) { return PyLong_FromLong((long) *(signed char *)p); } static PyObject * nu_ubyte(const char *p, const formatdef *f) { return PyLong_FromLong((long) *(unsigned char *)p); } static PyObject * nu_short(const char *p, const formatdef *f) { short x; memcpy((char *)&x, p, sizeof x); return PyLong_FromLong((long)x); } static PyObject * nu_ushort(const char *p, const formatdef *f) { unsigned short x; memcpy((char *)&x, p, sizeof x); return PyLong_FromLong((long)x); } static PyObject * nu_int(const char *p, const formatdef *f) { int x; memcpy((char *)&x, p, sizeof x); return PyLong_FromLong((long)x); } static PyObject * nu_uint(const char *p, const formatdef *f) { unsigned int x; memcpy((char *)&x, p, sizeof x); #if (SIZEOF_LONG > SIZEOF_INT) return PyLong_FromLong((long)x); #else if (x <= ((unsigned int)LONG_MAX)) return PyLong_FromLong((long)x); return PyLong_FromUnsignedLong((unsigned long)x); #endif } static PyObject * nu_long(const char *p, const formatdef *f) { long x; memcpy((char *)&x, p, sizeof x); return PyLong_FromLong(x); } static PyObject * nu_ulong(const char *p, const formatdef *f) { unsigned long x; memcpy((char *)&x, p, sizeof x); if (x <= LONG_MAX) return PyLong_FromLong((long)x); return PyLong_FromUnsignedLong(x); } static PyObject * nu_ssize_t(const char *p, const formatdef *f) { Py_ssize_t x; memcpy((char *)&x, p, sizeof x); return PyLong_FromSsize_t(x); } static PyObject * nu_size_t(const char *p, const formatdef *f) { size_t x; memcpy((char *)&x, p, sizeof x); return PyLong_FromSize_t(x); } /* Native mode doesn't support q or Q unless the platform C supports long long (or, on Windows, __int64). */ #ifdef HAVE_LONG_LONG static PyObject * nu_longlong(const char *p, const formatdef *f) { PY_LONG_LONG x; memcpy((char *)&x, p, sizeof x); if (x >= LONG_MIN && x <= LONG_MAX) return PyLong_FromLong(Py_SAFE_DOWNCAST(x, PY_LONG_LONG, long)); return PyLong_FromLongLong(x); } static PyObject * nu_ulonglong(const char *p, const formatdef *f) { unsigned PY_LONG_LONG x; memcpy((char *)&x, p, sizeof x); if (x <= LONG_MAX) return PyLong_FromLong(Py_SAFE_DOWNCAST(x, unsigned PY_LONG_LONG, long)); return PyLong_FromUnsignedLongLong(x); } #endif static PyObject * nu_bool(const char *p, const formatdef *f) { BOOL_TYPE x; memcpy((char *)&x, p, sizeof x); return PyBool_FromLong(x != 0); } static PyObject * nu_float(const char *p, const formatdef *f) { float x; memcpy((char *)&x, p, sizeof x); return PyFloat_FromDouble((double)x); } static PyObject * nu_double(const char *p, const formatdef *f) { double x; memcpy((char *)&x, p, sizeof x); return PyFloat_FromDouble(x); } static PyObject * nu_complex_float(const char *p, const formatdef *f) { float re, im; memcpy((char *)&re, p, sizeof re); memcpy((char *)&im, p+sizeof re, sizeof im); return PyComplex_FromDoubles((double)re, (double) im); } static PyObject * nu_complex_double(const char *p, const formatdef *f) { double re, im; memcpy((char *)&re, p, sizeof re); memcpy((char *)&im, p+sizeof re, sizeof im); return PyComplex_FromDoubles(re, im); } static PyObject * nu_void_p(const char *p, const formatdef *f) { void *x; memcpy((char *)&x, p, sizeof x); return PyLong_FromVoidPtr(x); } static int np_byte(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; if (x < -128 || x > 127){ PyErr_SetString(StructError, "byte format requires -128 <= number <= 127"); return -1; } *p = (char)x; return 0; } static int np_ubyte(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; if (x < 0 || x > 255){ PyErr_SetString(StructError, "ubyte format requires 0 <= number <= 255"); return -1; } *p = (char)x; return 0; } static int np_char(char *p, PyObject *v, const formatdef *f) { if (!PyBytes_Check(v) || PyBytes_Size(v) != 1) { PyErr_SetString(StructError, "char format requires a bytes object of length 1"); return -1; } *p = *PyBytes_AsString(v); return 0; } static int np_short(char *p, PyObject *v, const formatdef *f) { long x; short y; if (get_long(v, &x) < 0) return -1; if (x < SHRT_MIN || x > SHRT_MAX){ PyErr_SetString(StructError, "short format requires " STRINGIFY(SHRT_MIN) " <= number <= " STRINGIFY(SHRT_MAX)); return -1; } y = (short)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_ushort(char *p, PyObject *v, const formatdef *f) { long x; unsigned short y; if (get_long(v, &x) < 0) return -1; if (x < 0 || x > USHRT_MAX){ PyErr_SetString(StructError, "ushort format requires 0 <= number <= " STRINGIFY(USHRT_MAX)); return -1; } y = (unsigned short)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_int(char *p, PyObject *v, const formatdef *f) { long x; int y; if (get_long(v, &x) < 0) return -1; #if (SIZEOF_LONG > SIZEOF_INT) if ((x < ((long)INT_MIN)) || (x > ((long)INT_MAX))) RANGE_ERROR(x, f, 0, -1); #endif y = (int)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_uint(char *p, PyObject *v, const formatdef *f) { unsigned long x; unsigned int y; if (get_ulong(v, &x) < 0) return -1; y = (unsigned int)x; #if (SIZEOF_LONG > SIZEOF_INT) if (x > ((unsigned long)UINT_MAX)) RANGE_ERROR(y, f, 1, -1); #endif memcpy(p, (char *)&y, sizeof y); return 0; } static int np_long(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ulong(char *p, PyObject *v, const formatdef *f) { unsigned long x; if (get_ulong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ssize_t(char *p, PyObject *v, const formatdef *f) { Py_ssize_t x; if (get_ssize_t(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_size_t(char *p, PyObject *v, const formatdef *f) { size_t x; if (get_size_t(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } #ifdef HAVE_LONG_LONG static int np_longlong(char *p, PyObject *v, const formatdef *f) { PY_LONG_LONG x; if (get_longlong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ulonglong(char *p, PyObject *v, const formatdef *f) { unsigned PY_LONG_LONG x; if (get_ulonglong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } #endif static int np_bool(char *p, PyObject *v, const formatdef *f) { int y; BOOL_TYPE x; y = PyObject_IsTrue(v); if (y < 0) return -1; x = y; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_float(char *p, PyObject *v, const formatdef *f) { float x = (float)PyFloat_AsDouble(v); if (x == -1 && PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a float"); return -1; } memcpy(p, (char *)&x, sizeof x); return 0; } static int np_double(char *p, PyObject *v, const formatdef *f) { double x = PyFloat_AsDouble(v); if (x == -1 && PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a float"); return -1; } memcpy(p, (char *)&x, sizeof(double)); return 0; } static int np_complex_float(char *p, PyObject *v, const formatdef *f) { if (PyArray_IsZeroDim(v)) { PyObject *v_cast = PyArray_Cast( reinterpret_cast(v), NPY_CFLOAT); if (!v_cast) return -1; memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast)); Py_DECREF(v_cast); } else { float re = 0.0f; float im = 0.0f; Py_complex cplx = PyComplex_AsCComplex(v); if (PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a complex"); return -1; } re = (float)cplx.real; im = (float)cplx.imag; memcpy(p, (char *)&re, sizeof re); memcpy(p+sizeof re, (char *)&im, sizeof im); } return 0; } static int np_complex_double(char *p, PyObject *v, const formatdef *f) { if (PyArray_IsZeroDim(v)) { PyObject *v_cast = PyArray_Cast( reinterpret_cast(v), NPY_CDOUBLE); if (!v_cast) return -1; memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast)); Py_DECREF(v_cast); } else { double re = 0.0; double im = 0.0; Py_complex cplx = PyComplex_AsCComplex(v); if (PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a complex"); return -1; } re = cplx.real; im = cplx.imag; memcpy(p, (char *)&re, sizeof re); memcpy(p+sizeof re, (char *)&im, sizeof im); } return 0; } static int np_void_p(char *p, PyObject *v, const formatdef *f) { void *x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsVoidPtr(v); Py_DECREF(v); if (x == NULL && PyErr_Occurred()) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static formatdef native_table[] = { {'x', sizeof(char), 0, NULL}, {'b', sizeof(char), 0, nu_byte, np_byte}, {'B', sizeof(char), 0, nu_ubyte, np_ubyte}, {'c', sizeof(char), 0, nu_char, np_char}, {'s', sizeof(char), 0, NULL}, {'p', sizeof(char), 0, NULL}, {'h', sizeof(short), SHORT_ALIGN, nu_short, np_short}, {'H', sizeof(short), SHORT_ALIGN, nu_ushort, np_ushort}, {'i', sizeof(int), INT_ALIGN, nu_int, np_int}, {'I', sizeof(int), INT_ALIGN, nu_uint, np_uint}, {'l', sizeof(long), LONG_ALIGN, nu_long, np_long}, {'L', sizeof(long), LONG_ALIGN, nu_ulong, np_ulong}, {'n', sizeof(size_t), SIZE_T_ALIGN, nu_ssize_t, np_ssize_t}, {'N', sizeof(size_t), SIZE_T_ALIGN, nu_size_t, np_size_t}, #ifdef HAVE_LONG_LONG {'q', sizeof(PY_LONG_LONG), LONG_LONG_ALIGN, nu_longlong, np_longlong}, {'Q', sizeof(PY_LONG_LONG), LONG_LONG_ALIGN, nu_ulonglong,np_ulonglong}, #endif {'?', sizeof(BOOL_TYPE), BOOL_ALIGN, nu_bool, np_bool}, {'f', sizeof(float), FLOAT_ALIGN, nu_float, np_float}, {'d', sizeof(double), DOUBLE_ALIGN, nu_double, np_double}, {'F', 2*sizeof(float), FLOAT_ALIGN, nu_complex_float, np_complex_float}, {'D', 2*sizeof(double), DOUBLE_ALIGN, nu_complex_double, np_complex_double}, {'P', sizeof(void *), VOID_P_ALIGN, nu_void_p, np_void_p}, {0} }; // }}} static const formatdef * whichtable(char **pfmt) { const char *fmt = (*pfmt)++; /* May be backed out of later */ switch (*fmt) { default: --*pfmt; /* Back out of pointer increment */ /* Fall through */ case '@': return native_table; } } /* Get the table entry for a format code */ static const formatdef * getentry(int c, const formatdef *f) { for (; f->format != '\0'; f++) { if (f->format == c) { return f; } } PyErr_SetString(StructError, "bad char in struct format"); return NULL; } /* Align a size according to a format code. Return -1 on overflow. */ static Py_ssize_t align(Py_ssize_t size, char c, const formatdef *e) { Py_ssize_t extra; if (e->format == c) { if (e->alignment && size > 0) { extra = (e->alignment - 1) - (size - 1) % (e->alignment); if (extra > PY_SSIZE_T_MAX - size) return -1; size += extra; } } return size; } /* calculate the size of a format string */ static int prepare_s(PyStructObject *self) { const formatdef *f; const formatdef *e; formatcode *codes; const char *s; const char *fmt; char c; Py_ssize_t size, len, num, itemsize; fmt = PyBytes_AS_STRING(self->s_format); f = whichtable((char **)&fmt); s = fmt; size = 0; len = 0; while ((c = *s++) != '\0') { if (isspace(Py_CHARMASK(c))) continue; if ('0' <= c && c <= '9') { num = c - '0'; while ('0' <= (c = *s++) && c <= '9') { /* overflow-safe version of if (num*10 + (c - '0') > PY_SSIZE_T_MAX) { ... } */ if (num >= PY_SSIZE_T_MAX / 10 && ( num > PY_SSIZE_T_MAX / 10 || (c - '0') > PY_SSIZE_T_MAX % 10)) goto overflow; num = num*10 + (c - '0'); } if (c == '\0') { PyErr_SetString(StructError, "repeat count given without format specifier"); return -1; } } else num = 1; e = getentry(c, f); if (e == NULL) return -1; switch (c) { case 's': /* fall through */ case 'p': len++; break; case 'x': break; default: len += num; break; } itemsize = e->size; size = align(size, c, e); if (size == -1) goto overflow; /* if (size + num * itemsize > PY_SSIZE_T_MAX) { ... } */ if (num > (PY_SSIZE_T_MAX - size) / itemsize) goto overflow; size += num * itemsize; } /* check for overflow */ if ((len + 1) > (PY_SSIZE_T_MAX / sizeof(formatcode))) { PyErr_NoMemory(); return -1; } self->s_size = size; self->s_len = len; codes = (formatcode *) PyMem_MALLOC((len + 1) * sizeof(formatcode)); if (codes == NULL) { PyErr_NoMemory(); return -1; } /* Free any s_codes value left over from a previous initialization. */ if (self->s_codes != NULL) PyMem_FREE(self->s_codes); self->s_codes = codes; s = fmt; size = 0; while ((c = *s++) != '\0') { if (isspace(Py_CHARMASK(c))) continue; if ('0' <= c && c <= '9') { num = c - '0'; while ('0' <= (c = *s++) && c <= '9') num = num*10 + (c - '0'); if (c == '\0') break; } else num = 1; e = getentry(c, f); size = align(size, c, e); if (c == 's' || c == 'p') { codes->offset = size; codes->size = num; codes->fmtdef = e; codes++; size += num; } else if (c == 'x') { size += num; } else { while (--num >= 0) { codes->offset = size; codes->size = e->size; codes->fmtdef = e; codes++; size += e->size; } } } codes->fmtdef = NULL; codes->offset = size; codes->size = 0; return 0; overflow: PyErr_SetString(StructError, "total struct size too long"); return -1; } static PyObject * s_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { PyObject *self; assert(type != NULL && type->tp_alloc != NULL); self = type->tp_alloc(type, 0); if (self != NULL) { PyStructObject *s = (PyStructObject*)self; Py_INCREF(Py_None); s->s_format = Py_None; s->s_codes = NULL; s->s_size = -1; s->s_len = -1; } return self; } static int s_init(PyObject *self, PyObject *args, PyObject *kwds) { PyStructObject *soself = (PyStructObject *)self; PyObject *o_format = NULL; int ret = 0; static char *kwlist[] = {"format", 0}; assert(PyStruct_Check(self)); if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:Struct", kwlist, &o_format)) return -1; if (PyUnicode_Check(o_format)) { o_format = PyUnicode_AsASCIIString(o_format); if (o_format == NULL) return -1; } /* XXX support buffer interface, too */ else { Py_INCREF(o_format); } if (!PyBytes_Check(o_format)) { Py_DECREF(o_format); PyErr_Format(PyExc_TypeError, "Struct() argument 1 must be a bytes object, not %.200s", Py_TYPE(o_format)->tp_name); return -1; } Py_CLEAR(soself->s_format); soself->s_format = o_format; ret = prepare_s(soself); return ret; } static void s_dealloc(PyStructObject *s) { if (s->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *)s); if (s->s_codes != NULL) { PyMem_FREE(s->s_codes); } Py_XDECREF(s->s_format); Py_TYPE(s)->tp_free((PyObject *)s); } static PyObject * s_unpack_internal(PyStructObject *soself, char *startfrom) { formatcode *code; Py_ssize_t i = 0; PyObject *result = PyTuple_New(soself->s_len); if (result == NULL) return NULL; for (code = soself->s_codes; code->fmtdef != NULL; code++) { PyObject *v; const formatdef *e = code->fmtdef; const char *res = startfrom + code->offset; if (e->format == 's') { v = PyBytes_FromStringAndSize(res, code->size); } else if (e->format == 'p') { Py_ssize_t n = *(unsigned char*)res; if (n >= code->size) n = code->size - 1; v = PyBytes_FromStringAndSize(res + 1, n); } else { v = e->unpack(res, e); } if (v == NULL) goto fail; PyTuple_SET_ITEM(result, i++, v); } return result; fail: Py_DECREF(result); return NULL; } PyDoc_STRVAR(s_unpack__doc__, "S.unpack(buffer) -> (v1, v2, ...)\n\ \n\ Return a tuple containing values unpacked according to the format\n\ string S.format. Requires len(buffer) == S.size. See help(struct)\n\ for more on format strings."); static PyObject * s_unpack(PyObject *self, PyObject *input) { Py_buffer vbuf; PyObject *result; PyStructObject *soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyObject_GetBuffer(input, &vbuf, PyBUF_SIMPLE) < 0) return NULL; if (vbuf.len != soself->s_size) { PyErr_Format(StructError, "unpack requires a bytes object of length %zd", soself->s_size); PyBuffer_Release(&vbuf); return NULL; } result = s_unpack_internal(soself, (char *) vbuf.buf); PyBuffer_Release(&vbuf); return result; } PyDoc_STRVAR(s_unpack_from__doc__, "S.unpack_from(buffer, offset=0) -> (v1, v2, ...)\n\ \n\ Return a tuple containing values unpacked according to the format\n\ string S.format. Requires len(buffer[offset:]) >= S.size. See\n\ help(struct) for more on format strings."); static PyObject * s_unpack_from(PyObject *self, PyObject *args, PyObject *kwds) { static char *kwlist[] = {"buffer", "offset", 0}; PyObject *input; Py_ssize_t offset = 0; Py_buffer vbuf; PyObject *result; PyStructObject *soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|n:unpack_from", kwlist, &input, &offset)) return NULL; if (PyObject_GetBuffer(input, &vbuf, PyBUF_SIMPLE) < 0) return NULL; if (offset < 0) offset += vbuf.len; if (offset < 0 || vbuf.len - offset < soself->s_size) { PyErr_Format(StructError, "unpack_from requires a buffer of at least %zd bytes", soself->s_size); PyBuffer_Release(&vbuf); return NULL; } result = s_unpack_internal(soself, (char*)vbuf.buf + offset); PyBuffer_Release(&vbuf); return result; } /* * Guts of the pack function. * * Takes a struct object, a tuple of arguments, and offset in that tuple of * argument for where to start processing the arguments for packing, and a * character buffer for writing the packed string. The caller must insure * that the buffer may contain the required length for packing the arguments. * 0 is returned on success, 1 is returned if there is an error. * */ static int s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf) { formatcode *code; /* XXX(nnorwitz): why does i need to be a local? can we use the offset parameter or do we need the wider width? */ Py_ssize_t i; memset(buf, '\0', soself->s_size); i = offset; for (code = soself->s_codes; code->fmtdef != NULL; code++) { Py_ssize_t n; PyObject *v = PyTuple_GET_ITEM(args, i++); const formatdef *e = code->fmtdef; char *res = buf + code->offset; if (e->format == 's') { int isstring; void *p; if (PyBytes_Check(v)) { n = PyBytes_GET_SIZE(v); p = PyBytes_AS_STRING(v); if (n > code->size) n = code->size; if (n > 0) memcpy(res, p, n); } else if (PyByteArray_Check(v)) { n = PyByteArray_GET_SIZE(v); p = PyByteArray_AS_STRING(v); if (n > code->size) n = code->size; if (n > 0) memcpy(res, p, n); } else if (PyObject_CheckBuffer(v)) { Py_buffer view; int gb_result = PyObject_GetBuffer(v, &view, PyBUF_SIMPLE); if (gb_result == -1) return gb_result; n = view.len; if (n > code->size) n = code->size; if (n > 0) memcpy(res, view.buf, n); PyBuffer_Release(&view); } else { PyErr_SetString(StructError, "argument for 's' must be a bytes object"); return -1; } } else if (e->format == 'p') { int isstring; void *p; isstring = PyBytes_Check(v); if (!isstring && !PyByteArray_Check(v)) { PyErr_SetString(StructError, "argument for 'p' must be a bytes object"); return -1; } if (isstring) { n = PyBytes_GET_SIZE(v); p = PyBytes_AS_STRING(v); } else { n = PyByteArray_GET_SIZE(v); p = PyByteArray_AS_STRING(v); } if (n > (code->size - 1)) n = code->size - 1; if (n > 0) memcpy(res + 1, p, n); if (n > 255) n = 255; *res = Py_SAFE_DOWNCAST(n, Py_ssize_t, unsigned char); } else { if (e->pack(res, v, e) < 0) { if (PyLong_Check(v) && PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_SetString(StructError, "long too large to convert to int"); return -1; } } } /* Success */ return 0; } PyDoc_STRVAR(s_pack__doc__, "S.pack(v1, v2, ...) -> bytes\n\ \n\ Return a bytes object containing values v1, v2, ... packed according\n\ to the format string S.format. See help(struct) for more on format\n\ strings."); static PyObject * s_pack(PyObject *self, PyObject *args) { PyStructObject *soself; PyObject *result; /* Validate arguments. */ soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyTuple_GET_SIZE(args) != soself->s_len) { PyErr_Format(StructError, "pack requires exactly %zd arguments", soself->s_len); return NULL; } /* Allocate a new string */ result = PyBytes_FromStringAndSize((char *)NULL, soself->s_size); if (result == NULL) return NULL; /* Call the guts */ if ( s_pack_internal(soself, args, 0, PyBytes_AS_STRING(result)) != 0 ) { Py_DECREF(result); return NULL; } return result; } PyDoc_STRVAR(s_pack_into__doc__, "S.pack_into(buffer, offset, v1, v2, ...)\n\ \n\ Pack the values v1, v2, ... according to the format string S.format\n\ and write the packed bytes into the writable buffer buf starting at\n\ offset. Note that the offset is a required argument. See\n\ help(struct) for more on format strings."); static PyObject * s_pack_into(PyObject *self, PyObject *args) { PyStructObject *soself; char *buffer; Py_ssize_t buffer_len, offset; /* Validate arguments. +1 is for the first arg as buffer. */ soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyTuple_GET_SIZE(args) != (soself->s_len + 2)) { PyErr_Format(StructError, "pack_into requires exactly %zd arguments", (soself->s_len + 2)); return NULL; } /* Extract a writable memory buffer from the first argument */ if ( PyObject_AsWriteBuffer(PyTuple_GET_ITEM(args, 0), (void**)&buffer, &buffer_len) == -1 ) { return NULL; } assert( buffer_len >= 0 ); /* Extract the offset from the first argument */ offset = PyNumber_AsSsize_t(PyTuple_GET_ITEM(args, 1), PyExc_IndexError); if (offset == -1 && PyErr_Occurred()) return NULL; /* Support negative offsets. */ if (offset < 0) offset += buffer_len; /* Check boundaries */ if (offset < 0 || (buffer_len - offset) < soself->s_size) { PyErr_Format(StructError, "pack_into requires a buffer of at least %zd bytes", soself->s_size); return NULL; } /* Call the guts */ if ( s_pack_internal(soself, args, 2, buffer + offset) != 0 ) { return NULL; } Py_RETURN_NONE; } static PyObject * s_get_format(PyStructObject *self, void *unused) { Py_INCREF(self->s_format); return self->s_format; } static PyObject * s_get_size(PyStructObject *self, void *unused) { return PyLong_FromSsize_t(self->s_size); } /* List of functions */ static struct PyMethodDef s_methods[] = { {"pack", s_pack, METH_VARARGS, s_pack__doc__}, {"pack_into", s_pack_into, METH_VARARGS, s_pack_into__doc__}, {"unpack", s_unpack, METH_O, s_unpack__doc__}, {"unpack_from", (PyCFunction)s_unpack_from, METH_VARARGS|METH_KEYWORDS, s_unpack_from__doc__}, {NULL, NULL} /* sentinel */ }; PyDoc_STRVAR(s__doc__, "Struct(fmt) --> compiled struct object\n" "\n" "Return a new Struct object which writes and reads binary data according to\n" "the format string fmt. See help(struct) for more on format strings."); #define OFF(x) offsetof(PyStructObject, x) static PyGetSetDef s_getsetlist[] = { {"format", (getter)s_get_format, (setter)NULL, "struct format string", NULL}, {"size", (getter)s_get_size, (setter)NULL, "struct size in bytes", NULL}, {NULL} /* sentinel */ }; namespace { PyTypeObject PyStructType = { PyVarObject_HEAD_INIT(NULL, 0) "Struct", sizeof(PyStructObject), 0, (destructor)s_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_reserved */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ PyObject_GenericSetAttr, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ s__doc__, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ offsetof(PyStructObject, weakreflist), /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ s_methods, /* tp_methods */ NULL, /* tp_members */ s_getsetlist, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ s_init, /* tp_init */ PyType_GenericAlloc,/* tp_alloc */ s_new, /* tp_new */ PyObject_Del, /* tp_free */ }; } /* ---- Standalone functions ---- */ #define MAXCACHE 100 static PyObject *cache = NULL; static PyObject * cache_struct(PyObject *fmt) { PyObject * s_object; if (cache == NULL) { cache = PyDict_New(); if (cache == NULL) return NULL; } s_object = PyDict_GetItem(cache, fmt); if (s_object != NULL) { Py_INCREF(s_object); return s_object; } s_object = PyObject_CallFunctionObjArgs((PyObject *)(&PyStructType), fmt, NULL); if (s_object != NULL) { if (PyDict_Size(cache) >= MAXCACHE) PyDict_Clear(cache); /* Attempt to cache the result */ if (PyDict_SetItem(cache, fmt, s_object) == -1) PyErr_Clear(); } return s_object; } PyDoc_STRVAR(clearcache_doc, "Clear the internal cache."); static PyObject * clearcache(PyObject *self) { Py_CLEAR(cache); Py_RETURN_NONE; } PyDoc_STRVAR(calcsize_doc, "calcsize(fmt) -> integer\n\ \n\ Return size in bytes of the struct described by the format string fmt."); static PyObject * calcsize(PyObject *self, PyObject *fmt) { Py_ssize_t n; PyObject *s_object = cache_struct(fmt); if (s_object == NULL) return NULL; n = ((PyStructObject *)s_object)->s_size; Py_DECREF(s_object); return PyLong_FromSsize_t(n); } PyDoc_STRVAR(pack_doc, "pack(fmt, v1, v2, ...) -> bytes\n\ \n\ Return a bytes object containing the values v1, v2, ... packed according\n\ to the format string fmt. See help(struct) for more on format strings."); static PyObject * pack(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_pack(s_object, newargs); Py_DECREF(newargs); Py_DECREF(s_object); return result; } PyDoc_STRVAR(pack_into_doc, "pack_into(fmt, buffer, offset, v1, v2, ...)\n\ \n\ Pack the values v1, v2, ... according to the format string fmt and write\n\ the packed bytes into the writable buffer buf starting at offset. Note\n\ that the offset is a required argument. See help(struct) for more\n\ on format strings."); static PyObject * pack_into(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_pack_into(s_object, newargs); Py_DECREF(newargs); Py_DECREF(s_object); return result; } PyDoc_STRVAR(unpack_doc, "unpack(fmt, buffer) -> (v1, v2, ...)\n\ \n\ Return a tuple containing values unpacked according to the format string\n\ fmt. Requires len(buffer) == calcsize(fmt). See help(struct) for more\n\ on format strings."); static PyObject * unpack(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *inputstr, *result; if (!PyArg_UnpackTuple(args, "unpack", 2, 2, &fmt, &inputstr)) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) return NULL; result = s_unpack(s_object, inputstr); Py_DECREF(s_object); return result; } PyDoc_STRVAR(unpack_from_doc, "unpack_from(fmt, buffer, offset=0) -> (v1, v2, ...)\n\ \n\ Return a tuple containing values unpacked according to the format string\n\ fmt. Requires len(buffer[offset:]) >= calcsize(fmt). See help(struct)\n\ for more on format strings."); static PyObject * unpack_from(PyObject *self, PyObject *args, PyObject *kwds) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_unpack_from(s_object, newargs, kwds); Py_DECREF(newargs); Py_DECREF(s_object); return result; } static struct PyMethodDef module_functions[] = { {"_clearcache", (PyCFunction)clearcache, METH_NOARGS, clearcache_doc}, {"calcsize", calcsize, METH_O, calcsize_doc}, {"pack", pack, METH_VARARGS, pack_doc}, {"pack_into", pack_into, METH_VARARGS, pack_into_doc}, {"unpack", unpack, METH_VARARGS, unpack_doc}, {"unpack_from", (PyCFunction)unpack_from, METH_VARARGS|METH_KEYWORDS, unpack_from_doc}, {NULL, NULL} /* sentinel */ }; /* Module initialization */ PyDoc_STRVAR(module_doc, "Functions to convert between Python values and C structs.\n\ Python bytes objects are used to hold the data representing the C struct\n\ and also as format strings (explained below) to describe the layout of data\n\ in the C struct.\n\ \n\ The optional first format char indicates byte order, size and alignment:\n\ @: native order, size & alignment (default)\n\ =: native order, std. size & alignment\n\ <: little-endian, std. size & alignment\n\ >: big-endian, std. size & alignment\n\ !: same as >\n\ \n\ The remaining chars indicate types of args and must match exactly;\n\ these can be preceded by a decimal repeat count:\n\ x: pad byte (no data); c:char; b:signed byte; B:unsigned byte;\n\ ?: _Bool (requires C99; if not available, char is used instead)\n\ h:short; H:unsigned short; i:int; I:unsigned int;\n\ l:long; L:unsigned long; f:float; d:double.\n\ Special cases (preceding decimal count indicates length):\n\ s:string (array of char); p: pascal string (with count byte).\n\ Special cases (only available in native format):\n\ n:ssize_t; N:size_t;\n\ P:an integer type that is wide enough to hold a pointer.\n\ Special case (not in native mode unless 'long long' in platform C):\n\ q:long long; Q:unsigned long long\n\ Whitespace between formats is ignored.\n\ \n\ The variable struct.error is an exception raised on errors.\n"); static struct PyModuleDef _structmodule = { PyModuleDef_HEAD_INIT, "_struct", module_doc, -1, module_functions, NULL, NULL, NULL, NULL }; extern "C" PyMODINIT_FUNC PyInit__pvt_struct(void) { PyObject *m; m = PyModule_Create(&_structmodule); if (m == NULL) return NULL; Py_TYPE(&PyStructType) = &PyType_Type; if (PyType_Ready(&PyStructType) < 0) return NULL; /* Add some symbolic constants to the module */ if (StructError == NULL) { StructError = PyErr_NewException("struct.error", NULL, NULL); if (StructError == NULL) return NULL; } Py_INCREF(StructError); PyModule_AddObject(m, "error", StructError); Py_INCREF((PyObject*)&PyStructType); PyModule_AddObject(m, "Struct", (PyObject*)&PyStructType); return m; } // vim: fdm=marker pycuda-2013.1.1+git20140310/src/wrapper/mempool.cpp0000644000175000000500000001512112313360364017752 0ustar tomussrc#include #include "tools.hpp" #include "wrap_helpers.hpp" #include #include #include namespace py = boost::python; namespace { class device_allocator : public pycuda::context_dependent { public: typedef CUdeviceptr pointer_type; typedef size_t size_type; bool is_deferred() const { return false; } device_allocator *copy() const { return new device_allocator(*this); } pointer_type allocate(size_type s) { pycuda::scoped_context_activation ca(get_context()); return pycuda::mem_alloc(s); } void free(pointer_type p) { try { pycuda::scoped_context_activation ca(get_context()); pycuda::mem_free(p); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(pooled_device_allocation); } void try_release_blocks() { pycuda::run_python_gc(); } }; class host_allocator { private: unsigned m_flags; public: typedef void *pointer_type; typedef size_t size_type; bool is_deferred() const { return false; } host_allocator *copy() const { return new host_allocator(*this); } host_allocator(unsigned flags=0) : m_flags(flags) { } pointer_type allocate(size_type s) { return pycuda::mem_host_alloc(s, m_flags); } void free(pointer_type p) { pycuda::mem_host_free(p); } void try_release_blocks() { pycuda::run_python_gc(); } }; template class context_dependent_memory_pool : public pycuda::memory_pool, public pycuda::explicit_context_dependent { protected: void start_holding_blocks() { acquire_context(); } void stop_holding_blocks() { release_context(); } }; class pooled_device_allocation : public pycuda::context_dependent, public pycuda::pooled_allocation > { private: typedef pycuda::pooled_allocation > super; public: pooled_device_allocation( boost::shared_ptr p, super::size_type s) : super(p, s) { } operator CUdeviceptr() { return ptr(); } }; pooled_device_allocation *device_pool_allocate( boost::shared_ptr > pool, context_dependent_memory_pool::size_type sz) { return new pooled_device_allocation(pool, sz); } PyObject *pooled_device_allocation_to_long(pooled_device_allocation const &da) { #if defined(_WIN32) && defined(_WIN64) return PyLong_FromUnsignedLongLong(da.ptr()); #else return PyLong_FromUnsignedLong(da.ptr()); #endif } class pooled_host_allocation : public pycuda::pooled_allocation > { private: typedef pycuda::pooled_allocation > super; public: pooled_host_allocation( boost::shared_ptr p, super::size_type s) : super(p, s) { } }; py::handle<> host_pool_allocate( boost::shared_ptr > pool, py::object shape, py::object dtype, py::object order_py) { PyArray_Descr *tp_descr; if (PyArray_DescrConverter(dtype.ptr(), &tp_descr) != NPY_SUCCEED) throw py::error_already_set(); std::vector dims; std::copy( py::stl_input_iterator(shape), py::stl_input_iterator(), back_inserter(dims)); std::auto_ptr alloc( new pooled_host_allocation( pool, tp_descr->elsize*pycuda::size_from_dims(dims.size(), &dims.front()))); NPY_ORDER order = PyArray_CORDER; PyArray_OrderConverter(order_py.ptr(), &order); int flags = 0; if (order == PyArray_FORTRANORDER) flags |= NPY_FARRAY; else if (order == PyArray_CORDER) flags |= NPY_CARRAY; else throw std::runtime_error("unrecognized order specifier"); py::handle<> result = py::handle<>(PyArray_NewFromDescr( &PyArray_Type, tp_descr, int(dims.size()), &dims.front(), /*strides*/ NULL, alloc->ptr(), flags, /*obj*/NULL)); py::handle<> alloc_py(handle_from_new_ptr(alloc.release())); PyArray_BASE(result.get()) = alloc_py.get(); Py_INCREF(alloc_py.get()); return result; } template void expose_memory_pool(Wrapper &wrapper) { typedef typename Wrapper::wrapped_type cl; wrapper .add_property("held_blocks", &cl::held_blocks) .add_property("active_blocks", &cl::active_blocks) .DEF_SIMPLE_METHOD(bin_number) .DEF_SIMPLE_METHOD(alloc_size) .DEF_SIMPLE_METHOD(free_held) .DEF_SIMPLE_METHOD(stop_holding) .staticmethod("bin_number") .staticmethod("alloc_size") ; } } void pycuda_expose_tools() { py::def("bitlog2", pycuda::bitlog2); { typedef context_dependent_memory_pool cl; py::class_< cl, boost::noncopyable, boost::shared_ptr > wrapper("DeviceMemoryPool"); wrapper .def("allocate", device_pool_allocate, py::return_value_policy()) ; expose_memory_pool(wrapper); } { typedef host_allocator cl; py::class_ wrapper("PageLockedAllocator", py::init >()); } { typedef pycuda::memory_pool cl; py::class_< cl, boost::noncopyable, boost::shared_ptr > wrapper( "PageLockedMemoryPool", py::init >() ); wrapper .def("allocate", host_pool_allocate, (py::arg("shape"), py::arg("dtype"), py::arg("order")="C")); ; expose_memory_pool(wrapper); } { typedef pooled_device_allocation cl; py::class_( "PooledDeviceAllocation", py::no_init) .DEF_SIMPLE_METHOD(free) .def("__int__", &cl::ptr) .def("__long__", pooled_device_allocation_to_long) .def("__len__", &cl::size) ; py::implicitly_convertible(); } { typedef pooled_host_allocation cl; py::class_( "PooledHostAllocation", py::no_init) .DEF_SIMPLE_METHOD(free) .def("__len__", &cl::size) ; } } pycuda-2013.1.1+git20140310/src/wrapper/numpy_init.hpp0000644000175000000500000000065712313360364020512 0ustar tomussrc#ifndef _FAYHVVAAA_PYCUDA_HEADER_SEEN_NUMPY_INIT_HPP #include #include namespace { static struct pyublas_array_importer { static bool do_import_array() { import_array1(false); return true; } pyublas_array_importer() { if (!do_import_array()) throw std::runtime_error("numpy failed to initialize"); } } _array_importer; } #endif pycuda-2013.1.1+git20140310/src/wrapper/_pvt_struct_v2.cpp0000644000175000000500000011241412313360364021270 0ustar tomussrc/* struct module -- pack values into and (out of) strings */ /* New version supporting byte order, alignment and size options, character strings, and unsigned numbers */ /* Compared with vanilla Python's struct module, this adds support * for packing complex values and only supports native packing. * (the minimum that's needed for PyOpenCL/PyCUDA.) */ #define PY_SSIZE_T_CLEAN #include "Python.h" #include "structseq.h" #include "structmember.h" #include #include "numpy_init.hpp" // static PyTypeObject PyStructType; /* compatibility macros */ #if (PY_VERSION_HEX < 0x02050000) #ifndef PY_SSIZE_T_MIN typedef long int Py_ssize_t; #endif #define PyInt_FromSsize_t(x) PyInt_FromLong(x) #define PyInt_AsSsize_t(x) PyInt_AsLong(x) #endif /* If PY_STRUCT_FLOAT_COERCE is defined, the struct module will allow float arguments for integer formats with a warning for backwards compatibility. */ #define PY_STRUCT_FLOAT_COERCE 1 #ifdef PY_STRUCT_FLOAT_COERCE #define FLOAT_COERCE "integer argument expected, got float" #endif /* Compatibility with Py2.5 and older */ #ifndef Py_TYPE # define Py_TYPE(o) ((o)->ob_type) #endif #ifndef PyVarObject_HEAD_INIT #define PyVarObject_HEAD_INIT(type, size) \ PyObject_HEAD_INIT(type) size, #endif #ifndef SIZEOF_SIZE_T #define SIZEOF_SIZE_T sizeof(size_t) #endif #ifndef PY_SSIZE_T_MAX #define PY_SSIZE_T_MAX LONG_MAX #endif /* The translation function for each format character is table driven */ typedef struct _formatdef { char format; Py_ssize_t size; Py_ssize_t alignment; PyObject* (*unpack)(const char *, const struct _formatdef *); int (*pack)(char *, PyObject *, const struct _formatdef *); } formatdef; typedef struct _formatcode { const struct _formatdef *fmtdef; Py_ssize_t offset; Py_ssize_t size; } formatcode; /* Struct object interface */ typedef struct { PyObject_HEAD Py_ssize_t s_size; Py_ssize_t s_len; formatcode *s_codes; PyObject *s_format; PyObject *weakreflist; /* List of weak references */ } PyStructObject; #define PyStruct_Check(op) PyObject_TypeCheck(op, &PyStructType) #define PyStruct_CheckExact(op) (Py_TYPE(op) == &PyStructType) /* Exception */ static PyObject *StructError; /* Define various structs to figure out the alignments of types */ typedef struct { char c; short x; } st_short; typedef struct { char c; int x; } st_int; typedef struct { char c; long x; } st_long; typedef struct { char c; float x; } st_float; typedef struct { char c; double x; } st_double; typedef struct { char c; void *x; } st_void_p; #define SHORT_ALIGN (sizeof(st_short) - sizeof(short)) #define INT_ALIGN (sizeof(st_int) - sizeof(int)) #define LONG_ALIGN (sizeof(st_long) - sizeof(long)) #define FLOAT_ALIGN (sizeof(st_float) - sizeof(float)) #define DOUBLE_ALIGN (sizeof(st_double) - sizeof(double)) #define VOID_P_ALIGN (sizeof(st_void_p) - sizeof(void *)) /* We can't support q and Q in native mode unless the compiler does; in std mode, they're 8 bytes on all platforms. */ #ifdef HAVE_LONG_LONG typedef struct { char c; PY_LONG_LONG x; } s_long_long; #define LONG_LONG_ALIGN (sizeof(s_long_long) - sizeof(PY_LONG_LONG)) #endif #define BOOL_TYPE bool typedef struct { char c; bool x; } s_bool; #define BOOL_ALIGN (sizeof(s_bool) - sizeof(BOOL_TYPE)) #define STRINGIFY(x) #x #ifdef __powerc #pragma options align=reset #endif static char *integer_codes = "bBhHiIlLqQ"; static void s_dealloc(PyStructObject *s); static int s_init(PyObject *self, PyObject *args, PyObject *kwds); static PyObject *s_new(PyTypeObject *type, PyObject *args, PyObject *kwds); static PyObject *s_pack(PyObject *self, PyObject *args); static PyObject *s_pack_into(PyObject *self, PyObject *args); static PyObject *s_unpack(PyObject *self, PyObject *inputstr); static PyObject *s_unpack_from(PyObject *self, PyObject *args, PyObject *kwds); static PyObject *s_get_format(PyStructObject *self, void *unused); static PyObject *s_get_size(PyStructObject *self, void *unused); PyDoc_STRVAR(s__doc__, "Compiled struct object"); /* List of functions */ PyDoc_STRVAR(s_pack__doc__, "S.pack(v1, v2, ...) -> string\n\ \n\ Return a string containing values v1, v2, ... packed according to this\n\ Struct's format. See struct.__doc__ for more on format strings."); PyDoc_STRVAR(s_pack_into__doc__, "S.pack_into(buffer, offset, v1, v2, ...)\n\ \n\ Pack the values v1, v2, ... according to this Struct's format, write \n\ the packed bytes into the writable buffer buf starting at offset. Note\n\ that the offset is not an optional argument. See struct.__doc__ for \n\ more on format strings."); PyDoc_STRVAR(s_unpack__doc__, "S.unpack(str) -> (v1, v2, ...)\n\ \n\ Return tuple containing values unpacked according to this Struct's format.\n\ Requires len(str) == self.size. See struct.__doc__ for more on format\n\ strings."); PyDoc_STRVAR(s_unpack_from__doc__, "S.unpack_from(buffer[, offset]) -> (v1, v2, ...)\n\ \n\ Return tuple containing values unpacked according to this Struct's format.\n\ Unlike unpack, unpack_from can unpack values from any object supporting\n\ the buffer API, not just str. Requires len(buffer[offset:]) >= self.size.\n\ See struct.__doc__ for more on format strings."); static struct PyMethodDef s_methods[] = { {"pack", s_pack, METH_VARARGS, s_pack__doc__}, {"pack_into", s_pack_into, METH_VARARGS, s_pack_into__doc__}, {"unpack", s_unpack, METH_O, s_unpack__doc__}, {"unpack_from", (PyCFunction)s_unpack_from, METH_VARARGS|METH_KEYWORDS, s_unpack_from__doc__}, {NULL, NULL} /* sentinel */ }; #define OFF(x) offsetof(PyStructObject, x) static PyGetSetDef s_getsetlist[] = { {"format", (getter)s_get_format, (setter)NULL, "struct format string", NULL}, {"size", (getter)s_get_size, (setter)NULL, "struct size in bytes", NULL}, {NULL} /* sentinel */ }; static PyTypeObject PyStructType = { PyVarObject_HEAD_INIT(NULL, 0) "Struct", sizeof(PyStructObject), 0, (destructor)s_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ PyObject_GenericSetAttr, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_WEAKREFS,/* tp_flags */ s__doc__, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ offsetof(PyStructObject, weakreflist), /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ s_methods, /* tp_methods */ NULL, /* tp_members */ s_getsetlist, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ s_init, /* tp_init */ PyType_GenericAlloc,/* tp_alloc */ s_new, /* tp_new */ PyObject_Del, /* tp_free */ }; /* Helper to get a PyLongObject by hook or by crook. Caller should decref. */ static PyObject * get_pylong(PyObject *v) { PyNumberMethods *m; assert(v != NULL); if (PyInt_Check(v)) return PyLong_FromLong(PyInt_AS_LONG(v)); if (PyLong_Check(v)) { Py_INCREF(v); return v; } m = Py_TYPE(v)->tp_as_number; if (m != NULL && m->nb_long != NULL) { v = m->nb_long(v); if (v == NULL) return NULL; if (PyLong_Check(v)) return v; Py_DECREF(v); } PyErr_SetString(StructError, "cannot convert argument to long"); return NULL; } /* Helper to convert a Python object to a C long. Sets an exception (struct.error for an inconvertible type, OverflowError for out-of-range values) and returns -1 on error. */ static int get_long(PyObject *v, long *p) { long x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsLong(v); Py_DECREF(v); if (x == (long)-1 && PyErr_Occurred()) return -1; *p = x; return 0; } /* Same, but handling unsigned long */ static int get_ulong(PyObject *v, unsigned long *p) { unsigned long x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsUnsignedLong(v); Py_DECREF(v); if (x == (unsigned long)-1 && PyErr_Occurred()) return -1; *p = x; return 0; } #ifdef HAVE_LONG_LONG /* Same, but handling native long long. */ static int get_longlong(PyObject *v, PY_LONG_LONG *p) { PY_LONG_LONG x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsLongLong(v); Py_DECREF(v); if (x == (PY_LONG_LONG)-1 && PyErr_Occurred()) return -1; *p = x; return 0; } /* Same, but handling native unsigned long long. */ static int get_ulonglong(PyObject *v, unsigned PY_LONG_LONG *p) { unsigned PY_LONG_LONG x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsUnsignedLongLong(v); Py_DECREF(v); if (x == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) return -1; *p = x; return 0; } #endif #if (SIZEOF_LONG > SIZEOF_INT) /* Helper to format the range error exceptions */ static int _range_error(const formatdef *f, int is_unsigned) { /* ulargest is the largest unsigned value with f->size bytes. * Note that the simpler: * ((size_t)1 << (f->size * 8)) - 1 * doesn't work when f->size == sizeof(size_t) because C doesn't * define what happens when a left shift count is >= the number of * bits in the integer being shifted; e.g., on some boxes it doesn't * shift at all when they're equal. */ const size_t ulargest = (size_t)-1 >> ((SIZEOF_SIZE_T - f->size)*8); assert(f->size >= 1 && f->size <= SIZEOF_SIZE_T); if (is_unsigned) PyErr_Format(StructError, "'%c' format requires 0 <= number <= %zu", f->format, ulargest); else { const Py_ssize_t largest = (Py_ssize_t)(ulargest >> 1); PyErr_Format(StructError, "'%c' format requires %zd <= number <= %zd", f->format, ~ largest, largest); } return -1; } #endif /* A large number of small routines follow, with names of the form [bln][up]_TYPE [bln] distiguishes among big-endian, little-endian and native. [pu] distiguishes between pack (to struct) and unpack (from struct). TYPE is one of char, byte, ubyte, etc. */ /* Native mode routines. ****************************************************/ /* NOTE: In all n[up]_ routines handling types larger than 1 byte, there is *no* guarantee that the p pointer is properly aligned for each type, therefore memcpy is called. An intermediate variable is used to compensate for big-endian architectures. Normally both the intermediate variable and the memcpy call will be skipped by C optimisation in little-endian architectures (gcc >= 2.91 does this). */ static PyObject * nu_char(const char *p, const formatdef *f) { return PyString_FromStringAndSize(p, 1); } static PyObject * nu_byte(const char *p, const formatdef *f) { return PyInt_FromLong((long) *(signed char *)p); } static PyObject * nu_ubyte(const char *p, const formatdef *f) { return PyInt_FromLong((long) *(unsigned char *)p); } static PyObject * nu_short(const char *p, const formatdef *f) { short x; memcpy((char *)&x, p, sizeof x); return PyInt_FromLong((long)x); } static PyObject * nu_ushort(const char *p, const formatdef *f) { unsigned short x; memcpy((char *)&x, p, sizeof x); return PyInt_FromLong((long)x); } static PyObject * nu_int(const char *p, const formatdef *f) { int x; memcpy((char *)&x, p, sizeof x); return PyInt_FromLong((long)x); } static PyObject * nu_uint(const char *p, const formatdef *f) { unsigned int x; memcpy((char *)&x, p, sizeof x); #if (SIZEOF_LONG > SIZEOF_INT) return PyInt_FromLong((long)x); #else if (x <= ((unsigned int)LONG_MAX)) return PyInt_FromLong((long)x); return PyLong_FromUnsignedLong((unsigned long)x); #endif } static PyObject * nu_long(const char *p, const formatdef *f) { long x; memcpy((char *)&x, p, sizeof x); return PyInt_FromLong(x); } static PyObject * nu_ulong(const char *p, const formatdef *f) { unsigned long x; memcpy((char *)&x, p, sizeof x); if (x <= LONG_MAX) return PyInt_FromLong((long)x); return PyLong_FromUnsignedLong(x); } /* Native mode doesn't support q or Q unless the platform C supports long long (or, on Windows, __int64). */ #ifdef HAVE_LONG_LONG static PyObject * nu_longlong(const char *p, const formatdef *f) { PY_LONG_LONG x; memcpy((char *)&x, p, sizeof x); if (x >= LONG_MIN && x <= LONG_MAX) return PyInt_FromLong(Py_SAFE_DOWNCAST(x, PY_LONG_LONG, long)); return PyLong_FromLongLong(x); } static PyObject * nu_ulonglong(const char *p, const formatdef *f) { unsigned PY_LONG_LONG x; memcpy((char *)&x, p, sizeof x); if (x <= LONG_MAX) return PyInt_FromLong(Py_SAFE_DOWNCAST(x, unsigned PY_LONG_LONG, long)); return PyLong_FromUnsignedLongLong(x); } #endif static PyObject * nu_bool(const char *p, const formatdef *f) { BOOL_TYPE x; memcpy((char *)&x, p, sizeof x); return PyBool_FromLong(x != 0); } static PyObject * nu_float(const char *p, const formatdef *f) { float x; memcpy((char *)&x, p, sizeof x); return PyFloat_FromDouble((double)x); } static PyObject * nu_double(const char *p, const formatdef *f) { double x; memcpy((char *)&x, p, sizeof x); return PyFloat_FromDouble(x); } static PyObject * nu_complex_float(const char *p, const formatdef *f) { float re, im; memcpy((char *)&re, p, sizeof re); memcpy((char *)&im, p+sizeof re, sizeof im); return PyComplex_FromDoubles((double)re, (double) im); } static PyObject * nu_complex_double(const char *p, const formatdef *f) { double re, im; memcpy((char *)&re, p, sizeof re); memcpy((char *)&im, p+sizeof re, sizeof im); return PyComplex_FromDoubles(re, im); } static PyObject * nu_void_p(const char *p, const formatdef *f) { void *x; memcpy((char *)&x, p, sizeof x); return PyLong_FromVoidPtr(x); } static int np_byte(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; if (x < -128 || x > 127){ PyErr_SetString(StructError, "byte format requires -128 <= number <= 127"); return -1; } *p = (char)x; return 0; } static int np_ubyte(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; if (x < 0 || x > 255){ PyErr_SetString(StructError, "ubyte format requires 0 <= number <= 255"); return -1; } *p = (char)x; return 0; } static int np_char(char *p, PyObject *v, const formatdef *f) { if (!PyString_Check(v) || PyString_Size(v) != 1) { PyErr_SetString(StructError, "char format require string of length 1"); return -1; } *p = *PyString_AsString(v); return 0; } static int np_short(char *p, PyObject *v, const formatdef *f) { long x; short y; if (get_long(v, &x) < 0) return -1; if (x < SHRT_MIN || x > SHRT_MAX){ PyErr_SetString(StructError, "short format requires " STRINGIFY(SHRT_MIN) " <= number <= " STRINGIFY(SHRT_MAX)); return -1; } y = (short)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_ushort(char *p, PyObject *v, const formatdef *f) { long x; unsigned short y; if (get_long(v, &x) < 0) return -1; if (x < 0 || x > USHRT_MAX){ PyErr_SetString(StructError, "ushort format requires 0 <= number <= " STRINGIFY(USHRT_MAX)); return -1; } y = (unsigned short)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_int(char *p, PyObject *v, const formatdef *f) { long x; int y; if (get_long(v, &x) < 0) return -1; #if (SIZEOF_LONG > SIZEOF_INT) if ((x < ((long)INT_MIN)) || (x > ((long)INT_MAX))) return _range_error(f, 0); #endif y = (int)x; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_uint(char *p, PyObject *v, const formatdef *f) { unsigned long x; unsigned int y; if (get_ulong(v, &x) < 0) return -1; y = (unsigned int)x; #if (SIZEOF_LONG > SIZEOF_INT) if (x > ((unsigned long)UINT_MAX)) return _range_error(f, 1); #endif memcpy(p, (char *)&y, sizeof y); return 0; } static int np_long(char *p, PyObject *v, const formatdef *f) { long x; if (get_long(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ulong(char *p, PyObject *v, const formatdef *f) { unsigned long x; if (get_ulong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } #ifdef HAVE_LONG_LONG static int np_longlong(char *p, PyObject *v, const formatdef *f) { PY_LONG_LONG x; if (get_longlong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static int np_ulonglong(char *p, PyObject *v, const formatdef *f) { unsigned PY_LONG_LONG x; if (get_ulonglong(v, &x) < 0) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } #endif static int np_bool(char *p, PyObject *v, const formatdef *f) { BOOL_TYPE y; y = PyObject_IsTrue(v) != 0; memcpy(p, (char *)&y, sizeof y); return 0; } static int np_float(char *p, PyObject *v, const formatdef *f) { float x = (float)PyFloat_AsDouble(v); if (x == -1 && PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a float"); return -1; } memcpy(p, (char *)&x, sizeof x); return 0; } static int np_double(char *p, PyObject *v, const formatdef *f) { double x = PyFloat_AsDouble(v); if (x == -1 && PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a float"); return -1; } memcpy(p, (char *)&x, sizeof(double)); return 0; } static int np_complex_float(char *p, PyObject *v, const formatdef *f) { if (PyArray_IsZeroDim(v)) { PyObject *v_cast = PyArray_Cast( reinterpret_cast(v), NPY_CFLOAT); if (!v_cast) return -1; memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast)); Py_DECREF(v_cast); } else { float re = 0.0f; float im = 0.0f; Py_complex cplx; #if (PY_VERSION_HEX < 0x02060000) if (PyComplex_Check(v)) cplx = PyComplex_AsCComplex(v); else if (PyObject_HasAttrString(v, "__complex__")) { PyObject *v2 = PyObject_CallMethod(v, "__complex__", ""); cplx = PyComplex_AsCComplex(v2); Py_DECREF(v2); } else cplx = PyComplex_AsCComplex(v); #else cplx = PyComplex_AsCComplex(v); #endif if (PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a complex"); return -1; } re = (float)cplx.real; im = (float)cplx.imag; memcpy(p, (char *)&re, sizeof re); memcpy(p+sizeof re, (char *)&im, sizeof im); } return 0; } static int np_complex_double(char *p, PyObject *v, const formatdef *f) { if (PyArray_IsZeroDim(v)) { PyObject *v_cast = PyArray_Cast( reinterpret_cast(v), NPY_CDOUBLE); if (!v_cast) return -1; memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast)); Py_DECREF(v_cast); } else { double re = 0.0; double im = 0.0; Py_complex cplx; #if (PY_VERSION_HEX < 0x02060000) if (PyComplex_Check(v)) cplx = PyComplex_AsCComplex(v); else if (PyObject_HasAttrString(v, "__complex__")) { PyObject *v2 = PyObject_CallMethod(v, "__complex__", ""); cplx = PyComplex_AsCComplex(v2); Py_DECREF(v2); } else cplx = PyComplex_AsCComplex(v); #else cplx = PyComplex_AsCComplex(v); #endif if (PyErr_Occurred()) { PyErr_SetString(StructError, "required argument is not a complex"); return -1; } re = cplx.real; im = cplx.imag; memcpy(p, (char *)&re, sizeof re); memcpy(p+sizeof re, (char *)&im, sizeof im); } return 0; } static int np_void_p(char *p, PyObject *v, const formatdef *f) { void *x; v = get_pylong(v); if (v == NULL) return -1; assert(PyLong_Check(v)); x = PyLong_AsVoidPtr(v); Py_DECREF(v); if (x == NULL && PyErr_Occurred()) return -1; memcpy(p, (char *)&x, sizeof x); return 0; } static formatdef native_table[] = { {'x', sizeof(char), 0, NULL}, {'b', sizeof(char), 0, nu_byte, np_byte}, {'B', sizeof(char), 0, nu_ubyte, np_ubyte}, {'c', sizeof(char), 0, nu_char, np_char}, {'s', sizeof(char), 0, NULL}, {'p', sizeof(char), 0, NULL}, {'h', sizeof(short), SHORT_ALIGN, nu_short, np_short}, {'H', sizeof(short), SHORT_ALIGN, nu_ushort, np_ushort}, {'i', sizeof(int), INT_ALIGN, nu_int, np_int}, {'I', sizeof(int), INT_ALIGN, nu_uint, np_uint}, {'l', sizeof(long), LONG_ALIGN, nu_long, np_long}, {'L', sizeof(long), LONG_ALIGN, nu_ulong, np_ulong}, #ifdef HAVE_LONG_LONG {'q', sizeof(PY_LONG_LONG), LONG_LONG_ALIGN, nu_longlong, np_longlong}, {'Q', sizeof(PY_LONG_LONG), LONG_LONG_ALIGN, nu_ulonglong,np_ulonglong}, #endif {'?', sizeof(BOOL_TYPE), BOOL_ALIGN, nu_bool, np_bool}, {'f', sizeof(float), FLOAT_ALIGN, nu_float, np_float}, {'d', sizeof(double), DOUBLE_ALIGN, nu_double, np_double}, {'F', 2*sizeof(float), FLOAT_ALIGN, nu_complex_float, np_complex_float}, {'D', 2*sizeof(double), DOUBLE_ALIGN, nu_complex_double, np_complex_double}, {'P', sizeof(void *), VOID_P_ALIGN, nu_void_p, np_void_p}, {0} }; /* Get the table entry for a format code */ static const formatdef * getentry(int c, const formatdef *f) { for (; f->format != '\0'; f++) { if (f->format == c) { return f; } } PyErr_SetString(StructError, "bad char in struct format"); return NULL; } /* Align a size according to a format code */ static Py_ssize_t align(Py_ssize_t size, char c, const formatdef *e) { if (e->format == c) { if (e->alignment) { size = ((size + e->alignment - 1) / e->alignment) * e->alignment; } } return size; } /* calculate the size of a format string */ static int prepare_s(PyStructObject *self) { const formatdef *f; const formatdef *e; formatcode *codes; const char *s; const char *fmt; char c; Py_ssize_t size, len, num, itemsize, x; fmt = PyString_AS_STRING(self->s_format); f = native_table; s = fmt; size = 0; len = 0; while ((c = *s++) != '\0') { if (isspace(Py_CHARMASK(c))) continue; if ('0' <= c && c <= '9') { num = c - '0'; while ('0' <= (c = *s++) && c <= '9') { x = num*10 + (c - '0'); if (x/10 != num) { PyErr_SetString( StructError, "overflow in item count"); return -1; } num = x; } if (c == '\0') break; } else num = 1; e = getentry(c, f); if (e == NULL) return -1; switch (c) { case 's': /* fall through */ case 'p': len++; break; case 'x': break; default: len += num; break; } itemsize = e->size; size = align(size, c, e); x = num * itemsize; size += x; if (x/itemsize != num || size < 0) { PyErr_SetString(StructError, "total struct size too long"); return -1; } } /* check for overflow */ if ((len + 1) > (PY_SSIZE_T_MAX / sizeof(formatcode))) { PyErr_NoMemory(); return -1; } self->s_size = size; self->s_len = len; codes = (formatcode *) PyMem_MALLOC((len + 1) * sizeof(formatcode)); if (codes == NULL) { PyErr_NoMemory(); return -1; } self->s_codes = codes; s = fmt; size = 0; while ((c = *s++) != '\0') { if (isspace(Py_CHARMASK(c))) continue; if ('0' <= c && c <= '9') { num = c - '0'; while ('0' <= (c = *s++) && c <= '9') num = num*10 + (c - '0'); if (c == '\0') break; } else num = 1; e = getentry(c, f); size = align(size, c, e); if (c == 's' || c == 'p') { codes->offset = size; codes->size = num; codes->fmtdef = e; codes++; size += num; } else if (c == 'x') { size += num; } else { while (--num >= 0) { codes->offset = size; codes->size = e->size; codes->fmtdef = e; codes++; size += e->size; } } } codes->fmtdef = NULL; codes->offset = size; codes->size = 0; return 0; } static PyObject * s_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { PyObject *self; assert(type != NULL && type->tp_alloc != NULL); self = type->tp_alloc(type, 0); if (self != NULL) { PyStructObject *s = (PyStructObject*)self; Py_INCREF(Py_None); s->s_format = Py_None; s->s_codes = NULL; s->s_size = -1; s->s_len = -1; } return self; } static int s_init(PyObject *self, PyObject *args, PyObject *kwds) { PyStructObject *soself = (PyStructObject *)self; PyObject *o_format = NULL; int ret = 0; static char *kwlist[] = {"format", 0}; assert(PyStruct_Check(self)); if (!PyArg_ParseTupleAndKeywords(args, kwds, "S:Struct", kwlist, &o_format)) return -1; Py_INCREF(o_format); Py_CLEAR(soself->s_format); soself->s_format = o_format; ret = prepare_s(soself); return ret; } static void s_dealloc(PyStructObject *s) { if (s->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *)s); if (s->s_codes != NULL) { PyMem_FREE(s->s_codes); } Py_XDECREF(s->s_format); Py_TYPE(s)->tp_free((PyObject *)s); } static PyObject * s_unpack_internal(PyStructObject *soself, char *startfrom) { formatcode *code; Py_ssize_t i = 0; PyObject *result = PyTuple_New(soself->s_len); if (result == NULL) return NULL; for (code = soself->s_codes; code->fmtdef != NULL; code++) { PyObject *v; const formatdef *e = code->fmtdef; const char *res = startfrom + code->offset; if (e->format == 's') { v = PyString_FromStringAndSize(res, code->size); } else if (e->format == 'p') { Py_ssize_t n = *(unsigned char*)res; if (n >= code->size) n = code->size - 1; v = PyString_FromStringAndSize(res + 1, n); } else { v = e->unpack(res, e); } if (v == NULL) goto fail; PyTuple_SET_ITEM(result, i++, v); } return result; fail: Py_DECREF(result); return NULL; } static PyObject * s_unpack(PyObject *self, PyObject *inputstr) { char *start; Py_ssize_t len; PyObject *args=NULL, *result; PyStructObject *soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (inputstr == NULL) goto fail; if (PyString_Check(inputstr) && PyString_GET_SIZE(inputstr) == soself->s_size) { return s_unpack_internal(soself, PyString_AS_STRING(inputstr)); } args = PyTuple_Pack(1, inputstr); if (args == NULL) return NULL; if (!PyArg_ParseTuple(args, "s#:unpack", &start, &len)) goto fail; if (soself->s_size != len) goto fail; result = s_unpack_internal(soself, start); Py_DECREF(args); return result; fail: Py_XDECREF(args); PyErr_Format(StructError, "unpack requires a string argument of length %zd", soself->s_size); return NULL; } static PyObject * s_unpack_from(PyObject *self, PyObject *args, PyObject *kwds) { static char *kwlist[] = {"buffer", "offset", 0}; #if (PY_VERSION_HEX < 0x02050000) static char *fmt = "z#|i:unpack_from"; #else static char *fmt = "z#|n:unpack_from"; #endif Py_ssize_t buffer_len = 0, offset = 0; char *buffer = NULL; PyStructObject *soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (!PyArg_ParseTupleAndKeywords(args, kwds, fmt, kwlist, &buffer, &buffer_len, &offset)) return NULL; if (buffer == NULL) { PyErr_Format(StructError, "unpack_from requires a buffer argument"); return NULL; } if (offset < 0) offset += buffer_len; if (offset < 0 || (buffer_len - offset) < soself->s_size) { PyErr_Format(StructError, "unpack_from requires a buffer of at least %zd bytes", soself->s_size); return NULL; } return s_unpack_internal(soself, buffer + offset); } /* * Guts of the pack function. * * Takes a struct object, a tuple of arguments, and offset in that tuple of * argument for where to start processing the arguments for packing, and a * character buffer for writing the packed string. The caller must insure * that the buffer may contain the required length for packing the arguments. * 0 is returned on success, 1 is returned if there is an error. * */ static int s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf) { formatcode *code; /* XXX(nnorwitz): why does i need to be a local? can we use the offset parameter or do we need the wider width? */ Py_ssize_t i; memset(buf, '\0', soself->s_size); i = offset; for (code = soself->s_codes; code->fmtdef != NULL; code++) { Py_ssize_t n; PyObject *v = PyTuple_GET_ITEM(args, i++); const formatdef *e = code->fmtdef; char *res = buf + code->offset; if (e->format == 's') { if (!PyString_Check(v)) { if (!PyObject_CheckReadBuffer(v)) { PyErr_SetString(StructError, "argument for 's' must " "be a string or a buffer"); return -1; } else { const void *buf; Py_ssize_t len; if (PyObject_AsReadBuffer(v, &buf, &len)) return -1; if (len > code->size) len = code->size; if (len > 0) memcpy(res, buf, len); } } else { n = PyString_GET_SIZE(v); if (n > code->size) n = code->size; if (n > 0) memcpy(res, PyString_AS_STRING(v), n); } } else if (e->format == 'p') { if (!PyString_Check(v)) { PyErr_SetString(StructError, "argument for 'p' must " "be a string"); return -1; } n = PyString_GET_SIZE(v); if (n > (code->size - 1)) n = code->size - 1; if (n > 0) memcpy(res + 1, PyString_AS_STRING(v), n); if (n > 255) n = 255; *res = Py_SAFE_DOWNCAST(n, Py_ssize_t, unsigned char); } else if (e->pack(res, v, e) < 0) { if (strchr(integer_codes, e->format) != NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) PyErr_Format(StructError, "integer out of range for " "'%c' format code", e->format); return -1; } } /* Success */ return 0; } static PyObject * s_pack(PyObject *self, PyObject *args) { PyStructObject *soself; PyObject *result; /* Validate arguments. */ soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyTuple_GET_SIZE(args) != soself->s_len) { PyErr_Format(StructError, "pack requires exactly %zd arguments", soself->s_len); return NULL; } /* Allocate a new string */ result = PyString_FromStringAndSize((char *)NULL, soself->s_size); if (result == NULL) return NULL; /* Call the guts */ if ( s_pack_internal(soself, args, 0, PyString_AS_STRING(result)) != 0 ) { Py_DECREF(result); return NULL; } return result; } static PyObject * s_pack_into(PyObject *self, PyObject *args) { PyStructObject *soself; char *buffer; Py_ssize_t buffer_len, offset; /* Validate arguments. +1 is for the first arg as buffer. */ soself = (PyStructObject *)self; assert(PyStruct_Check(self)); assert(soself->s_codes != NULL); if (PyTuple_GET_SIZE(args) != (soself->s_len + 2)) { PyErr_Format(StructError, "pack_into requires exactly %zd arguments", (soself->s_len + 2)); return NULL; } /* Extract a writable memory buffer from the first argument */ if ( PyObject_AsWriteBuffer(PyTuple_GET_ITEM(args, 0), (void**)&buffer, &buffer_len) == -1 ) { return NULL; } assert( buffer_len >= 0 ); /* Extract the offset from the first argument */ offset = PyInt_AsSsize_t(PyTuple_GET_ITEM(args, 1)); if (offset == -1 && PyErr_Occurred()) return NULL; /* Support negative offsets. */ if (offset < 0) offset += buffer_len; /* Check boundaries */ if (offset < 0 || (buffer_len - offset) < soself->s_size) { PyErr_Format(StructError, "pack_into requires a buffer of at least %zd bytes", soself->s_size); return NULL; } /* Call the guts */ if ( s_pack_internal(soself, args, 2, buffer + offset) != 0 ) { return NULL; } Py_RETURN_NONE; } static PyObject * s_get_format(PyStructObject *self, void *unused) { Py_INCREF(self->s_format); return self->s_format; } static PyObject * s_get_size(PyStructObject *self, void *unused) { return PyInt_FromSsize_t(self->s_size); } /* ---- Standalone functions ---- */ #define MAXCACHE 100 static PyObject *cache = NULL; static PyObject * cache_struct(PyObject *fmt) { PyObject * s_object; if (cache == NULL) { cache = PyDict_New(); if (cache == NULL) return NULL; } s_object = PyDict_GetItem(cache, fmt); if (s_object != NULL) { Py_INCREF(s_object); return s_object; } s_object = PyObject_CallFunctionObjArgs((PyObject *)(&PyStructType), fmt, NULL); if (s_object != NULL) { if (PyDict_Size(cache) >= MAXCACHE) PyDict_Clear(cache); /* Attempt to cache the result */ if (PyDict_SetItem(cache, fmt, s_object) == -1) PyErr_Clear(); } return s_object; } PyDoc_STRVAR(clearcache_doc, "Clear the internal cache."); static PyObject * clearcache(PyObject *self) { Py_CLEAR(cache); Py_RETURN_NONE; } PyDoc_STRVAR(calcsize_doc, "Return size of C struct described by format string fmt."); static PyObject * calcsize(PyObject *self, PyObject *fmt) { Py_ssize_t n; PyObject *s_object = cache_struct(fmt); if (s_object == NULL) return NULL; n = ((PyStructObject *)s_object)->s_size; Py_DECREF(s_object); return PyInt_FromSsize_t(n); } PyDoc_STRVAR(pack_doc, "Return string containing values v1, v2, ... packed according to fmt."); static PyObject * pack(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_pack(s_object, newargs); Py_DECREF(newargs); Py_DECREF(s_object); return result; } PyDoc_STRVAR(pack_into_doc, "Pack the values v1, v2, ... according to fmt.\n\ Write the packed bytes into the writable buffer buf starting at offset."); static PyObject * pack_into(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_pack_into(s_object, newargs); Py_DECREF(newargs); Py_DECREF(s_object); return result; } PyDoc_STRVAR(unpack_doc, "Unpack the string containing packed C structure data, according to fmt.\n\ Requires len(string) == calcsize(fmt)."); static PyObject * unpack(PyObject *self, PyObject *args) { PyObject *s_object, *fmt, *inputstr, *result; if (!PyArg_UnpackTuple(args, "unpack", 2, 2, &fmt, &inputstr)) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) return NULL; result = s_unpack(s_object, inputstr); Py_DECREF(s_object); return result; } PyDoc_STRVAR(unpack_from_doc, "Unpack the buffer, containing packed C structure data, according to\n\ fmt, starting at offset. Requires len(buffer[offset:]) >= calcsize(fmt)."); static PyObject * unpack_from(PyObject *self, PyObject *args, PyObject *kwds) { PyObject *s_object, *fmt, *newargs, *result; Py_ssize_t n = PyTuple_GET_SIZE(args); if (n == 0) { PyErr_SetString(PyExc_TypeError, "missing format argument"); return NULL; } fmt = PyTuple_GET_ITEM(args, 0); newargs = PyTuple_GetSlice(args, 1, n); if (newargs == NULL) return NULL; s_object = cache_struct(fmt); if (s_object == NULL) { Py_DECREF(newargs); return NULL; } result = s_unpack_from(s_object, newargs, kwds); Py_DECREF(newargs); Py_DECREF(s_object); return result; } static struct PyMethodDef module_functions[] = { {"_clearcache", (PyCFunction)clearcache, METH_NOARGS, clearcache_doc}, {"calcsize", calcsize, METH_O, calcsize_doc}, {"pack", pack, METH_VARARGS, pack_doc}, {"pack_into", pack_into, METH_VARARGS, pack_into_doc}, {"unpack", unpack, METH_VARARGS, unpack_doc}, {"unpack_from", (PyCFunction)unpack_from, METH_VARARGS|METH_KEYWORDS, unpack_from_doc}, {NULL, NULL} /* sentinel */ }; /* Module initialization */ PyDoc_STRVAR(module_doc, "Functions to convert between Python values and C structs represented\n\ as Python strings. It uses format strings (explained below) as compact\n\ descriptions of the lay-out of the C structs and the intended conversion\n\ to/from Python values.\n\ \n\ The remaining chars indicate types of args and must match exactly;\n\ these can be preceded by a decimal repeat count:\n\ x: pad byte (no data); c:char; b:signed byte; B:unsigned byte;\n\ ?: _Bool (requires C99; if not available, char is used instead)\n\ h:short; H:unsigned short; i:int; I:unsigned int;\n\ l:long; L:unsigned long; f:float; d:double.\n\ Special cases (preceding decimal count indicates length):\n\ s:string (array of char); p: pascal string (with count byte).\n\ Special case (only available in native format):\n\ P:an integer type that is wide enough to hold a pointer.\n\ Special case (not in native mode unless 'long long' in platform C):\n\ q:long long; Q:unsigned long long\n\ Whitespace between formats is ignored.\n\ \n\ The variable struct.error is an exception raised on errors.\n"); PyMODINIT_FUNC init_pvt_struct(void) { PyObject *ver, *m; ver = PyString_FromString("0.2"); if (ver == NULL) return; m = Py_InitModule3("_pvt_struct", module_functions, module_doc); if (m == NULL) return; Py_TYPE(&PyStructType) = &PyType_Type; if (PyType_Ready(&PyStructType) < 0) return; /* This speed trick can't be used until overflow masking goes away, because native endian always raises exceptions instead of overflow masking. */ /* Add some symbolic constants to the module */ if (StructError == NULL) { StructError = PyErr_NewException("pycuda._pvt_struct.error", NULL, NULL); if (StructError == NULL) return; } Py_INCREF(StructError); PyModule_AddObject(m, "error", StructError); Py_INCREF((PyObject*)&PyStructType); PyModule_AddObject(m, "Struct", (PyObject*)&PyStructType); PyModule_AddObject(m, "__version__", ver); PyModule_AddIntConstant(m, "_PY_STRUCT_RANGE_CHECKING", 1); #ifdef PY_STRUCT_FLOAT_COERCE PyModule_AddIntConstant(m, "_PY_STRUCT_FLOAT_COERCE", 1); #endif } // vim: noexpandtab:sw=8 pycuda-2013.1.1+git20140310/src/wrapper/wrap_cudagl.cpp0000644000175000000500000000460012313360364020572 0ustar tomussrc#ifdef _WIN32 #include #endif #include #include #include "tools.hpp" #include "wrap_helpers.hpp" using namespace pycuda; using namespace pycuda::gl; using boost::shared_ptr; void pycuda_expose_gl() { using py::arg; using py::args; py::def("make_gl_context", make_gl_context, (arg("dev"), arg("flags")=0)); // {{{ new-style #if CUDAPP_CUDA_VERSION >= 3000 py::enum_("graphics_map_flags") .value("NONE", CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE) .value("READ_ONLY", CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY) .value("WRITE_DISCARD", CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD) ; { typedef registered_object cl; py::class_ >("RegisteredObject", py::no_init) .DEF_SIMPLE_METHOD(gl_handle) .DEF_SIMPLE_METHOD(unregister) .def("map", map_registered_object, (arg("robj"), arg("stream")=py::object()), py::return_value_policy()) ; } { typedef registered_buffer cl; py::class_, py::bases >( "RegisteredBuffer", py::init >()) ; } { typedef registered_image cl; py::class_, py::bases >( "RegisteredImage", py::init >()) ; } { typedef registered_mapping cl; py::class_("RegisteredMapping", py::no_init) .def("unmap", &cl::unmap_no_strm) .def("unmap", &cl::unmap) .DEF_SIMPLE_METHOD(device_ptr_and_size) .def("array", &cl::array, (py::args("self", "index", "level")), py::return_value_policy()) ; } #endif // }}} // {{{ old-style DEF_SIMPLE_FUNCTION(gl_init); { typedef buffer_object cl; py::class_ >("BufferObject", py::init()) .DEF_SIMPLE_METHOD(handle) .DEF_SIMPLE_METHOD(unregister) .def("map", map_buffer_object, py::return_value_policy()) ; } { typedef buffer_object_mapping cl; py::class_("BufferObjectMapping", py::no_init) .DEF_SIMPLE_METHOD(unmap) .DEF_SIMPLE_METHOD(device_ptr) .DEF_SIMPLE_METHOD(size) ; } // }}} } // vim: foldmethod=marker pycuda-2013.1.1+git20140310/Makefile.in0000644000175000000500000000050312313360364015372 0ustar tomussrc.PHONY : all install clean tags dist userdoc devdoc all: tags ${PYTHON_EXE} setup.py build dist: ${PYTHON_EXE} setup.py sdist install: tags ${PYTHON_EXE} setup.py install clean: rm -Rf build rm -f tags tags: ctags -R src || true tests: echo "running tests" find ./test -type f -name "*.py" -exec python {} \; pycuda-2013.1.1+git20140310/pycuda/0002755000175000000500000000000012313360364014616 5ustar tomussrcpycuda-2013.1.1+git20140310/pycuda/curandom.py0000644000175000000500000011241512313360364017002 0ustar tomussrcfrom __future__ import division import numpy as np import pycuda.compiler import pycuda.driver as drv import pycuda.gpuarray as array from pytools import memoize_method # {{{ MD5-based random number generation md5_code = """ /* ********************************************************************** ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** ** ** ** License to copy and use this software is granted provided that ** ** it is identified as the "RSA Data Security, Inc. MD5 Message ** ** Digest Algorithm" in all material mentioning or referencing this ** ** software or this function. ** ** ** ** License is also granted to make and use derivative works ** ** provided that such works are identified as "derived from the RSA ** ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** ** material mentioning or referencing the derived work. ** ** ** ** RSA Data Security, Inc. makes no representations concerning ** ** either the merchantability of this software or the suitability ** ** of this software for any particular purpose. It is provided "as ** ** is" without express or implied warranty of any kind. ** ** ** ** These notices must be retained in any copies of any part of this ** ** documentation and/or software. ** ********************************************************************** */ /* F, G and H are basic MD5 functions: selection, majority, parity */ #define F(x, y, z) (((x) & (y)) | ((~x) & (z))) #define G(x, y, z) (((x) & (z)) | ((y) & (~z))) #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | (~z))) /* ROTATE_LEFT rotates x left n bits */ #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) /* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */ /* Rotation is separate from addition to prevent recomputation */ #define FF(a, b, c, d, x, s, ac) \ {(a) += F ((b), (c), (d)) + (x) + (ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define GG(a, b, c, d, x, s, ac) \ {(a) += G ((b), (c), (d)) + (x) + (ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define HH(a, b, c, d, x, s, ac) \ {(a) += H ((b), (c), (d)) + (x) + (ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define II(a, b, c, d, x, s, ac) \ {(a) += I ((b), (c), (d)) + (x) + (ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define X0 threadIdx.x #define X1 threadIdx.y #define X2 threadIdx.z #define X3 blockIdx.x #define X4 blockIdx.y #define X5 blockIdx.z #define X6 seed #define X7 i #define X8 n #define X9 blockDim.x #define X10 blockDim.y #define X11 blockDim.z #define X12 gridDim.x #define X13 gridDim.y #define X14 gridDim.z #define X15 0 unsigned int a = 0x67452301; unsigned int b = 0xefcdab89; unsigned int c = 0x98badcfe; unsigned int d = 0x10325476; /* Round 1 */ #define S11 7 #define S12 12 #define S13 17 #define S14 22 FF ( a, b, c, d, X0 , S11, 3614090360); /* 1 */ FF ( d, a, b, c, X1 , S12, 3905402710); /* 2 */ FF ( c, d, a, b, X2 , S13, 606105819); /* 3 */ FF ( b, c, d, a, X3 , S14, 3250441966); /* 4 */ FF ( a, b, c, d, X4 , S11, 4118548399); /* 5 */ FF ( d, a, b, c, X5 , S12, 1200080426); /* 6 */ FF ( c, d, a, b, X6 , S13, 2821735955); /* 7 */ FF ( b, c, d, a, X7 , S14, 4249261313); /* 8 */ FF ( a, b, c, d, X8 , S11, 1770035416); /* 9 */ FF ( d, a, b, c, X9 , S12, 2336552879); /* 10 */ FF ( c, d, a, b, X10, S13, 4294925233); /* 11 */ FF ( b, c, d, a, X11, S14, 2304563134); /* 12 */ FF ( a, b, c, d, X12, S11, 1804603682); /* 13 */ FF ( d, a, b, c, X13, S12, 4254626195); /* 14 */ FF ( c, d, a, b, X14, S13, 2792965006); /* 15 */ FF ( b, c, d, a, X15, S14, 1236535329); /* 16 */ /* Round 2 */ #define S21 5 #define S22 9 #define S23 14 #define S24 20 GG ( a, b, c, d, X1 , S21, 4129170786); /* 17 */ GG ( d, a, b, c, X6 , S22, 3225465664); /* 18 */ GG ( c, d, a, b, X11, S23, 643717713); /* 19 */ GG ( b, c, d, a, X0 , S24, 3921069994); /* 20 */ GG ( a, b, c, d, X5 , S21, 3593408605); /* 21 */ GG ( d, a, b, c, X10, S22, 38016083); /* 22 */ GG ( c, d, a, b, X15, S23, 3634488961); /* 23 */ GG ( b, c, d, a, X4 , S24, 3889429448); /* 24 */ GG ( a, b, c, d, X9 , S21, 568446438); /* 25 */ GG ( d, a, b, c, X14, S22, 3275163606); /* 26 */ GG ( c, d, a, b, X3 , S23, 4107603335); /* 27 */ GG ( b, c, d, a, X8 , S24, 1163531501); /* 28 */ GG ( a, b, c, d, X13, S21, 2850285829); /* 29 */ GG ( d, a, b, c, X2 , S22, 4243563512); /* 30 */ GG ( c, d, a, b, X7 , S23, 1735328473); /* 31 */ GG ( b, c, d, a, X12, S24, 2368359562); /* 32 */ /* Round 3 */ #define S31 4 #define S32 11 #define S33 16 #define S34 23 HH ( a, b, c, d, X5 , S31, 4294588738); /* 33 */ HH ( d, a, b, c, X8 , S32, 2272392833); /* 34 */ HH ( c, d, a, b, X11, S33, 1839030562); /* 35 */ HH ( b, c, d, a, X14, S34, 4259657740); /* 36 */ HH ( a, b, c, d, X1 , S31, 2763975236); /* 37 */ HH ( d, a, b, c, X4 , S32, 1272893353); /* 38 */ HH ( c, d, a, b, X7 , S33, 4139469664); /* 39 */ HH ( b, c, d, a, X10, S34, 3200236656); /* 40 */ HH ( a, b, c, d, X13, S31, 681279174); /* 41 */ HH ( d, a, b, c, X0 , S32, 3936430074); /* 42 */ HH ( c, d, a, b, X3 , S33, 3572445317); /* 43 */ HH ( b, c, d, a, X6 , S34, 76029189); /* 44 */ HH ( a, b, c, d, X9 , S31, 3654602809); /* 45 */ HH ( d, a, b, c, X12, S32, 3873151461); /* 46 */ HH ( c, d, a, b, X15, S33, 530742520); /* 47 */ HH ( b, c, d, a, X2 , S34, 3299628645); /* 48 */ /* Round 4 */ #define S41 6 #define S42 10 #define S43 15 #define S44 21 II ( a, b, c, d, X0 , S41, 4096336452); /* 49 */ II ( d, a, b, c, X7 , S42, 1126891415); /* 50 */ II ( c, d, a, b, X14, S43, 2878612391); /* 51 */ II ( b, c, d, a, X5 , S44, 4237533241); /* 52 */ II ( a, b, c, d, X12, S41, 1700485571); /* 53 */ II ( d, a, b, c, X3 , S42, 2399980690); /* 54 */ II ( c, d, a, b, X10, S43, 4293915773); /* 55 */ II ( b, c, d, a, X1 , S44, 2240044497); /* 56 */ II ( a, b, c, d, X8 , S41, 1873313359); /* 57 */ II ( d, a, b, c, X15, S42, 4264355552); /* 58 */ II ( c, d, a, b, X6 , S43, 2734768916); /* 59 */ II ( b, c, d, a, X13, S44, 1309151649); /* 60 */ II ( a, b, c, d, X4 , S41, 4149444226); /* 61 */ II ( d, a, b, c, X11, S42, 3174756917); /* 62 */ II ( c, d, a, b, X2 , S43, 718787259); /* 63 */ II ( b, c, d, a, X9 , S44, 3951481745); /* 64 */ a += 0x67452301; b += 0xefcdab89; c += 0x98badcfe; d += 0x10325476; """ def rand(shape, dtype=np.float32, stream=None): from pycuda.gpuarray import GPUArray from pycuda.elementwise import get_elwise_kernel result = GPUArray(shape, dtype) if dtype == np.float32: func = get_elwise_kernel( "float *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0f) dest[i] = a*POW_2_M32; if ((i += total_threads) < n) dest[i] = b*POW_2_M32; if ((i += total_threads) < n) dest[i] = c*POW_2_M32; if ((i += total_threads) < n) dest[i] = d*POW_2_M32; """, "md5_rng_float") elif dtype == np.float64: func = get_elwise_kernel( "double *dest, unsigned int seed", md5_code + """ #define POW_2_M32 (1/4294967296.0) #define POW_2_M64 (1/18446744073709551616.) dest[i] = a*POW_2_M32 + b*POW_2_M64; if ((i += total_threads) < n) { dest[i] = c*POW_2_M32 + d*POW_2_M64; } """, "md5_rng_float") elif dtype in [np.int32, np.uint32]: func = get_elwise_kernel( "unsigned int *dest, unsigned int seed", md5_code + """ dest[i] = a; if ((i += total_threads) < n) dest[i] = b; if ((i += total_threads) < n) dest[i] = c; if ((i += total_threads) < n) dest[i] = d; """, "md5_rng_int") else: raise NotImplementedError; func.prepared_async_call(result._grid, result._block, stream, result.gpudata, np.random.randint(2**31-1), result.size) return result # }}} # {{{ CURAND wrapper try: import pycuda._driver as _curand # used to be separate module except ImportError: def get_curand_version(): return None else: get_curand_version = _curand.get_curand_version if get_curand_version() >= (3, 2, 0): direction_vector_set = _curand.direction_vector_set _get_direction_vectors = _curand._get_direction_vectors if get_curand_version() >= (4, 0, 0): _get_scramble_constants32 = _curand._get_scramble_constants32 _get_scramble_constants64 = _curand._get_scramble_constants64 # {{{ Base class gen_template = """ __global__ void %(name)s(%(state_type)s *s, %(out_type)s *d, const int n) { const int tidx = blockIdx.x*blockDim.x+threadIdx.x; const int delta = blockDim.x*gridDim.x; for (int idx = tidx; idx < n; idx += delta) d[idx] = curand%(suffix)s(&s[tidx]); } """ gen_log_template = """ __global__ void %(name)s(%(state_type)s *s, %(out_type)s *d, %(in_type)s mean, %(in_type)s stddev, const int n) { const int tidx = blockIdx.x*blockDim.x+threadIdx.x; const int delta = blockDim.x*gridDim.x; for (int idx = tidx; idx < n; idx += delta) d[idx] = curand_log%(suffix)s(&s[tidx], mean, stddev); } """ gen_poisson_template = """ __global__ void %(name)s(%(state_type)s *s, %(out_type)s *d, double lambda, const int n) { const int tidx = blockIdx.x*blockDim.x+threadIdx.x; const int delta = blockDim.x*gridDim.x; for (int idx = tidx; idx < n; idx += delta) d[idx] = curand_poisson%(suffix)s(&s[tidx], lambda); } """ random_source = """ // Uses C++ features (templates); do not surround with extern C #include extern "C" { %(generators)s } """ random_skip_ahead32_source = """ extern "C" { __global__ void skip_ahead(%(state_type)s *s, const int n, const unsigned int skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead(skip, &s[idx]); } __global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned int *skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead(skip[idx], &s[idx]); } } """ random_skip_ahead64_source = """ extern "C" { __global__ void skip_ahead(%(state_type)s *s, const int n, const unsigned long long skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead(skip, &s[idx]); } __global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned long long *skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead(skip[idx], &s[idx]); } } """ class _RandomNumberGeneratorBase(object): """ Class surrounding CURAND kernels from CUDA 3.2. It allows for generating random numbers with uniform and normal probability function of various types. """ gen_info = [ ("uniform_int", "unsigned int", ""), ("uniform_long", "unsigned long long", ""), ("uniform_float", "float", "_uniform"), ("uniform_double", "double", "_uniform_double"), ("normal_float", "float", "_normal"), ("normal_double", "double", "_normal_double"), ("normal_float2", "float2", "_normal2"), ("normal_double2", "double2", "_normal2_double"), ] gen_log_info = [ ("normal_log_float", "float", "float", "_normal"), ("normal_log_double", "double", "double", "_normal_double"), ("normal_log_float2", "float", "float2", "_normal2"), ("normal_log_double2", "double", "double2", "_normal2_double"), ] gen_poisson_info = [ ("poisson_int", "unsigned int", ""), ] def __init__(self, state_type, vector_type, generator_bits, additional_source, scramble_type=None): if get_curand_version() < (3, 2, 0): raise EnvironmentError("Need at least CUDA 3.2") dev = drv.Context.get_device() self.block_count = dev.get_attribute( pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT) from pycuda.characterize import has_double_support def do_generate(out_type): result = True if "double" in out_type: result = result and has_double_support() if "2" in out_type: result = result and self.has_box_muller return result my_generators = [ (name, out_type, suffix) for name, out_type, suffix in self.gen_info if do_generate(out_type)] if get_curand_version() >= (4, 0, 0): my_log_generators = [ (name, in_type, out_type, suffix) for name, in_type, out_type, suffix in self.gen_log_info if do_generate(out_type)] if get_curand_version() >= (5, 0, 0): my_poisson_generators = [ (name, out_type, suffix) for name, out_type, suffix in self.gen_poisson_info if do_generate(out_type)] generator_sources = [ gen_template % { "name": name, "out_type": out_type, "suffix": suffix, "state_type": state_type, } for name, out_type, suffix in my_generators] if get_curand_version() >= (4, 0, 0): generator_sources.extend([ gen_log_template % { "name": name, "in_type": in_type, "out_type": out_type, "suffix": suffix, "state_type": state_type, } for name, in_type, out_type, suffix in my_log_generators]) if get_curand_version() >= (5, 0, 0): generator_sources.extend([ gen_poisson_template % { "name": name, "out_type": out_type, "suffix": suffix, "state_type": state_type, } for name, out_type, suffix in my_poisson_generators]) source = (random_source + additional_source) % { "state_type": state_type, "vector_type": vector_type, "scramble_type": scramble_type, "generators": "\n".join(generator_sources)} # store in instance to let subclass constructors get to it. self.module = module = pycuda.compiler.SourceModule(source, no_extern_c=True) self.generators = {} for name, out_type, suffix in my_generators: gen_func = module.get_function(name) gen_func.prepare("PPi") self.generators[name] = gen_func if get_curand_version() >= (4, 0, 0): for name, in_type, out_type, suffix in my_log_generators: gen_func = module.get_function(name) if in_type == "float": gen_func.prepare("PPffi") if in_type == "double": gen_func.prepare("PPddi") self.generators[name] = gen_func if get_curand_version() >= (5, 0, 0): for name, out_type, suffix in my_poisson_generators: gen_func = module.get_function(name) gen_func.prepare("PPdi") self.generators[name] = gen_func self.generator_bits = generator_bits self._prepare_skipahead() self.state_type = state_type self._state = None def _prepare_skipahead(self): self.skip_ahead = self.module.get_function("skip_ahead") if self.generator_bits == 32: self.skip_ahead.prepare("PiI") if self.generator_bits == 64: self.skip_ahead.prepare("PiQ") self.skip_ahead_array = self.module.get_function("skip_ahead_array") self.skip_ahead_array.prepare("PiP") def _kernels(self): return ( list(self.generators.itervalues()) + [self.skip_ahead, self.skip_ahead_array]) @property @memoize_method def generators_per_block(self): return min(kernel.max_threads_per_block for kernel in self._kernels()) @property def state(self): if self._state is None: from pycuda.characterize import sizeof data_type_size = sizeof(self.state_type, "#include ") self._state = drv.mem_alloc( self.block_count * self.generators_per_block * data_type_size) return self._state def fill_uniform(self, data, stream=None): if data.dtype == np.float32: func = self.generators["uniform_float"] elif data.dtype == np.float64: func = self.generators["uniform_double"] elif data.dtype in [np.int, np.int32, np.uint32]: func = self.generators["uniform_int"] elif data.dtype in [np.int64, np.uint64] and self.generator_bits >= 64: func = self.generators["uniform_long"] else: raise NotImplementedError func.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, data.gpudata, data.size) def fill_normal(self, data, stream=None): if data.dtype == np.float32: func_name = "normal_float" elif data.dtype == np.float64: func_name = "normal_double" else: raise NotImplementedError data_size = data.size if self.has_box_muller and data_size % 2 == 0: func_name += "2" data_size //= 2 func = self.generators[func_name] func.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, data.gpudata, int(data_size)) def gen_uniform(self, shape, dtype, stream=None): result = array.empty(shape, dtype) self.fill_uniform(result, stream) return result def gen_normal(self, shape, dtype, stream=None): result = array.empty(shape, dtype) self.fill_normal(result, stream) return result if get_curand_version() >= (4, 0, 0): def fill_log_normal(self, data, mean, stddev, stream=None): if data.dtype == np.float32: func_name = "normal_log_float" elif data.dtype == np.float64: func_name = "normal_log_double" else: raise NotImplementedError data_size = data.size if self.has_box_muller and data_size % 2 == 0: func_name += "2" data_size //= 2 func = self.generators[func_name] func.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, data.gpudata, mean, stddev, int(data_size)) def gen_log_normal(self, shape, dtype, mean, stddev, stream=None): result = array.empty(shape, dtype) self.fill_log_normal(result, mean, stddev, stream) return result if get_curand_version() >= (5, 0, 0): def fill_poisson(self, data, lambda_value, stream=None): if data.dtype == np.uint32: func_name = "poisson_int" else: raise NotImplementedError func = self.generators[func_name] func.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, data.gpudata, lambda_value, data.size) def gen_poisson(self, shape, dtype, lambda_value, stream=None): result = array.empty(shape, dtype) self.fill_poisson(result, lambda_value, stream) return result def call_skip_ahead(self, i, stream=None): self.skip_ahead.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, self.generators_per_block, i) def call_skip_ahead_array(self, i, stream=None): self.skip_ahead_array.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, self.generators_per_block, i.gpudata) # }}} # {{{ XORWOW RNG class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase): def __init__(self, seed_getter, offset, state_type, vector_type, generator_bits, additional_source, scramble_type=None): super(_PseudoRandomNumberGeneratorBase, self).__init__( state_type, vector_type, generator_bits, additional_source) generator_count = self.generators_per_block * self.block_count if seed_getter is None: seed = array.to_gpu( np.asarray( np.random.random_integers( 0, (1 << 31) - 2, generator_count), dtype=np.int32)) else: seed = seed_getter(generator_count) if not (isinstance(seed, pycuda.gpuarray.GPUArray) and seed.dtype == np.int32 and seed.size == generator_count): raise TypeError("seed must be GPUArray of integers of right length") p = self.module.get_function("prepare") p.prepare("PiPi") from pycuda.characterize import has_stack has_stack = has_stack() if has_stack: prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE) try: if has_stack: drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k try: p.prepared_call( (self.block_count, 1), (self.generators_per_block, 1, 1), self.state, generator_count, seed.gpudata, offset) except drv.LaunchError: raise ValueError("Initialisation failed. Decrease number of threads.") finally: if has_stack: drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) def _prepare_skipahead(self): self.skip_ahead = self.module.get_function("skip_ahead") self.skip_ahead.prepare("PiQ") self.skip_ahead_array = self.module.get_function("skip_ahead_array") self.skip_ahead_array.prepare("PiP") self.skip_ahead_sequence = self.module.get_function("skip_ahead_sequence") self.skip_ahead_sequence.prepare("PiQ") self.skip_ahead_sequence_array = self.module.get_function("skip_ahead_sequence_array") self.skip_ahead_sequence_array.prepare("PiP") def call_skip_ahead_sequence(self, i, stream=None): self.skip_ahead_sequence.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, self.generators_per_block * self.block_count, i) def call_skip_ahead_sequence_array(self, i, stream=None): self.skip_ahead_sequence_array.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, self.generators_per_block * self.block_count, i.gpudata) def _kernels(self): return (_RandomNumberGeneratorBase._kernels(self) + [self.module.get_function("prepare")] + [self.module.get_function("skip_ahead_sequence"), self.module.get_function("skip_ahead_sequence_array")]) def seed_getter_uniform(N): result = pycuda.gpuarray.empty([N], np.int32) import random value = random.randint(0, 2**31-1) return result.fill(value) def seed_getter_unique(N): result = np.random.randint(0, 2**31-1, N).astype(np.int32) return pycuda.gpuarray.to_gpu(result) xorwow_random_source = """ extern "C" { __global__ void prepare(%(state_type)s *s, const int n, %(vector_type)s *v, const unsigned int o) { const int id = blockIdx.x*blockDim.x+threadIdx.x; if (id < n) curand_init(v[id], id, o, &s[id]); } } """ xorwow_skip_ahead_sequence_source = """ extern "C" { __global__ void skip_ahead_sequence(%(state_type)s *s, const int n, const unsigned long long skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead_sequence(skip, &s[idx]); } __global__ void skip_ahead_sequence_array(%(state_type)s *s, const int n, const unsigned long long *skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead_sequence(skip[idx], &s[idx]); } } """ if get_curand_version() >= (3, 2, 0): class XORWOWRandomNumberGenerator(_PseudoRandomNumberGeneratorBase): has_box_muller = True def __init__(self, seed_getter=None, offset=0): """ :arg seed_getter: a function that, given an integer count, will yield an `int32` :class:`GPUArray` of seeds. """ super(XORWOWRandomNumberGenerator, self).__init__( seed_getter, offset, 'curandStateXORWOW', 'unsigned int', 32, xorwow_random_source+ xorwow_skip_ahead_sequence_source+random_skip_ahead64_source) # }}} # {{{ MRG32k3a RNG mrg32k3a_random_source = """ extern "C" { __global__ void prepare(%(state_type)s *s, const int n, %(vector_type)s *v, const unsigned int o) { const int id = blockIdx.x*blockDim.x+threadIdx.x; if (id < n) curand_init(v[id], id, o, &s[id]); } } """ mrg32k3a_skip_ahead_sequence_source = """ extern "C" { __global__ void skip_ahead_sequence(%(state_type)s *s, const int n, const unsigned long long skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead_sequence(skip, &s[idx]); } __global__ void skip_ahead_sequence_array(%(state_type)s *s, const int n, const unsigned long long *skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead_sequence(skip[idx], &s[idx]); } __global__ void skip_ahead_subsequence(%(state_type)s *s, const int n, const unsigned long long skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead_subsequence(skip, &s[idx]); } __global__ void skip_ahead_subsequence_array(%(state_type)s *s, const int n, const unsigned long long *skip) { const int idx = blockIdx.x*blockDim.x+threadIdx.x; if (idx < n) skipahead_subsequence(skip[idx], &s[idx]); } } """ if get_curand_version() >= (4, 1, 0): class MRG32k3aRandomNumberGenerator(_PseudoRandomNumberGeneratorBase): has_box_muller = True def __init__(self, seed_getter=None, offset=0): """ :arg seed_getter: a function that, given an integer count, will yield an `int32` :class:`GPUArray` of seeds. """ super(MRG32k3aRandomNumberGenerator, self).__init__( seed_getter, offset, 'curandStateMRG32k3a', 'unsigned int', 32, mrg32k3a_random_source+ mrg32k3a_skip_ahead_sequence_source+random_skip_ahead64_source) def _prepare_skipahead(self): super(MRG32k3aRandomNumberGenerator, self)._prepare_skipahead() self.skip_ahead_subsequence = self.module.get_function("skip_ahead_subsequence") self.skip_ahead_subsequence.prepare("PiQ") self.skip_ahead_subsequence_array = self.module.get_function("skip_ahead_subsequence_array") self.skip_ahead_subsequence_array.prepare("PiP") def call_skip_ahead_subsequence(self, i, stream=None): self.skip_ahead_subsequence.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, self.generators_per_block * self.block_count, i) def call_skip_ahead_subsequence_array(self, i, stream=None): self.skip_ahead_subsequence_array.prepared_async_call( (self.block_count, 1), (self.generators_per_block, 1, 1), stream, self.state, self.generators_per_block * self.block_count, i.gpudata) def _kernels(self): return (_PseudoRandomNumberGeneratorBase._kernels(self) + [self.module.get_function("skip_ahead_subsequence"), self.module.get_function("skip_ahead_subsequence_array")]) # }}} # {{{ Sobol RNG def generate_direction_vectors(count, direction=None): if get_curand_version() >= (4, 0, 0): if direction == direction_vector_set.VECTOR_64 or \ direction == direction_vector_set.SCRAMBLED_VECTOR_64: result = np.empty((count, 64), dtype=np.uint64) else: result = np.empty((count, 32), dtype=np.uint32) else: result = np.empty((count, 32), dtype=np.uint32) _get_direction_vectors(direction, result, count) return pycuda.gpuarray.to_gpu(result) if get_curand_version() >= (4, 0, 0): def generate_scramble_constants32(count): result = np.empty((count, ), dtype=np.uint32) _get_scramble_constants32(result, count) return pycuda.gpuarray.to_gpu(result) def generate_scramble_constants64(count): result = np.empty((count, ), dtype=np.uint64) _get_scramble_constants64(result, count) return pycuda.gpuarray.to_gpu(result) sobol_random_source = """ extern "C" { __global__ void prepare(%(state_type)s *s, const int n, %(vector_type)s *v, const unsigned int o) { const int id = blockIdx.x*blockDim.x+threadIdx.x; if (id < n) curand_init(v[id], o, &s[id]); } } """ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 3.2. It allows for generating quasi-random numbers with uniform and normal probability function of type int, float, and double. """ has_box_muller = False def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size, dir_vector_set, offset, state_type, vector_type, generator_bits, sobol_random_source): super(_SobolRandomNumberGeneratorBase, self).__init__(state_type, vector_type, generator_bits, sobol_random_source) if dir_vector is None: dir_vector = generate_direction_vectors( self.block_count * self.generators_per_block, dir_vector_set) if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray) and dir_vector.dtype == dir_vector_dtype and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)): raise TypeError("seed must be GPUArray of integers of right length") p = self.module.get_function("prepare") p.prepare("PiPi") from pycuda.characterize import has_stack has_stack = has_stack() if has_stack: prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE) try: if has_stack: drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k try: p.prepared_call((self.block_count, 1), (self.generators_per_block, 1, 1), self.state, self.block_count * self.generators_per_block, dir_vector.gpudata, offset) except drv.LaunchError: raise ValueError("Initialisation failed. Decrease number of threads.") finally: if has_stack: drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) def _kernels(self): return (_RandomNumberGeneratorBase._kernels(self) + [self.module.get_function("prepare")]) scrambledsobol_random_source = """ extern "C" { __global__ void prepare( %(state_type)s *s, const int n, %(vector_type)s *v, %(scramble_type)s *scramble, const unsigned int o) { const int id = blockIdx.x*blockDim.x+threadIdx.x; if (id < n) curand_init(v[id], scramble[id], o, &s[id]); } } """ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 4.0. It allows for generating quasi-random numbers with uniform and normal probability function of type int, float, and double. """ has_box_muller = False def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size, dir_vector_set, scramble_vector, scramble_vector_function, offset, state_type, vector_type, generator_bits, scramble_type, sobol_random_source): super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__(state_type, vector_type, generator_bits, sobol_random_source, scramble_type) if dir_vector is None: dir_vector = generate_direction_vectors( self.block_count * self.generators_per_block, dir_vector_set) if scramble_vector is None: scramble_vector = scramble_vector_function( self.block_count * self.generators_per_block) if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray) and dir_vector.dtype == dir_vector_dtype and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)): raise TypeError("seed must be GPUArray of integers of right length") if not (isinstance(scramble_vector, pycuda.gpuarray.GPUArray) and scramble_vector.dtype == dir_vector_dtype and scramble_vector.shape == (self.block_count * self.generators_per_block, )): raise TypeError("scramble must be GPUArray of integers of right length") p = self.module.get_function("prepare") p.prepare("PiPPi") from pycuda.characterize import has_stack has_stack = has_stack() if has_stack: prev_stack_size = drv.Context.get_limit(drv.limit.STACK_SIZE) try: if has_stack: drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k try: p.prepared_call((self.block_count, 1), (self.generators_per_block, 1, 1), self.state, self.block_count * self.generators_per_block, dir_vector.gpudata, scramble_vector.gpudata, offset) except drv.LaunchError: raise ValueError("Initialisation failed. Decrease number of threads.") finally: if has_stack: drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) def _kernels(self): return (_RandomNumberGeneratorBase._kernels(self) + [self.module.get_function("prepare")]) if get_curand_version() >= (3, 2, 0): class Sobol32RandomNumberGenerator(_SobolRandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 3.2. It allows for generating quasi-random numbers with uniform and normal probability function of type int, float, and double. """ def __init__(self, dir_vector=None, offset=0): super(Sobol32RandomNumberGenerator, self).__init__(dir_vector, np.uint32, 32, direction_vector_set.VECTOR_32, offset, 'curandStateSobol32', 'curandDirectionVectors32_t', 32, sobol_random_source+random_skip_ahead32_source) if get_curand_version() >= (4, 0, 0): class ScrambledSobol32RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 4.0. It allows for generating quasi-random numbers with uniform and normal probability function of type int, float, and double. """ def __init__(self, dir_vector=None, scramble_vector=None, offset=0): super(ScrambledSobol32RandomNumberGenerator, self).__init__(dir_vector, np.uint32, 32, direction_vector_set.SCRAMBLED_VECTOR_32, scramble_vector, generate_scramble_constants32, offset, 'curandStateScrambledSobol32', 'curandDirectionVectors32_t', 32, 'unsigned int', scrambledsobol_random_source+random_skip_ahead32_source) if get_curand_version() >= (4, 0, 0): class Sobol64RandomNumberGenerator(_SobolRandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 4.0. It allows for generating quasi-random numbers with uniform and normal probability function of type int, float, and double. """ def __init__(self, dir_vector=None, offset=0): super(Sobol64RandomNumberGenerator, self).__init__(dir_vector, np.uint64, 64, direction_vector_set.VECTOR_64, offset, 'curandStateSobol64', 'curandDirectionVectors64_t', 64, sobol_random_source+random_skip_ahead64_source) if get_curand_version() >= (4, 0, 0): class ScrambledSobol64RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 4.0. It allows for generating quasi-random numbers with uniform and normal probability function of type int, float, and double. """ def __init__(self, dir_vector=None, scramble_vector=None, offset=0): super(ScrambledSobol64RandomNumberGenerator, self).__init__(dir_vector, np.uint64, 64, direction_vector_set.SCRAMBLED_VECTOR_64, scramble_vector, generate_scramble_constants64, offset, 'curandStateScrambledSobol64', 'curandDirectionVectors64_t', 64, 'unsigned long long', scrambledsobol_random_source+random_skip_ahead64_source) # }}} # }}} # vim: foldmethod=marker pycuda-2013.1.1+git20140310/pycuda/cuda/0002755000175000000500000000000012313360364015532 5ustar tomussrcpycuda-2013.1.1+git20140310/pycuda/cuda/pycuda-helpers.hpp0000644000175000000500000000503512313360364021171 0ustar tomussrc#include #ifndef _AFJKDASLFSADHF_HEADER_SEEN_PYCUDA_HELPERS_HPP #define _AFJKDASLFSADHF_HEADER_SEEN_PYCUDA_HELPERS_HPP extern "C++" { // "double-precision" textures ------------------------------------------------ /* Thanks to Nathan Bell for help in figuring this out. */ typedef float fp_tex_float; typedef int2 fp_tex_double; typedef uint2 fp_tex_cfloat; typedef int4 fp_tex_cdouble; template __device__ pycuda::complex fp_tex1Dfetch(texture tex, int i) { fp_tex_cfloat v = tex1Dfetch(tex, i); pycuda::complex out; return pycuda::complex(__int_as_float(v.x), __int_as_float(v.y)); } template __device__ pycuda::complex fp_tex1Dfetch(texture tex, int i) { fp_tex_cdouble v = tex1Dfetch(tex, i); return pycuda::complex(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); } template __device__ double fp_tex1Dfetch(texture tex, int i) { fp_tex_double v = tex1Dfetch(tex, i); return __hiloint2double(v.y, v.x); } template __device__ double fp_tex2D(texture tex, int i, int j) { fp_tex_double v = tex2D(tex, i, j); return __hiloint2double(v.y, v.x); } template __device__ double fp_tex3D(texture tex, int i, int j, int k) { fp_tex_double v = tex3D(tex, i, j, k); return __hiloint2double(v.y, v.x); } #define PYCUDA_GENERATE_FP_TEX_FUNCS(TYPE) \ template \ __device__ TYPE fp_tex1Dfetch(texture tex, int i) \ { \ return tex1Dfetch(tex, i); \ } \ \ template \ __device__ TYPE fp_tex2D(texture tex, int i, int j) \ { \ return tex2D(tex, i, j); \ } \ \ template \ __device__ TYPE fp_tex3D(texture tex, int i, int j, int k) \ { \ return tex3D(tex, i, j, k); \ } PYCUDA_GENERATE_FP_TEX_FUNCS(float) PYCUDA_GENERATE_FP_TEX_FUNCS(int) PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned int) PYCUDA_GENERATE_FP_TEX_FUNCS(short int) PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned short int) PYCUDA_GENERATE_FP_TEX_FUNCS(char) PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned char) } #endif pycuda-2013.1.1+git20140310/pycuda/cuda/pycuda-complex-impl.hpp0000644000175000000500000002642612313360364022144 0ustar tomussrc/* * Copyright (c) 1999 * Silicon Graphics Computer Systems, Inc. * * Copyright (c) 1999 * Boris Fomitchev * * This material is provided "as is", with absolutely no warranty expressed * or implied. Any use is at your own risk. * * Permission to use or copy this software for any purpose is hereby granted * without fee, provided the above notices are retained on all copies. * Permission to modify the code and to distribute modified code is granted, * provided the above notices are retained, and a notice that the code was * modified is included with the above copyright notice. * */ #ifndef PYCUDA_COMPLEX_IMPL_HPP_SEEN #define PYCUDA_COMPLEX_IMPL_HPP_SEEN extern "C++" { namespace pycuda { // Complex division and square roots. // Absolute value _STLP_TEMPLATE_NULL __device__ float abs(const complex& __z) { return ::hypot(__z._M_re, __z._M_im); } _STLP_TEMPLATE_NULL __device__ double abs(const complex& __z) { return ::hypot(__z._M_re, __z._M_im); } // Phase _STLP_TEMPLATE_NULL __device__ float arg(const complex& __z) { return ::atan2(__z._M_im, __z._M_re); } _STLP_TEMPLATE_NULL __device__ double arg(const complex& __z) { return ::atan2(__z._M_im, __z._M_re); } // Construct a complex number from polar representation _STLP_TEMPLATE_NULL __device__ complex polar(const float& __rho, const float& __phi) { return complex(__rho * ::cos(__phi), __rho * ::sin(__phi)); } _STLP_TEMPLATE_NULL __device__ complex polar(const double& __rho, const double& __phi) { return complex(__rho * ::cos(__phi), __rho * ::sin(__phi)); } // Division template __device__ static void _divT(const _Tp& __z1_r, const _Tp& __z1_i, const _Tp& __z2_r, const _Tp& __z2_i, _Tp& __res_r, _Tp& __res_i) { _Tp __ar = __z2_r >= 0 ? __z2_r : -__z2_r; _Tp __ai = __z2_i >= 0 ? __z2_i : -__z2_i; if (__ar <= __ai) { _Tp __ratio = __z2_r / __z2_i; _Tp __denom = __z2_i * (1 + __ratio * __ratio); __res_r = (__z1_r * __ratio + __z1_i) / __denom; __res_i = (__z1_i * __ratio - __z1_r) / __denom; } else { _Tp __ratio = __z2_i / __z2_r; _Tp __denom = __z2_r * (1 + __ratio * __ratio); __res_r = (__z1_r + __z1_i * __ratio) / __denom; __res_i = (__z1_i - __z1_r * __ratio) / __denom; } } template __device__ static void _divT(const _Tp& __z1_r, const _Tp& __z2_r, const _Tp& __z2_i, _Tp& __res_r, _Tp& __res_i) { _Tp __ar = __z2_r >= 0 ? __z2_r : -__z2_r; _Tp __ai = __z2_i >= 0 ? __z2_i : -__z2_i; if (__ar <= __ai) { _Tp __ratio = __z2_r / __z2_i; _Tp __denom = __z2_i * (1 + __ratio * __ratio); __res_r = (__z1_r * __ratio) / __denom; __res_i = - __z1_r / __denom; } else { _Tp __ratio = __z2_i / __z2_r; _Tp __denom = __z2_r * (1 + __ratio * __ratio); __res_r = __z1_r / __denom; __res_i = - (__z1_r * __ratio) / __denom; } } __device__ void complex::_div(const float& __z1_r, const float& __z1_i, const float& __z2_r, const float& __z2_i, float& __res_r, float& __res_i) { _divT(__z1_r, __z1_i, __z2_r, __z2_i, __res_r, __res_i); } __device__ void complex::_div(const float& __z1_r, const float& __z2_r, const float& __z2_i, float& __res_r, float& __res_i) { _divT(__z1_r, __z2_r, __z2_i, __res_r, __res_i); } __device__ void complex::_div(const double& __z1_r, const double& __z1_i, const double& __z2_r, const double& __z2_i, double& __res_r, double& __res_i) { _divT(__z1_r, __z1_i, __z2_r, __z2_i, __res_r, __res_i); } __device__ void complex::_div(const double& __z1_r, const double& __z2_r, const double& __z2_i, double& __res_r, double& __res_i) { _divT(__z1_r, __z2_r, __z2_i, __res_r, __res_i); } //---------------------------------------------------------------------- // Square root template __device__ static complex<_Tp> sqrtT(const complex<_Tp>& z) { _Tp re = z._M_re; _Tp im = z._M_im; _Tp mag = ::hypot(re, im); complex<_Tp> result; if (mag == 0.f) { result._M_re = result._M_im = 0.f; } else if (re > 0.f) { result._M_re = ::sqrt(0.5f * (mag + re)); result._M_im = im/result._M_re/2.f; } else { result._M_im = ::sqrt(0.5f * (mag - re)); if (im < 0.f) result._M_im = - result._M_im; result._M_re = im/result._M_im/2.f; } return result; } __device__ complex sqrt(const complex& z) { return sqrtT(z); } __device__ complex sqrt(const complex& z) { return sqrtT(z); } // exp, log, pow for complex, complex, and complex //---------------------------------------------------------------------- // exp template __device__ static complex<_Tp> expT(const complex<_Tp>& z) { _Tp expx = ::exp(z._M_re); _Tp s, c; ::sincos(z._M_im, &s, &c); return complex<_Tp>(expx * c, expx * s); } __device__ complex exp(const complex& z) { return expT(z); } __device__ complex exp(const complex& z) { return expT(z); } #if 0 //---------------------------------------------------------------------- // log10 template static __device__ complex<_Tp> log10T(const complex<_Tp>& z, const _Tp& ln10_inv) { complex<_Tp> r; r._M_im = ::atan2(z._M_im, z._M_re) * ln10_inv; r._M_re = ::log10(::hypot(z._M_re, z._M_im)); return r; } static const float LN10_INVF = 1.f / ::log(10.f); __device__ complex log10(const complex& z) { return log10T(z, LN10_INVF); } static const double LN10_INV = 1. / ::log10(10.); __device__ complex log10(const complex& z) { return log10T(z, LN10_INV); } #endif //---------------------------------------------------------------------- // log template static __device__ complex<_Tp> logT(const complex<_Tp>& z) { complex<_Tp> r; r._M_im = ::atan2(z._M_im, z._M_re); r._M_re = ::log(::hypot(z._M_re, z._M_im)); return r; } __device__ complex log(const complex& z) { return logT(z); } __device__ complex log(const complex& z) { return logT(z); } //---------------------------------------------------------------------- // pow template __device__ static complex<_Tp> powT(const _Tp& a, const complex<_Tp>& b) { _Tp logr = ::log(a); _Tp x = ::exp(logr * b._M_re); _Tp y = logr * b._M_im; return complex<_Tp>(x * ::cos(y), x * ::sin(y)); } #if 0 template __device__ static complex<_Tp> powT(const complex<_Tp>& z_in, int n) { complex<_Tp> z = z_in; z = _STLP_PRIV __power(z, (n < 0 ? -n : n), multiplies< complex<_Tp> >()); if (n < 0) return _Tp(1.0) / z; else return z; } #endif template __device__ static complex<_Tp> powT(const complex<_Tp>& a, const _Tp& b) { _Tp logr = ::log(::hypot(a._M_re,a._M_im)); _Tp logi = ::atan2(a._M_im, a._M_re); _Tp x = ::exp(logr * b); _Tp y = logi * b; return complex<_Tp>(x * ::cos(y), x * ::sin(y)); } template __device__ static complex<_Tp> powT(const complex<_Tp>& a, const complex<_Tp>& b) { _Tp logr = ::log(::hypot(a._M_re,a._M_im)); _Tp logi = ::atan2(a._M_im, a._M_re); _Tp x = ::exp(logr * b._M_re - logi * b._M_im); _Tp y = logr * b._M_im + logi * b._M_re; return complex<_Tp>(x * ::cos(y), x * ::sin(y)); } __device__ complex pow(const float& a, const complex& b) { return powT(a, b); } /* __device__ complex pow(const complex& z_in, int n) { return powT(z_in, n); } */ __device__ complex pow(const complex& a, const float& b) { return powT(a, b); } __device__ complex pow(const complex& a, const complex& b) { return powT(a, b); } __device__ complex pow(const double& a, const complex& b) { return powT(a, b); } /* __device__ complex pow(const complex& z_in, int n) { return powT(z_in, n); } */ __device__ complex pow(const complex& a, const double& b) { return powT(a, b); } __device__ complex pow(const complex& a, const complex& b) { return powT(a, b); } // ---------------------------------------------------------------------------- // trig helpers #ifndef FLT_MAX #define FLT_MAX 3.402823466E+38F #endif #ifndef DBL_MAX #define DBL_MAX 1.7976931348623158e+308 #endif #define float_limit ::log(FLT_MAX) #define double_limit ::log(DBL_MAX) //---------------------------------------------------------------------- // sin template __device__ complex<_Tp> sinT(const complex<_Tp>& z) { return complex<_Tp>(::sin(z._M_re) * ::cosh(z._M_im), ::cos(z._M_re) * ::sinh(z._M_im)); } __device__ complex sin(const complex& z) { return sinT(z); } __device__ complex sin(const complex& z) { return sinT(z); } //---------------------------------------------------------------------- // cos template __device__ complex<_Tp> cosT(const complex<_Tp>& z) { return complex<_Tp>(::cos(z._M_re) * ::cosh(z._M_im), -::sin(z._M_re) * ::sinh(z._M_im)); } __device__ complex cos(const complex& z) { return cosT(z); } __device__ complex cos(const complex& z) { return cosT(z); } //---------------------------------------------------------------------- // tan template __device__ complex<_Tp> tanT(const complex<_Tp>& z, const _Tp& Tp_limit) { _Tp re2 = 2.f * z._M_re; _Tp im2 = 2.f * z._M_im; if (::abs(im2) > Tp_limit) return complex<_Tp>(0.f, (im2 > 0 ? 1.f : -1.f)); else { _Tp den = ::cos(re2) + ::cosh(im2); return complex<_Tp>(::sin(re2) / den, ::sinh(im2) / den); } } __device__ complex tan(const complex& z) { return tanT(z, float_limit); } __device__ complex tan(const complex& z) { return tanT(z, double_limit); } //---------------------------------------------------------------------- // sinh template __device__ complex<_Tp> sinhT(const complex<_Tp>& z) { return complex<_Tp>(::sinh(z._M_re) * ::cos(z._M_im), ::cosh(z._M_re) * ::sin(z._M_im)); } __device__ complex sinh(const complex& z) { return sinhT(z); } __device__ complex sinh(const complex& z) { return sinhT(z); } //---------------------------------------------------------------------- // cosh template __device__ complex<_Tp> coshT(const complex<_Tp>& z) { return complex<_Tp>(::cosh(z._M_re) * ::cos(z._M_im), ::sinh(z._M_re) * ::sin(z._M_im)); } __device__ complex cosh(const complex& z) { return coshT(z); } __device__ complex cosh(const complex& z) { return coshT(z); } //---------------------------------------------------------------------- // tanh template __device__ complex<_Tp> tanhT(const complex<_Tp>& z, const _Tp& Tp_limit) { _Tp re2 = 2.f * z._M_re; _Tp im2 = 2.f * z._M_im; if (::abs(re2) > Tp_limit) return complex<_Tp>((re2 > 0 ? 1.f : -1.f), 0.f); else { _Tp den = ::cosh(re2) + ::cos(im2); return complex<_Tp>(::sinh(re2) / den, ::sin(im2) / den); } } __device__ complex tanh(const complex& z) { return tanhT(z, float_limit); } __device__ complex tanh(const complex& z) { return tanhT(z, double_limit); } } } #endif pycuda-2013.1.1+git20140310/pycuda/cuda/pycuda-complex.hpp0000644000175000000500000005257512313360364021211 0ustar tomussrc/* * Copyright (c) 1999 * Silicon Graphics Computer Systems, Inc. * * Copyright (c) 1999 * Boris Fomitchev * * This material is provided "as is", with absolutely no warranty expressed * or implied. Any use is at your own risk. * * Permission to use or copy this software for any purpose is hereby granted * without fee, provided the above notices are retained on all copies. * Permission to modify the code and to distribute modified code is granted, * provided the above notices are retained, and a notice that the code was * modified is included with the above copyright notice. * * Adapted for PyCUDA by Andreas Kloeckner 2009. */ #ifndef PYCUDA_COMPLEX_HPP_SEEN #define PYCUDA_COMPLEX_HPP_SEEN extern "C++" { namespace pycuda { #define _STLP_USE_NO_IOSTREAMS #define _STLP_DECLSPEC /* empty */ #define _STLP_CLASS_DECLSPEC /* empty */ #define _STLP_FUNCTION_TMPL_PARTIAL_ORDER #define _STLP_TEMPLATE_NULL template<> template struct complex { typedef _Tp value_type; typedef complex<_Tp> _Self; // Constructors, destructor, assignment operator. __device__ complex() : _M_re(0), _M_im(0) {} __device__ complex(const value_type& __x) : _M_re(__x), _M_im(0) {} __device__ complex(const value_type& __x, const value_type& __y) : _M_re(__x), _M_im(__y) {} __device__ complex(const _Self& __z) : _M_re(__z._M_re), _M_im(__z._M_im) {} __device__ _Self& operator=(const _Self& __z) { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } __device__ volatile _Self& operator=(const _Self& __z) volatile { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } template __device__ explicit complex(const complex<_Tp2>& __z) : _M_re(__z._M_re), _M_im(__z._M_im) {} template __device__ _Self& operator=(const complex<_Tp2>& __z) { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } template __device__ volatile _Self& operator=(const complex<_Tp2>& __z) volatile { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } // Element access. __device__ value_type real() const { return _M_re; } __device__ value_type imag() const { return _M_im; } // Arithmetic op= operations involving one real argument. __device__ _Self& operator= (const value_type& __x) { _M_re = __x; _M_im = 0; return *this; } __device__ volatile _Self& operator= (const value_type& __x) volatile { _M_re = __x; _M_im = 0; return *this; } __device__ _Self& operator+= (const value_type& __x) { _M_re += __x; return *this; } __device__ _Self& operator-= (const value_type& __x) { _M_re -= __x; return *this; } __device__ _Self& operator*= (const value_type& __x) { _M_re *= __x; _M_im *= __x; return *this; } __device__ _Self& operator/= (const value_type& __x) { _M_re /= __x; _M_im /= __x; return *this; } // Arithmetic op= operations involving two complex arguments. static void __device__ _div(const value_type& __z1_r, const value_type& __z1_i, const value_type& __z2_r, const value_type& __z2_i, value_type& __res_r, value_type& __res_i); static void __device__ _div(const value_type& __z1_r, const value_type& __z2_r, const value_type& __z2_i, value_type& __res_r, value_type& __res_i); template __device__ _Self& operator+= (const complex<_Tp2>& __z) { _M_re += __z._M_re; _M_im += __z._M_im; return *this; } template __device__ _Self& operator-= (const complex<_Tp2>& __z) { _M_re -= __z._M_re; _M_im -= __z._M_im; return *this; } template __device__ _Self& operator*= (const complex<_Tp2>& __z) { value_type __r = _M_re * __z._M_re - _M_im * __z._M_im; value_type __i = _M_re * __z._M_im + _M_im * __z._M_re; _M_re = __r; _M_im = __i; return *this; } template __device__ _Self& operator/= (const complex<_Tp2>& __z) { value_type __r; value_type __i; _div(_M_re, _M_im, __z._M_re, __z._M_im, __r, __i); _M_re = __r; _M_im = __i; return *this; } __device__ _Self& operator+= (const _Self& __z) { _M_re += __z._M_re; _M_im += __z._M_im; return *this; } __device__ _Self& operator-= (const _Self& __z) { _M_re -= __z._M_re; _M_im -= __z._M_im; return *this; } __device__ _Self& operator*= (const _Self& __z) { value_type __r = _M_re * __z._M_re - _M_im * __z._M_im; value_type __i = _M_re * __z._M_im + _M_im * __z._M_re; _M_re = __r; _M_im = __i; return *this; } __device__ _Self& operator/= (const _Self& __z) { value_type __r; value_type __i; _div(_M_re, _M_im, __z._M_re, __z._M_im, __r, __i); _M_re = __r; _M_im = __i; return *this; } // Data members. value_type _M_re; value_type _M_im; }; // Explicit specializations for float, double, long double. The only // reason for these specializations is to enable automatic conversions // from complex to complex, and complex to // complex. _STLP_TEMPLATE_NULL struct _STLP_CLASS_DECLSPEC complex { typedef float value_type; typedef complex _Self; // Constructors, destructor, assignment operator. __device__ complex(value_type __x = 0.0f, value_type __y = 0.0f) : _M_re(__x), _M_im(__y) {} __device__ complex(const complex& __z) : _M_re(__z._M_re), _M_im(__z._M_im) {} inline explicit __device__ complex(const complex& __z); // Element access. value_type __device__ real() const { return _M_re; } value_type __device__ imag() const { return _M_im; } // Arithmetic op= operations involving one real argument. __device__ _Self& operator= (value_type __x) { _M_re = __x; _M_im = 0.0f; return *this; } volatile __device__ _Self& operator= (value_type __x) volatile { _M_re = __x; _M_im = 0.0f; return *this; } __device__ _Self& operator+= (value_type __x) { _M_re += __x; return *this; } __device__ _Self& operator-= (value_type __x) { _M_re -= __x; return *this; } __device__ _Self& operator*= (value_type __x) { _M_re *= __x; _M_im *= __x; return *this; } __device__ _Self& operator/= (value_type __x) { _M_re /= __x; _M_im /= __x; return *this; } // Arithmetic op= operations involving two complex arguments. static __device__ void _div(const float& __z1_r, const float& __z1_i, const float& __z2_r, const float& __z2_i, float& __res_r, float& __res_i); static __device__ void _div(const float& __z1_r, const float& __z2_r, const float& __z2_i, float& __res_r, float& __res_i); template __device__ complex& operator=(const complex<_Tp2>& __z) { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } template __device__ volatile complex& operator=(const complex<_Tp2>& __z) volatile { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } template __device__ complex& operator+= (const complex<_Tp2>& __z) { _M_re += __z._M_re; _M_im += __z._M_im; return *this; } template __device__ complex& operator-= (const complex<_Tp2>& __z) { _M_re -= __z._M_re; _M_im -= __z._M_im; return *this; } template __device__ complex& operator*= (const complex<_Tp2>& __z) { float __r = _M_re * __z._M_re - _M_im * __z._M_im; float __i = _M_re * __z._M_im + _M_im * __z._M_re; _M_re = __r; _M_im = __i; return *this; } template __device__ complex& operator/= (const complex<_Tp2>& __z) { float __r; float __i; _div(_M_re, _M_im, __z._M_re, __z._M_im, __r, __i); _M_re = __r; _M_im = __i; return *this; } __device__ _Self& operator=(const _Self& __z) { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } __device__ volatile _Self& operator=(const _Self& __z) volatile { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } __device__ _Self& operator+= (const _Self& __z) { _M_re += __z._M_re; _M_im += __z._M_im; return *this; } __device__ _Self& operator-= (const _Self& __z) { _M_re -= __z._M_re; _M_im -= __z._M_im; return *this; } __device__ _Self& operator*= (const _Self& __z) { value_type __r = _M_re * __z._M_re - _M_im * __z._M_im; value_type __i = _M_re * __z._M_im + _M_im * __z._M_re; _M_re = __r; _M_im = __i; return *this; } __device__ _Self& operator/= (const _Self& __z) { value_type __r; value_type __i; _div(_M_re, _M_im, __z._M_re, __z._M_im, __r, __i); _M_re = __r; _M_im = __i; return *this; } // Data members. value_type _M_re; value_type _M_im; }; template<> struct _STLP_CLASS_DECLSPEC complex { typedef double value_type; typedef complex _Self; // Constructors, destructor, assignment operator. __device__ complex(value_type __x = 0.0, value_type __y = 0.0) : _M_re(__x), _M_im(__y) {} __device__ complex(const complex& __z) : _M_re(__z._M_re), _M_im(__z._M_im) {} __device__ inline complex(const complex& __z); // Element access. __device__ value_type real() const { return _M_re; } __device__ value_type imag() const { return _M_im; } // Arithmetic op= operations involving one real argument. __device__ _Self& operator= (value_type __x) { _M_re = __x; _M_im = 0.0; return *this; } __device__ volatile _Self& operator= (value_type __x) volatile { _M_re = __x; _M_im = 0.0; return *this; } __device__ _Self& operator+= (value_type __x) { _M_re += __x; return *this; } __device__ _Self& operator-= (value_type __x) { _M_re -= __x; return *this; } __device__ _Self& operator*= (value_type __x) { _M_re *= __x; _M_im *= __x; return *this; } __device__ _Self& operator/= (value_type __x) { _M_re /= __x; _M_im /= __x; return *this; } // Arithmetic op= operations involving two complex arguments. static __device__ void _div(const double& __z1_r, const double& __z1_i, const double& __z2_r, const double& __z2_i, double& __res_r, double& __res_i); static __device__ void _div(const double& __z1_r, const double& __z2_r, const double& __z2_i, double& __res_r, double& __res_i); #if defined (_STLP_FUNCTION_TMPL_PARTIAL_ORDER) template __device__ complex& operator=(const complex<_Tp2>& __z) { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } template __device__ volatile complex& operator=(const volatile complex<_Tp2>& __z) { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } template __device__ complex& operator+= (const complex<_Tp2>& __z) { _M_re += __z._M_re; _M_im += __z._M_im; return *this; } template __device__ complex& operator-= (const complex<_Tp2>& __z) { _M_re -= __z._M_re; _M_im -= __z._M_im; return *this; } template __device__ complex& operator*= (const complex<_Tp2>& __z) { double __r = _M_re * __z._M_re - _M_im * __z._M_im; double __i = _M_re * __z._M_im + _M_im * __z._M_re; _M_re = __r; _M_im = __i; return *this; } template __device__ complex& operator/= (const complex<_Tp2>& __z) { double __r; double __i; _div(_M_re, _M_im, __z._M_re, __z._M_im, __r, __i); _M_re = __r; _M_im = __i; return *this; } #endif /* _STLP_FUNCTION_TMPL_PARTIAL_ORDER */ __device__ _Self& operator=(const _Self& __z) { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } __device__ volatile _Self& operator=(const _Self& __z) volatile { _M_re = __z._M_re; _M_im = __z._M_im; return *this; } __device__ _Self& operator+= (const _Self& __z) { _M_re += __z._M_re; _M_im += __z._M_im; return *this; } __device__ _Self& operator-= (const _Self& __z) { _M_re -= __z._M_re; _M_im -= __z._M_im; return *this; } __device__ _Self& operator*= (const _Self& __z) { value_type __r = _M_re * __z._M_re - _M_im * __z._M_im; value_type __i = _M_re * __z._M_im + _M_im * __z._M_re; _M_re = __r; _M_im = __i; return *this; } __device__ _Self& operator/= (const _Self& __z) { value_type __r; value_type __i; _div(_M_re, _M_im, __z._M_re, __z._M_im, __r, __i); _M_re = __r; _M_im = __i; return *this; } // Data members. value_type _M_re; value_type _M_im; }; // Converting constructors from one of these three specialized types // to another. inline __device__ complex::complex(const complex& __z) : _M_re((float)__z._M_re), _M_im((float)__z._M_im) {} inline __device__ complex::complex(const complex& __z) : _M_re(__z._M_re), _M_im(__z._M_im) {} // Unary non-member arithmetic operators. template inline complex<_Tp> __device__ operator+(const complex<_Tp>& __z) { return __z; } template inline complex<_Tp> __device__ operator-(const complex<_Tp>& __z) { return complex<_Tp>(-__z._M_re, -__z._M_im); } // Non-member arithmetic operations involving one real argument. template __device__ inline complex<_Tp> operator+(const _Tp& __x, const complex<_Tp>& __z) { return complex<_Tp>(__x + __z._M_re, __z._M_im); } template __device__ inline complex<_Tp> operator+(const complex<_Tp>& __z, const _Tp& __x) { return complex<_Tp>(__z._M_re + __x, __z._M_im); } template __device__ inline complex<_Tp> operator-(const _Tp& __x, const complex<_Tp>& __z) { return complex<_Tp>(__x - __z._M_re, -__z._M_im); } template __device__ inline complex<_Tp> operator-(const complex<_Tp>& __z, const _Tp& __x) { return complex<_Tp>(__z._M_re - __x, __z._M_im); } template __device__ inline complex<_Tp> operator*(const _Tp& __x, const complex<_Tp>& __z) { return complex<_Tp>(__x * __z._M_re, __x * __z._M_im); } template __device__ inline complex<_Tp> operator*(const complex<_Tp>& __z, const _Tp& __x) { return complex<_Tp>(__z._M_re * __x, __z._M_im * __x); } template __device__ inline complex<_Tp> operator/(const _Tp& __x, const complex<_Tp>& __z) { complex<_Tp> __result; complex<_Tp>::_div(__x, __z._M_re, __z._M_im, __result._M_re, __result._M_im); return __result; } template __device__ inline complex<_Tp> operator/(const complex<_Tp>& __z, const _Tp& __x) { return complex<_Tp>(__z._M_re / __x, __z._M_im / __x); } // Non-member arithmetic operations involving two complex arguments template __device__ inline complex<_Tp> operator+(const complex<_Tp>& __z1, const complex<_Tp>& __z2) { return complex<_Tp>(__z1._M_re + __z2._M_re, __z1._M_im + __z2._M_im); } template __device__ inline complex<_Tp> operator+(const volatile complex<_Tp>& __z1, const volatile complex<_Tp>& __z2) { return complex<_Tp>(__z1._M_re + __z2._M_re, __z1._M_im + __z2._M_im); } template __device__ inline complex<_Tp> __device__ operator-(const complex<_Tp>& __z1, const complex<_Tp>& __z2) { return complex<_Tp>(__z1._M_re - __z2._M_re, __z1._M_im - __z2._M_im); } template __device__ inline complex<_Tp> __device__ operator*(const complex<_Tp>& __z1, const complex<_Tp>& __z2) { return complex<_Tp>(__z1._M_re * __z2._M_re - __z1._M_im * __z2._M_im, __z1._M_re * __z2._M_im + __z1._M_im * __z2._M_re); } template __device__ inline complex<_Tp> __device__ operator*(const volatile complex<_Tp>& __z1, const volatile complex<_Tp>& __z2) { return complex<_Tp>(__z1._M_re * __z2._M_re - __z1._M_im * __z2._M_im, __z1._M_re * __z2._M_im + __z1._M_im * __z2._M_re); } template __device__ inline complex<_Tp> __device__ operator/(const complex<_Tp>& __z1, const complex<_Tp>& __z2) { complex<_Tp> __result; complex<_Tp>::_div(__z1._M_re, __z1._M_im, __z2._M_re, __z2._M_im, __result._M_re, __result._M_im); return __result; } // Comparison operators. template __device__ inline bool operator==(const complex<_Tp>& __z1, const complex<_Tp>& __z2) { return __z1._M_re == __z2._M_re && __z1._M_im == __z2._M_im; } template __device__ inline bool operator==(const complex<_Tp>& __z, const _Tp& __x) { return __z._M_re == __x && __z._M_im == 0; } template __device__ inline bool operator==(const _Tp& __x, const complex<_Tp>& __z) { return __x == __z._M_re && 0 == __z._M_im; } //04/27/04 dums: removal of this check, if it is restablish //please explain why the other operators are not macro guarded //#ifdef _STLP_FUNCTION_TMPL_PARTIAL_ORDER template __device__ inline bool operator!=(const complex<_Tp>& __z1, const complex<_Tp>& __z2) { return __z1._M_re != __z2._M_re || __z1._M_im != __z2._M_im; } //#endif /* _STLP_FUNCTION_TMPL_PARTIAL_ORDER */ template __device__ inline bool operator!=(const complex<_Tp>& __z, const _Tp& __x) { return __z._M_re != __x || __z._M_im != 0; } template __device__ inline bool operator!=(const _Tp& __x, const complex<_Tp>& __z) { return __x != __z._M_re || 0 != __z._M_im; } // Other basic arithmetic operations template __device__ inline _Tp real(const complex<_Tp>& __z) { return __z._M_re; } template __device__ inline _Tp imag(const complex<_Tp>& __z) { return __z._M_im; } template __device__ _Tp abs(const complex<_Tp>& __z); template __device__ _Tp arg(const complex<_Tp>& __z); template __device__ inline _Tp norm(const complex<_Tp>& __z) { return __z._M_re * __z._M_re + __z._M_im * __z._M_im; } template __device__ inline complex<_Tp> conj(const complex<_Tp>& __z) { return complex<_Tp>(__z._M_re, -__z._M_im); } template __device__ complex<_Tp> polar(const _Tp& __rho) { return complex<_Tp>(__rho, 0); } template __device__ complex<_Tp> polar(const _Tp& __rho, const _Tp& __phi); _STLP_TEMPLATE_NULL __device__ float abs(const complex&); _STLP_TEMPLATE_NULL __device__ double abs(const complex&); _STLP_TEMPLATE_NULL __device__ float arg(const complex&); _STLP_TEMPLATE_NULL __device__ double arg(const complex&); _STLP_TEMPLATE_NULL __device__ complex polar(const float& __rho, const float& __phi); _STLP_TEMPLATE_NULL __device__ complex polar(const double& __rho, const double& __phi); template __device__ _Tp abs(const complex<_Tp>& __z) { return _Tp(abs(complex(double(__z.real()), double(__z.imag())))); } template __device__ _Tp arg(const complex<_Tp>& __z) { return _Tp(arg(complex(double(__z.real()), double(__z.imag())))); } template __device__ complex<_Tp> polar(const _Tp& __rho, const _Tp& __phi) { complex __tmp = polar(double(__rho), double(__phi)); return complex<_Tp>(_Tp(__tmp.real()), _Tp(__tmp.imag())); } // Transcendental functions. These are defined only for float, // double, and long double. (Sqrt isn't transcendental, of course, // but it's included in this section anyway.) __device__ complex sqrt(const complex&); __device__ complex exp(const complex&); __device__ complex log(const complex&); __device__ complex log10(const complex&); // uses some stlport-private power thing // __device__ complex pow(const complex&, int); __device__ complex pow(const complex&, const float&); __device__ complex pow(const float&, const complex&); __device__ complex pow(const complex&, const complex&); __device__ complex sin(const complex&); __device__ complex cos(const complex&); __device__ complex tan(const complex&); __device__ complex sinh(const complex&); __device__ complex cosh(const complex&); __device__ complex tanh(const complex&); __device__ complex sqrt(const complex&); __device__ complex exp(const complex&); __device__ complex log(const complex&); __device__ complex log10(const complex&); // uses some stlport-private power thing // __device__ complex pow(const complex&, int); __device__ complex pow(const complex&, const double&); __device__ complex pow(const double&, const complex&); __device__ complex pow(const complex&, const complex&); __device__ complex sin(const complex&); __device__ complex cos(const complex&); __device__ complex tan(const complex&); __device__ complex sinh(const complex&); __device__ complex cosh(const complex&); __device__ complex tanh(const complex&); } } #if 0 #ifndef _STLP_LINK_TIME_INSTANTIATION # include #endif #endif #include #endif pycuda-2013.1.1+git20140310/pycuda/__init__.py0000644000175000000500000000015512313360364016726 0ustar tomussrcVERSION = (2013, 1, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS pycuda-2013.1.1+git20140310/pycuda/scan.py0000644000175000000500000003143712313360364016122 0ustar tomussrc"""Scan primitive.""" from __future__ import division __copyright__ = """ Copyright 2011 Andreas Kloeckner Copyright 2008-2011 NVIDIA Corporation """ __license__ = """ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Derived from thrust/detail/backend/cuda/detail/fast_scan.inl within the Thrust project, https://code.google.com/p/thrust/ Direct browse link: https://code.google.com/p/thrust/source/browse/thrust/detail/backend/cuda/detail/fast_scan.inl """ import numpy as np import pycuda.driver as driver import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule from pycuda.tools import dtype_to_ctype import pycuda._mymako as mako from pycuda._cluda import CLUDA_PREAMBLE SHARED_PREAMBLE = CLUDA_PREAMBLE + """ #define WG_SIZE ${wg_size} #define SCAN_EXPR(a, b) ${scan_expr} ${preamble} typedef ${scan_type} scan_type; """ SCAN_INTERVALS_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL// #define K ${wg_seq_batches} <%def name="make_group_scan(name, with_bounds_check)"> WITHIN_KERNEL void ${name}(LOCAL_MEM_ARG scan_type *array % if with_bounds_check: , const unsigned n % endif ) { scan_type val = array[LID_0]; <% offset = 1 %> % while offset <= wg_size: if (LID_0 >= ${offset} % if with_bounds_check: && LID_0 < n % endif ) { scan_type tmp = array[LID_0 - ${offset}]; val = SCAN_EXPR(tmp, val); } local_barrier(); array[LID_0] = val; local_barrier(); <% offset *= 2 %> % endwhile } ${make_group_scan("scan_group", False)} ${make_group_scan("scan_group_n", True)} KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${name_prefix}_scan_intervals( GLOBAL_MEM scan_type *input, const unsigned int N, const unsigned int interval_size, GLOBAL_MEM scan_type *output, GLOBAL_MEM scan_type *group_results) { // padded in WG_SIZE to avoid bank conflicts // index K in first dimension used for carry storage LOCAL_MEM scan_type ldata[K + 1][WG_SIZE + 1]; const unsigned int interval_begin = interval_size * GID_0; const unsigned int interval_end = min(interval_begin + interval_size, N); const unsigned int unit_size = K * WG_SIZE; unsigned int unit_base = interval_begin; %for is_tail in [False, True]: %if not is_tail: for(; unit_base + unit_size <= interval_end; unit_base += unit_size) %else: if (unit_base < interval_end) %endif { // Algorithm: Each work group is responsible for one contiguous // 'interval', of which there are just enough to fill all compute // units. Intervals are split into 'units'. A unit is what gets // worked on in parallel by one work group. // Each unit has two axes--the local-id axis and the k axis. // // * * * * * * * * * * ----> lid // * * * * * * * * * * // * * * * * * * * * * // * * * * * * * * * * // * * * * * * * * * * // | // v k // This is a three-phase algorithm, in which first each interval // does its local scan, then a scan across intervals exchanges data // globally, and the final update adds the exchanged sums to each // interval. // Exclusive scan is realized by performing a right-shift inside // the final update. // read a unit's worth of data from global for(unsigned int k = 0; k < K; k++) { const unsigned int offset = k*WG_SIZE + LID_0; %if is_tail: if (unit_base + offset < interval_end) %endif { ldata[offset % K][offset / K] = input[unit_base + offset]; } } // carry in from previous unit, if applicable. if (LID_0 == 0 && unit_base != interval_begin) ldata[0][0] = SCAN_EXPR(ldata[K][WG_SIZE - 1], ldata[0][0]); local_barrier(); // scan along k (sequentially in each work item) scan_type sum = ldata[0][LID_0]; %if is_tail: const unsigned int offset_end = interval_end - unit_base; %endif for(unsigned int k = 1; k < K; k++) { %if is_tail: if (K * LID_0 + k < offset_end) %endif { scan_type tmp = ldata[k][LID_0]; sum = SCAN_EXPR(sum, tmp); ldata[k][LID_0] = sum; } } // store carry in out-of-bounds (padding) array entry in the K direction ldata[K][LID_0] = sum; local_barrier(); // tree-based parallel scan along local id %if not is_tail: scan_group(&ldata[K][0]); %else: scan_group_n(&ldata[K][0], offset_end / K); %endif // update local values if (LID_0 > 0) { sum = ldata[K][LID_0 - 1]; for(unsigned int k = 0; k < K; k++) { %if is_tail: if (K * LID_0 + k < offset_end) %endif { scan_type tmp = ldata[k][LID_0]; ldata[k][LID_0] = SCAN_EXPR(sum, tmp); } } } local_barrier(); // write data for(unsigned int k = 0; k < K; k++) { const unsigned int offset = k*WG_SIZE + LID_0; %if is_tail: if (unit_base + offset < interval_end) %endif { output[unit_base + offset] = ldata[offset % K][offset / K]; } } local_barrier(); } % endfor // write interval sum if (LID_0 == 0) { group_results[GID_0] = output[interval_end - 1]; } } """) INCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL// KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${name_prefix}_final_update( GLOBAL_MEM scan_type *output, const unsigned int N, const unsigned int interval_size, GLOBAL_MEM scan_type *group_results) { const unsigned int interval_begin = interval_size * GID_0; const unsigned int interval_end = min(interval_begin + interval_size, N); if (GID_0 == 0) return; // value to add to this segment scan_type prev_group_sum = group_results[GID_0 - 1]; // advance result pointer output += interval_begin + LID_0; for(unsigned int unit_base = interval_begin; unit_base < interval_end; unit_base += WG_SIZE, output += WG_SIZE) { const unsigned int i = unit_base + LID_0; if(i < interval_end) { *output = SCAN_EXPR(prev_group_sum, *output); } } } """) EXCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL// KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${name_prefix}_final_update( GLOBAL_MEM scan_type *output, const unsigned int N, const unsigned int interval_size, GLOBAL_MEM scan_type *group_results) { LOCAL_MEM scan_type ldata[WG_SIZE]; const unsigned int interval_begin = interval_size * GID_0; const unsigned int interval_end = min(interval_begin + interval_size, N); // value to add to this segment scan_type carry = ${neutral}; if(GID_0 != 0) { scan_type tmp = group_results[GID_0 - 1]; carry = SCAN_EXPR(carry, tmp); } scan_type val = carry; // advance result pointer output += interval_begin + LID_0; for (unsigned int unit_base = interval_begin; unit_base < interval_end; unit_base += WG_SIZE, output += WG_SIZE) { const unsigned int i = unit_base + LID_0; if(i < interval_end) { scan_type tmp = *output; ldata[LID_0] = SCAN_EXPR(carry, tmp); } local_barrier(); if (LID_0 != 0) val = ldata[LID_0 - 1]; /* else (see above) val = carry OR last tail; */ if (i < interval_end) *output = val; if(LID_0 == 0) val = ldata[WG_SIZE - 1]; local_barrier(); } } """) class _ScanKernelBase(object): def __init__(self, dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None): if isinstance(self, ExclusiveScanKernel) and neutral is None: raise ValueError("neutral element is required for exclusive scan") dtype = self.dtype = np.dtype(dtype) self.neutral = neutral # Thrust says these are good for GT200 self.scan_wg_size = 128 self.update_wg_size = 256 self.scan_wg_seq_batches = 6 kw_values = dict( preamble=preamble, name_prefix=name_prefix, scan_type=dtype_to_ctype(dtype), scan_expr=scan_expr, neutral=neutral) scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render( wg_size=self.scan_wg_size, wg_seq_batches=self.scan_wg_seq_batches, **kw_values)) scan_intervals_prg = SourceModule( scan_intervals_src, options=options, no_extern_c=True) self.scan_intervals_knl = scan_intervals_prg.get_function( name_prefix+"_scan_intervals") self.scan_intervals_knl.prepare("PIIPP") final_update_src = str(self.final_update_tp.render( wg_size=self.update_wg_size, **kw_values)) final_update_prg = SourceModule( final_update_src, options=options, no_extern_c=True) self.final_update_knl = final_update_prg.get_function( name_prefix+"_final_update") self.final_update_knl.prepare("PIIP") def __call__(self, input_ary, output_ary=None, allocator=None, stream=None): allocator = allocator or input_ary.allocator if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = gpuarray.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches dev = driver.Context.get_device() max_groups = 3*dev.get_attribute( driver.device_attribute.MULTIPROCESSOR_COUNT) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups); block_results = allocator(self.dtype.itemsize*num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl.prepared_async_call( (num_groups, 1), (self.scan_wg_size, 1, 1), stream, input_ary.gpudata, n, interval_size, output_ary.gpudata, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl.prepared_async_call( (1,1), (self.scan_wg_size, 1, 1), stream, block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl.prepared_async_call( (num_groups, 1,), (self.update_wg_size, 1, 1), stream, output_ary.gpudata, n, interval_size, block_results) return output_ary class InclusiveScanKernel(_ScanKernelBase): final_update_tp = INCLUSIVE_UPDATE_SOURCE class ExclusiveScanKernel(_ScanKernelBase): final_update_tp = EXCLUSIVE_UPDATE_SOURCE pycuda-2013.1.1+git20140310/pycuda/characterize.py0000644000175000000500000000160112313360364017630 0ustar tomussrcfrom __future__ import division from pycuda.tools import context_dependent_memoize import numpy as np def platform_bits(): return tuple.__itemsize__ * 8 def has_stack(): from pycuda.driver import Context return Context.get_device().compute_capability() >= (2, 0) def has_double_support(): from pycuda.driver import Context return Context.get_device().compute_capability() >= (1, 3) @context_dependent_memoize def sizeof(type_name, preamble=""): from pycuda.compiler import SourceModule mod = SourceModule(""" %s extern "C" __global__ void write_size(size_t *output) { *output = sizeof(%s); } """ % (preamble, type_name), no_extern_c=True) import pycuda.gpuarray as gpuarray output = gpuarray.empty((), dtype=np.uintp) mod.get_function("write_size")(output, block=(1, 1, 1), grid=(1, 1)) return int(output.get()) pycuda-2013.1.1+git20140310/pycuda/compyte/0002755000175000000500000000000012313360366016300 5ustar tomussrcpycuda-2013.1.1+git20140310/pycuda/compyte/array.py0000644000175000000500000001247012313360366017772 0ustar tomussrcfrom __future__ import division __copyright__ = "Copyright (C) 2011 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np def f_contiguous_strides(itemsize, shape): if shape: strides = [itemsize] for s in shape[:-1]: strides.append(strides[-1]*s) return tuple(strides) else: return () def c_contiguous_strides(itemsize, shape): if shape: strides = [itemsize] for s in shape[:0:-1]: strides.append(strides[-1]*s) return tuple(strides[::-1]) else: return () class ArrayFlags: def __init__(self, ary): self.f_contiguous = ( ary.strides == f_contiguous_strides( ary.dtype.itemsize, ary.shape)) self.c_contiguous = ( ary.strides == c_contiguous_strides( ary.dtype.itemsize, ary.shape)) self.forc = self.f_contiguous or self.c_contiguous def get_common_dtype(obj1, obj2, allow_double): # Yes, numpy behaves differently depending on whether # we're dealing with arrays or scalars. zero1 = np.zeros(1, dtype=obj1.dtype) try: zero2 = np.zeros(1, dtype=obj2.dtype) except AttributeError: zero2 = obj2 result = (zero1 + zero2).dtype if not allow_double: if result == np.float64: result = np.dtype(np.float32) elif result == np.complex128: result = np.dtype(np.complex64) return result def bound(a): high = a.bytes low = a.bytes for stri, shp in zip(a.strides, a.shape): if stri < 0: low += (stri)*(shp-1) else: high += (stri)*(shp-1) return low, high def may_share_memory(a, b): # When this is called with a an ndarray and b # a sparse matrix, numpy.may_share_memory fails. if a is b: return True if a.__class__ is b.__class__: a_l, a_h = bound(a) b_l, b_h = bound(b) if b_l >= a_h or a_l >= b_h: return False return True else: return False # {{{ as_strided implementation # stolen from numpy to be compatible with older versions of numpy class _DummyArray(object): """ Dummy object that just exists to hang __array_interface__ dictionaries and possibly keep alive a reference to a base array. """ def __init__(self, interface, base=None): self.__array_interface__ = interface self.base = base def as_strided(x, shape=None, strides=None): """ Make an ndarray from the given array with the given shape and strides. """ # work around Numpy bug 1873 (reported by Irwin Zaid) # Since this is stolen from numpy, this implementation has the same bug. # http://projects.scipy.org/numpy/ticket/1873 # == https://github.com/numpy/numpy/issues/2466 if not x.dtype.isbuiltin: if (shape is None or x.shape == shape) and \ (strides is None or x.strides == strides): return x if shape is None: shape = x.shape strides = tuple(strides) from pytools import product if strides is not None and shape is not None \ and product(shape) == product(x.shape) \ and x.flags.forc: # Workaround: If we're being asked to do what amounts to a # contiguous reshape, at least do that. if strides == f_contiguous_strides(x.dtype.itemsize, shape): # **dict is a workaround for Python 2.5 syntax. result = x.reshape(-1).reshape(*shape, **dict(order="F")) assert result.strides == strides return result elif strides == c_contiguous_strides(x.dtype.itemsize, shape): # **dict is a workaround for Python 2.5 syntax. result = x.reshape(-1).reshape(*shape, **dict(order="C")) assert result.strides == strides return result raise NotImplementedError( "as_strided won't work on non-builtin arrays for now. " "See https://github.com/numpy/numpy/issues/2466") interface = dict(x.__array_interface__) if shape is not None: interface['shape'] = tuple(shape) if strides is not None: interface['strides'] = tuple(strides) return np.asarray(_DummyArray(interface, base=x)) # }}} pycuda-2013.1.1+git20140310/pycuda/compyte/__init__.py0000644000175000000500000000000012313360366020375 0ustar tomussrcpycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/0002755000175000000500000000000012313360366017740 5ustar tomussrcpycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/pygpu_ndarray.h0000644000175000000500000000320112313360366022767 0ustar tomussrc#ifndef _PYGPU_NDARRAY_H #define _PYGPU_NDARRAY_H #ifndef OFFSET #define OFFSET 0 #endif //#include //#include #include #include #include "pygpu_ndarray_object.h" #include "gpu_ndarray.h" #include "pygpu_language.h" /* * Return a PyGpuNdArray whose 'nd' dimensions are all 0. * if nd==-1, it is not initialized. */ PyObject * PyGpuNdArray_New(int nd=-1); /** * Return 1 for a PyGpuNdArrayObject otw 0 */ int PyGpuNdArray_Check(const PyObject * ob); /** * Return 1 for a PyGpuNdArrayObject otw 0 */ int PyGpuNdArray_CheckExact(const PyObject * ob); /** * Transfer the contents of numpy array `obj` to `self`. * * self is reallocated to have the correct dimensions if necessary. */ int PyGpuNdArray_CopyFromArray(PyGpuNdArrayObject * self, PyArrayObject*obj); static int PyGpuNdArray_add_offset(PyGpuNdArrayObject * self, int offset); static int PyGpuNdArray_set_data(PyGpuNdArrayObject * self, char * data, PyObject * base, int offset=0); static PyObject * PyGpuNdArray_Subscript(PyObject * py_self, PyObject * key); static PyObject * PyGpuNdArray_Copy(PyGpuNdArrayObject * self, NPY_ORDER order=NPY_CORDER); static PyObject * PyGpuNdArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran); static PyObject * PyGpuNdArray_Empty(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran); #endif /* Local Variables: mode:c++ c-basic-offset:4 c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)) indent-tabs-mode:nil fill-column:79 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/__init__.py0000644000175000000500000000000012313360366022035 0ustar tomussrcpycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/pygpu_language_cuda.cu0000644000175000000500000007121312313360366024276 0ustar tomussrc#include #include #include #ifdef __DEVICE_EMULATION__ #define NUM_VECTOR_OP_BLOCKS 4096 #define NUM_VECTOR_OP_THREADS_PER_BLOCK 1 //This prevents printf from getting tangled up #else #define NUM_VECTOR_OP_BLOCKS 4096 //Max number of blocks to launch. Should be read from device properties. (#10) #define NUM_VECTOR_OP_THREADS_PER_BLOCK 256 //Should be read from device properties. (#10) #endif #if 0 // Do not wait after every kernel & transfer. #define CNDA_THREAD_SYNC #else // This is useful for using normal profiling tools #define CNDA_THREAD_SYNC cudaThreadSynchronize(); #endif #ifndef SHARED_SIZE #define SHARED_SIZE (16*1024) #endif char * cublasGetErrorString(cublasStatus err) { if (err == CUBLAS_STATUS_NOT_INITIALIZED) { return "CUBLAS_STATUS_NOT_INITIALIZED"; } else if (err == CUBLAS_STATUS_ALLOC_FAILED){ return "CUBLAS_STATUS_ALLOC_FAILED"; } else if (err == CUBLAS_STATUS_INVALID_VALUE){ return "CUBLAS_STATUS_INVALID_VALUE"; } else if (err == CUBLAS_STATUS_MAPPING_ERROR){ return "CUBLAS_STATUS_MAPPING_ERROR"; } else if (err == CUBLAS_STATUS_EXECUTION_FAILED){ return "CUBLAS_STATUS_EXECUTION_FAILED"; } else if (err == CUBLAS_STATUS_INTERNAL_ERROR){ return "CUBLAS_STATUS_INTERNAL_ERROR"; } else { return "UNKNOW ERROR"; } } ///////////////////////// // Alloc and Free ///////////////////////// void * device_malloc(size_t size) { void * rval=NULL; cudaError_t err = cudaMalloc(&rval, size); if (cudaSuccess != err){ #if COMPUTE_GPU_MEM_USED fprintf(stderr, "Error allocating %li bytes of device memory (%s). %d already allocated\n", (long)size, cudaGetErrorString(err),_allocated_size); #else fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err)); #endif PyErr_Format(PyExc_MemoryError, "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err)); return NULL; } _outstanding_mallocs[0] += (rval != NULL); #if COMPUTE_GPU_MEM_USED for(int i=0;i __device__ T unary_copy(T a) { return a; } decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_float, unary_copy, float) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_double, unary_copy, double) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint8, unary_copy, uint8_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int8, unary_copy, int8_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint16, unary_copy, uint16_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int16, unary_copy, int16_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint32, unary_copy, uint32_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int32, unary_copy, int32_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint64, unary_copy, uint64_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int64, unary_copy, int64_t) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex64, unary_copy, npy_complex64) decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex128, unary_copy, npy_complex128) //template __device__ T unary_exp(T a) { return exp(a); } //decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp) template static __global__ void k_copy_1d(const int N, const T * x, const ssize_t sx, T * y, const ssize_t sy) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x) { y[i*sy] = x[i*sx]; } } //copy from other into self //don't allocated memory int PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast) { DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray start nd=%d\n", PyGpuNdArray_NDIM(self)); assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other)); assert(PyGpuNdArray_ISWRITEABLE(self)); //standard elemwise size checks if (PyGpuNdArray_NDIM(self) == -1) { PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuNdArrayObject"); return -1; } if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other)); return -1; } //standard elemwise dim checks (also compute total size) unsigned int size = 1; unsigned int size_source = 1; for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) { if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i]) && (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) { PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld", i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]); return -1; } size *= (unsigned int) PyGpuNdArray_DIMS(self)[i]; size_source *= (unsigned int) PyGpuNdArray_DIMS(other)[i]; } if (0 == size) { return 0; //nothing to copy, we're done. } //cublas don't support negative stride bool pos_stride = true; for (int i = 0; i < PyGpuNdArray_NDIM(other); ++i) if (PyGpuNdArray_STRIDE(other,i)<0) pos_stride = false; void * other_data = PyGpuNdArray_DATA(other) + PyGpuNdArray_OFFSET(other); void * self_data = PyGpuNdArray_DATA(self) + PyGpuNdArray_OFFSET(self); //Try to transfer with cublas(we suppose it is faster) if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) && size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT32 && pos_stride ) { cublasScopy(size, (float*) other_data, 1, (float*) self_data, 1); CNDA_THREAD_SYNC; if (CUBLAS_STATUS_SUCCESS != cublasGetError()) { PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); return -1; } DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: cublasScopy end\n"); return 0; } if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) && size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT64 && pos_stride) { cublasDcopy(size, (double*) other_data, 1, (double*) self_data, 1); CNDA_THREAD_SYNC; if (CUBLAS_STATUS_SUCCESS != cublasGetError()) { PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); return -1; } DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray cublasDcopy end\n"); return 0; } //TODO: rewrite these copy operations to be more efficient // See, for example the transpose example in the cuda_sdk. switch (PyGpuNdArray_NDIM(self)) { case 0: // scalar { // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS assert(0); }; break; case 1: // vector { assert(PyGpuNdArray_ISALIGNED(self)); assert(PyGpuNdArray_ISALIGNED(other)); DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying non-contiguous vector\n"); unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int n_threads = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); if (PyGpuNdArray_TYPE(self) == NPY_FLOAT32) { const int elsize = sizeof(float); k_copy_1d<<>>(size, (float*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (float*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_FLOAT64) { const int elsize = sizeof(double); k_copy_1d<<>>(size, (double*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (double*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_INT8) { const int elsize = sizeof(int8_t); k_copy_1d<<>>(size, (int8_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (int8_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_INT16) { const int elsize = sizeof(int16_t); k_copy_1d<<>>(size, (int16_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (int16_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_INT32) { const int elsize = sizeof(int32_t); k_copy_1d<<>>(size, (int32_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (int32_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_INT64) { const int elsize = sizeof(int64_t); k_copy_1d<<>>(size, (int64_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (int64_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_UINT8) { const int elsize = sizeof(uint8_t); k_copy_1d<<>>(size, (uint8_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (uint8_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_UINT16) { const int elsize = sizeof(uint16_t); k_copy_1d<<>>(size, (uint16_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (uint16_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_UINT32) { const int elsize = sizeof(uint32_t); k_copy_1d<<>>(size, (uint32_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (uint32_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_UINT64) { const int elsize = sizeof(uint64_t); k_copy_1d<<>>(size, (uint64_t*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (uint64_t*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) { const int elsize = sizeof(npy_complex64); k_copy_1d<<>>(size, (npy_complex64*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (npy_complex64*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) { const int elsize = sizeof(npy_complex128); k_copy_1d<<>>(size, (npy_complex128*)other_data, PyGpuNdArray_STRIDES(other)[0]/elsize, (npy_complex128*)self_data, PyGpuNdArray_STRIDES(self)[0]/elsize); } else { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n"); return -1; } CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_copy_1d", cudaGetErrorString(err), n_blocks, n_threads); return -1; } }; break; default: { assert (cudaSuccess == cudaGetLastError()); assert(PyGpuNdArray_ISALIGNED(self)); assert(PyGpuNdArray_ISALIGNED(other)); DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying with default version unbroadcast=%d\n", unbroadcast); // Identigy the dim of the output memory. PyGpuNdArrayObject * cuda_dims = other; if(unbroadcast) cuda_dims = self; // Move the dim and strides information on the gpu memory int ndim = PyGpuNdArray_NDIM(other); void * strides_dev = device_malloc(sizeof(ssize_t)*ndim*3); ssize_t * strides_dev_p = (ssize_t *) strides_dev; cudaError_t err = cudaMemcpy(strides_dev, PyGpuNdArray_DIMS(cuda_dims), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice); if (err != cudaSuccess){ PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory1: %s", cudaGetErrorString(err)); return -1; } err = cudaMemcpy((void*)(strides_dev_p+ndim), PyGpuNdArray_STRIDES(other), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice); if (err != cudaSuccess){ PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory2: %s", cudaGetErrorString(err)); return -1; } err = cudaMemcpy((void*)(strides_dev_p+(ndim*2)), PyGpuNdArray_STRIDES(self), ndim*sizeof(ssize_t), cudaMemcpyHostToDevice); if (err != cudaSuccess){ PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory3: %s", cudaGetErrorString(err)); return -1; } void * strides_host = malloc(sizeof(ssize_t)*ndim*3); err = cudaMemcpy(strides_host, strides_dev, ndim*3*sizeof(ssize_t),cudaMemcpyDeviceToHost); if (err != cudaSuccess){ PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory4: %s", cudaGetErrorString(err)); return -1; } #ifdef DEBUG for(int i=0;i<3*ndim;i++) DPRINTF(" %ld", ((ssize_t *)strides_host)[i]); DPRINTF("\n"); #endif CNDA_THREAD_SYNC; if(cudaSuccess != cudaGetLastError()){ PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: error before copy\n"); return -1; } // call worker routine unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int threads_per_block = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT32) { k_elemwise_unary_rowmajor_copy_float<<>>( size, (unsigned int)ndim, strides_dev_p, (const float*)other_data, strides_dev_p+ndim, (float*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT64) { k_elemwise_unary_rowmajor_copy_double<<>>( size, (unsigned int)ndim, strides_dev_p, (const double*)other_data, strides_dev_p+ndim, (double*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_INT8) { k_elemwise_unary_rowmajor_copy_int8<<>>( size, (unsigned int)ndim, strides_dev_p, (const int8_t*)other_data, strides_dev_p+ndim, (int8_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_INT16) { k_elemwise_unary_rowmajor_copy_int16<<>>( size, (unsigned int)ndim, strides_dev_p, (const int16_t*)other_data, strides_dev_p+ndim, (int16_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_INT32) { k_elemwise_unary_rowmajor_copy_int32<<>>( size, (unsigned int)ndim, strides_dev_p, (const int32_t*)other_data, strides_dev_p+ndim, (int32_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_INT64) { k_elemwise_unary_rowmajor_copy_int64<<>>( size, (unsigned int)ndim, strides_dev_p, (const int64_t*)other_data, strides_dev_p+ndim, (int64_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT8) { k_elemwise_unary_rowmajor_copy_uint8<<>>( size, (unsigned int)ndim, strides_dev_p, (const uint8_t*)other_data, strides_dev_p+ndim, (uint8_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT16) { k_elemwise_unary_rowmajor_copy_uint16<<>>( size, (unsigned int)ndim, strides_dev_p, (const uint16_t*)other_data, strides_dev_p+ndim, (uint16_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT32) { k_elemwise_unary_rowmajor_copy_uint32<<>>( size, (unsigned int)ndim, strides_dev_p, (const uint32_t*)other_data, strides_dev_p+ndim, (uint32_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT64) { k_elemwise_unary_rowmajor_copy_uint64<<>>( size, (unsigned int)ndim, strides_dev_p, (const uint64_t*)other_data, strides_dev_p+ndim, (uint64_t*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) { k_elemwise_unary_rowmajor_copy_complex64<<>>( size, (unsigned int)ndim, strides_dev_p, (const npy_complex64*)other_data, strides_dev_p+ndim, (npy_complex64*) self_data, strides_dev_p+(ndim*2)); } else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) { k_elemwise_unary_rowmajor_copy_complex128<<>>( size, (unsigned int)ndim, strides_dev_p, (const npy_complex128*)other_data, strides_dev_p+ndim, (npy_complex128*) self_data, strides_dev_p+(ndim*2)); } else { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n"); return -1; } CNDA_THREAD_SYNC; err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_elemwise_unary_rowmajor_copy", cudaGetErrorString(err), n_blocks, threads_per_block); return -1; } device_free(strides_dev); free(strides_host); } }; // Set flags if (false && PyGpuNdArray_NDIM(self) == 0) { //Numpy 1.4.1 is not consistent here //When we create a new numpy ndarray of 0 dim, it is not f contiguous //But when we take a subtensor that is of 0 dim, it is f contiguous! //We make as them for now... PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; } else { if (PyGpuNdArray_is_c_contiguous(self)) { PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; } if (PyGpuNdArray_is_f_contiguous(self)) { PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; } } DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray end\n"); return 0; } int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, PyGpuTransfert direction){ DPRINTF("PyGpuMemcpy: start\n"); cudaMemcpyKind dir; const char * ssrc; const char * ddst; if (direction == PyGpuDeviceToHost){ dir = cudaMemcpyDeviceToHost; ssrc = (char*)src+dev_offset; ddst = (char*)dst; } else if (direction == PyGpuHostToDevice) { dir = cudaMemcpyHostToDevice; ssrc = (char*)src; ddst = (char*)dst + dev_offset; } else { PyErr_Format(PyExc_ValueError, "PyGpuMemcpy: Received wrong direction %d!\n", direction); return -1; } cudaError_t err = cudaMemcpy((void*)ddst, (void*)ssrc, bytes, dir); CNDA_THREAD_SYNC; if (cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "PyGpuMemcpy: cudaMemcpy: error copying data to host (%s)", cudaGetErrorString(err)); return -1; } DPRINTF("PyGpuMemcpy: end\n"); return 0; } int PyGpuMemset(void * dst, int data, size_t bytes){ DPRINTF("PyGpuMemset: start\n"); cudaError_t err = cudaMemset(dst, data, bytes); CNDA_THREAD_SYNC; if (cudaSuccess != err) { PyErr_Format(PyExc_MemoryError, "PyGpuMemset: Error memsetting %ld bytes of device memory(%s). %p", bytes, cudaGetErrorString(err), PyGpuNdArray_DATA(dst)); DPRINTF("PyGpuMemset: end error\n"); return -1; } DPRINTF("PyGpuMemset: end\n"); return 0; } /* Local Variables: mode:c++ c-basic-offset:4 c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)) indent-tabs-mode:nil fill-column:79 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/test_gpu_ndarray.py0000644000175000000500000004255312313360366023673 0ustar tomussrcimport copy import numpy import pygpu_ndarray as gpu_ndarray enable_double = True enable_double = False dtypes_all = ["float32", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "complex64", ] dtypes_no_complex = ["float32", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", ] if enable_double: dtypes_all += ["float64", "complex128"] dtypes_no_complex += ["float64"] def check_flags(x, y): assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] assert x.flags["OWNDATA"] == y.flags["OWNDATA"] assert x.flags["ALIGNED"] == y.flags["ALIGNED"] assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] def check_meta(x, y): assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides check_flags(x, y) def check_all(x, y): check_meta(x, y) assert numpy.allclose(numpy.asarray(x), numpy.asarray(y)) def gen_gpu_nd_array(shape_orig, dtype='float32', offseted_outer=False, offseted_inner=False, sliced=1, order='c'): if sliced is True: sliced = 2 elif sliced is False: sliced = 1 shape = numpy.asarray(shape_orig).copy() if sliced != 1 and len(shape) > 0: shape[0] *= numpy.absolute(sliced) if offseted_outer and len(shape) > 0: shape[0] += 1 if offseted_inner and len(shape) > 0: shape[-1] += 1 a = numpy.random.rand(*shape) * 10 if dtype.startswith("u"): a = numpy.absolute(a) a = numpy.asarray(a, dtype=dtype) assert order in ['c', 'f'] if order == 'f' and len(shape) > 0: a = numpy.asfortranarray(a) b = gpu_ndarray.GpuNdArrayObject(a) if order == 'f' and len(shape) > 0 and b.size > 1: assert b.flags['F_CONTIGUOUS'] if offseted_outer and len(shape) > 0: b = b[1:] a = a[1:] assert b.offset != 0 if offseted_inner and len(shape) > 0: # The b[..., 1:] act as the test for this subtensor case. b = b[..., 1:] a = a[..., 1:] assert b.offset != 0 if sliced != 1 and len(shape) > 0: a = a[::sliced] b = b[::sliced] if False and shape_orig == (): assert a.shape == (1,) assert b.shape == (1,) else: assert a.shape == shape_orig, (a.shape, shape_orig) assert b.shape == shape_orig, (b.shape, shape_orig) assert numpy.allclose(a, numpy.asarray(b)) return a, b def product(*args, **kwds): # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111 pools = map(tuple, args) * kwds.get('repeat', 1) result = [[]] for pool in pools: result = [x + [y] for x in result for y in pool] for prod in result: yield tuple(prod) def test_transfer(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: a, b = gen_gpu_nd_array(shp, dtype, offseted) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape assert a.strides == b.strides == c.strides assert a.dtype == b.dtype == c.dtype == dtype assert c.flags.c_contiguous def test_transfer_not_contiguous(): """ Test transfer when the input on the CPU is not contiguous TODO: test when the input on the gpu is not contiguous """ for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: a = numpy.random.rand(*shp) * 10 a = a[::-1] b = gpu_ndarray.GpuNdArrayObject(a) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape # We copy a to a c contiguous array before the transfer assert (-a.strides[0],) + a.strides[1:] == b.strides == c.strides assert a.dtype == b.dtype == c.dtype assert c.flags.c_contiguous def test_transfer_fortran(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: a = numpy.random.rand(*shp) * 10 a_ = numpy.asfortranarray(a) if len(shp) > 1: assert a_.strides != a.strides a = a_ b = gpu_ndarray.GpuNdArrayObject(a) c = numpy.asarray(b) assert a.shape == b.shape == c.shape assert a.dtype == b.dtype == c.dtype assert a.flags.f_contiguous assert c.flags.f_contiguous assert a.strides == b.strides == c.strides assert numpy.allclose(c, a) def test_ascontiguousarray(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted_o in [True, False]: for offseted_i in [True, True]: for sliced in [1, 2, -1, -2]: for order in ['f', 'c']: #print shp, dtype, offseted_o, offseted_i, #print sliced, order cpu, gpu = gen_gpu_nd_array(shp, dtype, offseted_o, offseted_i, sliced, order) a = numpy.ascontiguousarray(cpu) b = gpu_ndarray.ascontiguousarray(gpu) # numpy upcast with a view to 1d scalar. if (sliced != 1 or shp == () or (offseted_i and len(shp) > 1)): assert b is not gpu if sliced == 1 and not offseted_i: assert ((a.data is cpu.data) == (b.bytes is gpu.bytes)) else: assert b is gpu assert a.shape == b.shape assert a.dtype == b.dtype assert a.flags.c_contiguous assert b.flags['C_CONTIGUOUS'] assert a.strides == b.strides assert numpy.allclose(cpu, a) assert numpy.allclose(cpu, b) def test_asfortranarray(): for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted_outer in [True, False]: for offseted_inner in [True, False]: for sliced in [1, 2, -1, -2]: for order in ['f', 'c']: #print shp, dtype, offseted_outer, offseted_inner, sliced, order cpu, gpu = gen_gpu_nd_array(shp, dtype, offseted_outer, offseted_inner, sliced, order) a = numpy.asfortranarray(cpu) b = gpu_ndarray.asfortranarray(gpu) # numpy upcast with a view to 1d scalar. if (sliced != 1 or shp == () or (offseted_outer and len(shp) > 1) or (order != 'f' and len(shp) > 1)): assert b is not gpu if (sliced == 1 and not offseted_outer and order != 'c'): assert ((a.data is cpu.data) == (b.bytes is gpu.bytes)) else: assert b is gpu pass assert a.shape == b.shape assert a.dtype == b.dtype assert a.flags.f_contiguous if shp != (): assert b.flags['F_CONTIGUOUS'] assert a.strides == b.strides assert numpy.allclose(cpu, a) assert numpy.allclose(cpu, b) def test_zeros(): for shp in [(), (0,), (5,), (0, 0), (1, 0), (0, 1), (6, 7), (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (4, 8, 9), (1, 8, 9)]: for order in ["C", "F"]: for dtype in dtypes_all: x = numpy.zeros(shp, dtype, order) y = gpu_ndarray.zeros(shp, dtype, order) check_all(x, y) x = gpu_ndarray.zeros(()) # no dtype and order param y = numpy.zeros(()) check_meta(x, y) try: gpu_ndarray.zeros() assert False except TypeError: pass def test_empty(): for shp in [(), (0,), (5,), (0, 0), (1, 0), (0, 1), (6, 7), (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), (4, 8, 9), (1, 8, 9)]: for order in ["C", "F"]: for dtype in dtypes_all: x = numpy.empty(shp, dtype, order) y = gpu_ndarray.empty(shp, dtype, order) check_meta(x, y) x = gpu_ndarray.empty(()) # no dtype and order param y = numpy.empty(()) check_meta(x, y) try: gpu_ndarray.empty() assert False except TypeError: pass def test_mapping_getitem_ellipsis(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: a, a_gpu = gen_gpu_nd_array(shp, dtype, offseted) b = a_gpu[...] assert b.bytes == a_gpu.bytes assert b.strides == a.strides assert b.shape == a.shape b_cpu = numpy.asarray(b) assert numpy.allclose(a, b_cpu) def test_copy_view(): from ..array import may_share_memory def check_memory_region(a, a_op, b, b_op): assert numpy.may_share_memory(a, a_op) == may_share_memory(b, b_op) if a_op.base is None: assert b_op.base is None else: assert a_op.base is a if b.base: # We avoid having a series of object connected by base. # This is to don't bloc the garbage collection. assert b_op.base is b.base else: assert b_op.base is b for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [False, True]: # order1 is the order of the original data for order1 in ['c', 'f']: # order2 is the order wanted after copy for order2 in ['c', 'f']: print shp, dtype, offseted, order1, order2 #TODO test copy unbroadcast! a, b = gen_gpu_nd_array(shp, dtype, offseted, order=order1) assert numpy.allclose(a, numpy.asarray(b)) check_flags(a, b) c = b.copy(order2) assert numpy.allclose(a, numpy.asarray(c)) check_flags(c, a.copy(order2)) check_memory_region(a, a.copy(order2), b, c) d = copy.copy(b) assert numpy.allclose(a, numpy.asarray(d)) check_flags(d, copy.copy(a)) check_memory_region(a, copy.copy(a), b, d) e = b.view() assert numpy.allclose(a, numpy.asarray(e)) check_flags(e, a.view()) check_memory_region(a, a.view(), b, e) f = copy.deepcopy(b) assert numpy.allclose(a, numpy.asarray(f)) check_flags(f, copy.deepcopy(a)) check_memory_region(a, copy.deepcopy(a), b, f) g = copy.copy(b.view()) assert numpy.allclose(a, numpy.asarray(g)) check_memory_region(a, copy.copy(a.view()), b, g) check_flags(g, copy.copy(a.view())) def test_len(): for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: for dtype in dtypes_all: for offseted in [True, False]: a, a_gpu = gen_gpu_nd_array(shp, dtype, offseted) assert len(a_gpu) == shp[0] def test_mapping_getitem_w_int(): def _cmp(x, y): assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] if x.flags["WRITEABLE"] != y.flags["WRITEABLE"]: assert x.ndim == 0 assert not x.flags["OWNDATA"] assert y.flags["OWNDATA"] else: assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] assert x.flags["OWNDATA"] == y.flags["OWNDATA"] assert x.flags["ALIGNED"] == y.flags["ALIGNED"] assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] x = numpy.asarray(x) assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides if not numpy.all(x == y): print x print y assert numpy.all(x == y), (x, y) def _cmpNs(x, y): """ Don't compare the stride after the transfer There is a copy that have been made on the gpu before the transfer """ assert x.shape == y.shape assert x.dtype == y.dtype assert x.strides == y.strides assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] assert x.flags["ALIGNED"] == y.flags["ALIGNED"] assert x.flags["OWNDATA"] == y.flags["OWNDATA"] assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] x_ = numpy.asarray(x) assert x_.shape == y.shape assert x_.dtype == y.dtype if not numpy.all(x_ == y): print x_ print y assert numpy.all(x_ == y), (x_, y) pass def _cmpf(x, *y): try: x.__getitem__(y) except IndexError: pass else: raise Exception("Did not generate out or bound error") def _cmpfV(x, *y): try: if len(y) == 1: x.__getitem__(*y) else: x.__getitem__(y) except ValueError: pass else: raise Exception("Did not generate value error") for dtype in dtypes_all: for offseted in [True, False]: # test vector dim = (2,) a, _a = gen_gpu_nd_array(dim, dtype, offseted) import sys init_ref_count = sys.getrefcount(_a) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[...], a[...]) _cmp(_a[-1], a[-1]) _cmp(_a[1], a[1]) _cmp(_a[0], a[0]) _cmp(_a[::1], a[::1]) _cmpNs(_a[::-1], a[::-1]) _cmp(_a[...], a[...]) _cmpf(_a, 2) # test scalar dim = () a, _a = gen_gpu_nd_array(dim, dtype, offseted) _cmp(_a[...], a[...]) _cmpf(_a, 0) _cmpfV(_a, slice(1)) # test 4d-tensor dim = (5, 4, 3, 2) a, _a = gen_gpu_nd_array(dim, dtype, offseted) _cmpf(_a, slice(-1), slice(-1), 10, -10) _cmpf(_a, slice(-1), slice(-1), -10, slice(-1)) _cmpf(_a, 0, slice(0, -1, -20), -10) _cmpf(_a, 10) _cmpf(_a, (10, 0, 0, 0)) _cmpf(_a, -10) #test with integer _cmp(_a[1], a[1]) _cmp(_a[-1], a[-1]) _cmp(_a[numpy.int64(1)], a[numpy.int64(1)]) _cmp(_a[numpy.int64(-1)], a[numpy.int64(-1)]) #test with slice _cmp(_a[1:], a[1:]) _cmp(_a[1:2], a[1:2]) _cmp(_a[-1:1], a[-1:1]) #test with tuple (mix slice, integer, numpy.int64) _cmpNs(_a[0, 0, ::numpy.int64(-1), ::-1], a[0, 0, ::-1, ::-1]) _cmpNs(_a[:, :, ::numpy.int64(-1), ::-1], a[:, :, ::-1, ::-1]) _cmpNs(_a[:, :, numpy.int64(1), -1], a[:, :, 1, -1]) _cmpNs(_a[:, :, ::-1, ::-1], a[:, :, ::-1, ::-1]) _cmpNs(_a[:, :, ::-10, ::-10], a[:, :, ::-10, ::-10]) _cmpNs(_a[:, :, 1, -1], a[:, :, 1, -1]) _cmpNs(_a[:, :, -1, :], a[:, :, -1, :]) _cmpNs(_a[:, ::-2, -1, :], a[:, ::-2, -1, :]) _cmpNs(_a[:, ::-20, -1, :], a[:, ::-20, -1, :]) _cmpNs(_a[:, ::-2, -1], a[:, ::-2, -1]) _cmpNs(_a[0, ::-2, -1], a[0, ::-2, -1]) _cmp(_a[-1, -1, -1, -2], a[-1, -1, -1, -2]) #test ellipse _cmp(_a[...], a[...]) pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/gpu_ndarray.h0000644000175000000500000000175512313360366022432 0ustar tomussrc#ifndef _GPU_NDARRAY_H #define _GPU_NDARRAY_H typedef struct GpuNdArray{ char* data; //pointer to data element [0,..,0]. int offset; int nd; //the number of dimensions of the tensor /** * base: * either NULL or a pointer to a fellow CudaNdarray into which this one is viewing. * This pointer is never followed, except during Py_DECREF when we do not need it any longer. */ void * base; ssize_t * dimensions; //dim0, dim1, ... dim nd ssize_t * strides; //stride0, stride1, ... stride nd int flags; // Flags, see numpy flags //DTYPE dtype; // fine for numeric types //DtypeMeta * dtype_meta; // reserved for future use. //PyArray_Descr *descr; /* Pointer to type structure */ } GpuNdArray; #endif /* Local Variables: mode:c++ c-basic-offset:4 c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)) indent-tabs-mode:nil fill-column:79 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/pygpu_ndarray.cpp0000644000175000000500000014450512313360366023337 0ustar tomussrc#include #include #include #include #include "pygpu_ndarray.h" #include "pygpu_language.h" ///////////////////////// // Static helper methods ///////////////////////// static void PyGpuNdArray_null_init(PyGpuNdArrayObject *self) { DPRINTF("PyGpuNdArrayObject_null_init\n"); PyGpuNdArray_DATA(self) = NULL; PyGpuNdArray_OFFSET(self) = 0; PyGpuNdArray_NDIM(self) = -1; self->base = NULL; PyGpuNdArray_DIMS(self) = NULL; PyGpuNdArray_STRIDES(self) = NULL; PyGpuNdArray_FLAGS(self) = NPY_DEFAULT; self->descr = NULL; self->data_allocated = 0; } ///////////////////////////// // Satisfying reqs to be Type ///////////////////////////// //DON'T use directly(if their is other PyGpuNdArrayObject that point to it, it will cause problem)! use Py_DECREF() instead static void PyGpuNdArrayObject_dealloc(PyGpuNdArrayObject* self) { DPRINTF("PyGpuNdArrayObject_dealloc\n"); DPRINTF("PyGpuNdArrayObject dealloc %p %d %p\n", self, self->data_allocated, PyGpuNdArray_DATA(self)); if(self->ob_refcnt>1) printf("WARNING:PyGpuNdArrayObject_dealloc called when their is still active reference to it.\n"); if (self->data_allocated){ assert(PyGpuNdArray_DATA(self)); if (PyGpuNdArray_DATA(self)){ if (device_free(PyGpuNdArray_DATA(self))){ fprintf(stderr, "!!!! error freeing device memory %p (self=%p)\n", PyGpuNdArray_DATA(self), self); } PyGpuNdArray_DATA(self) = NULL; } } PyGpuNdArray_OFFSET(self) = 0; PyGpuNdArray_NDIM(self) = -1; Py_XDECREF(self->base); self->base = NULL; if (PyGpuNdArray_DIMS(self)){ free(PyGpuNdArray_DIMS(self)); PyGpuNdArray_DIMS(self) = NULL; } if (PyGpuNdArray_STRIDES(self)){ free(PyGpuNdArray_STRIDES(self)); PyGpuNdArray_STRIDES(self) = NULL; } PyGpuNdArray_FLAGS(self) = NPY_DEFAULT; //Py_XDECREF(self->descr);//TODO: How to handle the refcont on this object? self->descr = NULL; self->data_allocated = 0; self->ob_type->tp_free((PyObject*)self); --_outstanding_mallocs[1]; DPRINTF("device_malloc_counts: (device) %i (obj) %i\n", _outstanding_mallocs[0], _outstanding_mallocs[1]); DPRINTF("PyGpuNdArrayObject_dealloc end\n"); } static PyObject * PyGpuNdArray_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { DPRINTF("PyGpuNdArray_new\n"); PyGpuNdArrayObject *self; self = (PyGpuNdArrayObject *)type->tp_alloc(type, 0); if (self != NULL){ PyGpuNdArray_null_init(self); ++_outstanding_mallocs[1]; } DPRINTF("PyGpuNdArray_new end %p\n", self); return (PyObject *)self; } static int PyGpuNdArray_init(PyGpuNdArrayObject *self, PyObject *args, PyObject *kwds) { DPRINTF("PyGpuNdArray_init\n"); PyObject *arr=NULL; if (! PyArg_ParseTuple(args, "O", &arr)) return -1; if (! PyArray_Check(arr)){ PyErr_SetString(PyExc_TypeError, "PyGpuNdArrayObject_init: PyArray or PyGpuNdArrayObject arg required"); return -1; } // TODO: We must create a new copy of the PyArray_Descr(or this only increment the refcount?) or still the reference? PyArray_Descr * type = PyArray_DescrFromType(PyArray_TYPE(arr)); self->descr = type; Py_XINCREF(self->descr);//TODO: How to handle the refcont on this object? int rval = PyGpuNdArray_CopyFromArray(self, (PyArrayObject*)arr); DPRINTF("PyGpuNdArray_init: end %p type=%p\n", self, self->descr); return rval; } int PyGpuNdArray_CopyFromArray(PyGpuNdArrayObject * self, PyArrayObject*obj) { DPRINTF("PyGpuNdArray_CopyFromArray: start descr=%p\n", self->descr); //modif done to the new array won't be updated! assert(!PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); //Aligned are not tested, so don't allow it for now assert(PyGpuNdArray_CHKFLAGS(self, NPY_ALIGNED)); int typenum = PyArray_TYPE(obj); PyObject * py_src = NULL; if (PyArray_ISONESEGMENT(obj)) { Py_INCREF(obj); py_src = (PyObject *) obj; }else{ py_src = PyArray_ContiguousFromAny((PyObject*)obj, typenum, PyArray_NDIM(obj), PyArray_NDIM(obj)); } DPRINTF("PyGpuNdArray_CopyFromArray: contiguous!\n"); if (!py_src) { return -1; } int err; if(PyArray_ISFORTRAN(obj) && ! PyArray_ISCONTIGUOUS(obj)){ DPRINTF("PyGpuNdArray_CopyFromArray: fortran!\n"); err = PyGpuNdArray_alloc_contiguous(self, obj->nd, obj->dimensions, NPY_FORTRANORDER); }else{ err = PyGpuNdArray_alloc_contiguous(self, obj->nd, obj->dimensions); } if (err) { return err; } //check that the flag are the same if (PyArray_ISCONTIGUOUS(py_src) != PyGpuNdArray_ISCONTIGUOUS(self) && PyArray_ISFORTRAN(obj) && 0) { PyErr_Format(PyExc_RuntimeError, "ISCONTIGUOUS %d %d\n", PyArray_ISCONTIGUOUS(py_src), PyGpuNdArray_ISCONTIGUOUS(self)); return -1; } assert(PyArray_ISCONTIGUOUS(py_src) == PyGpuNdArray_ISCONTIGUOUS(self) || PyArray_ISFORTRAN(obj)); assert(PyArray_ISFORTRAN(py_src) == PyGpuNdArray_ISFORTRAN(self)); assert(PyArray_ISALIGNED(py_src) == PyGpuNdArray_ISALIGNED(self)); // New memory, so we should own it. assert(PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); // New memory, so it should be writable assert(PyGpuNdArray_ISWRITEABLE(self)); err = PyGpuMemcpy(PyGpuNdArray_DATA(self), PyArray_DATA(py_src), PyGpuNdArray_OFFSET(self), PyArray_SIZE(py_src) * PyArray_ITEMSIZE(py_src), PyGpuHostToDevice); if (err) { Py_DECREF(py_src); return -1; } Py_DECREF(py_src); DPRINTF("PyGpuNdArray_CopyFromArray: end\n"); return 0; } static PyObject * PyGpuNdArray_copy(PyObject * self, PyObject *args, PyObject *kargs) { DPRINTF("PyGpuNdArray_copy start\n"); static const char *kwlist[] = {"order", NULL}; NPY_ORDER order = PyArray_CORDER; if(!PyGpuNdArray_Check(self)){ PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_copy: expected a PyGpuNdArrayObject."); return NULL; } DPRINTF("PyGpuNdArray_copy before parse inputs\n"); if (!PyArg_ParseTupleAndKeywords(args, kargs, "|O&", (char**)kwlist, PyArray_OrderConverter, &order)) { DPRINTF("PyGpuNdArray_copy start1.2\n"); return NULL; } DPRINTF("PyGpuNdArray_copy after parse inputs\n"); DPRINTF("PyGpuNdArray_copy before copy\n"); PyObject *ret = PyGpuNdArray_Copy((PyGpuNdArrayObject*)self, order); DPRINTF("PyGpuNdArray_copy end\n"); return ret; } static PyObject * PyGpuNdArray_Copy(PyGpuNdArrayObject * self, NPY_ORDER order) { DPRINTF("PyGpuNdArray_Copy start\n"); PyObject * rval = PyGpuNdArray_New(); //TODO find how to refcount descr. PyGpuNdArray_DESCR(rval) = PyGpuNdArray_DESCR(self); if ((!rval) || (-1 == PyGpuNdArray_NDIM(self))) { return rval; } if (PyGpuNdArray_alloc_contiguous((PyGpuNdArrayObject*)rval, PyGpuNdArray_NDIM(self), PyGpuNdArray_DIMS(self), order)) { Py_DECREF(rval); return NULL; } if (PyGpuNdArray_CopyFromPyGpuNdArray((PyGpuNdArrayObject*)rval, self)) { Py_DECREF(rval); return NULL; } if (order == NPY_F_CONTIGUOUS) PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; #ifdef DEBUG PyGpuNdArray_fprint(stderr, self); PyGpuNdArray_fprint(stderr, (PyGpuNdArrayObject *)rval); #endif DPRINTF("PyGpuNdArray_Copy end\n"); return rval; } PyObject * PyGpuNdArray_DeepCopy(PyGpuNdArrayObject * self, PyObject * memo) { assert(PyDict_Check(memo)); PyObject * selfkey = PyInt_FromLong((long)self); assert(selfkey); if (PyDict_Contains(memo, selfkey)) { PyObject * rval = PyDict_GetItem(memo, selfkey); Py_DECREF(selfkey); Py_XINCREF(rval); return rval; } else { DPRINTF("PyGpuNdArray_DeepCopy: startd deepcopy\n"); PyObject * rval = PyGpuNdArray_Copy(self); if (NULL == rval) { Py_DECREF(selfkey); return NULL; } DPRINTF("DeepCopy created %p\n", rval); DPRINTF("DeepCopy created %p %p\n", PyGpuNdArray_DESCR(rval), PyGpuNdArray_DATA(rval)); if (PyDict_SetItem(memo, selfkey, rval)) { Py_DECREF(rval); Py_DECREF(selfkey); return NULL; } Py_DECREF(selfkey); DPRINTF("PyGpuNdArray_DeepCopy: startd end\n"); return rval; } } PyObject * PyGpuNdArray_View(PyGpuNdArrayObject * self) { PyGpuNdArrayObject * rval = (PyGpuNdArrayObject*)PyGpuNdArray_New(PyGpuNdArray_NDIM(self)); if (!rval || PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), (PyObject *)self, PyGpuNdArray_OFFSET(self))) { Py_XDECREF(rval); DPRINTF("PyGpuNdArray_View: no rval or PyGpuNdArray_set_data " "failed: self=%p, rval=%p rval_base=%p\n", self, rval, rval->base); return NULL; } else { for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { PyGpuNdArray_DIM(rval, i) = PyGpuNdArray_DIMS(self)[i]; PyGpuNdArray_STRIDE(rval, i) = PyGpuNdArray_STRIDES(self)[i]; } } DPRINTF("PyGpuNdArray_View: self=%p, self->base=%p" " rval=%p rval->base=%p\n", self, self->base, rval, rval->base); //TODO: find how to refcount on the descr! //Py_INCREF(PyGpuNdArray_DESCR(self)); PyGpuNdArray_DESCR(rval) = PyGpuNdArray_DESCR(self); PyGpuNdArray_FLAGS(rval) = PyGpuNdArray_FLAGS(self); PyGpuNdArray_FLAGS(rval) &= ~NPY_OWNDATA; return (PyObject*)rval; } //updated for offset PyObject * PyGpuNdArray_CreateArrayObj(PyGpuNdArrayObject * self) { DPRINTF("PyGpuNdArray_CreateArrayObj\n"); if(PyGpuNdArray_NDIM(self)>=0 && PyGpuNdArray_SIZE(self)==0){ npy_intp * npydims = (npy_intp*)malloc(PyGpuNdArray_NDIM(self) * sizeof(npy_intp)); assert (npydims); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) npydims[i] = (npy_intp)(PyGpuNdArray_DIMS(self)[i]); // Numpy will do a decref on the description. Py_INCREF(PyGpuNdArray_DESCR(self)); // We can't use PyArray_{Empty,EMPTY} as they segfault when size == 0 PyObject * rval = PyArray_NewFromDescr(&PyArray_Type, PyGpuNdArray_DESCR(self), PyGpuNdArray_NDIM(self), npydims, NULL, NULL, 0, NULL); free(npydims); if (!rval){ return NULL; } assert (PyArray_ITEMSIZE(rval) == PyGpuNdArray_ITEMSIZE(self)); return rval; } if ((PyGpuNdArray_NDIM(self) < 0) || (PyGpuNdArray_DATA(self) == 0)) { PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized PyGpuNdArray"); return NULL; } PyGpuNdArrayObject * contiguous_self = NULL; bool pos_stride = true; for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) if (PyGpuNdArray_STRIDE(self,i)<0) pos_stride = false; if (PyGpuNdArray_ISONESEGMENT(self) && pos_stride) { contiguous_self = self; Py_INCREF(contiguous_self); DPRINTF("PyGpuNdArray_CreateArrayObj: gpu array already contiguous %p\n", contiguous_self); //}else if(PyGpuNdArray_ISONESEGMENT(self)){ //TODO implement special object handling to speed up transfer // DPRINTF("CreateArrayObj one segment, with special handling %p\n", contiguous_self); //PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_CreateArrayObj: Need PyGpuNdArray_Copy or some other nd array mandling to transfer contiguous bloc with negative stride."); //return NULL; } else { contiguous_self = (PyGpuNdArrayObject*)PyGpuNdArray_Copy(self); DPRINTF("CreateArrayObj created contiguous %p\n", contiguous_self); } if (!contiguous_self) { return NULL; } npy_intp * npydims = (npy_intp*)malloc(PyGpuNdArray_NDIM(self) * sizeof(npy_intp)); assert (npydims); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) npydims[i] = (npy_intp)(PyGpuNdArray_DIMS(self)[i]); Py_INCREF(PyGpuNdArray_DESCR(self)); PyObject * rval = PyArray_Empty(PyGpuNdArray_NDIM(self), npydims, PyGpuNdArray_DESCR(self), PyGpuNdArray_ISFORTRAN(self)); free(npydims); if (!rval) { Py_DECREF(contiguous_self); return NULL; } int err = PyGpuMemcpy(PyArray_DATA(rval), PyGpuNdArray_DATA(contiguous_self), PyGpuNdArray_OFFSET(contiguous_self), PyArray_SIZE(rval) * PyArray_ITEMSIZE(rval), PyGpuDeviceToHost); if (err) { Py_DECREF(contiguous_self); Py_DECREF(rval); rval = NULL; } Py_DECREF(contiguous_self); return rval; } static PyObject * PyGpuNdArray_Empty(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran) { DPRINTF("PyGpuNdArray_Empty: start!\n"); PyGpuNdArrayObject* rval = (PyGpuNdArrayObject*)PyGpuNdArray_New(); PyGpuNdArray_DESCR(rval) = dtype; if (!rval) { DPRINTF("PyGpuNdArray_Empty: fail!\n"); return NULL; } NPY_ORDER order = NPY_CORDER; if (fortran!=0) order = NPY_FORTRANORDER; if (PyGpuNdArray_alloc_contiguous(rval, nd, dims, order)) { Py_DECREF(rval); return NULL; } DPRINTF("PyGpuNdArray_Empty: end!\n"); return (PyObject*) rval; } //DONE: dtype, offset not needed, flags static PyObject * PyGpuNdArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran) { DPRINTF("PyGpuNdArray_Zeros: start!\n"); PyObject * rval = PyGpuNdArray_Empty(nd, dims, dtype, fortran); if (!rval) { return rval; } int total_elements = 1; for(int i=0;ielsize; // Fill with zeros int err = PyGpuMemset(PyGpuNdArray_DATA(rval), 0, total_size); if (err) { Py_DECREF(rval); return NULL; } DPRINTF("PyGpuNdArray_Zeros: end!\n"); return (PyObject*) rval; } // declared as a static method (hence "dummy" is not used) // numpy.zeros(shape, dtype=float, order='C') static PyObject * PyGpuNdArray_zeros(PyObject* dummy, PyObject* args, PyObject *kargs) { static const char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */ PyArray_Descr *typecode = NULL; PyObject * shape = NULL; NPY_ORDER order = PyArray_CORDER; bool fortran = false; PyObject *ret = NULL; if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&O&", (char**)kwlist, &shape, PyArray_DescrConverter, &typecode, PyArray_OrderConverter, &order)) { Py_XDECREF(typecode); Py_XDECREF(shape); return ret; } if (order == PyArray_FORTRANORDER) { fortran = true; } else { fortran = false; } if(!PySequence_Check(shape)) { PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence"); return NULL; } if (!typecode) typecode = PyArray_DescrFromType(NPY_FLOAT64); int shplen = PySequence_Length(shape); if (shplen == 0) { return PyGpuNdArray_Zeros(0, NULL, typecode, fortran); } npy_intp* newdims = (npy_intp *)malloc(sizeof(npy_intp) * shplen); if (!newdims) { PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_Zeros: Failed to allocate temporary space"); return NULL; } // start from the end to compute strides for (int i = shplen-1; i >= 0; --i) { PyObject* shp_el_obj = PySequence_GetItem(shape, i); if(shp_el_obj == NULL) { // shouldn't happen since we checked length before... PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_Zeros: Index out of bound in sequence"); free(newdims); return NULL; } int shp_el = PyInt_AsLong(shp_el_obj); Py_DECREF(shp_el_obj); newdims[i] = shp_el; } PyObject* rval = PyGpuNdArray_Zeros(shplen, newdims, typecode, fortran); free(newdims); return (PyObject*)rval; } // declared as a static method (hence "dummy" is not used) // numpy.empty(shape, dtype=float, order='C') static PyObject * PyGpuNdArray_empty(PyObject* dummy, PyObject* args, PyObject *kargs) { static const char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */ PyArray_Descr *typecode = NULL; PyObject * shape = NULL; NPY_ORDER order = PyArray_CORDER; bool fortran = false; PyObject *ret = NULL; if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&O&", (char **)kwlist, &shape, PyArray_DescrConverter, &typecode, PyArray_OrderConverter, &order)) { Py_XDECREF(typecode); Py_XDECREF(shape); return ret; } if (order == PyArray_FORTRANORDER) { fortran = true; } else { fortran = false; } if(!PySequence_Check(shape)) { PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence"); return NULL; } if (!typecode) typecode = PyArray_DescrFromType(NPY_FLOAT64); int shplen = PySequence_Length(shape); if (shplen == 0) { return PyGpuNdArray_Empty(0, NULL, typecode, fortran); } npy_intp* newdims = (npy_intp *)malloc(sizeof(npy_intp) * shplen); if (!newdims) { PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_empty: Failed to allocate temporary space"); return NULL; } // start from the end to compute strides for (int i = shplen-1; i >= 0; --i) { PyObject* shp_el_obj = PySequence_GetItem(shape, i); if(shp_el_obj == NULL) { // shouldn't happen since we checked length before... PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_empty: Index out of bound in sequence"); free(newdims); return NULL; } int shp_el = PyInt_AsLong(shp_el_obj); Py_DECREF(shp_el_obj); newdims[i] = shp_el; } PyObject* rval = PyGpuNdArray_Empty(shplen, newdims, typecode, fortran); free(newdims); return (PyObject*)rval; } static PyMethodDef PyGpuNdArray_methods[] = { {"__array__", (PyCFunction)PyGpuNdArray_CreateArrayObj, METH_NOARGS, "Copy from the device to a numpy ndarray"}, {"copy", (PyCFunction)PyGpuNdArray_copy, METH_VARARGS|METH_KEYWORDS, "Create a deep copy of this object."}, {"view", (PyCFunction)PyGpuNdArray_View, METH_NOARGS, "Create a view of this object."}, {"__copy__", (PyCFunction)PyGpuNdArray_Copy, METH_NOARGS, "Create a copy of this object as numpy does. Why numpy do a copy of the data when the object is a view?"}, {"__deepcopy__", (PyCFunction)PyGpuNdArray_DeepCopy, METH_O, "Create a copy of this object"}, /* {"reduce_sum", (PyCFunction)PyGpuNdArray_ReduceSum, METH_O, "Reduce over the given dimensions by summation"}, {"exp", (PyCFunction)PyGpuNdArray_Exp, METH_NOARGS, "Return the exponential of all elements"}, {"reshape", (PyCFunction)PyGpuNdArray_Reshape, METH_O, "Return a reshaped view (or copy) of this ndarray\n\ The required argument is a tuple of integers specifying the shape of the new ndarray."}, {"_set_stride", (PyCFunction)PyGpuNdArray_SetStride, METH_VARARGS, "For integer arguments (i, s), set the 'i'th stride to 's'"}, {"_set_shape_i", (PyCFunction)PyGpuNdArray_SetShapeI, METH_VARARGS, "For integer arguments (i, s), set the 'i'th shape to 's'"}, */ {NULL, NULL, NULL, NULL} /* Sentinel */ }; //PyArray_CopyInto(PyArrayObject* dest, PyArrayObject* src)¶ //PyObject* PyArray_NewCopy(PyArrayObject* old, NPY_ORDER order)¶ static PyObject * PyGpuNdArray_get_shape(PyGpuNdArrayObject *self, void *closure) { DPRINTF("PyGpuNdArray_get_shape\n"); if (PyGpuNdArray_NDIM(self) < 0) { PyErr_SetString(PyExc_ValueError, "PyGpuNdArray not initialized"); return NULL; } PyObject * rval = PyTuple_New(PyGpuNdArray_NDIM(self)); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(PyGpuNdArray_DIMS(self)[i]))) { Py_XDECREF(rval); return NULL; } } return rval; } static int PyGpuNdArray_set_shape(PyGpuNdArrayObject *self, PyObject *value, void *closure) { PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape"); return -1; } static PyObject * PyGpuNdArray_get_strides(PyGpuNdArrayObject *self, void *closure) { if ( PyGpuNdArray_NDIM(self) < 0){ PyErr_SetString(PyExc_ValueError, "PyGpuNdArrayObject not initialized"); return NULL; } PyObject * rval = PyTuple_New( PyGpuNdArray_NDIM(self)); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i){ if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(PyGpuNdArray_STRIDES(self)[i]))){ Py_XDECREF(rval); return NULL; } } return rval; } static PyObject * PyGpuNdArray_get_data(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) PyGpuNdArray_DATA(self)); } static PyObject * PyGpuNdArray_get_flags(PyGpuNdArrayObject *self, void *closure) { PyObject * dict = PyDict_New(); PyObject * str= PyString_FromString("C_CONTIGUOUS"); PyObject * i = PyBool_FromLong(PyGpuNdArray_ISCONTIGUOUS(self)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("F_CONTIGUOUS"); i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("WRITEABLE"); i = PyBool_FromLong(PyGpuNdArray_ISWRITEABLE(self)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("ALIGNED"); i = PyBool_FromLong(PyGpuNdArray_ISALIGNED(self)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("UPDATEIFCOPY"); i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); str= PyString_FromString("OWNDATA"); i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); PyDict_SetItem(dict, str, i); Py_DECREF(str); Py_DECREF(i); return dict; } static PyObject * PyGpuNdArray_get_ndim(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) PyGpuNdArray_NDIM(self)); } static PyObject * PyGpuNdArray_get_offset(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) PyGpuNdArray_OFFSET(self)); } static PyObject * PyGpuNdArray_get_data_allocated(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) self->data_allocated); } static PyObject * PyGpuNdArray_get_size(PyGpuNdArrayObject *self, void *closure) { return PyInt_FromLong((long int) PyGpuNdArray_SIZE(self)); } static PyObject * PyGpuNdArray_get_base(PyGpuNdArrayObject *self, void *closure) { if (!PyGpuNdArray_BASE(self)){ Py_INCREF(Py_None); return Py_None; } PyObject * ret = PyGpuNdArray_BASE(self); Py_INCREF(ret); return ret; } static PyObject * PyGpuNdArray_get_dtype(PyArrayObject *self) { Py_INCREF(PyGpuNdArray_DESCR(self)); PyObject * ret = (PyObject *)PyGpuNdArray_DESCR(self); return ret; } static PyObject * PyGpuNdArray_get_itemsize(PyArrayObject *self) { return (PyObject *)PyInt_FromLong(PyGpuNdArray_ITEMSIZE(self)); } static PyGetSetDef PyGpuNdArray_getset[] = { {(char*)"base", (getter)PyGpuNdArray_get_base, NULL, (char*)"Return the object stored in the base attribute", NULL}, {(char*)"bytes", (getter)PyGpuNdArray_get_data, NULL, (char*)"device data pointer", NULL}, {(char*)"shape", (getter)PyGpuNdArray_get_shape, (setter)PyGpuNdArray_set_shape, (char*)"shape of this ndarray (tuple)", NULL}, {(char*)"strides", (getter)PyGpuNdArray_get_strides, NULL,//(setter)PyGpuNdArray_set_strides, (char*)"data pointer strides (in elements)", NULL}, {(char*)"ndim", (getter)PyGpuNdArray_get_ndim, NULL, (char*)"The number of dimensions in this object", NULL}, {(char*)"offset", (getter)PyGpuNdArray_get_offset, NULL, (char*)"Return the offset value", NULL}, {(char*)"size", (getter)PyGpuNdArray_get_size, NULL, (char*)"The number of elements in this object.", NULL}, {(char*)"data_allocated", (getter)PyGpuNdArray_get_data_allocated, NULL, (char*)"The size of the allocated memory on the device.", NULL}, {(char*)"itemsize", (getter)PyGpuNdArray_get_itemsize, NULL, (char*)"The size of the base element.", NULL}, {(char*)"dtype", (getter)PyGpuNdArray_get_dtype, NULL, (char*)"The dtype of the element", NULL}, {(char*)"flags", (getter)PyGpuNdArray_get_flags, NULL, (char*)"Return the flags as a dictionary", NULL}, {NULL, NULL, NULL, NULL} /* Sentinel */ }; // Will by called by __len__ in Python static Py_ssize_t PyGpuNdArray_len(PyObject * py_self) { PyGpuNdArrayObject * self = (PyGpuNdArrayObject*) py_self; if (PyGpuNdArray_NDIM(self) <= 0) { return (Py_ssize_t) 0; } else { return (Py_ssize_t) PyGpuNdArray_DIMS(self)[0]; } } static int PyGpuNdArray_add_offset(PyGpuNdArrayObject * self, int offset) { DPRINTF("PyGpuNdArray_add_offset: %p %d\n", self, offset); #if OFFSET PyGpuNdArray_OFFSET(self) += offset; #else PyGpuNdArray_DATA(self) += offset; #endif return 0; } static int PyGpuNdArray_set_data(PyGpuNdArrayObject * self, char * data, PyObject * base, int offset) { DPRINTF("PyGpuNdArray_set_data: %p %p %p %d\n", self, data, base, offset); if (self->data_allocated) { assert(PyGpuNdArray_DATA(self)); if (device_free(PyGpuNdArray_DATA(self))) { PyGpuNdArray_DATA(self) = NULL; self->data_allocated = 0; DPRINTF("PyGpuNdArray_set_data: device_free failed!\n"); PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_set_data: device_free failed"); return -1; } } // Get the original base object (base.base.base...) // TODO: check that base is indeed a CudaNdarray? PyObject * orig_base = base; // base is not always a PyGpuNdArrayObject. It can be a GpuArray from pycuda, ... while (orig_base && PyGpuNdArray_Check(orig_base) && ((PyGpuNdArrayObject*) orig_base)->base) { // base_base is itself a view orig_base = ((PyGpuNdArrayObject*) orig_base)->base; } //N.B. XDECREF and XINCREF are no-ops for NULL pointers if (PyGpuNdArray_BASE(self) != orig_base) { Py_XDECREF(PyGpuNdArray_BASE(self)); PyGpuNdArray_BASE(self) = orig_base; Py_XINCREF(PyGpuNdArray_BASE(self)); } self->data_allocated = 0; #if OFFSET PyGpuNdArray_DATA(self) = data; PyGpuNdArray_OFFSET(self) = offset; #else PyGpuNdArray_DATA(self) = data + offset; #endif return 0; } // Will by called by __getitem__ in Python static PyObject * PyGpuNdArray_Subscript(PyObject * py_self, PyObject * key) { DPRINTF("Subscript start\n"); PyGpuNdArrayObject * self = (PyGpuNdArrayObject*) py_self; PyObject * py_rval = NULL; PyGpuNdArrayObject * rval = NULL; PyObject * intobj = NULL; //PyObject_Print(key, stderr, 0); if (key == Py_Ellipsis) { DPRINTF("Subscript with ellipse \n"); Py_INCREF(py_self); DPRINTF("Subscript with ellipse end\n"); return py_self; } if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER { #ifdef DEBUG PyGpuNdArray_fprint(stderr, self); #endif DPRINTF("Subscript with int \n"); int d_idx = PyInt_AsLong(intobj); Py_DECREF(intobj); intobj=NULL; DPRINTF("Subscript with int 1\n"); if (PyGpuNdArray_NDIM(self) == 0) { PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed"); return NULL; }else if (PyGpuNdArray_NDIM(self)< 0){ PyErr_SetString(PyExc_IndexError, "nd arrays must have a number of dim > 0!"); return NULL; } int d_dim = PyGpuNdArray_DIMS(self)[0]; int offset = 0; DPRINTF("Subscript with int 2\n"); if ((d_idx >= 0) && (d_idx < d_dim)) { //normal indexing offset += d_idx * PyGpuNdArray_STRIDES(self)[0]; } else if ((d_idx < 0) && (d_idx >= -d_dim)) { //end-based indexing // d_idx is negative offset += (d_dim + d_idx) * PyGpuNdArray_STRIDES(self)[0]; } else { PyErr_SetString(PyExc_IndexError, "index out of bounds"); return NULL; } DPRINTF("Subscript with int 3\n"); //Add the original offset offset += PyGpuNdArray_OFFSET(self); //allocate our subtensor view py_rval = PyGpuNdArray_New(PyGpuNdArray_NDIM(self) - 1); rval = (PyGpuNdArrayObject*) py_rval; if (!rval) return NULL; //TODO: find how to refcount on the descr! PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); DPRINTF("Subscript with int 4\n"); //initialize the view's data pointer to our own. assert (0 == rval->data_allocated); if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), (PyObject *) self, offset)){ Py_DECREF(rval); return NULL; } DPRINTF("Subscript with int 5\n"); for (int d = 1; d < PyGpuNdArray_NDIM(self); ++d) { PyGpuNdArray_STRIDE(rval, d-1) = PyGpuNdArray_STRIDES(self)[d]; PyGpuNdArray_DIM(rval, d-1) = PyGpuNdArray_DIMS(self)[d]; } } else { PyErr_Clear(); } if (PySlice_Check(key)) //INDEXING BY SLICE { DPRINTF("Subscript with slice \n"); if (PyGpuNdArray_NDIM(self) == 0) { PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array"); return NULL; } int d_dim = PyGpuNdArray_DIMS(self)[0]; Py_ssize_t start, stop, step, slen; if (PySlice_GetIndicesEx((PySliceObject*)key, d_dim, &start, &stop, &step, &slen)) { return NULL; } DPRINTF("start %zd\nstop %zd\n step %zd\n slen %zd\n", start, stop, step, slen); //allocate our subtensor view py_rval = PyGpuNdArray_New(PyGpuNdArray_NDIM(self)); rval = (PyGpuNdArrayObject*) py_rval; if (!rval) return NULL; //TODO: find how to refcount on the descr! PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); assert (0 == rval->data_allocated); if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), py_self, start * PyGpuNdArray_STRIDE(self, 0) + PyGpuNdArray_OFFSET(self))) { Py_DECREF(rval); return NULL; } //initialize dimension 0 of rval PyGpuNdArray_STRIDE(rval, 0) = step * PyGpuNdArray_STRIDES(self)[0]; PyGpuNdArray_DIM(rval, 0) = slen; DPRINTF("rval stride %zd\n", PyGpuNdArray_STRIDES(rval)[0]); // initialize dimensions > 0 of rval for (int d = 1; d < PyGpuNdArray_NDIM(self); ++d) { PyGpuNdArray_STRIDE(rval, d) = PyGpuNdArray_STRIDES(self)[d]; PyGpuNdArray_DIM(rval, d) = PyGpuNdArray_DIMS(self)[d]; } } if (PyTuple_Check(key)) //INDEXING BY TUPLE { DPRINTF("Subscript with tuple \n"); //elements of the tuple can be either integers or slices //the dimensionality of the view we will return is diminished for each slice in the tuple int tuple_start_index = 0; if (PyTuple_Size(key) > PyGpuNdArray_NDIM(self)) { if (PyTuple_GetItem(key, 0) == Py_Ellipsis && PyTuple_Size(key) == PyGpuNdArray_NDIM(self) + 1) { tuple_start_index = 1; DPRINTF("Subscript with tuple staring with an extra ellipse" " at the start.\n"); } else{ PyErr_SetString(PyExc_IndexError, "index error, specified more dimensions then" " the number of existing dimensions"); return NULL; } } //calculate the number of dimensions in the return value int rval_nd = PyGpuNdArray_NDIM(self); for (int tuple_d = tuple_start_index; tuple_d < PyTuple_Size(key); ++tuple_d) { //On some paltform PyInt_Check() return true, other it return false. //So we use PyArray_IsAnyScalar that should covert everything. rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, tuple_d)); } //allocate our subtensor view py_rval = PyGpuNdArray_New(rval_nd); rval = (PyGpuNdArrayObject*) py_rval; if (!rval) return NULL; assert (0 == rval->data_allocated); //TODO: find how to refcount on the descr! PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); //initialize the view's data pointer to our own. if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), py_self, PyGpuNdArray_OFFSET(self))) { Py_DECREF(rval); return NULL; } // rval_d will refer to the current dimension in the rval. // It will not be incremented for integer keys, but will be incremented for slice // keys int rval_d = 0; for (int self_d = 0, tuple_d = tuple_start_index; self_d < PyGpuNdArray_NDIM(self); ++self_d, ++tuple_d) { // keys can be shorter than PyGpuNdArray_NDIM(self). // when that happens, it means that the remaining dimensions are "full slices" if (tuple_d >= PyTuple_Size(key)) { PyGpuNdArray_STRIDE(rval, rval_d) = PyGpuNdArray_STRIDES(self)[tuple_d]; PyGpuNdArray_DIM(rval, rval_d) = PyGpuNdArray_DIMS(self)[tuple_d]; ++rval_d; DPRINTF("Subscript extra dims to append %zd %zd\n", PyGpuNdArray_STRIDE(rval, rval_d), PyGpuNdArray_DIM(rval, rval_d)); } else { PyObject * key_d = PyTuple_GetItem(key, tuple_d); if (PySlice_Check(key_d)) { Py_ssize_t start, stop, step, slen; if (PySlice_GetIndicesEx((PySliceObject*)key_d, PyGpuNdArray_DIMS(self)[self_d], &start, &stop, &step, &slen)) { Py_DECREF(rval); return NULL; } PyGpuNdArray_add_offset(rval, start * PyGpuNdArray_STRIDES(self)[self_d]); PyGpuNdArray_STRIDE(rval, rval_d) = step * PyGpuNdArray_STRIDES(self)[self_d]; PyGpuNdArray_DIM(rval, rval_d) = slen; DPRINTF("rval_d %d self_d %d\n start %zd\nstop %zd\n step %zd\n slen %zd\n", rval_d, self_d, start, stop, step, slen); ++rval_d; } else if ((intobj=PyNumber_Int(key_d))) { assert(PyArray_IsAnyScalar(key_d)); int d_idx = PyInt_AsLong(intobj); Py_DECREF(intobj); intobj = NULL; int d_dim = PyGpuNdArray_DIMS(self)[self_d]; if ((d_idx >= 0) && (d_idx < d_dim)) { //normal indexing PyGpuNdArray_add_offset(rval, d_idx * PyGpuNdArray_STRIDES(self)[self_d]); } else if ((d_idx < 0) && (d_idx >= -d_dim)) { //end-based indexing PyGpuNdArray_add_offset(rval, (d_dim + d_idx) * PyGpuNdArray_STRIDES(self)[self_d]); } else { PyErr_SetString(PyExc_IndexError, "index out of bounds"); Py_DECREF(rval); return NULL; } } else if (key_d == Py_Ellipsis) { if (self_d != 0){ PyErr_Format(PyExc_IndexError, "Ellipsis supported only at the start of" " the tuple"); Py_DECREF(rval); return NULL; } DPRINTF("Substript with tuple with the first element an ellipse\n"); for( ; self_d < (rval_nd - PyTuple_Size(key) + 1); self_d++) { PyGpuNdArray_STRIDE(rval, rval_d) = PyGpuNdArray_STRIDES(self)[self_d]; PyGpuNdArray_DIM(rval, rval_d) = PyGpuNdArray_DIMS(self)[self_d]; DPRINTF("Ellipse append dimensions self_%d with %zd %zd\n", self_d, PyGpuNdArray_STRIDE(rval, rval_d), PyGpuNdArray_DIM(rval, rval_d)); ++rval_d; } tuple_start_index = 1; self_d--; } else { PyErr_Clear(); // clear the error set by PyNumber_Int PyErr_Format(PyExc_IndexError, "index must be either int or slice. Got %s", PyString_AsString(PyObject_Str(key_d))); Py_DECREF(rval); return NULL; } } } } if (py_rval) { #ifdef DEBUG PyGpuNdArray_fprint(stderr, self); PyGpuNdArray_fprint(stderr, rval); #endif } else { PyErr_SetString(PyExc_NotImplementedError, "Unknown key type"); return NULL; } // Set flags if (PyGpuNdArray_ISWRITEABLE(self)) { PyGpuNdArray_FLAGS(rval) |= NPY_WRITEABLE; } else { PyGpuNdArray_FLAGS(rval) &= ~NPY_WRITEABLE; } PyGpuNdArray_FLAGS(rval) &= ~NPY_OWNDATA; if (PyGpuNdArray_ISALIGNED(self)) { PyGpuNdArray_FLAGS(rval) |= NPY_ALIGNED; } else { PyGpuNdArray_FLAGS(rval) &= ~NPY_ALIGNED; } PyGpuNdArray_FLAGS(rval) &= ~NPY_UPDATEIFCOPY; if (false && PyGpuNdArray_NDIM(rval) == 0) { //Numpy is not consistent here //When we create a new numpy ndarray of 0 dim, it is not f contiguous //But when we take a subtensor that is of 0 dim, it is f contiguous! //We make as them for now... PyGpuNdArray_FLAGS(rval) &= ~NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(rval) |= NPY_C_CONTIGUOUS; } else { if (PyGpuNdArray_is_c_contiguous(rval)) { PyGpuNdArray_FLAGS(rval) |= NPY_C_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(rval) &= ~NPY_C_CONTIGUOUS; } if (PyGpuNdArray_is_f_contiguous(rval)) { PyGpuNdArray_FLAGS(rval) |= NPY_F_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(rval) &= ~NPY_F_CONTIGUOUS; } } DPRINTF("Subscript end\n"); return py_rval; } PyMappingMethods PyGpuNdArrayMappingMethods = { PyGpuNdArray_len, //lenfunc mp_length; __len__ PyGpuNdArray_Subscript, //binaryfunc mp_subscript; __getitem__ 0 //PyGpuNdArray_setitem //objobjargproc mp_ass_subscript; __setitem__ }; static PyTypeObject PyGpuNdArrayType = { PyObject_HEAD_INIT(NULL) 0, /*ob_size*/ "GpuNdArray", /*tp_name*/ sizeof(PyGpuNdArrayObject), /*tp_basicsize*/ 0, /*tp_itemsize*/ (destructor)PyGpuNdArrayObject_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ 0, /*tp_compare*/ 0, /*tp_repr*/ 0, //&PyGpuNdArrayObjectNumberMethods, /*tp_as_number*/ 0, /*tp_as_sequence*/ &PyGpuNdArrayMappingMethods,/*tp_as_mapping*/ 0, /*tp_hash */ 0, /*tp_call*/ 0, /*tp_str*/ 0, /*tp_getattro*/ 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/ "PyGpuNdArrayObject objects", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ PyGpuNdArray_methods, /* tp_methods */ 0, //PyGpuNdArray_members, /* tp_members */ //TODO PyGpuNdArray_getset, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)PyGpuNdArray_init,/* tp_init */ 0, /* tp_alloc */ PyGpuNdArray_new, /* tp_new */ }; ////////////////////////////////////// // // C API FOR PyGpuNdArrayObject // ////////////////////////////////////// PyObject * PyGpuNdArray_New(int nd) { DPRINTF("PyGpuNdArray_New start\n"); PyGpuNdArrayObject *self = (PyGpuNdArrayObject *)PyGpuNdArrayType.tp_alloc(&PyGpuNdArrayType, 0); if (self == NULL) { PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_New failed to allocate self"); return NULL; } PyGpuNdArray_null_init(self); if (nd == 0) { PyGpuNdArray_NDIM(self) = 0; } else if (nd > 0) { if (PyGpuNdArray_set_nd(self, nd)) { Py_DECREF(self); return NULL; } } ++_outstanding_mallocs[1]; DPRINTF("PyGpuNdArray_New end\n"); return (PyObject *)self; } int PyGpuNdArray_Check(const PyObject * ob) { DPRINTF("PyGpuNdArray_Check\n"); //TODO: doesn't work with inheritance return PyGpuNdArray_CheckExact(ob); } int PyGpuNdArray_CheckExact(const PyObject * ob) { DPRINTF("PyGpuNdArray_CheckExact\n"); return ((ob->ob_type == &PyGpuNdArrayType) ? 1 : 0); } static PyObject * PyGpuNdArray_as_c_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs) { DPRINTF("PyGpuNdArray_as_c_contiguous:start\n"); static const char *kwlist[] = {"a", "dtype", NULL}; PyArray_Descr *typecode = NULL; PyObject *self_ = NULL; if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&", (char **)kwlist, &self_, PyArray_DescrConverter, &typecode)) { Py_XDECREF(typecode); Py_XDECREF(self_); return NULL; } assert(typecode == NULL); if (!PyGpuNdArray_Check(self_)){ PyErr_SetString(PyExc_TypeError, "PyGpuNdArray_as_c_contiguous:" " PyGpuNdArrayObject required"); return NULL; } PyGpuNdArrayObject *self = (PyGpuNdArrayObject*)self_; if (PyGpuNdArray_is_c_contiguous(self)){ Py_INCREF(self); if (PyGpuNdArray_NDIM(self) == 0){ //numpy.ascontiguous() always return object with 1d. DPRINTF("PyGpuNdArray_as_c_contiguous: upcast to 1d tensor end\n"); PyObject * rval = PyGpuNdArray_View(self); if (!rval) return NULL; PyGpuNdArray_set_nd((PyGpuNdArrayObject*)rval, 1); PyGpuNdArray_DIM(rval, 0) = 1; PyGpuNdArray_STRIDE(rval, 0) = PyGpuNdArray_ITEMSIZE(rval); return rval; } DPRINTF("PyGpuNdArray_as_c_contiguous: no copy end\n"); return (PyObject*)self; } PyObject * ret = PyGpuNdArray_Copy(self); DPRINTF("PyGpuNdArray_as_c_contiguous: copy end\n"); return ret; } static PyObject * PyGpuNdArray_as_f_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs) { DPRINTF("PyGpuNdArray_as_f_contiguous:start\n"); static const char *kwlist[] = {"a", "dtype", NULL}; PyArray_Descr *typecode = NULL; PyObject *self_ = NULL; if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&", (char **)kwlist, &self_, PyArray_DescrConverter, &typecode)) { Py_XDECREF(typecode); Py_XDECREF(self_); return NULL; } assert(typecode == NULL); if (!PyGpuNdArray_Check(self_)){ PyErr_SetString(PyExc_TypeError, "PyGpuNdArray_as_f_contiguous:" " PyGpuNdArrayObject required"); return NULL; } PyGpuNdArrayObject *self = (PyGpuNdArrayObject*)self_; if (PyGpuNdArray_is_f_contiguous(self)){ Py_INCREF(self); if (PyGpuNdArray_NDIM(self) == 0){ //numpy.ascontiguous() always return object with 1d. PyObject * rval = PyGpuNdArray_View(self); if (!rval) return NULL; PyGpuNdArray_set_nd((PyGpuNdArrayObject*)rval, 1); PyGpuNdArray_DIM(rval, 0) = 1; PyGpuNdArray_STRIDE(rval, 0) = PyGpuNdArray_ITEMSIZE(rval); DPRINTF("PyGpuNdArray_as_f_contiguous: upcast to 1d tensor end\n"); return rval; } DPRINTF("PyGpuNdArray_as_f_contiguous: no copy end\n"); return (PyObject*)self; } PyObject * ret = PyGpuNdArray_Copy(self, NPY_FORTRANORDER); DPRINTF("PyGpuNdArray_as_f_contiguous: copy end\n"); return ret; } #ifdef WITH_OPENCL #ifdef __APPLE__ #include #else #include #endif extern void setup_context(cl_context c); PyObject * PyGpuNdArray_set_opencl_context(PyObject *mod, PyObject *ctx) { Py_ssize_t v; v = PyInt_AsSsize_t(ctx); if (v == -1 && PyErr_Occurred()) return NULL; setup_context((cl_context)v); Py_INCREF(Py_None); return Py_None; } #endif static PyMethodDef module_methods[] = { //{"dimshuffle", PyGpuNdArray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a PyGpuNdArray."}, {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"}, {"zeros", (PyCFunction)PyGpuNdArray_zeros, METH_VARARGS|METH_KEYWORDS, "Create a new PyGpuNdArray with specified shape, filled with zeros."}, {"empty", (PyCFunction)PyGpuNdArray_empty, METH_VARARGS|METH_KEYWORDS, "Create a new PyGpuNdArray with specified shape, filled with zeros."}, {"ascontiguousarray", (PyCFunction)PyGpuNdArray_as_c_contiguous, METH_VARARGS|METH_KEYWORDS, "If the array is not c contiguous, copy it to a new c contiguous region."}, {"asfortranarray", (PyCFunction)PyGpuNdArray_as_f_contiguous, METH_VARARGS|METH_KEYWORDS, "If the array is not f contiguous, copy it to a new c contiguous region."}, #ifdef WITH_OPENCL {"set_opencl_context", PyGpuNdArray_set_opencl_context, METH_O, "Set the OpenCL context to use for allocations and work."}, #endif {NULL, NULL, NULL, NULL} /* Sentinel */ }; #ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ #define PyMODINIT_FUNC void #endif PyMODINIT_FUNC initpygpu_ndarray(void) { import_array(); PyObject* m; if (PyType_Ready(&PyGpuNdArrayType) < 0) return; m = Py_InitModule3("pygpu_ndarray", module_methods, "Example module that creates an extension type."); if (m == NULL) return; Py_INCREF(&PyGpuNdArrayType); PyModule_AddObject(m, "GpuNdArrayObject", (PyObject *)&PyGpuNdArrayType); #if COMPUTE_GPU_MEM_USED for(int i=0;i> sio, """ if (%(x)s->nd != %(nd_in)s) { PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", %(x)s->nd); %(fail)s; } """ % locals() # # alloc an output if we need one # # check the basics of out output print >> sio, """ if ( !%(z)s || (%(z)s->nd != %(nd_out)s) """ % locals() #ensure that the output has the right non-reduced dimensions j = 0 for i in xrange(nd_in): if not self.reduce_mask[i]: print >> sio, (" || (CudaNdarray_HOST_DIMS(%(z)s)[%(j)s] !=" "CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]) " % locals()) j += 1 print >> sio, """ ) { """ % locals() print >> sio, "int new_dims[%(nd_out)s]; " % locals() j = 0 for i in xrange(nd_in): if not self.reduce_mask[i]: print >> sio, ('new_dims[%(j)s] = CudaNdarray_HOST_DIMS' '(%(x)s)[%(i)s];' % locals()) j += 1 print >> sio, """ Py_XDECREF(%(z)s); %(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims); if (NULL == %(z)s) { PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); %(fail)s; } } """ % locals() # \begin bracket the reduction in a check that there is # actually work to do print >> sio, """ if (CudaNdarray_SIZE(%(z)s)) { """ % locals() # # Now perform the reduction # if all(i == 1 for i in self.reduce_mask): #check if the tensor is ccontiguous, if true, use the #c_c0de_reduce_ccontig code. #TODO: check if we are ccontiguous when we un-dimshuffle #TODO: if only some dims are ccontiguous, call version # with less dims. print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){' % locals() self.c_code_reduce_ccontig(sio, node, name, x, z, fail) print >> sio, "}else{" getattr(self, 'c_code_reduce_%s' % (''.join( str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) print >> sio, "}" else: getattr(self, 'c_code_reduce_%s' % (''.join( str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) # \end bracket the reduction ... print >> sio, """ } """ % locals() return sio.getvalue() def _makecall(self, node, name, x, z, fail, pattern=None): """Return a string for making a kernel call. The return value looks something like: .. code-block:: c if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); int n_shared = sizeof(%(dtype)s) * n_threads.x; kernel_reduce_sum_10_%(name)s<<>>( CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_DEV_DATA(%(z)s), CudaNdarray_HOST_STRIDES(%(z)s)[0] ); CNDA_THREAD_SYNC; if (cudaSuccess != cudaGetLastError()) { PyErr_Format(PyExc_RuntimeError, "Cuda error: ... ); %(fail)s; } """ sio = StringIO.StringIO() if pattern is None: pattern = ''.join(str(c) for c in self.reduce_mask) ndim = len(self.reduce_mask) nd_out = ndim - sum(self.reduce_mask) print >> sio, """ if (verbose) printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n"); int n_shared = sizeof(%(dtype)s) * n_threads.x * n_threads.y * n_threads.z; if (verbose>1) printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d," " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d," " nb_block=%%d, n_shared=%%d\\n", n_threads.x,n_threads.y,n_threads.z, n_threads.x*n_threads.y*n_threads.z, n_blocks.x,n_blocks.y, n_blocks.x*n_blocks.y, n_shared); kernel_reduce_sum_%(pattern)s_%(name)s<<>>( """ % locals() for i in xrange(ndim): print >> sio, """ CudaNdarray_HOST_DIMS(%(x)s)[%(i)s], """ % locals() print >> sio, """ CudaNdarray_DEV_DATA(%(x)s) """ % locals() for i in xrange(ndim): print >> sio, """ ,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s] """ % locals() print >> sio, """ ,CudaNdarray_DEV_DATA(%(z)s) """ % locals() for i in xrange(nd_out): print >> sio, """ ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s] """ % locals() print >> sio, """ ); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_%(pattern)s_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } """ % locals() return sio.getvalue() def _k_decl(self, nodename, pattern=None, ndim=None, reduce_mask=None): """Return a string to declare a kernel function .. code-block:: c __global__ void kernel_reduce_sum_110_%(nodename)s( const int d0, const int d1, const int d2, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, %(dtype)s * Z, const int sZ0) """ dtype = self.dtype if reduce_mask is None: reduce_mask = self.reduce_mask if ndim is None: ndim = len(reduce_mask) if pattern is None: pattern = ''.join(str(i) for i in reduce_mask) sio = StringIO.StringIO() print >> sio, """ __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s( """ % locals() for i in xrange(ndim): print >> sio, """const int d%(i)s,""" % locals() print >> sio, """const %(dtype)s *A,""" % locals() for i in xrange(ndim): print >> sio, """const int sA%(i)s,""" % locals() print >> sio, """%(dtype)s * Z""" % locals() for i in xrange(ndim - sum(reduce_mask)): print >> sio, """, const int sZ%(i)s""" % locals() print >> sio, ")" return sio.getvalue() def _k_init(self, *args): dtype = self.dtype return """ const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32){ //TODO: set error code Z[0] = 666; return; } """ % locals() def _k_reduce_buf(self, z_pos): return """ __syncthreads(); // some kernel do multiple reduction. buf[threadNum] = mysum; __syncthreads(); // rest of function is handled by one warp if (threadNum < warpSize) { //round up all the partial sums into the first `warpSize` elements for (int i = threadNum + warpSize; i < threadCount; i += warpSize) { mysum += buf[i]; } buf[threadNum] = mysum; if (threadNum < 16) { //reduce so that threadNum 0 has the sum of everything if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16]; if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8]; if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4]; if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2]; if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1]; if (threadNum == 0) { %(z_pos)s = buf[0]; } } } """ % locals() return """ __syncthreads(); // some kernel do multiple reduction. buf[threadNum] = mysum; __syncthreads(); // rest of function is handled by one warp if (threadNum < warpSize) { //round up all the partial sums into the first `warpSize` elements for (int i = threadNum + warpSize; i < threadCount; i += warpSize) { mysum += buf[i]; } buf[threadNum] = mysum; /*Comment this optimization as it don't work on Fermi GPU. TODO: find why it don't work or put the GPU compute capability into the version // no sync because only one warp is running if(threadCount >32) { buf[threadNum] += buf[threadNum+16]; buf[threadNum] += buf[threadNum+8]; buf[threadNum] += buf[threadNum+4]; buf[threadNum] += buf[threadNum+2]; buf[threadNum] += buf[threadNum+1]; if (threadNum == 0) { %(z_pos)s = buf[0]; } } else */ if (threadNum < 16) { //reduce so that threadNum 0 has the sum of everything if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16]; if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8]; if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4]; if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2]; if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1]; if (threadNum == 0) { %(z_pos)s = buf[0]; } } } """ % locals() # Threads must be organized as: threadNum%nb_reduce correspond to # the same sum # nb_reduce<=warpSize def _k_reduce_buf_multiple(self, z_pos, nb_reduce): return """ __syncthreads(); // some kernel do multiple reduction. buf[threadNum] = mysum; __syncthreads(); // rest of function is handled by one warp if (threadNum < %(nb_reduce)s) { //round up all the partial sums into the first `nb_reduce` elements for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s) { mysum += buf[i]; } %(z_pos)s = mysum; } """ % locals() def c_code_reduce_ccontig(self, sio, node, name, x, z, fail): print >> sio, """ { if(CudaNdarray_SIZE(%(x)s)==0){ cudaMemset(CudaNdarray_DEV_DATA(%(z)s),0,sizeof(%(dtype)s)); }else{ int verbose = 0; dim3 n_threads( std::min(CudaNdarray_SIZE(%(x)s), NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks(1); if (verbose) printf("running kernel_reduce_sum_ccontig_%(name)s" " n_threads.x=%%d, size=%%d, ndim=%%d\\n", n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd); int n_shared = sizeof(%(dtype)s) * n_threads.x; kernel_reduce_sum_ccontig_%(name)s<<>>( CudaNdarray_SIZE(%(x)s), CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(z)s)); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i;" " block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_ccontig_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } } } """ % locals() def c_code_reduce_1(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks(1); %(makecall)s } """ % locals() def c_code_reduce_11(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.y * n_threads.x <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y; n_threads.y -= 1; if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0]) n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[0]; dim3 n_blocks(1); %(makecall)s } """ % locals() def c_code_reduce_01X(self, sio, node, name, x, z, fail, N): """ :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011, N=3 ->0111 Work for N=1,2,3 """ assert N in [1, 2, 3] makecall = self._makecall(node, name, x, z, fail) N_pattern = ''.join(['1'] * N) param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals() for i in xrange(N + 1)]) strides_dim = ",".join( ["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]" % locals() for i in xrange(N + 1)]) threads_y = """ //get as many y threads as we can fit while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-1]) n_threads.y += 1; else break; } """ % locals() threads_z = """ //get as many z threads as we can fit while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.z < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-2]) n_threads.z += 1; else break; } """ % locals() if len(self.reduce_mask) == 2: threads_y = '' threads_z = '' if len(self.reduce_mask) == 3: threads_z = '' print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[%(N)s], NUM_VECTOR_OP_THREADS_PER_BLOCK)); %(threads_y)s %(threads_z)s dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS)); %(makecall)s } """ % locals() def c_code_reduce_01(self, sio, node, name, x, z, fail): self.c_code_reduce_01X(sio, node, name, x, z, fail, 1) def c_code_reduce_011(self, sio, node, name, x, z, fail): self.c_code_reduce_01X(sio, node, name, x, z, fail, 2) def c_code_reduce_0111(self, sio, node, name, x, z, fail): self.c_code_reduce_01X(sio, node, name, x, z, fail, 3) def c_code_reduce_10(self, sio, node, name, x, z, fail): print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks(1, std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], NUM_VECTOR_OP_BLOCKS)); if (verbose) { fprintf(stderr, "running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n", n_blocks.x, n_blocks.y); } assert(CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]); int n_shared = sizeof(%(dtype)s) * n_threads.x; kernel_reduce_sum_010_%(name)s<<>>( 1, CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_DEV_DATA(%(x)s), 1, CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_DEV_DATA(%(z)s), 1, CudaNdarray_HOST_STRIDES(%(z)s)[0] ); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_010_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } } """ % locals() def c_code_reduce_010(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) makecall_inner = self._makecall(node, name, x, z, fail, pattern="010_inner") pattern = ''.join(str(i) for i in self.reduce_mask) print >> sio, """ { // if the alternative is less buggy, consider not using this branch if (1) { // If there are a lot of summations to do, then we can use // simple parallelization - use each thread to do one sum. // we might as well launch blocks of 32 threads because that's // the warp size. we could schedule more threads if we were // maxing out the gridsize below, but the gridsize is way more // than the physical hardware and I think 32 threads // on a huge grid is enough to fully use the hardware. dim3 n_threads(32,1,1); // We kindof reshape the input implicitly to something 4D: // the shape A,B,C -> A, B, D, E // where C <= D*E < C+32 // where E==32 int A = CudaNdarray_HOST_DIMS(%(x)s)[0]; int B = CudaNdarray_HOST_DIMS(%(x)s)[1]; int C = CudaNdarray_HOST_DIMS(%(x)s)[2]; int D = C/32; if (32*D < C) D+= 1; assert ((C <= 32*D) && (32*D < C+32)); // The gridsize would ideally be (A, D). But we do the // following logic to make sure we don't ask for a grid that // is too big. dim3 n_blocks(A,D); if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS; if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x; int n_shared = 0; kernel_reduce_sum_010_AD_%(name)s<<>>( A,B,C,D, CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[2], CudaNdarray_DEV_DATA(%(z)s), CudaNdarray_HOST_STRIDES(%(z)s)[0], CudaNdarray_HOST_STRIDES(%(z)s)[1] ); CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_010_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } } else { int verbose = 2; dim3 n_threads(std::min(32,CudaNdarray_HOST_DIMS(%(x)s)[2])); while((n_threads.x*(n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) && (n_threads.y1) printf("n_block.x.1=%%d, n_block.x.2=%%d," " n_block.y.1=%%d, n_block.y.2=%%d,\\n", CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS, ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2], (int)n_threads.x), (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x)); assert(n_threads.x<=32); %(makecall_inner)s }else{ n_threads.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], (int)NUM_VECTOR_OP_THREADS_PER_BLOCK); n_blocks.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], (int)NUM_VECTOR_OP_BLOCKS); n_blocks.y = std::min( CudaNdarray_HOST_DIMS(%(x)s)[2], (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x) ); %(makecall)s } CNDA_THREAD_SYNC; cudaError_t sts = cudaGetLastError(); if (cudaSuccess != sts) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", "kernel_reduce_sum_%(pattern)s_%(name)s", cudaGetErrorString(sts), n_blocks.x, n_blocks.y, n_threads.x, n_threads.y, n_threads.z); %(fail)s; } } } """ % locals() def c_code_reduce_0101(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; n_threads.y += 1; } n_threads.y -= 1; dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]); %(makecall)s } """ % locals() def c_code_reduce_100(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) # use threadIdx.x for i0 # use blockIdx.x for i1 # use blockIdx.y for i2 print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2]) { n_blocks.y += 1; } %(makecall)s } """ % locals() def c_code_reduce_110(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.x*n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0]) break; n_threads.y += 1; } n_threads.y -= 1; dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[2]); %(makecall)s } """ % locals() def c_code_reduce_001(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)); dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS)); while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS) { if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; n_blocks.y += 1; } n_blocks.y -= 1; %(makecall)s } """ % locals() def c_code_reduce_111(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)); //get as many y threads as we can fit while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; n_threads.y += 1; } n_threads.y -= 1; //get as many z threads as we can fit while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) break; n_threads.z += 1; } n_threads.z -= 1; dim3 n_blocks(1,1,1); %(makecall)s } """ % locals() def c_code_reduce_0011(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS)); while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS && n_blocks.y < CudaNdarray_HOST_DIMS(%(x)s)[1]) { n_blocks.y += 1; } dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK && n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[2] && n_threads.x * n_threads.y * sizeof(%(dtype)s) <= (15 * 1024 - 200)) { n_threads.y += 1; } %(makecall)s } """ % locals() def c_code_reduce_1111(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], NUM_VECTOR_OP_THREADS_PER_BLOCK)); //get as many y threads as we can fit while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; n_threads.y += 1; } n_threads.y -= 1; //get as many z threads as we can fit while (n_threads.x * n_threads.y * n_threads.z <= NUM_VECTOR_OP_THREADS_PER_BLOCK) { if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) break; n_threads.z += 1; } n_threads.z -= 1; dim3 n_blocks(1,1,1); %(makecall)s } """ % locals() def c_code_reduce_1011(self, sio, node, name, x, z, fail): makecall = self._makecall(node, name, x, z, fail) print >> sio, """ { int verbose = 0; dim3 n_threads( std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], NUM_VECTOR_OP_THREADS_PER_BLOCK)); while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y; if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2]) n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2]; while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z; if (n_threads.z > 64) n_threads.z = 64; if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) n_threads.z = CudaNdarray_HOST_DIMS(%(x)s)[0]; dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); %(makecall)s } """ % locals() def c_code_cache_version(self): return (21,) def c_support_code_apply(self, nodename, contig=False): sio = StringIO.StringIO() nd_in = len(self.reduce_mask) dtype = self.dtype if contig: # all(i == 1 for i in self.reduce_mask): #this kernel is ok for up to a few thousand elements, but # it only runs on ONE multiprocessor reducebuf = self._k_reduce_buf('Z[0]') print >> sio, """ __global__ void kernel_reduce_sum_ccontig_%(nodename)s( const int d0, const %(dtype)s *A, %(dtype)s * Z) { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) { mysum += A[i0]; } %(reducebuf)s } """ % locals() if self.reduce_mask == (1,): #this kernel is ok for up to a few thousand elements, but # it only runs on ONE multiprocessor reducebuf = self._k_reduce_buf('Z[0]') decl = self._k_decl(nodename) print >> sio, """ %(decl)s { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) { %(dtype)s Ai = A[i0 * sA0]; mysum += Ai; } %(reducebuf)s } """ % locals() if self.reduce_mask == (1, 1): #this kernel is ok for up to a few thousand elements, but # it only runs on ONE multiprocessor reducebuf = self._k_reduce_buf('Z[0]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, decl print >> sio, " { " print >> sio, init print >> sio, """ for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) { for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) { %(dtype)s Ai = A[i0 * sA0 + i1 * sA1]; mysum += Ai; } } """ % locals() print >> sio, reducebuf print >> sio, " } " #01, 011, 0111 if (0 == self.reduce_mask[0] and all(self.reduce_mask[1:]) and nd_in in[2, 3, 4]): # this kernel uses one block for each row. # threads per block for each element per row. N_pattern = ''.join(['1'] * (nd_in - 1)) if nd_in == 2: for_i1 = "for(int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)" for_i2 = "int i2=0, sA2=0;" for_i3 = "int i3=0, sA3=0;" if nd_in == 3: for_i1 = "for(int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)" for_i2 = "for(int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)" for_i3 = "int i3=0, sA3=0;" if nd_in == 4: for_i1 = "for(int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)" for_i2 = "for(int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)" for_i3 = "for(int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)" reducebuf = self._k_reduce_buf('Z[i0 * sZ0]') param_dim = ",".join(["const int d%(i)s" % locals() for i in xrange(nd_in)]) param_strides = ",".join(["const int sA%(i)s" % locals() for i in xrange(nd_in)]) decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s{ %(init)s for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ mysum = 0; %(for_i1)s{ %(for_i2)s{ %(for_i3)s{ %(dtype)s Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]; mysum += Ai; } } } %(reducebuf)s } } """ % locals() if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): # this kernel uses one block for each column, # threads per block for each element per column. #TODO: This kernel is pretty inefficient in terms of # reading, because if A is c_contiguous (typical # case) then each warp is accessing non-contigous # memory (a segment of a column). reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]') print >> sio, """ __global__ void kernel_reduce_sum_010_%(nodename)s( const int d0, const int d1, const int d2, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, %(dtype)s * Z, const int sZ0, const int sZ1) { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; extern __shared__ %(dtype)s buf[]; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) { for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) { %(dtype)s mysum = 0.0f; for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (0, 1, 0): print >> sio, """ __global__ void kernel_reduce_sum_010_AD_%(nodename)s( const int A, const int B, const int C, const int D, //const int E, // THIS is 32 const %(dtype)s *X, const int sX0, const int sX1, const int sX2, %(dtype)s * Z, const int sZ0, const int sZ1) { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; %(dtype)s mysum = 0.0f; if (warpSize != 32) { return; //TODO: set error code } for (int a = blockIdx.x; a < A; a += gridDim.x) { for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y) { int c = i2_D * 32 + threadIdx.x; if (c < C) { mysum = 0; for (int b = 0; b < B; ++b) { mysum += X[a * sX0 + b * sX1 + c * sX2]; } Z[a * sZ0 + c * sZ1] = mysum; } } } } """ % locals() if self.reduce_mask == (0, 1, 0): # # This kernel is optimized when the inner most dimensions # have the smallest stride. # this kernel uses one block for multiple column(up to 32TODO), # threads per block for each element per column. #thread.x = dim 2 contiguous #thread.y = dim 1 #block.x = dim 0 #block.y = dim 1 rest init = self._k_init(nodename) decl = self._k_decl(nodename, pattern="010_inner") reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', 'blockDim.x') reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', 'blockDim.x') print >> sio, """ %(decl)s { if(warpSize> sio, """ __global__ void kernel_reduce_sum_110_%(nodename)s( const int d0, const int d1, const int d2, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, %(dtype)s * Z, const int sZ0) { const int threadCount = blockDim.x * blockDim.y; const int threadNum = threadIdx.y * blockDim.x + threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32) { //TODO: set error code Z[blockIdx.x * sZ0] = 666; return; } for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) { for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) { %(dtype)s Ai = A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]; mysum += Ai; } } %(reducebuf)s } """ % locals() if self.reduce_mask == (1, 0, 0): reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) { for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x) { mysum = 0; for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (1, 1, 1): reducebuf = self._k_reduce_buf('Z[0]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) { for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) { for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; } } } """ % locals() print >> sio, reducebuf, "}" if self.reduce_mask == (0, 0, 1): # this kernel uses one block for each row, # threads per block for each element per row. reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]') print >> sio, """ __global__ void kernel_reduce_sum_001_%(nodename)s( const int d0, const int d1, const int d2, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, %(dtype)s * Z, const int sZ0, const int sZ1) { const int threadCount = blockDim.x; const int threadNum = threadIdx.x; extern __shared__ %(dtype)s buf[]; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) { for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) { %(dtype)s mysum = 0.0f; for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (0, 0, 1, 1): # this kernel uses one block for each row, # threads per block for each element per row. reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) { for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) { %(dtype)s mysum = 0.0f; for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) { for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]; } } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (0, 1, 0, 1): # this kernel uses one block for each row, # threads per block for each element per row. reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) { for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) { %(dtype)s mysum = 0.0f; for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) { for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]; } } %(reducebuf)s } } } """ % locals() if self.reduce_mask == (1, 1, 1, 1): reducebuf = self._k_reduce_buf('Z[0]') decl = self._k_decl(nodename) init = self._k_init(nodename) print >> sio, """ %(decl)s { %(init)s mysum = 0; for (int i0 = 0; i0 < d0; i0++) for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z) { for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) { for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) { mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]; } } } %(reducebuf)s } """ % locals() if self.reduce_mask == (1, 0, 1, 1): reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]') print >> sio, """ __global__ void kernel_reduce_sum_1011_%(nodename)s( const int d0, const int d1, const int d2, const int d3, const %(dtype)s *A, const int sA0, const int sA1, const int sA2, const int sA3, %(dtype)s * Z, const int sZ0) { const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; extern __shared__ %(dtype)s buf[]; %(dtype)s mysum = 0.0f; if (warpSize != 32) { return; //TODO: set error code } for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) { for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) { for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) { %(dtype)sy Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]; mysum += Ai; } } } %(reducebuf)s } """ % locals() return sio.getvalue() pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/test_gpu_elemwise.py0000644000175000000500000004374512313360366024051 0ustar tomussrc# TODO: test other dtype import numpy import theano import pygpu_ndarray as gpu_ndarray from gen_elemwise import MyGpuNdArray, elemwise_collapses from test_gpu_ndarray import (dtypes_all, enable_double, gen_gpu_nd_array, product) def rand(shape, dtype): r = numpy.random.randn(*shape) * 10 if dtype.startswith("u"): r = numpy.absolute(r) return r.astype(dtype) # numpy.allclose seam to have problem with int8... def all_close(x, y): return (numpy.allclose(x, y) or numpy.absolute(x - y).max() == 0) def test_elemwise_collapse(): """ Test collapsing under many broadcast and strided pattern """ for dtype1 in ["int16", "float32", "int8"]: for dtype2 in ["int16", "float32", "int8"]: for shape1_, shape2_, expected in [ # 1d to test this special case ((40,), (40,), 0), ((40,), (1,), 1), # No broadcastable dimensions ((4, 5, 6, 9), (4, 5, 6, 9), 0), # All inputs have one(and the same) broadcastable dimension ((1, 4, 5, 9), (1, 4, 5, 9), 0), ((4, 1, 5, 9), (4, 1, 5, 9), 0), ((4, 5, 1, 9), (4, 5, 1, 9), 0), ((4, 5, 9, 1), (4, 5, 9, 1), 0), # One inputs have one broadcastable dimension ((1, 5, 6, 9), (4, 5, 6, 9), 2), ((4, 1, 6, 9), (4, 5, 6, 9), 3), ((4, 5, 1, 9), (4, 5, 6, 9), 3), ((4, 5, 6, 1), (4, 5, 6, 9), 2), # One inputs have two broadcastable dimension ((1, 1, 6, 9), (4, 5, 6, 9), 2), ((1, 5, 1, 9), (4, 5, 6, 9), 4), ((1, 5, 6, 1), (4, 5, 6, 9), 3), ((4, 1, 1, 9), (4, 5, 6, 9), 3), ((4, 1, 6, 1), (4, 5, 6, 9), 4), ((4, 5, 1, 1), (4, 5, 6, 9), 2), # One inputs have tree broadcastable dimension ((1, 1, 1, 9), (4, 5, 6, 9), 2), ((1, 1, 6, 1), (4, 5, 6, 9), 3), ((1, 5, 1, 1), (4, 5, 6, 9), 3), ((4, 1, 1, 1), (4, 5, 6, 9), 2), # One scalar ((1, 1, 1, 1), (4, 5, 6, 9), 1), # One scalar, the other 1 broadcast dims ((1, 1, 1, 1), (4, 5, 6, 1), 1), ]: scalar_cpu = rand((1,) * len(shape1_), dtype=dtype1) scalar_gpu = gpu_ndarray.GpuNdArrayObject(scalar_cpu) scalar_gpu1 = MyGpuNdArray(scalar_gpu) for shape1, shape2 in [(shape1_, shape2_), (shape2_, shape1_)]: a_cpu = rand(shape1, dtype=dtype1) a = gpu_ndarray.GpuNdArrayObject(a_cpu) a1 = MyGpuNdArray(a) b_cpu = rand(shape2, dtype=dtype2) b = gpu_ndarray.GpuNdArrayObject(b_cpu) b1 = MyGpuNdArray(b) assert len(shape1) == len(shape2) o_shape = [] for i in range(len(shape1)): o_shape.append(max(shape1[i], shape2[i])) o = gpu_ndarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype) # 1.1 Check direct collapse nd_collaps, info = elemwise_collapses([a, b], [o]) assert nd_collaps == expected, (shape1, shape2, nd_collaps, expected, info) # 1.2 Check computation are still valid f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1], len(shape1)) out = f([a1, b1]) out2 = f([a1, b1], out=out) assert out is out2 assert numpy.allclose(numpy.asarray(f([a1, b1])), a_cpu + b_cpu) assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(a1, b1)), a_cpu + b_cpu) assert numpy.allclose(numpy.asarray( MyGpuNdArray.add(a1, b1)), a_cpu + b_cpu) assert MyGpuNdArray.add(a1, b1, out=out2) is out2 # 1.3 Check work without collaping f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1], len(shape1), collapse=False) out = f([a1, b1]) out2 = f([a1, b1], out=out) assert out is out2 assert numpy.allclose(numpy.asarray(f([a1, b1])), a_cpu + b_cpu) assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds( a1, b1)), a_cpu + b_cpu) assert numpy.allclose(numpy.asarray(MyGpuNdArray.add( a1, b1)), a_cpu + b_cpu) assert MyGpuNdArray.add(a1, b1, out=out2) is out2 # 2.1 What if we add a scalar? nd_collaps, info = elemwise_collapses( [a, b, scalar_gpu], [o]) if expected == 0: expected2 = 1 else: expected2 = expected assert nd_collaps == expected2, (shape1, shape2, nd_collaps, expected, info) # 2.2 Check computation assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds( a1, b1, scalar_gpu1)), a_cpu + b_cpu + scalar_cpu) # 3.1 What if one of the dimensions is strided? broadcast = any([True for i in a.shape + b.shape if i == 1]) if expected == 0: expected2 = 2 else: expected2 = expected if len(shape1_) != 4: continue if a.shape[0] != 1: shape = list(shape1) shape[0] *= 2 c_cpu = rand(shape, dtype='float32') c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::2] c1 = MyGpuNdArray(c) err = ("strided", c.shape, shape2, nd_collaps, expected, info) nd_collaps, info = elemwise_collapses([c, b], [o]) if broadcast: assert nd_collaps >= expected, err else: assert nd_collaps == expected2, err assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(c1, b1)), numpy.asarray(c) + b_cpu) if a.shape[1] != 1: shape = list(shape1) shape[1] *= 2 c_cpu = rand(shape, dtype='float32') c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::2] c1 = MyGpuNdArray(c) err = ("strided", c.shape, shape2, nd_collaps, expected, info) nd_collaps, info = elemwise_collapses([c, b], [o]) if broadcast: assert nd_collaps >= expected, err else: assert nd_collaps == expected2, err pass assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(c1, b1)), numpy.asarray(c) + b_cpu) if a.shape[2] != 1: shape = list(shape1) shape[2] *= 2 c_cpu = rand(shape, dtype='float32') c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, ::2] c1 = MyGpuNdArray(c) err = ("strided", c.shape, shape2, nd_collaps, expected, info) nd_collaps, info = elemwise_collapses([c, b], [o]) if broadcast: assert nd_collaps >= expected, err else: assert nd_collaps == expected2, err pass assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(c1, b1)), numpy.asarray(c) + b_cpu) if a.shape[3] != 1: shape = list(shape1) shape[3] *= 2 c_cpu = rand(shape, dtype='float32') c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, ::, ::2] c1 = MyGpuNdArray(c) err = ("strided", c.shape, shape2, nd_collaps, expected, info) nd_collaps, info = elemwise_collapses([c, b], [o]) if broadcast: assert nd_collaps >= expected, err else: assert nd_collaps == 1, err pass assert numpy.allclose(numpy.asarray( MyGpuNdArray.adds(c1, b1)), numpy.asarray(c) + b_cpu) def test_elemwise_mixed_dtype(): to_cpu = numpy.asarray for dtype1 in ["int16", "float32", "int8"]: for dtype2 in ["int16", "float32", "int8"]: dtypeo = str((numpy.zeros(1, dtype=dtype1) + numpy.zeros(1, dtype=dtype2)).dtype) #print "dtypes", dtype1, dtype2, "o dtype", dtypeo #print " Test inside a wrapping python object 2 inputs" for shape in [(500,), (50, 5), (5, 6, 7)]: input_vals = [rand(shape, dtype) for dtype in [dtype1, dtype2]] del dtype gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) for i in input_vals] assert all([numpy.allclose(to_cpu(ig), i) for ig, i in zip(gpu_vals, input_vals)]) gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] out = gpu_vals[0] + gpu_vals[1] assert numpy.allclose(to_cpu(out), input_vals[0] + input_vals[1]) out = gpu_vals[0] - gpu_vals[1] assert numpy.allclose(to_cpu(out), input_vals[0] - input_vals[1]) out = gpu_vals[0] * gpu_vals[1] assert all_close(to_cpu(out), input_vals[0] * input_vals[1]) if dtypeo.startswith("float"): # TODO: execute for all dtype out = gpu_vals[0] / gpu_vals[1] assert numpy.allclose(to_cpu(out), input_vals[0] / input_vals[1]) nb_in = 4 #print " Test inside a wrapping python object %d inputs"%nb_in for shape in [(500,), (50, 5), (5, 6, 7)]: input_vals = [rand(shape, dtype) for dtype in [dtype1, dtype2, dtype1, dtype2]] gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) for i in input_vals] assert all([numpy.allclose(to_cpu(ig), i) for ig, i in zip(gpu_vals, input_vals)]) gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] out = MyGpuNdArray.adds(*gpu_vals) assert numpy.allclose(to_cpu(out), reduce(numpy.add, input_vals)) out = MyGpuNdArray.multiplys(*gpu_vals) assert all_close(to_cpu(out), reduce(numpy.multiply, input_vals)) #print " Test broadcasting" for shapes in [((1, 5), (4, 5)), ((33, 10), (33, 1)), ((33, 1, 5), (33, 10, 1)), ((33, 1, 5), (33, 10, 1), ((1, 10, 5))), ]: input_vals = [rand(shape, dtype) for shape, dtype in zip(shapes, [dtype1, dtype2])] gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) for i in input_vals] assert all([numpy.allclose(to_cpu(ig), i) for ig, i in zip(gpu_vals, input_vals)]) gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] out = MyGpuNdArray.adds(*gpu_vals) assert numpy.allclose(to_cpu(out), reduce(numpy.add, input_vals)) out = MyGpuNdArray.multiplys(*gpu_vals) assert all_close(to_cpu(out), reduce(numpy.multiply, input_vals)) def test_sum(): to_cpu = numpy.asarray dtypes = list(dtypes_all) # I remove *int8 as currently the output have the same dtype # And this cause overflow dtypes.remove("int8") dtypes.remove("uint8") # I need to find how pycuda handle complexe in c. # I probably just need to add an header. dtypes.remove("complex64") if enable_double: dtypes.remove("complex128") for shape in [ # need something bigger then 32, 1024 or 4096. # Those are corner case. # 1d, take only a few seconds on a GTX470 (0,), (5,), (31,), (32,), (33,), (1023,), (1024,), (1025,), (4095,), (4096,), (4097,), (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,), # 2d, take 2 minutes on a GTX 470 (0, 0), (1, 0), (0, 1,), (5, 4), (31, 31), (31, 32), (31, 33), (32, 31), (32, 32), (32, 33), (33, 31), (33, 32), (33, 33), (1024, 32), (1025, 32), (1024, 33), (1025, 33), (4096, 32), (32, 4096), (4096, 33), (33, 4096), (4097, 32), (32, 4097), (4097, 33), (33, 4097), # 3d, take 2 minutes on a GTX 470 (0, 0, 0), (0, 1, 0), (0, 0, 1), (5, 4, 3), (5, 4, 3), (5, 4, 3), (4096, 2, 33), (2, 4096, 33), (33, 2, 4096), (4097, 2, 33), (2, 4097, 33), (33, 2, 4097), (4096, 33, 2), (33, 4096, 2), (2, 33, 4096), (4097, 33, 2), (33, 4097, 2), (2, 33, 4097), # 4d, take 1 minutes on a GTX 470 (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1), (5, 4, 3, 2), (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32), (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32), (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33), (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33), (4100, 4, 3, 2), (4, 4100, 3, 2), (4, 3, 4100, 2), (4, 3, 2, 4100), # 5d, work only if c contiguous (5, 4, 3, 10, 11), ]: for dtype, off_o, off_i, sliced, order in product( *([dtypes] + [[False, True]] + [[False, True]] + [[-1, 2, -2, 1]] + [['f', 'c']])): cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o, off_i, sliced, order) if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or gpu_val.flags["F_CONTIGUOUS"]): continue gpu_val = MyGpuNdArray(gpu_val) cpu_sum = cpu_val.sum() # print dtype, shape, off_o, off_i, sliced, order # print (cpu_val.strides, # cpu_val.flags["C_CONTIGUOUS"], # cpu_val.flags["F_CONTIGUOUS"]) # print (gpu_val.strides, # gpu_val.flags["C_CONTIGUOUS"], # gpu_val.flags["F_CONTIGUOUS"]) gpu_sum = to_cpu(gpu_val.sum()) def get_rtol(orig, after_reduction): if after_reduction.size == 0: return 0 if orig.size // after_reduction.size > 500000: rtols = {"float32": 4.3e-5} elif orig.size // after_reduction.size > 100000: rtols = {"float32": 3e-5} elif orig.size // after_reduction.size > 50000: rtols = {"float32": 2e-5} else: rtols = {"float32": 1e-5} if dtype in rtols: rtol = rtols[dtype] else: rtol = 1e-8 return rtol rtol = get_rtol(gpu_val, gpu_sum) cpu_sum = cpu_sum.astype(dtype) if not (dtype.endswith("int16") and numpy.prod(shape) > 20000): assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or cpu_sum == gpu_sum), ( dtype, shape, cpu_sum, gpu_sum, (cpu_sum - gpu_sum) / cpu_sum) # Test pattern 10 and 01 # Test pattern 100, 010 and 001 if len(shape) in [2, 3]: for axis in range(len(shape)): gpu_sum = to_cpu(gpu_val.sum(axis=[axis])) cpu_sum = cpu_val.sum(axis=axis) rtol = get_rtol(gpu_val, gpu_sum) if cpu_sum.size > 0: argmax = numpy.absolute(cpu_sum - gpu_sum).argmax() cpu_max = cpu_sum.flatten()[argmax] gpu_max = gpu_sum.flatten()[argmax] assert numpy.allclose(cpu_sum, gpu_sum), ( "axis=%d" % axis, dtype, shape, cpu_sum.shape, cpu_sum, gpu_sum, cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max) pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/pygpu_language.h0000644000175000000500000001561112313360366023122 0ustar tomussrc/** * This file contain the header for ALL code that depend on cuda or opencl. */ #ifndef _PYGPU_LANGUAGE_H #define _PYGPU_LANGUAGE_H #include //#include #include "pygpu_ndarray_object.h" ///////////////////////// // Alloc and Free ///////////////////////// //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device. #define COMPUTE_GPU_MEM_USED 0 #define VERBOSE_ALLOC_FREE 0 //If true, we fill with NAN allocated device memory. #define ALLOC_MEMSET 0 static int _outstanding_mallocs[] = {0,0}; #ifdef DEBUG #define DPRINTF(args...) fprintf(stderr, args) #else #define DPRINTF(...) #endif #if COMPUTE_GPU_MEM_USED int _allocated_size = 0; const int TABLE_SIZE = 10000; struct table_struct{ void* ptr; int size; }; table_struct _alloc_size_table[TABLE_SIZE]; #endif /** * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage. * * device_malloc will set the Python error message before returning None. * device_free will return nonzero on failure (after setting the python error message) */ void * device_malloc(size_t size); int device_free(void * ptr); static PyObject * outstanding_mallocs(PyObject* self, PyObject * args) { return PyInt_FromLong(_outstanding_mallocs[0]); } int PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast = false); /** * PyGpuNdArray_alloc_contiguous * * Allocate storage space for a tensor of rank 'nd' and given dimensions. * * Note: PyGpuNdArray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions */ template int PyGpuNdArray_alloc_contiguous(PyGpuNdArrayObject *self, const int nd, const inttype * dim, NPY_ORDER order=NPY_CORDER) { DPRINTF("PyGpuNdArray_alloc_contiguous: start nd=%i descr=%p\n", nd, self); if (!PyGpuNdArray_DESCR(self)){ PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_alloc_contiguous: The array don't have a type! We can't allocate it!\n"); return -1; } // allocate an empty ndarray with c_contiguous access // return 0 on success int size = 1; //set up the strides for contiguous tensor assert (nd >= 0); if (PyGpuNdArray_set_nd(self, nd)) { return -1; } //TODO: check if by any chance our current dims are correct, // and strides already contiguous // in that case we can return right here. DPRINTF("PyGpuNdArray_alloc_contiguous: before itemsize descr=%p elsize=%i\n", self->descr, self->descr->elsize); int elsize = PyGpuNdArray_ITEMSIZE((PyObject*)self); DPRINTF("PyGpuNdArray_alloc_contiguous: set_nd %d! elsize=%i\n", nd, elsize); if(order != NPY_FORTRANORDER){ DPRINTF("PyGpuNdArray_alloc_contiguous: NPY_CORDER\n"); for (int i = nd-1; i >= 0; --i){ if (size == 0) PyGpuNdArray_STRIDE(self, i) = elsize; else PyGpuNdArray_STRIDE(self,i) = size * elsize; PyGpuNdArray_DIM(self,i) = dim[i]; size = size * dim[i]; } }else if (nd>0){ DPRINTF("PyGpuNdArray_alloc_contiguous: NPY_FORTRANORDER\n"); size = dim[0]; PyGpuNdArray_STRIDE(self, 0) = elsize; PyGpuNdArray_DIM(self, nd-1) = dim[nd-1]; for (int i = 1; i < nd; ++i){ if (size == 0) PyGpuNdArray_STRIDE(self, i) = elsize; else PyGpuNdArray_STRIDE(self, i) = PyGpuNdArray_STRIDE(self, i-1) * dim[i-1]; PyGpuNdArray_DIM(self, nd-i-1) = dim[nd-i-1]; size = size * dim[i]; } } if (self->data_allocated != size) { // If self is a view, do not try to free its memory if (self->data_allocated && device_free(PyGpuNdArray_DATA(self))) { // Does this ever happen?? Do we need to set data_allocated or devdata to 0? PyGpuNdArray_DATA(self) = NULL; self->data_allocated = 0; return -1; } assert(size>0); DPRINTF("PyGpuNdArray_alloc_contiguous: will allocate for size=%d elements\n", size); PyGpuNdArray_DATA(self) = (char*)device_malloc(size * PyGpuNdArray_ITEMSIZE((PyObject *)self)); if (!PyGpuNdArray_DATA(self)) { PyGpuNdArray_set_nd(self,-1); self->data_allocated = 0; PyGpuNdArray_DATA(self) = 0; return -1; } // The structure of self will be reused with newly allocated memory. // If self was a view, we should remove the reference to its base. // (If base was already NULL, the following has no effect.) Py_XDECREF(self->base); self->base = NULL; self->data_allocated = size; self->gpu_ndarray.flags = NPY_DEFAULT; PyGpuNdArray_FLAGS(self) |= NPY_WRITEABLE; PyGpuNdArray_FLAGS(self) |= NPY_OWNDATA; if (nd == 0) { PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; if (order != NPY_FORTRANORDER) { PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; } }else if(nd == 1){//set c and f contiguous PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; }else if(order != NPY_FORTRANORDER){//set c contiguous PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; }else{//set f contiguous PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; } PyGpuNdArray_FLAGS(self) &= ~NPY_UPDATEIFCOPY; }else if(size == 0){ PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_OWNDATA; if (nd == 0) { PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; if (order != NPY_FORTRANORDER) { PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; } else { PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; } }else if(nd == 1){//set c and f contiguous PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; }else if(order != NPY_FORTRANORDER){//set c contiguous PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; }else{//set f contiguous PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; } PyGpuNdArray_FLAGS(self) &= ~NPY_UPDATEIFCOPY; return 0; }else{ // How to check for the flags? Need to check if already contiguous. PyErr_Format(PyExc_RuntimeError, "PyGpuNdArray_alloc_contiguous: self->data_allocated=%d, size=%d, cmp=%d", self->data_allocated, size, self->data_allocated != size ); return -1; } if (order != NPY_FORTRANORDER) { assert(PyGpuNdArray_is_c_contiguous(self)); } else { assert(PyGpuNdArray_is_f_contiguous(self)); } DPRINTF("PyGpuNdArray_alloc_contiguous: end\n"); return 0; } enum PyGpuTransfert { PyGpuHostToDevice, PyGpuDeviceToHost }; int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, PyGpuTransfert direction); int PyGpuMemset(void * dst, int data, size_t bytes); #endif pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/pygpu_ndarray_object.h0000644000175000000500000002154312313360366024326 0ustar tomussrc/** * struct PyGPUArrayObject * * This is a Python type. * */ #ifndef _PYGPU_NDARRAY_OBJECT_H #define _PYGPU_NDARRAY_OBJECT_H #include #include #include "gpu_ndarray.h" typedef struct PyGpuNdArrayObject{ PyObject_HEAD GpuNdArray gpu_ndarray; //no pointer, just inlined. PyObject * base; PyArray_Descr * descr; // for numpy-like desc int data_allocated; //the number of bytes allocated for devdata } PyGpuNdArrayObject; #define PyGpuNdArray_NDIM(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.nd) #define PyGpuNdArray_DATA(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.data) #define PyGpuNdArray_BYTES(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.data) #define PyGpuNdArray_OFFSET(obj) (((PyGpuNdArrayObject *)(obj))->gpu_ndarray.offset) #define PyGpuNdArray_DIMS(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.dimensions) #define PyGpuNdArray_STRIDES(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.strides) #define PyGpuNdArray_DIM(obj,n) (PyGpuNdArray_DIMS(obj)[n]) #define PyGpuNdArray_STRIDE(obj,n) (PyGpuNdArray_STRIDES(obj)[n]) #define PyGpuNdArray_BASE(obj) (((PyGpuNdArrayObject *)obj)->base) #define PyGpuNdArray_DESCR(obj) (((PyGpuNdArrayObject *)obj)->descr) #define PyGpuNdArray_FLAGS(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.flags) #define PyGpuNdArray_ITEMSIZE(obj) (((PyGpuNdArrayObject *)obj)->descr->elsize) #define PyGpuNdArray_TYPE(obj) (((PyGpuNdArrayObject *)(obj))->descr->type_num) #define PyGpuNdArray_SIZE(obj) PyArray_MultiplyList(PyGpuNdArray_DIMS(obj),PyGpuNdArray_NDIM(obj)) //npy_intp PyGpuNdArray_Size(PyObject* obj); //npy_intp PyGpuNdArray_NBYTES(PyObject* arr); /* Flags accessor */ #define PyGpuNdArray_CHKFLAGS(m, FLAGS) \ ((((PyGpuNdArrayObject *)(m))->gpu_ndarray.flags & (FLAGS)) == (FLAGS)) #define PyGpuNdArray_ISCONTIGUOUS(m) PyGpuNdArray_CHKFLAGS(m, NPY_CONTIGUOUS) #define PyGpuNdArray_ISFORTRAN(m) (PyGpuNdArray_CHKFLAGS(m, NPY_F_CONTIGUOUS) && \ PyGpuNdArray_NDIM(m) > 1) #define PyGpuNdArray_FORTRAN_IF(m) (PyGpuNdArray_CHKFLAGS(m, NPY_F_CONTIGUOUS)? \ NPY_F_CONTIGUOUS : 0) #define PyGpuNdArray_ISONESEGMENT(m) (PyGpuNdArray_NDIM(m) == 0 || \ PyGpuNdArray_ISCONTIGUOUS(m) || \ PyGpuNdArray_ISFORTRAN(m)) #define PyGpuNdArray_ISWRITEABLE(m) PyGpuNdArray_CHKFLAGS(m, NPY_WRITEABLE) #define PyGpuNdArray_ISALIGNED(m) PyGpuNdArray_CHKFLAGS(m, NPY_ALIGNED) #define PyGpuNdArray_ISNBO(arg) ((arg) != NPY_OPPBYTE) // THE NEXT ONE SEEM BAD... #define PyGpuNdArray_IsNativeByteOrder PyArray_ISNBO #define PyGpuNdArray_ISNOTSWAPPED(m) PyArray_ISNBO(PyArray_DESCR(m)->byteorder) #define PyGpuNdArray_FLAGSWAP(m, flags) (PyGpuNdArray_CHKFLAGS(m, flags) && PyGpuNdArray_ISNOTSWAPPED(m)) #define PyGpuNdArray_ISCARRAY(m) PyGpuNdArray_FLAGSWAP(m, NPY_CARRAY) #define PyGpuNdArray_ISCARRAY_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_CARRAY_RO) #define PyGpuNdArray_ISFARRAY(m) PyGpuNdArray_FLAGSWAP(m, NPY_FARRAY) #define PyGpuNdArray_ISFARRAY_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_FARRAY_RO) #define PyGpuNdArray_ISBEHAVED(m) PyGpuNdArray_FLAGSWAP(m, NPY_BEHAVED) #define PyGpuNdArray_ISBEHAVED_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_ALIGNED) static void PyGpuNdArray_fprint(FILE * fd, const PyGpuNdArrayObject *self) { fprintf(fd, "PyGpuNdArrayObject <%p, %p> nd=%i data_allocated=%d\n", self, PyGpuNdArray_DATA(self), PyGpuNdArray_NDIM(self), self->data_allocated); fprintf(fd, "\tITEMSIZE: %d\n", PyGpuNdArray_ITEMSIZE(self)); fprintf(fd, "\tTYPENUM: %d\n", PyGpuNdArray_TYPE(self)); fprintf(fd, "\tRefcount: %ld\n", (long int)self->ob_refcnt); fprintf(fd, "\tBASE: %p\n", PyGpuNdArray_BASE(self)); fprintf(fd, "\tHOST_DIMS: "); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { fprintf(fd, "%ld\t", PyGpuNdArray_DIMS(self)[i]); } fprintf(fd, "\n\tHOST_STRIDES: "); for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { fprintf(fd, "%ld\t", PyGpuNdArray_STRIDES(self)[i]); } fprintf(fd, "\n\tFLAGS: "); fprintf(fd, "\n\t\tC_CONTIGUOUS: %d", PyGpuNdArray_ISCONTIGUOUS(self)); fprintf(fd, "\n\t\tPyGpuNdArray_ISFORTRAN: %d PyGpuNdArray_FORTRAN_IF:%d F_CONTIGUOUS: %d", PyGpuNdArray_ISFORTRAN(self), PyGpuNdArray_FORTRAN_IF(self), PyGpuNdArray_CHKFLAGS(self, NPY_FORTRAN)); fprintf(fd, "\n\t\tOWNDATA: %d", PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); fprintf(fd, "\n\t\tWRITEABLE: %d", PyGpuNdArray_ISWRITEABLE(self)); fprintf(fd, "\n\t\tALIGNED: %d", PyGpuNdArray_ISALIGNED(self)); fprintf(fd, "\n\t\tUPDATEIFCOPY: %d", PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); fprintf(fd, "\n"); } static void PyArray_fprint(FILE * fd, const PyArrayObject *self) { fprintf(fd, "PyArrayObject <%p, %p> nd=%i\n", self, PyArray_DATA(self), PyArray_NDIM(self)); fprintf(fd, "\tITEMSIZE: %d\n", PyArray_ITEMSIZE(self)); fprintf(fd, "\tTYPENUM: %d\n", PyArray_TYPE(self)); fprintf(fd, "\tHOST_DIMS: "); for (int i = 0; i < PyArray_NDIM(self); ++i) { fprintf(fd, "%ld\t", PyArray_DIMS(self)[i]); } fprintf(fd, "\n\tHOST_STRIDES: "); for (int i = 0; i < PyArray_NDIM(self); ++i) { fprintf(fd, "%ld\t", PyArray_STRIDES(self)[i]); } fprintf(fd, "\n\tFLAGS: "); fprintf(fd, "\n\t\tC_CONTIGUOUS: %d", PyArray_ISCONTIGUOUS(self)); fprintf(fd, "\n\t\tPyArray_ISFORTRAN: %d PyArray_FORTRAN_IF:%d F_CONTIGUOUS: %d", PyArray_ISFORTRAN(self), PyArray_FORTRAN_IF(self), PyArray_CHKFLAGS(self, NPY_FORTRAN)); fprintf(fd, "\n\t\tOWNDATA: %d", PyArray_CHKFLAGS(self, NPY_OWNDATA)); fprintf(fd, "\n\t\tWRITEABLE: %d", PyArray_ISWRITEABLE(self)); fprintf(fd, "\n\t\tALIGNED: %d", PyArray_ISALIGNED(self)); fprintf(fd, "\n\t\tUPDATEIFCOPY: %d", PyArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); fprintf(fd, "\n"); } template static T ceil_intdiv(T a, T b) { return (a/b) + ((a % b) ? 1: 0); } //Compute if the resulting array is c contiguous static bool PyGpuNdArray_is_c_contiguous(const PyGpuNdArrayObject * self) { bool c_contiguous = true; int size = PyGpuNdArray_ITEMSIZE(self); for (int i = PyGpuNdArray_NDIM(self)-1; (i >= 0) && c_contiguous; --i) { if (PyGpuNdArray_STRIDE(self, i) != size) { c_contiguous = false; } size = size * PyGpuNdArray_DIM(self, i); } return c_contiguous; } //Compute if the resulting array is f contiguous static bool PyGpuNdArray_is_f_contiguous(const PyGpuNdArrayObject * self) { bool f_contiguous = true; int size = PyGpuNdArray_ITEMSIZE(self); for (int i = 0; i < PyGpuNdArray_NDIM(self) && f_contiguous; ++i) { if (PyGpuNdArray_STRIDE(self, i) != size) { f_contiguous = false; } size = size * PyGpuNdArray_DIM(self, i); } return f_contiguous; } static PyObject * PyGpuNdArray_as_c_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs); static PyObject * PyGpuNdArray_as_f_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs); /** * [Re]allocate a PyGpuNdArrayObject with access to 'nd' dimensions. * * Note: This does not allocate storage for data. */ static int PyGpuNdArray_set_nd(PyGpuNdArrayObject * self, const int nd) { if (nd != PyGpuNdArray_NDIM(self)) { if(0) fprintf(stderr, "PyGpuNdArray_set_nd: modif nd=%i to nd=%i\n", PyGpuNdArray_NDIM(self), nd); if (PyGpuNdArray_DIMS(self)){ free(PyGpuNdArray_DIMS(self)); PyGpuNdArray_DIMS(self) = NULL; PyGpuNdArray_NDIM(self) = -1; } if (PyGpuNdArray_STRIDES(self)){ free(PyGpuNdArray_STRIDES(self)); PyGpuNdArray_STRIDES(self) = NULL; PyGpuNdArray_NDIM(self) = -1; } if (nd == -1) return 0; PyGpuNdArray_DIMS(self) = (npy_intp*)malloc(nd*sizeof(npy_intp)); if (NULL == PyGpuNdArray_DIMS(self)) { PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_set_nd: Failed to allocate dimensions"); return -1; } PyGpuNdArray_STRIDES(self) = (npy_intp*)malloc(nd*sizeof(npy_intp)); if (NULL == PyGpuNdArray_STRIDES(self)) { PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_set_nd: Failed to allocate str"); return -1; } //initialize all dimensions and strides to 0 for (int i = 0; i < nd; ++i) { PyGpuNdArray_DIM(self, i) = 0; PyGpuNdArray_STRIDES(self)[i] = 0; } PyGpuNdArray_NDIM(self) = nd; if(0) fprintf(stderr, "PyGpuNdArray_set_nd: end\n"); } return 0; } #endif /* Local Variables: mode:c++ c-basic-offset:4 c-file-style:"stroustrup" c-file-offsets:((innamespace . 0)(inline-open . 0)) indent-tabs-mode:nil fill-column:79 End: */ // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/setup_opencl.py0000644000175000000500000000734012313360366023014 0ustar tomussrcimport os from distutils.core import setup, Extension from distutils.command.build_ext import build_ext from distutils.dep_util import newer import numpy as np class build_ext_nvcc(build_ext): user_options = build_ext.user_options user_options.extend([ ('cuda-root=', None, "The cuda root directory")]) def initialize_options(self): build_ext.initialize_options(self) self.cuda_root = None def finalize_options(self): build_ext.finalize_options(self) if self.cuda_root is None: self.cuda_root = os.getenv('CUDA_ROOT', None) if self.cuda_root is not None: self._nvcc_bin = os.path.join(self.cuda_root, 'bin', 'nvcc') else: self._nvcc_bin = 'nvcc' def cuda_process(self, source, include_args): target = source + '.cpp' if newer(source, target): self.spawn([self._nvcc_bin, '--cuda', source, '-o', target] + \ include_args) return target def cuda_extension(self, ext): includes = self.distribution.include_dirs + ext.include_dirs include_args = ['-I' + i for i in includes] new_sources = [] anycuda = False for src in ext.sources: if src.endswith('.cu'): new_sources.append(self.cuda_process(src, include_args)) anycuda = True else: new_sources.append(src) if anycuda: ext.sources = new_sources if self.cuda_root is not None: lib = os.path.join(self.cuda_root, 'lib') lib64 = os.path.join(self.cuda_root, 'lib64') if os.path.isdir(lib): ext.library_dirs.append(lib) ext.extra_link_args.append('-Xlinker') ext.extra_link_args.append('-rpath') ext.extra_link_args.append('-Xlinker') ext.extra_link_args.append(lib) if os.path.isdir(lib64): ext.library_dirs.append(lib64) # ext.extra_link_args.append('-rpath') # ext.extra_link_args.append(lib64) if 'cudart' not in ext.libraries: ext.libraries.append('cudart') if self.cuda_root: include = os.path.join(self.cuda_root, 'include') if os.path.isdir(include): ext.extra_compile_args.append('-I' + include) if os.path.isfile('/usr/lib/nvidia-current/libOpenCL.so'): ext.extra_link_args.append('-L/usr/lib/nvidia-current') ext.extra_link_args.append('-Xlinker') ext.extra_link_args.append('-rpath') ext.extra_link_args.append('-Xlinker') ext.extra_link_args.append('/usr/lib/nvidia-current') def build_extensions(self): self.check_extensions_list(self.extensions) for ext in self.extensions: self.cuda_extension(ext) # uncomment this + inherit from the cython version of build_ext # work with cuda and cython sources #ext.sources = self.cython_sources(ext.sources, ext) self.build_extension(ext) import sys if sys.platform == 'darwin': libcl_args = {'extra_link_args': ['-framework', 'OpenCL']} else: libcl_args = {'libraries': ['OpenCL']} setup(name='compyte', cmdclass={'build_ext': build_ext_nvcc}, include_dirs=[np.get_include(), '.'], ext_modules=[Extension('pygpu_ndarray', define_macros=[('OFFSET', '1'), ('WITH_OPENCL', '')], sources=['pygpu_language_opencl.cpp', 'pygpu_ndarray.cpp'], **libcl_args) ] ) pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/Makefile0000644000175000000500000000232412313360366021377 0ustar tomussrcall: pygpu_ndarray.so PYTHONVERSION ?= $(shell python -c "import sys; print '%d.%d'%(sys.version_info[0], sys.version_info[1]")) CUDA_ROOT ?= /opt/lisa/os/cuda THEANO_ROOT ?= /u/bastienf/repos/Theano CFLAGS=-g -DDEBUG -DOFFSET # By default enable the OFFSET usage. Otherwise some test fail. CFLAGS=-g -DOFFSET #BINDIR=--compiler-bindir ${HOME}/.theano.nvcc-bindir #NPY_PATH!=python -c "import numpy;print numpy.__path__" #NPY_INCLUDE=-I${NPY_PATH}/core/include CUDA_INCLUDE=-I${CUDA_ROOT}/include PYTHON_INCLUDE=-I$(shell python -c "import distutils.sysconfig;print distutils.sysconfig.get_python_inc()") INCLUDES=${CUDA_INCLUDE} ${PYTHON_INCLUDE} CUDA_FLAGS=-Xlinker -rpath,${CUDA_ROOT}/lib64 -Xlinker -rpath,${CUDA_ROOT}/lib pygpu_language_cuda.o: pygpu_language_cuda.cu pygpu_language.h nvcc -c ${CFLAGS} -m64 -Xcompiler -fPIC,-m64 ${CUDA_FLAGS} ${INCLUDES} ${BINDIR} -o $@ $< pygpu_ndarray.so: pygpu_ndarray.cpp pygpu_ndarray.h pygpu_language_cuda.o pygpu_ndarray_object.h nvcc -shared ${CFLAGS} -m64 -Xcompiler -fPIC,-m64 ${CUDA_FLAGS} ${INCLUDES} ${BINDIR} -o $@ pygpu_language_cuda.o $< -lpython${PYTHONVERSION} -lcublas -lcudart clean: rm -f pygpu_ndarray.so core.* *.o *~ rm -rf build cleantmp: rm -f core.* *.o *~pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/gen_elemwise.py0000644000175000000500000022514612313360366022765 0ustar tomussrc""" This file implement 1 version of the elemwise op on the gpu. The elemwise fct are also used with scalar operation! So it can happen that ndim is 0 as with all scalar type. """ import numpy import StringIO import pygpu_ndarray as gpu_ndarray _CL_MODE = hasattr(gpu_ndarray, "set_opencl_context") if _CL_MODE: # THIS IS NOT FINISHED import pyopencl as cl import pyopencl.array as cl_array from pyopencl.tools import dtype_to_ctype # import pyopencl._mymako as mako from pyopencl._cluda import CLUDA_PREAMBLE # TODO: use mako to get rid of the %if CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455] CLUDA_PREAMBLE += """ #define LDIM_0 get_local_size(0) #define LDIM_1 get_local_size(1) #define LDIM_2 get_local_size(2) #define GDIM_0 get_num_groups(0) #define GDIM_1 get_num_groups(1) #define GDIM_2 get_num_groups(2) """ # TODO, reuse the same context as the use used to create the memory. ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) else: import pycuda.autoinit import pycuda.driver as driver from pycuda.compiler import SourceModule from pycuda.tools import dtype_to_ctype # import pycuda._mymako as mako from pycuda._cluda import CLUDA_PREAMBLE CLUDA_PREAMBLE += """ #define LDIM_0 blockDim.x #define LDIM_1 blockDim.y #define LDIM_2 blockDim.z #define GDIM_0 gridDim.x #define GDIM_1 gridDim.y #define GDIM_2 gridDim.z """ from theano import Apply from theano import scalar from theano.tensor import TensorType import theano import logging _logger_name = 'compyte.gen_elemwise' _logger = logging.getLogger(_logger_name) _logger.setLevel(logging.INFO) _logger.addHandler(logging.StreamHandler()) # TO REMOVE def warning(*msg): _logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg)) def info(*msg): _logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg)) def debug(*msg): _logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg)) if _CL_MODE: gpu_ndarray.set_opencl_context(ctx.obj_ptr) cast_int = numpy.intc cast_uint = numpy.uintc def _logical_scalar(x): return numpy.all(x.type.broadcastable) def get_str_list_logical_scalar(inputs, value_str='ii_i%i_value', data_str='ii_i%i_data[0]'): l = [] for ipos, i in enumerate(inputs): if _logical_scalar(i): l += [value_str % ipos] else: l += [data_str % ipos] return l class WrapOpenCLFunction(object): def __init__(self, fct): self.fct = fct def _param_wrap(self, p): if isinstance(p, MyGpuNdArray): p = p.gpu_nd_array if isinstance(p, gpu_ndarray.GpuNdArrayObject): p = cl.MemoryObject.from_cl_mem_as_int(p.bytes) return p def set_block_shape(self, *shape): self.local_size = shape def param_set(self, *param): self.param = [self._param_wrap(p) for p in param] def launch_grid(self, *global_shape): global_size = global_shape + (1,) d = {"g_times_l": True} return self.fct(queue, global_size, self.local_size, *self.param, **d) def compile_gpu_code(code, fct_name): if _CL_MODE: # Compile the gpu function with pyopencl prg = cl.Program(ctx, code).build() fct2 = getattr(prg, fct_name) fct = WrapOpenCLFunction(fct2) else: # Compile the gpu function with pycuda mod = SourceModule(code) fct = mod.get_function(fct_name) return fct class ElemwiseAlgo(object): verbose = 0 # 1, 2 or 3 for more verbose output. cache_version = () cache_version = ('debug', 14, verbose) def __init__(self, scalar_op, inplace_pattern={}): """ :param scalar_op: the scalar operation to execute on each element. """ self.scalar_op = scalar_op self.inplace_pattern = inplace_pattern def task_code(self, inputs, outputs, sio, nodename, iname=None, oname=None): if iname == None: iname = get_str_list_logical_scalar(inputs) if oname == None: oname = ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(outputs)] print >> sio, self.scalar_op.c_code( Apply(self.scalar_op, [scalar.Scalar(dtype=input.type.dtype)() for input in inputs], [scalar.Scalar(dtype=output.type.dtype)() for output in outputs]), nodename + '_scalar_', iname, oname, sub=dict(fail='return;')) # TODO: set a failure code somehow!!! def c_src_kernel(self, inputs, outputs, nodename, nd, static="static"): sio = StringIO.StringIO() #print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, static, ( "KERNEL void kernel_%s_%s(unsigned int numEls" % (nodename, nd)) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(inputs): s = ", ".join(["GLOBAL_MEM const %s * i%i_data" % ( dtype_to_ctype(i.dtype), ipos)] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(outputs): s = ", ".join(["GLOBAL_MEM %s * o%i_data" % ( dtype_to_ctype(i.dtype), ipos)] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) # for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" print >> sio, " const int idx = GID_0 * LDIM_0 + LID_0;" print >> sio, " const int numThreads = LDIM_0 * GDIM_0;" # For each input that is a scalar which has been broadcasted # to a tensor, load it into a local variable for ipos, i in enumerate(inputs): if _logical_scalar(i): print >> sio, " const %s ii_i%i_value = i%i_data[0];" % ( dtype_to_ctype(i.dtype), ipos, ipos) #loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # calculate the data pointers for all arguments print >> sio, " int ii = i;" for ipos, i in enumerate(inputs): if not _logical_scalar(i): print >> sio, (" GLOBAL_MEM const " "%s * ii_i%i_data = i%i_data;" % ( dtype_to_ctype(i.dtype), ipos, ipos)) for ipos, i in enumerate(outputs): print >> sio, " GLOBAL_MEM %s * ii_o%i_data = o%i_data;" % ( dtype_to_ctype(i.dtype), ipos, ipos) for d in xrange(nd - 1, -1, -1): if d > 0: print >> sio, " int pos%i = ii %% dim%i;" % (d, d) print >> sio, " ii = ii / dim%i;" % d else: print >> sio, " int pos%i = ii;" % d for ipos, i in enumerate(inputs): if not _logical_scalar(i): print >> sio, (" ii_i" "%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)) for ipos, i in enumerate(outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % ( ipos, d, ipos, d) # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? self.task_code(inputs, outputs, sio, nodename) print >> sio, " }" #indent = " "*(4*d+7) #for ipos, i in enumerate(inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" #print sio.getvalue() return sio.getvalue() def c_src_kernel_Ccontiguous(self, inputs, outputs, nodename, static="static"): nd = outputs[0].type.ndim sio = StringIO.StringIO() #print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, static, ("KERNEL void kernel_%s_Ccontiguous" " (unsigned int numEls" % (nodename)) #declare inputs for ipos, i in enumerate(inputs): print >> sio, "\t,", "GLOBAL_MEM const %s * i%i_data" % ( dtype_to_ctype(i.dtype), ipos) #declare outputs for ipos, i in enumerate(outputs): print >> sio, "\t,", "GLOBAL_MEM %s * o%i_data" % ( dtype_to_ctype(i.dtype), ipos) print >> sio, "\t)\n{" print >> sio, " const int idx = GID_0 * LDIM_0 + LID_0;" print >> sio, " const int numThreads = LDIM_0 * GDIM_0;" # For each input that is a scalar which has been broadcasted # to a tensor, load it into a local variable for ipos, i in enumerate(inputs): if _logical_scalar(i): print >> sio, " const %s ii_i%i_value = i%i_data[0];" % ( dtype_to_ctype(i.dtype), ipos, ipos) #loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? self.task_code(inputs, outputs, sio, nodename, iname=get_str_list_logical_scalar( inputs, data_str='i%i_data[i]'), oname=['o%i_data[i]' % ipos for ipos, i in enumerate(outputs)]) print >> sio, " }" print >> sio, "}" #print sio.getvalue() return sio.getvalue() def c_src_callkernel(self, inputs, outputs, nodename): # # This function serves three main goals: # # The first is stride unpacking: # it accepts input and output arguments as # float * , int* # pairs, and it constructs a kernel function call where inputs # and arguments are named like # float *, int, int, int ... # # The second is to recognize when any dimensions can be collapsed as # being contiguous. That mean that we can merge that dimensions with # another one for all inputs/outputs and have the same retusuls # (confusing... read code) # # The thrid is to make a special case for scalar element. We allow # the collapsing of them. In the ccontiguous and not contiguous case, # we use registers to lower the number of memory access. # TODO: make a special case for broadcasting, to store the # data in shared memory. nd = outputs[0].type.ndim nb_inputs = len(inputs) nb_outputs = len(outputs) d = dict() # input_params and output_params go into the function # declaration/definition input_params = ", ".join("const %s * i%i_data, const int * i%i_str" % ( dtype_to_ctype(inputs[i].dtype), ipos, ipos) for ipos in xrange(len(inputs))) output_params = ", ".join("%s * o%i_data, const int * o%i_str" % ( dtype_to_ctype(outputs[i].dtype), ipos, ipos) for ipos in xrange(len(outputs))) #input_args and output_args go into the recursive call. input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos) for ipos in xrange(len(inputs))) output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos) for ipos in xrange(len(outputs))) prod_dims = '*'.join(["dims[%i]" % di for di in xrange(nd)] + ['1']) sio = StringIO.StringIO() print >> sio, """ static void can_collapse_%(nodename)s(int nd, const int * dims, const int * strides, int collapse[]) { //can we collapse dims[i] and dims[i-1] for(int i=nd-1;i>0;i--){ if(strides[i]*dims[i]==strides[i-1]){ //the dims nd-1 are not strided again dimension nd collapse[i]=1; }else collapse[i]=0; } } """ % locals() print >> sio, """ static int callkernel_%(nodename)s(unsigned int numEls, const int d, const int * dims, %(input_params)s, %(output_params)s) { numEls = %(prod_dims)s; """ % locals() if self.verbose: print >> sio, """ std::cerr << "calling kernel_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n"; """ % locals() print >> sio, 'std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di for di in xrange(nd)) + ["'\\n';"]) if self.verbose > 1: for ipos in xrange(len(inputs)): print >> sio, """ std::cerr << " %(ipos)s data strides" << """ % locals() + " << ' ' << ".join(["i%s_data" % ipos] + list("i%s_str[%i]" % (ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''' for ipos in xrange(len(outputs)): print >> sio, """ std::cerr << " %(ipos)s data strides" << """ % locals() + " << ' ' << ".join(["o%s_data" % ipos] + list("o%s_str[%i]" % (ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''' # collapse dimension that are broadcast in all inputs. # need to be done before contiguous collapse as it will break it. # do the dimensions and the strides print >> sio, """ int local_dims[%(nd)s]; int local_str[%(nb_inputs)s][%(nd)s]; int local_ostr[%(nb_inputs)s][%(nd)s]; int nd_collapse = %(nd)s; for(int i=0;i<%(nd)s;i++){//init new dim local_dims[i]=dims[i]; } """ % locals() for ipos in xrange(len(inputs)): print >> sio, """ for(int i=0;i<%(nd)s;i++){//init new strides local_str[%(ipos)s][i]=i%(ipos)s_str[i]; } """ % locals() for ipos in xrange(len(outputs)): print >> sio, """ for(int i=0;i<%(nd)s;i++){//init new strides local_ostr[%(ipos)s][i]=o%(ipos)s_str[i]; } """ % locals() if self.verbose > 2: print >>sio, 'std::cerr <<"before broadcast collapse\\n";' print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >> sio, 'std::cerr << "local_dims";' for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals() print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(inputs)): print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' for ipos in xrange(len(outputs)): print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' print >> sio, """ for(int id=0;id 2: print >>sio, 'std::cerr <<"after broadcast collapse\\n";' print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >> sio, 'std::cerr << "local_dims";' for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals() print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(inputs)): print >> sio, 'std::cerr << " local_str %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' for ipos in xrange(len(outputs)): print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";' # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle)) # this is a good idea because we make less index calculation in the gpu. print >> sio, "int nd_collapse_[%(nd)s] = {" % locals() +','.join(['1' for x in range(nd)]) +"};" for ipos in xrange(len(inputs)): if not _logical_scalar(inputs[ipos]): print >> sio, """ int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals() +','.join(['1' for x in range(nd)]) +"};" print >> sio, """ can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s); for(int i=0;i 1: print >>sio, """ std::cerr<< "nd_collapse_%(ipos)s "<< """ % locals() print >>sio, ' << " " << '.join( ["nd_collapse_%(ipos)s[" % locals() + str(i) + "]" for i in range(nd)]) print >>sio, '<< "\\n";' print >>sio, """ std::cerr<< "nd_collapse_ "<< """ % locals() print >>sio, ' << " " << '.join( ["nd_collapse_[" % locals() + str(i) + "]" for i in range(nd)]) print >>sio, '<< "\\n";' # update the local stride. for ipos in xrange(len(inputs)): print >> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides for(int j=i+1;j> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides for(int j=i+1;j> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_dims[i-1]*=local_dims[i];//set new dims for(int j=i+1;j> sio, """ for(int i=1, end=nd_collapse;i 0: print >> sio, " && ", " && ".join(l) print >> sio, """){nd_collapse=0;} """ if self.verbose: print >> sio, 'std::cerr <<"after can_collapse\\n";' print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals() if self.verbose > 1: for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals() print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(inputs)): print >> sio, ('std::cerr << " local_str %(ipos)s: " <<' % locals() + ' << " " << '.join( ["local_str[%(ipos)s][%(x)s]" % locals() for x in range(nd)]) + '<<"\\n";') for ipos in xrange(len(outputs)): print >> sio, ('std::cerr << " local_ostr %(ipos)s: " <<' % locals() + ' << " " << '.join( ["local_ostr[%(ipos)s][%(x)s]" % locals() for x in range(nd)]) + '<<"\\n";') def launch_Ccontiguous(nodename, scalar_op): kernel_call_args = ["numEls"] for ipos in xrange(len(inputs)): kernel_call_args.append("i%i_data" % ipos) for ipos in xrange(len(outputs)): kernel_call_args.append("o%i_data" % ipos) kernel_call_args = ", ".join(kernel_call_args) verb = "" if self.verbose: verb = 'std::cerr << " Running ccontiguous version\\n";' print >> sio, """ //first use at least a full warp int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE //next start adding multiprocessors int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s); //std::cerr << "calling callkernel returned\\n"; """ % locals() print >> sio, """ CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", "GpuElemwise %(nodename)s", cudaGetErrorString(err), n_blocks, threads_per_block, "kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s)"); return -1; } %(verb)s return 0; """ % locals() def launch_General(nodename, scalar_op, force_nd): # kernel_call_args are used to invoke the cuda kernel local = "local_" kernel_call_args = ["numEls"] kernel_call_args.extend(local + "dims[%i]" % di for di in xrange(force_nd)) for ipos in xrange(len(inputs)): kernel_call_args += ["i%i_data" % ipos] + list( local + "str[%i][%i]" % (ipos, di) for di in xrange(force_nd)) #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) #kernel_call_args.append( "%s, i%i_data" % (strides, ipos)) for ipos in xrange(len(outputs)): kernel_call_args += ["o%i_data" % ipos] + list( local + "ostr[%i][%i]" % (ipos, di) for di in xrange(force_nd)) #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) #kernel_call_args.append( "%s, o%i_data" % (strides, ipos)) if self.verbose: print >> sio, """ std::cerr << " Running general version with %(force_nd)s dims\\n"; """ % locals() print >> sio, "std::cerr << "+ ' << " " << '.join( kernel_call_args)+' << "\\n";' #std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n; kernel_call_args = ", ".join(kernel_call_args) print >> sio, """ //first use at least a full warp int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE //next start adding multiprocessors int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); kernel_%(nodename)s_%(force_nd)s<<>>(%(kernel_call_args)s); """ % locals() print >> sio, """ CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", "GpuElemwise %(nodename)s", cudaGetErrorString(err), n_blocks, threads_per_block, "kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s)"); return -1; } return 0; """ % locals() print >> sio, "if(numEls==0) return 0;" print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals() print >> sio, "case 0: {" launch_Ccontiguous(nodename, scalar_op) print >> sio, " } break;" for i in range(1, nd + 1): print >> sio, "case " + str(i) + ": {" launch_General(nodename, scalar_op, i) print >> sio, " } break;" print >> sio, "}" # end case print >> sio, "return -2;" # should not get to this point print >> sio, "}" # end fct #N.B. cudaGetLastError is called by c_code return sio.getvalue() def c_support_code_apply(self, inputs, outputs, nodename): nd = outputs[0].type.ndim return "".join( CLUDA_PREAMBLE, [self.c_src_kernel(inputs, outputs, nodename, x) for x in range(1, nd + 1)] + [self.c_src_kernel_Ccontiguous(inputs, outputs, nodename), self.c_src_callkernel(inputs, outputs, nodename), ]) def c_code(self, ninputs, noutputs, nodename, inputs, outputs, sub): d = dict(sub) nd = noutputs[0].type.ndim d.update(locals()) sio = StringIO.StringIO() nin = len(inputs) nout = len(outputs) fail = sub['fail'] opname = str(self.scalar_op) initial_dims = ','.join('1' for i in xrange(nd)) if 1 or self.scalar_op == scalar.pow: print >> sio, """ //std::cerr << "C_CODE %(opname)s START\\n"; //standard elemwise size checks """ % locals() print >> sio, """ int dims[%(nd)s] = {%(initial_dims)s}; """ % locals() #check that all inputs have valid dimensions emitted_inames = {} for id, iname in enumerate(inputs): if iname in emitted_inames: assert emitted_inames[iname] is ninputs[id] continue broadcasts = ', '.join(map(str, map(int, ninputs[id].broadcastable))) nd = ninputs[id].ndim print >> sio, """ int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; """ % locals() emitted_inames[iname] = ninputs[id] #check that all inputs have valid dimensions emitted_inames = {} for id, iname in enumerate(inputs): if iname in emitted_inames: continue print >> sio, """ //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; if (%(nd)s != %(iname)s->nd) { PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd); %(fail)s; } for (int i = 0; i< %(nd)s; ++i) { dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i]; if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1))&& (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i])) { //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. One of your inputs has shape[%%i] == %%i, but the output's size on that axis is %%i.", i, CudaNdarray_HOST_DIMS(%(iname)s)[i], dims[i] ); %(fail)s; } } """ % locals() emitted_inames[iname] = True #check that all outputs have valid dimensions for idx, oname in enumerate(outputs): if idx not in self.inplace_pattern.keys(): print >> sio, """ for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; } } if (NULL == %(oname)s) { %(oname)s = (CudaNdarray*)CudaNdarray_New(); if (!%(oname)s) { //error string already set %(fail)s; } if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims)) { //error string already set Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() else: input_idx = self.inplace_pattern[idx] iname = inputs[input_idx] print >> sio, """ Py_XDECREF(%(oname)s); %(oname)s = %(iname)s; Py_INCREF(%(oname)s); for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() print >> sio, """ { //new block so that failure gotos don't skip over variable initialization //std::cerr << "calling callkernel\\n"; if (callkernel_%(nodename)s(1, 0, dims """ % locals() for iname in inputs: print >> sio, """ , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s) """ % locals() for oname in outputs: print >> sio, """ , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s) """ % locals() print >> sio, """ )) { // error """ for oname in outputs: print >> sio, """ Py_DECREF(%(oname)s); %(oname)s = NULL; """ % locals() print >> sio, """ %(fail)s; } else // no error { } } //std::cerr << "C_CODE %(opname)s END\\n"; """ % locals() #print sio.getvalue() return sio.getvalue() def c_support_code(self): return """ #define INTDIV_POW2(a, b) (a >> b) #define INTMOD_POW2(a, b) (a & ((1<> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, """static __global__ void kernel_%s_%s( unsigned int numEls""" % ( nodename, 'tiling%i' % nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(inputs): s = ", ".join(["const float * i%i_data" % ipos] + list( "int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(outputs): s = ", ".join(["float * o%i_data" % ipos] + list( "int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable print >> sio, " __shared__ float value0[%i];" % len(inputs) print >> sio, " __shared__ int shared_dims[%(nd)s];" % locals() #print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]" print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" for ipos, i in enumerate(inputs): if _logical_scalar(i): print >> sio, " value0[%i] = i%i_data[0];" % (ipos, ipos) for ipos in xrange(nd): print >> sio, " shared_dims[%i] = dim%i;" % (ipos, ipos) print >> sio, " }" print >> sio, " __syncthreads();" if (nd == 4): print >> sio, """ for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x) { for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y) { //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x) for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y) { //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y) for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x) { """ else: raise NotImplementedError() for ipos, i in enumerate(inputs): if not _logical_scalar(i): print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos) for ipos, i in enumerate(outputs): print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos) for d in xrange(nd): for ipos, i in enumerate(inputs): if not _logical_scalar(i): print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d) for ipos, i in enumerate(outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d) # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? self.task_code(inputs, outputs, sio, nodename, iname=get_str_list_logical_scalar( inputs, value_str='value0[%i]')) print >> sio, " }" * nd #TODO: insert runtime stride checks that select the best loop order either here, or in # the host code that launched the kernel (host code probably better spot) #indent = " "*(4*d+7) #for ipos, i in enumerate(inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" print sio.getvalue() return sio.getvalue() def c_src_kernel_tiling_less_registers(self, inputs, outputs, nodename): """ The kernel applies to problems with <= 5 dimensions """ nd = outputs[0].type.ndim n_in = len(inputs) n_out = len(outputs) sio = StringIO.StringIO() if nd not in (2,): return sio.getvalue() # print some leading comments to make the code easier to read for ipos, i in enumerate(inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s(unsigned int numEls" %( nodename, 'tiling%i_less_registers'%nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(inputs): s = ", ".join(["const float * i%i_data_0" % ipos] + list( "int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(outputs): s = ", ".join(["float * o%i_data_0" % ipos] + list( "int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet. use_shared_stride = False use_shared_limits = False def decl_limits(nd): if use_shared_limits: print >> sio, "__shared__ float * limits[%(nd)s];" % locals() def stride(io, p, d): if use_shared_stride: return "s%s_str[%i][%i]" % (io, p, d) else: return "%s%i_str_%i" % (io, p, d) def limits(d): if use_shared_limits: return "limits[%i]" % d else: return "limits%i" % d def decl_shared_stride(nin, nout, nd): if not use_shared_stride: return print >> sio, """ __shared__ int si_str[%(nin)s][%(nd)s]; __shared__ int so_str[%(nout)s][%(nd)s]; if ((threadIdx.x == 0) && (threadIdx.y == 0)) { """ % locals() for i in xrange(nin): for d in xrange(nd): print >> sio, "si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals() for i in xrange(n_out): for d in xrange(nd): print >> sio, "so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals() print >> sio, "} __syncthreads();" def calc_limit(d): s = stride('o', 0, d) lname = limits(d) if use_shared_limits: print >> sio, "if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" if d == 0: print >> sio, "%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals() else: dm1 = d - 1 print >> sio, "%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals() print >> sio, "} __syncthreads();" else: if d == 0: print >> sio, "const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals() else: dm1 = d - 1 print >> sio, "const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals() def decl_ptrs(d, offset): dm1 = d - 1 assert dm1 >= 0 for i in xrange(n_in): s = stride('i', i, d) print >> sio, "const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals() for i in xrange(n_out): s = stride('o', i, d) print >> sio, "float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals() def inc_ptrs(d, amt): for i in xrange(n_in): s = stride('i', i, d) print >> sio, "i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals() for i in xrange(n_out): s = stride('o', i, d) print >> sio, "o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals() def while_limit(d): lname = limits(d) print >> sio, "while (o0_data_%(d)s < %(lname)s) { " % locals() def end_while(d): print >> sio, "}" def task_code(d): self.task_code(inputs, outputs, sio, nodename, iname=['i%i_data_%i[0]' % (ipos, d) for ipos, i in enumerate(inputs)], oname=['o%i_data_%i[0]' % (ipos, d) for ipos, i in enumerate(outputs)]) if nd == 4: decl_shared_stride(n_in, n_out, nd) decl_limits(nd) calc_limit(0) inc_ptrs(0, 'blockIdx.x') while_limit(0) if 1: calc_limit(1) decl_ptrs(1, 'blockIdx.y') while_limit(1) if 1: calc_limit(2) decl_ptrs(2, 'threadIdx.y') while_limit(2) if 1: calc_limit(3) decl_ptrs(3, 'threadIdx.x') while_limit(3) if 1: task_code(3) inc_ptrs(3, 'blockDim.x') end_while(3) inc_ptrs(2, 'blockDim.y') end_while(2) inc_ptrs(1, 'gridDim.y') end_while(1) inc_ptrs(0, 'gridDim.x') end_while(0) print >> sio, "}" print sio.getvalue() return sio.getvalue() def elemwise_collapses(inputs, outputs, out_shape=None, verbose=0): """ This collapse dimensions that are not needed when computing elemwise. This is usefull as it lower the indexing computation that is heavier on gpu then on cpu. This is a generic version. It collapse dimensions at any place in the shape. It handle broadcasted dimensions correctly. There is no special handling needed for broadcasted scalar at this level. @return: ndims, tuple(dims, strides) after collapsing. """ in_out = inputs + outputs del inputs if out_shape is not None: local_dims = tuple(out_shape) else: # TODO, use the right algo here or make the parameter not optional # We should always have the same shape for all outputs # If there is more then one outputs local_dims = tuple(outputs[0].shape) del outputs nd_orig = len(local_dims) if nd_orig == 1: # This have a lower overhead all_c_contig = True for inp in in_out: if not inp.flags['C_CONTIGUOUS'] or inp.shape != local_dims: all_c_contig = False break if all_c_contig: return 0, (local_dims, []) collapsable = [1] * nd_orig local_str = [None] * len(in_out) nd_collapse = nd_orig for ipos in xrange(len(in_out)): inp = in_out[ipos] assert len(inp.shape) == nd_orig, "All inputs/outputs must have the same number of dimensions. You must broadcast before calling elemwise_collapse" local_str[ipos] = list(inp.strides) # We set the strides of broacastable dims to 0 # This make indexing in gpu simpler and is needed # For collapsing the dimensions. for dim_pos in range(inp.ndim): if inp.shape[dim_pos] == 1: local_str[ipos][dim_pos] = 0 if nd_orig == 1: # We already covered the contiguous case before # So we are sure it is not contiguous # TODO: Add a test that f contiguous are also collapsed by the first case. # I think that for 1d array when the flags f contiguous is true, c contiguous is also true. return 1, (local_dims, local_str) if verbose > 2: print "before broadcast collapse" print " nd_collapse", nd_collapse print " local_dims", local_dims for ipos in xrange(len(local_str)): print " local_str inputs", ipos, local_str[ipos] local_dims = list(local_dims) # Collapse dimension that are broadcast in all inputs. # need to be done before contiguous collapse as it will break it. # Update the dimensions and the strides for id in range(nd_collapse): if local_dims[id] == 1: # remove dims i from the array for j in range(id + 1, nd_collapse): local_dims[j - 1] = local_dims[j] # remove dims i from the array for input_id in range(len(in_out)): for j in range(id + 1, nd_collapse): local_str[input_id][j - 1] = local_str[input_id][j] nd_collapse -= 1 id -= 1 # TODO: what is this? How this work? if verbose > 2: print "after broadcast collapse" print " nd_collapse", nd_collapse print " local_dims", local_dims for ipos in xrange(len(local_str)): print " local_str inputs", ipos, local_str[ipos] nd_collapse_ = [1] * nd_orig for ipos in xrange(len(local_str)): # Can we collapse dims[i] and dims[i-1]? strides = local_str[ipos] for i in range(nd_collapse - 1, 0, -1): if strides[i] * local_dims[i] != strides[i - 1]: # The dims nd-1 are not strided again dimension nd nd_collapse_[i] = 0 if verbose > 1: print "nd_collapse_", nd_collapse_ nd_collapse2 = nd_collapse for i in range(nd_collapse - 1, 0, -1): if nd_collapse_[i] == 1: # update the local dims. local_dims[i - 1] *= local_dims[i] for j in range(i + 1, nd_collapse): local_dims[j - 1] = local_dims[j] # update the local stride. for ipos in xrange(len(local_str)): local_str[ipos][i - 1] = local_str[ipos][i] # set new strides # remove stride i from the array for j in range(i + 1, nd_collapse): local_str[ipos][j - 1] = local_str[ipos][j] # update the new number of dim nd_collapse2 -= 1 nd_collapse = nd_collapse2 if nd_collapse == 1: l = [local_str[ipos][nd_collapse - 1] == in_out[ipos].itemsize for ipos in range(len(local_str))] if all(l): nd_collapse = 0 if verbose: print "end collapsing" print " nd_collapse", nd_collapse if verbose > 1: print " local_dims", local_dims for ipos in xrange(len(local_str)): print " local_str inputs", ipos, local_str[ipos] return nd_collapse, (local_dims, local_str) def reduction_collapses(inout, axis, verbose=0): """ This collapse dimensions that are not needed when computing reduction. This is usefull as it lower the indexing computation that is heavier on gpu then on cpu. This is a generic version. It collapse dimensions at any place in the shape. @param: inout: tuple(input, output) @param: axis: None, interger, list of 1 interger The axis over witch we will do reduction. @return: (ndims, (input dims, input strides, input pattern), out strides) after collapsing. :note: we suppose that we can always collapse the output dimensions. """ input = inout[0] out = inout[1] # Some quick check. It is faster then the full version. if axis is None: # The output size is always 1, so we don't care about this strides if (input.flags['C_CONTIGUOUS'] or input.flags['F_CONTIGUOUS']): return 0, ((input.size,), (input.itemsize,), axis), (0,) if input.ndim == 1: assert axis == [0] or axis == 0 or axis is None # not c contiguous as the first if should have catched it. return 1, (input.shape, input.strides, axis), (0,) if not isinstance(axis, (list, tuple)): local_axis = [axis] else: local_axis = list(axis) # This is needed for the computing of the output strides assert axis is None or len(local_axis) == 1 local_dims = list(input.shape) local_str = list(input.strides) out_strides = list(out.strides) nd_orig = len(local_dims) collapsable = [1] * nd_orig nd_collapse = nd_orig if verbose > 2: print "before broadcast collapse" print " nd_collapse", nd_collapse print " local_dims", local_dims print " local_str inputs", local_str print " local_axis", local_axis # Collapse dimension that are broadcast in all inputs. # need to be done before contiguous collapse as it will break it. # Update the dimensions and the strides for id in range(nd_collapse): if local_dims[id] == 1: for j in range(id + 1, nd_collapse): # remove dims i from the array local_dims[j - 1] = local_dims[j] # remove strides i from the array local_str[j - 1] = local_str[j] # remove output strides i from the array if axis is not None: out_strides[j - 2] = out_strides[j - 1] if id in local_axis: local_axis.remove(id) for axis_pos in range(len(local_axis)): if local_axis[axis_pos] > id: local_axis[axis_pos] -= 1 nd_collapse -= 1 id -= 1 # TODO: how this work? if verbose > 2: print "after broadcast collapse" print " nd_collapse", nd_collapse print " local_dims", local_dims print " local_str inputs", local_str print " local_axis", local_axis print " out_strides", out_strides nd_collapse_ = [1] * nd_orig # Can we collapse dims[i] and dims[i-1]? for i in range(nd_collapse - 1, 0, -1): if ((local_str[i] * local_dims[i] != local_str[i - 1])): # The dims nd-1 are not strided again dimension nd nd_collapse_[i] = 0 elif (i in local_axis) != ((i - 1) in local_axis): nd_collapse_[i] = 0 if verbose > 1: print "nd_collapse_", nd_collapse_ nd_collapse2 = nd_collapse for i in range(nd_collapse - 1, 0, -1): if nd_collapse_[i] == 1: # update the local dims. local_dims[i - 1] *= local_dims[i] # set new strides local_str[i - 1] = local_str[i] #remove the old dims and strides for j in range(i + 1, nd_collapse): local_dims[j - 1] = local_dims[j] local_str[j - 1] = local_str[j] if axis is not None: out_strides[j - 2] = out_strides[j - 1] if i in local_axis: local_axis.remove(i) for axis_pos in range(len(local_axis)): if local_axis[axis_pos] > i: local_axis[axis_pos] -= 1 # update the new number of dim nd_collapse2 -= 1 nd_collapse = nd_collapse2 if nd_collapse == 1: if local_str[nd_collapse - 1] == input.itemsize: nd_collapse = 0 if verbose: print "end collapsing" print " nd_collapse", nd_collapse if verbose > 1: print " local_dims", local_dims print " local_str inputs", local_str print " local_axis", local_axis print " out_strides", out_strides #print input.shape, input.strides #print nd_collapse, (local_dims, local_str, local_axis) local_dims = local_dims[:nd_collapse] local_str = local_str[:nd_collapse] out_strides = out_strides[:nd_collapse] return nd_collapse, (local_dims, local_str, local_axis), out_strides def call_elemwise(fct, input_vals, block=None, grid=None, out=None, out_shape=None, strides=None): """ Call an elemwise gpu function with gived inputs and block size. :param fct: The gpu function to call :param input_vals: a list of inputs to pass to fct :param block: int, the size of the block wanted :param grid: int, the size of the grid wanted :param out: Optional, the preallocated output. Must have the right shape and dtype. :param out_shape: Optional, if provided, we will suppose that the output, have this shape event if it is not true. :param strides: Optional, if provided, we will use those strides for the inputs and outputs. :note: param out_shape and strides are used for the collapsing of dimensions. """ inp = input_vals[0] # Get the output and output shape to us if out_shape is None and out is None: out_shape = list(inp.shape) for i in input_vals[1:]: # dtype checked by pycuda before gpu call for s_i in range(len(inp.shape)): assert (inp.shape[s_i] == i.shape[s_i] or inp.shape[s_i] == 1 or i.shape[s_i] == 1) out_shape[s_i] = max(out_shape[s_i], inp.shape[s_i], i.shape[s_i]) if out is None: out = gpu_ndarray.empty(out_shape, dtype=inp.dtype) elif out_shape is None: out_shape = out.shape # Arg: nb element args = [cast_uint(out.size)] # Arg: output shape to the arguments. for i in range(len(out_shape)): args.append(cast_int(out_shape[i])) # for each inputs and the output # add its ptr and strides nd = len(out_shape) idx = 0 for i in list(input_vals) + [out]: itemsize = i.dtype.itemsize args.append(i) for j in range(nd): # We force a stride of 0 for broadcastable dimensions # This lower the index computation in the kernel. if strides is not None: # strides should have a strides of 0 for broadcasting. args.append(cast_int(strides[idx][j] / itemsize)) elif i.shape[j] == 1: args.append(cast_int(0)) else: args.append(cast_int(i.strides[j] / itemsize)) idx += 1 out_size = out.size # First use at least a full warp if block is None: block_ = min(32, out_size) else: block_ = block # Next start adding multiprocessors if grid is None: grid_ = min(out_size / block_ + (out_size % block_ != 0), 60) else: grid_ = grid # Next start adding more warps per multiprocessor if block is None: if block_ * grid_ < out_size: block_ = min(out_size / grid_, 512) # We bypass the pycuda wrapper gpu function call. # by calling directly the gpu function. # This is faster and lower the overhead. # Here is code that allow you to use the pycuda fct call. # d = {"block":(block_,1,1), "grid":(grid_,1)} # fct(*args, **d) fct.set_block_shape(block_, 1, 1) # time_kernel fct.param_set(*args) fct.launch_grid(grid_, 1) return out class MyGpuNdArray(): _compiled_fct = {} def __init__(self, gpu_nd_array): #assert isinstance(gpu_nd_array, gpu_ndarray.GpuNdArrayObject) self.gpu_nd_array = gpu_nd_array self.ctype = dtype_to_ctype(self.gpu_nd_array.dtype) @staticmethod def gen_fct(op, inputs, nd, nodename="TestNodeName", collapse=True): if _CL_MODE: npy_ty = "typedef float npy_float32;\n" else: npy_ty = "typedef double npy_float64;\n typedef float npy_float32;\n" # Generate the gpu functions nb_in = len(inputs) fcts = [None] for nd in range(1, nd + 1): # 1 to nd out = op(*[TensorType(i.gpu_nd_array.dtype, (False,) * nd)() for i in inputs]) out_dtype = out.dtype node = out.owner elemwise_algo = ElemwiseAlgo(node.op.scalar_op) code = (CLUDA_PREAMBLE + npy_ty + elemwise_algo.c_src_kernel(node.inputs, node.outputs, nodename, nd, static="")) fct_name = "kernel_%s_%d" % (nodename, nd) fct = compile_gpu_code(code, fct_name) fcts.append(fct) # All inputs/outputs C contiguous case code = (npy_ty + CLUDA_PREAMBLE + elemwise_algo.c_src_kernel_Ccontiguous( node.inputs, node.outputs, nodename, static="")) fct_name = "kernel_%s_Ccontiguous" % nodename fcts[0] = compile_gpu_code(code, fct_name) def call_fct2(inputs, out=None): " Do dimensions collapsing before call the gpu code " assert len(inputs) == nb_in # dtype checked by pycuda # TODO: assert nb dim? inp = inputs[0] # Compute the output shape. out_shape = list(inp.shape) for i in inputs[1:]: for s_i in range(len(inp.shape)): assert (inp.shape[s_i] == i.shape[s_i] or inp.shape[s_i] == 1 or i.shape[s_i] == 1) out_shape[s_i] = max(out_shape[s_i], i.shape[s_i]) # Create the output object if (out is None or out.dtype != out_dtype or out.shape != tuple(out_shape)): out = MyGpuNdArray(gpu_ndarray.empty(out_shape, dtype=out_dtype)) if collapse: # Do the collapsing. nd_col, info = elemwise_collapses(list(inputs), [out]) # The two next line are usefull to force a call to the # c contiguous version: #nd_col = 0 #info = [[],[]] out = call_elemwise(fcts[nd_col], inputs, out=out, out_shape=info[0][:nd_col], strides=info[1]) else: out = call_elemwise(fcts[-1], inputs, out=out, out_shape=out_shape) return out return call_fct2 def __elemwise2__(self, other, name, op): """ Call this code on this op with 2 inputs """ nd = len(self.gpu_nd_array.shape) # self.gpu_nd_array.ndim assert nd == len(other.gpu_nd_array.shape) # ndim tag = (name + '_' + str(self.gpu_nd_array.dtype) + str(self.gpu_nd_array.ndim)) tag += ('_' + str(other.gpu_nd_array.dtype) + str(other.gpu_nd_array.ndim)) fct = self._compiled_fct.get(tag, None) if fct is None: # print "compile", tag fct = MyGpuNdArray.gen_fct(op, [self, other], nd) self._compiled_fct[tag] = fct return fct((self, other)) @classmethod def __elemwise__(cls, inputs, name, op, out=None): """ Call this code on this op with * inputs """ nd = len(inputs[0].gpu_nd_array.shape) # self.gpu_nd_array.ndim for i in inputs[1:]: assert nd == len(i.gpu_nd_array.shape) # ndim nb = len(inputs) tag = name + "_".join([str(i.gpu_nd_array.dtype) + str(i.gpu_nd_array.ndim) for i in inputs]) fct = cls._compiled_fct.get(tag, None) if fct is None: # print "compile", tag fct = MyGpuNdArray.gen_fct(op, inputs, nd) cls._compiled_fct[tag] = fct return fct(inputs, out=out) base = property(lambda self: self.gpu_nd_array.base) bytes = property(lambda self: self.gpu_nd_array.bytes) dtype = property(lambda self: self.gpu_nd_array.dtype) flags = property(lambda self: self.gpu_nd_array.flags) itemsize = property(lambda self: self.gpu_nd_array.itemsize) ndim = property(lambda self: self.gpu_nd_array.ndim, doc="number of dimensions") offset = property(lambda self: self.gpu_nd_array.offset) shape = property(lambda self: self.gpu_nd_array.shape) size = property(lambda self: self.gpu_nd_array.size) strides = property(lambda self: self.gpu_nd_array.strides) def __array__(self): return numpy.asarray(self.gpu_nd_array) def copy(self): return MyGpuNdArray(self.gpu_nd_array.copy()) def view(self): return MyGpuNdArray(self.gpu_nd_array.view()) def __copy__(self): return MyGpuNdArray(self.gpu_nd_array.__copy__()) def __deepcopy__(self): return MyGpuNdArray(self.gpu_nd_array.__deepcopy__()) @property def gpudata(self): # TODO: Add this assert when PyCUDA/PyOpenCL can use the bytes # attributes. Without this assert old code that don't support # strides can receive as input object that are strided and no # error will be gived #assert (self.gpu_nd_array.flags['C_CONTIGUOUS'] or # self.gpu_nd_array.flags['F_CONTIGUOUS']) # TODO: find a way to pass to a pycuda/pyopencl function the # bytes + offset directly. return self.bytes + self.offset def __getitem__(self, *inputs): return MyGpuNdArray(self.gpu_nd_array.__getitem__(*inputs)) def __add__(self, other): return self.__elemwise2__(other, "add", theano.tensor.add) def __sub__(self, other): return self.__elemwise2__(other, "sub", theano.tensor.sub) def __mul__(self, other): return self.__elemwise2__(other, "mul", theano.tensor.mul) def __div__(self, other): assert (str(self.gpu_nd_array.dtype).startswith("float") or str(other.gpu_nd_array.dtype).startswith("float")) return self.__elemwise2__(other, "true_div", theano.tensor.true_div) @classmethod def add(cls, x, y, out=None): """ add all inputs togethers element-wise """ return cls.__elemwise__([x, y], "add", theano.tensor.add, out=out) @classmethod def adds(cls, *inputs): """ add all inputs togethers element-wise """ return cls.__elemwise__(inputs, "add", theano.tensor.add) @classmethod def multiplys(cls, *inputs): """ multiply all inputs togethers element-wise """ return cls.__elemwise__(inputs, "mul", theano.tensor.mul) def sum(self, axis=None, collapse=True): import gen_reduction max_thread_per_block = 512 max_block = 4096 if isinstance(axis, (list, tuple)): if len(axis) == 1: axis = axis[0] else: assert len(axis) == self.ndim axis.sort() assert axis == range(self.ndim) axis = None # TODO: Why this? if self.size == 0: make_out = gpu_ndarray.zeros else: make_out = gpu_ndarray.empty if axis is None: out = make_out((), self.dtype) out = MyGpuNdArray(out) else: out_shape = [self.shape[i] for i in range(self.ndim) if i != axis] out = make_out(out_shape, self.dtype) out = MyGpuNdArray(out) if self.size == 0: return out args_set = False if collapse: coll_ndim, (coll_shape, coll_strides, coll_axis), coll_out_str = ( reduction_collapses([self, out], axis)) else: coll_ndim = self.ndim coll_shape = self.shape coll_strides = self.strides coll_axis = [axis] coll_out_str = out.strides if axis is not None: coll_axis = coll_axis[0] args_set = False if coll_ndim == 0: sum_op = gen_reduction.GpuSum([1], self.dtype) c_code = sum_op.c_support_code_apply("nodename", contig=True) fctname = "kernel_reduce_sum_ccontig_nodename" fct = compile_gpu_code(c_code, fctname) block_ = min(coll_shape[0], max_thread_per_block) block = (block_, 1, 1) grid = (1, 1) shared_ = self.dtype.itemsize * block_ args = [cast_int(coll_shape[0]), self, out] args_set = True elif axis is None: pattern = [1] * coll_ndim str_pattern = [str(i) for i in pattern] sum_op = gen_reduction.GpuSum(pattern, self.dtype) c_code = sum_op.c_support_code_apply("nodename") if not c_code: raise NotImplementedError( "GpuNdArray sum case not implemented") fctname = "kernel_reduce_sum_" + "".join(str_pattern) + "_nodename" fct = compile_gpu_code(c_code, fctname) if coll_ndim == 1: bx = min(max_thread_per_block, coll_shape[0]) block = (bx, 1, 1) block_ = bx elif coll_ndim == 2: bx = min(max_thread_per_block, coll_shape[1]) by = min(max_thread_per_block // coll_shape[1], coll_shape[0]) by = max(by, 1) block = (bx, by, 1) block_ = bx * by elif coll_ndim == 3: bx = min(max_thread_per_block, coll_shape[2]) by = min(max_thread_per_block // bx, coll_shape[1]) bz = min(max_thread_per_block // (bx * by), coll_shape[0]) by = max(by, 1) bz = min(max(bz, 1), 64) block = (bx, by, bz) block_ = bx * by * bz elif coll_ndim == 4: bx = min(max_thread_per_block, coll_shape[3]) by = min(max_thread_per_block // bx, coll_shape[2]) bz = min(max_thread_per_block // (bx * by), coll_shape[1]) by = max(by, 1) bz = min(max(bz, 1), 64) block = (bx, by, bz) block_ = bx * by * bz grid = (1, 1) shared_ = self.dtype.itemsize * block_ elif coll_ndim in [1, 2, 3]: if coll_ndim == 1: assert coll_axis == 0 # pattern 1 sum_op = gen_reduction.GpuSum([1], self.dtype) fctname = "kernel_reduce_sum_1_nodename" grid = (1, 1) block_ = min(max_thread_per_block, coll_shape[0]) block = (block_, 1, 1) elif coll_ndim == 3 and coll_axis == 0: # pattern 100 sum_op = gen_reduction.GpuSum([1, 0, 0], self.dtype) fctname = "kernel_reduce_sum_100_nodename" gx = min(coll_shape[1], max_block) gy = min(max_block // (gx * coll_shape[2]), coll_shape[2]) gy = max(gy, 1) grid = (gx, gy) block_ = min(max_thread_per_block, coll_shape[0]) block = (block_, 1, 1) elif coll_ndim == 3 and coll_axis == 1: # pattern 010 sum_op = gen_reduction.GpuSum([0, 1, 0], self.dtype) fctname = "kernel_reduce_sum_010_AD_nodename" A = coll_shape[0] B = coll_shape[1] C = coll_shape[2] D = C / 32 if (32 * D < C): D += 1 assert ((C <= 32 * D) and (32 * D < C + 32)) shared_ = 0 gx = min(A, max_block) gy = min(max_block // (D * A), D) gy = max(gy, 1) grid = (gx, gy) block = (32, 1, 1) block_ = 32 args_set = True # input shape args = [cast_int(A), cast_int(B), cast_int(C), cast_int(D)] # input args.append(self) # input strides args += [cast_int(i / self.dtype.itemsize) for i in coll_strides] # output args.append(out) # output strides args.append(cast_int(coll_out_str[0] / out.dtype.itemsize)) args.append(cast_int(coll_out_str[1] / out.dtype.itemsize)) elif coll_ndim == 3 and coll_axis == 2: # pattern 001 sum_op = gen_reduction.GpuSum([0, 0, 1], self.dtype) fctname = "kernel_reduce_sum_001_nodename" gx = min(coll_shape[0], max_block) gy = min(max_block // (gx * coll_shape[1]), coll_shape[1]) gy = max(gy, 1) grid = (gx, gy) block_ = min(max_thread_per_block, coll_shape[2]) block = (block_, 1, 1) elif coll_axis == 0: # pattern 10 sum_op = gen_reduction.GpuSum([1, 0], self.dtype) fctname = "kernel_reduce_sum_010_nodename" block_ = min(coll_shape[1], max_thread_per_block) block = (block_, 1, 1) grid = (1, coll_shape[0]) args_set = True # input shape args = [cast_int(1)] args += [cast_int(i) for i in coll_shape] # input args.append(self) # input strides args.append(cast_int(1)) args += [cast_int(i / self.dtype.itemsize) for i in coll_strides] # output args.append(out) # output strides args.append(cast_int(1)) # We must take the last dimensions in the case of # dimensions collapsing. args.append(cast_int(coll_out_str[-1] / out.dtype.itemsize)) elif coll_axis == 1: # pattern 01 sum_op = gen_reduction.GpuSum([0, 1], self.dtype) fctname = "kernel_reduce_sum_01_nodename" block_ = min(coll_shape[1], max_thread_per_block) block = (block_, 1, 1) grid = (1, min(coll_shape[0], max_block)) else: raise Exception("Bad axis") c_code = sum_op.c_support_code_apply("nodename") fct = compile_gpu_code(c_code, fctname) shared_ = self.dtype.itemsize * block_ else: raise Exception("Not implemented") if not args_set: # input shape args = [cast_int(i) for i in coll_shape] # input args.append(self) # input strides args += [cast_int(i / self.dtype.itemsize) for i in coll_strides] # output args.append(out) # output strides args += [cast_int(i / self.dtype.itemsize) for i in coll_out_str] pycuda._driver.Context.synchronize() #print fctname, block, grid, shared_, axis #print self.ndim, self.shape, self.strides, axis, out.strides #print coll_ndim, coll_shape, coll_strides, coll_axis, coll_out_str #print args if False: d = {"block": block, "shared": shared_, "grid": grid} fct(*args, **d) else: # We bypass the pycuda wrapper gpu function call. # by calling directly the gpu function. # This is faster and lower the overhead. fct.set_block_shape(*block) fct.set_shared_size(shared_) fct.param_set(*args) fct.launch_grid(*grid) return out pycuda-2013.1.1+git20140310/pycuda/compyte/ndarray/pygpu_language_opencl.cpp0000644000175000000500000001741212313360366025016 0ustar tomussrc#include #include #include #include #include #ifdef __APPLE__ #include #else #include #endif cl_context ctx = NULL; cl_device_id dev; cl_command_queue q; void setup_context(cl_context c); static void init_context(void) { cl_int err; cl_uint n; cl_platform_id *plats; cl_context_properties props[3]; cl_context c; if (ctx != NULL) return; err = clGetPlatformIDs(0, NULL, &n); if (err != CL_SUCCESS) return; plats = (cl_platform_id *)calloc(n, sizeof(cl_platform_id)); if (plats == NULL) return; err = clGetPlatformIDs(n, plats, NULL); if (err != CL_SUCCESS) goto fail_id; props[0] = CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties)plats[0]; props[2] = 0; c = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "Could not create context, will fail later (%d)!\n", err); /* error - error - error */ /* but we do nothing */ goto fail_id; } free(plats); setup_context(c); clReleaseContext(c); return; fail_id: free(plats); } void setup_context(cl_context c) { cl_int err; cl_device_id *devs; size_t sz; if (ctx != NULL) { clReleaseContext(ctx); clReleaseCommandQueue(q); } ctx = c; clRetainContext(ctx); err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz); if (err != CL_SUCCESS) { fprintf(stderr, "clGetContextInfo = %d\n", err); goto fail; } devs = (cl_device_id *)malloc(sz); if (devs == NULL) goto fail; err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sz, devs, NULL); if (err != CL_SUCCESS) goto fail_dev; dev = devs[0]; free(devs); q = clCreateCommandQueue(ctx, dev, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "clCreateCommandQueue = %d", err); goto fail; } return; fail_dev: free(devs); fail: clReleaseContext(ctx); ctx = NULL; } void * device_malloc(size_t size) { cl_int err; cl_mem res; init_context(); DPRINTF("malloc size = %zu\n", size); /* OpenCL devices do not always support byte-addressable storage therefore make sure we have at least 4 bytes in buffers */ if (size < 4) size = 4; res = clCreateBuffer(ctx, CL_MEM_READ_WRITE, size, NULL, &err); if (err != CL_SUCCESS) { PyErr_Format(PyExc_MemoryError, "Could not allocate device memory (%d)", err); return NULL; } return res; } int device_free(void * ptr) { cl_int err; if ((err = clReleaseMemObject((cl_mem)ptr)) != CL_SUCCESS) { PyErr_Format(PyExc_MemoryError, "Could not free device memory (%d)", err); return -1; } return 0; } int PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast) { size_t size = 1; cl_event ev; cl_int err; assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other)); assert(PyGpuNdArray_ISWRITEABLE(self)); if (PyGpuNdArray_NDIM(self) == -1) { PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuN\ dArrayObject"); return -1; } if (!(PyGpuNdArray_ISONESEGMENT(self) && PyGpuNdArray_ISONESEGMENT(other))) { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: only contiguous arrays are supported"); return -1; } if ((PyGpuNdArray_ISCONTIGUOUS(self) != PyGpuNdArray_ISCONTIGUOUS(other)) || (PyGpuNdArray_ISFORTRAN(self) != PyGpuNdArray_ISFORTRAN(other)) ) { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: the input and output don't have the same c/f contiguous memory layout. This isnot supported now."); return -1; } if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) { PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other)); return -1; } for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) { if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i]) && (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) { PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld", i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]); return -1; } size *= (unsigned int) PyGpuNdArray_DIMS(self)[i]; } if (0 == size) { return 0; //nothing to copy, we're done. } size *= PyGpuNdArray_ITEMSIZE(self); if ((err = clEnqueueCopyBuffer(q, (cl_mem)PyGpuNdArray_DATA(other), (cl_mem)PyGpuNdArray_DATA(self), PyGpuNdArray_OFFSET(other), PyGpuNdArray_OFFSET(self), size, 0, NULL, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create copy command (%d)", err); return -1; } if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not copy data (%d)", err); clReleaseEvent(ev); return -1; } clReleaseEvent(ev); return 0; } int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, PyGpuTransfert direction) { cl_int err; cl_event ev; switch (direction) { case PyGpuHostToDevice: err = clEnqueueWriteBuffer(q, (cl_mem)dst, CL_FALSE, dev_offset, bytes, src, 0, NULL, &ev); break; case PyGpuDeviceToHost: err = clEnqueueReadBuffer(q, (cl_mem)src, CL_FALSE, dev_offset, bytes, dst, 0, NULL, &ev); break; default: PyErr_Format(PyExc_ValueError, "Unknown direction %d", direction); return -1; } if (err != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create memcpy command (%d)", err); return -1; } if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not memcpy data (%d)", err); clReleaseEvent(ev); return -1; } clReleaseEvent(ev); return 0; } int PyGpuMemset(void * dst, int data, size_t bytes) { /* This should be at least one byte over the formatted string below */ char local_kern[92]; const char *rlk[1]; size_t sz; int r, res = -1; cl_int err; cl_event ev; cl_program p; cl_kernel k; bytes = (bytes+3)/4; if (bytes == 0) return 0; unsigned char val = (unsigned)data; unsigned int pattern = (unsigned int)val & (unsigned int)val >> 8 & (unsigned int)val >> 16 & (unsigned int)val >> 24; r = snprintf(local_kern, sizeof(local_kern), "__kernel void memset(__global unsigned int *mem) { mem[get_global_id(0)] = %u; }", pattern); /* If this assert fires, increase the size of local_kern above. */ assert(r >= sizeof(local_kern)); sz = strlen(local_kern); rlk[0] = local_kern; p = clCreateProgramWithSource(ctx, 1, rlk, &sz, &err); if (err != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create program (%d)", err); return -1; } if ((err = clBuildProgram(p, 1, &dev, NULL, NULL, NULL)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not build program (%d)", err); goto fail_prog; } k = clCreateKernel(p, "memset", &err); if (err != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not create kernel (%d)", err); goto fail_prog; } if ((err = clSetKernelArg(k, 0, sizeof(cl_mem), &dst)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not set kernel arg (%d)", err); goto fail_kern; } if ((err = clEnqueueNDRangeKernel(q, k, 1, NULL, &bytes, NULL, 0, NULL, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not enqueue kernel (%d)", err); goto fail_kern; } if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Could not memset (%d)", err); } /* success! */ res = 0; clReleaseEvent(ev); fail_kern: clReleaseKernel(k); fail_prog: clReleaseProgram(p); return res; } pycuda-2013.1.1+git20140310/pycuda/compyte/dtypes.py0000644000175000000500000001507012313360366020163 0ustar tomussrc"""Type mapping helpers.""" from __future__ import division __copyright__ = "Copyright (C) 2011 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np # {{{ registry DTYPE_TO_NAME = {} NAME_TO_DTYPE = {} class TypeNameNotKnown(RuntimeError): pass def get_or_register_dtype(c_names, dtype=None): """Get or register a :class:`numpy.dtype` associated with the C type names in the string list *c_names*. If *dtype* is `None`, no registration is performed, and the :class:`numpy.dtype` must already have been registered. If so, it is returned. If not, :exc:`TypeNameNotKnown` is raised. If *dtype* is not `None`, registration is attempted. If the *c_names* are already known and registered to identical :class:`numpy.dtype` objects, then the previously dtype object of the previously registered type is returned. If the *c_names* are not yet known, the type is registered. If one of the *c_names* is known but registered to a different type, an error is raised. In this latter case, the type may end up partially registered and any further behavior is undefined. .. versionadded:: 2012.2 """ if isinstance(c_names, str): c_names = [c_names] if dtype is None: from pytools import single_valued return single_valued(NAME_TO_DTYPE[name] for name in c_names) dtype = np.dtype(dtype) # check if we've seen an identical dtype, if so retrieve exact dtype object. try: existing_name = DTYPE_TO_NAME[dtype] except KeyError: existed = False else: existed = True existing_dtype = NAME_TO_DTYPE[existing_name] assert existing_dtype == dtype dtype = existing_dtype for nm in c_names: try: name_dtype = NAME_TO_DTYPE[nm] except KeyError: NAME_TO_DTYPE[nm] = dtype else: if name_dtype != dtype: raise RuntimeError("name '%s' already registered to " "different dtype" % nm) if not existed: DTYPE_TO_NAME[dtype] = c_names[0] if not str(dtype) in DTYPE_TO_NAME: DTYPE_TO_NAME[str(dtype)] = c_names[0] return dtype def register_dtype(dtype, c_names, alias_ok=False): from warnings import warn warn("register_dtype is deprecated. Use get_or_register_dtype instead.", DeprecationWarning, stacklevel=2) if isinstance(c_names, str): c_names = [c_names] dtype = np.dtype(dtype) # check if we've seen this dtype before and error out if a) it was seen before # and b) alias_ok is False. if not alias_ok and dtype in DTYPE_TO_NAME: raise RuntimeError("dtype '%s' already registered (as '%s', new names '%s')" % (dtype, DTYPE_TO_NAME[dtype], ", ".join(c_names))) get_or_register_dtype(c_names, dtype) def _fill_dtype_registry(respect_windows, include_bool=True): from sys import platform if include_bool: # bool is of unspecified size in the OpenCL spec and may in fact be 4-byte. get_or_register_dtype("bool", np.bool) get_or_register_dtype(["signed char", "char"], np.int8) get_or_register_dtype("unsigned char", np.uint8) get_or_register_dtype(["short", "signed short", "signed short int", "short signed int"], np.int16) get_or_register_dtype(["unsigned short", "unsigned short int", "short unsigned int"], np.uint16) get_or_register_dtype(["int", "signed int"], np.int32) get_or_register_dtype(["unsigned", "unsigned int"], np.uint32) is_64_bit = tuple.__itemsize__ * 8 == 64 if is_64_bit: if 'win32' in platform and respect_windows: i64_name = "long long" else: i64_name = "long" get_or_register_dtype( [i64_name, "%s int" % i64_name, "signed %s int" % i64_name, "%s signed int" % i64_name], np.int64) get_or_register_dtype( ["unsigned %s" % i64_name, "unsigned %s int" % i64_name, "%s unsigned int" % i64_name], np.uint64) # http://projects.scipy.org/numpy/ticket/2017 if is_64_bit: get_or_register_dtype(["unsigned %s" % i64_name], np.uintp) else: get_or_register_dtype(["unsigned"], np.uintp) get_or_register_dtype("float", np.float32) get_or_register_dtype("double", np.float64) # }}} # {{{ dtype -> ctype def dtype_to_ctype(dtype): if dtype is None: raise ValueError("dtype may not be None") dtype = np.dtype(dtype) try: return DTYPE_TO_NAME[dtype] except KeyError: raise ValueError("unable to map dtype '%s'" % dtype) # }}} # {{{ c declarator parsing def parse_c_arg_backend(c_arg, scalar_arg_factory, vec_arg_factory, name_to_dtype=None): if name_to_dtype is None: name_to_dtype = NAME_TO_DTYPE.__getitem__ c_arg = c_arg.replace("const", "").replace("volatile", "") # process and remove declarator import re decl_re = re.compile(r"(\**)\s*([_a-zA-Z0-9]+)(\s*\[[ 0-9]*\])*\s*$") decl_match = decl_re.search(c_arg) if decl_match is None: raise ValueError("couldn't parse C declarator '%s'" % c_arg) name = decl_match.group(2) if decl_match.group(1) or decl_match.group(3) is not None: arg_class = vec_arg_factory else: arg_class = scalar_arg_factory tp = c_arg[:decl_match.start()] tp = " ".join(tp.split()) try: dtype = name_to_dtype(tp) except KeyError: raise ValueError("unknown type '%s'" % tp) return arg_class(dtype, name) # }}} # vim: foldmethod=marker pycuda-2013.1.1+git20140310/pycuda/compyte/setup.cfg0000644000175000000500000000011012313360366020107 0ustar tomussrc[flake8] ignore = E126,E127,E128,E123,E226,E241,E242 max-line-length=85 pycuda-2013.1.1+git20140310/pycuda/autoinit.py0000644000175000000500000000057512313360364017031 0ustar tomussrcimport pycuda.driver as cuda # Initialize CUDA cuda.init() from pycuda.tools import make_default_context global context context = make_default_context() device = context.get_device() def _finish_up(): global context context.pop() context = None from pycuda.tools import clear_context_caches clear_context_caches() import atexit atexit.register(_finish_up) pycuda-2013.1.1+git20140310/pycuda/gpuarray.py0000644000175000000500000012111012313360364017014 0ustar tomussrcfrom __future__ import division import numpy as np import pycuda.elementwise as elementwise from pytools import memoize, memoize_method import pycuda.driver as drv from pycuda.compyte.array import ( as_strided as _as_strided, f_contiguous_strides as _f_contiguous_strides, c_contiguous_strides as _c_contiguous_strides, ArrayFlags as _ArrayFlags, get_common_dtype as _get_common_dtype_base) from pycuda.characterize import has_double_support def _get_common_dtype(obj1, obj2): return _get_common_dtype_base(obj1, obj2, has_double_support()) # {{{ vector types class vec: pass def _create_vector_types(): from pycuda.characterize import platform_bits if platform_bits() == 32: long_dtype = np.int32 ulong_dtype = np.uint32 else: long_dtype = np.int64 ulong_dtype = np.uint64 field_names = ["x", "y", "z", "w"] from pycuda.tools import get_or_register_dtype for base_name, base_type, counts in [ ('char', np.int8, [1, 2, 3, 4]), ('uchar', np.uint8, [1, 2, 3, 4]), ('short', np.int16, [1, 2, 3, 4]), ('ushort', np.uint16, [1, 2, 3, 4]), ('int', np.int32, [1, 2, 3, 4]), ('uint', np.uint32, [1, 2, 3, 4]), ('long', long_dtype, [1, 2, 3, 4]), ('ulong', ulong_dtype, [1, 2, 3, 4]), ('longlong', np.int64, [1, 2]), ('ulonglong', np.uint64, [1, 2]), ('float', np.float32, [1, 2, 3, 4]), ('double', np.float64, [1, 2]), ]: for count in counts: name = "%s%d" % (base_name, count) dtype = np.dtype([ (field_names[i], base_type) for i in range(count)]) get_or_register_dtype(name, dtype) setattr(vec, name, dtype) my_field_names = ",".join(field_names[:count]) setattr(vec, "make_"+name, staticmethod(eval( "lambda %s: array((%s), dtype=my_dtype)" % (my_field_names, my_field_names), dict(array=np.array, my_dtype=dtype)))) _create_vector_types() # }}} # {{{ helper functionality @memoize def _splay_backend(n, dev): # heavily modified from cublas from pycuda.tools import DeviceData devdata = DeviceData(dev) min_threads = devdata.warp_size max_threads = 128 max_blocks = 4 * devdata.thread_blocks_per_mp \ * dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT) if n < min_threads: block_count = 1 threads_per_block = min_threads elif n < (max_blocks * min_threads): block_count = (n + min_threads - 1) // min_threads threads_per_block = min_threads elif n < (max_blocks * max_threads): block_count = max_blocks grp = (n + min_threads - 1) // min_threads threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads else: block_count = max_blocks threads_per_block = max_threads #print "n:%d bc:%d tpb:%d" % (n, block_count, threads_per_block) return (block_count, 1), (threads_per_block, 1, 1) def splay(n, dev=None): if dev is None: dev = drv.Context.get_device() return _splay_backend(n, dev) # }}} # {{{ main GPUArray class def _make_binary_op(operator): def func(self, other): if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") if isinstance(other, GPUArray): assert self.shape == other.shape if not other.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") result = self._new_like_me() func = elementwise.get_binary_op_kernel( self.dtype, other.dtype, result.dtype, operator) func.prepared_async_call(self._grid, self._block, None, self.gpudata, other.gpudata, result.gpudata, self.mem_size) return result else: # scalar operator result = self._new_like_me() func = elementwise.get_scalar_op_kernel( self.dtype, result.dtype, operator) func.prepared_async_call(self._grid, self._block, None, self.gpudata, other, result.gpudata, self.mem_size) return result return func class GPUArray(object): """A GPUArray is used to do array-based calculation on the GPU. This is mostly supposed to be a numpy-workalike. Operators work on an element-by-element basis, just like numpy.ndarray. """ __array_priority__ = 100 def __init__(self, shape, dtype, allocator=drv.mem_alloc, base=None, gpudata=None, strides=None, order="C"): dtype = np.dtype(dtype) try: s = 1 for dim in shape: s *= dim except TypeError: assert isinstance(shape, (int, long, np.integer)) s = shape shape = (shape,) if isinstance(s, np.integer): # bombs if s is a Python integer s = np.asscalar(s) if strides is None: if order == "F": strides = _f_contiguous_strides( dtype.itemsize, shape) elif order == "C": strides = _c_contiguous_strides( dtype.itemsize, shape) else: raise ValueError("invalid order: %s" % order) else: # FIXME: We should possibly perform some plausibility # checking on 'strides' here. strides = tuple(strides) self.shape = shape self.dtype = dtype self.strides = strides self.mem_size = self.size = s self.nbytes = self.dtype.itemsize * self.size self.allocator = allocator if gpudata is None: if self.size: self.gpudata = self.allocator(self.size * self.dtype.itemsize) else: self.gpudata = None assert base is None else: self.gpudata = gpudata self.base = base self._grid, self._block = splay(self.mem_size) @property @memoize_method def flags(self): return _ArrayFlags(self) def set(self, ary): assert ary.size == self.size assert ary.dtype == self.dtype if ary.strides != self.strides: from warnings import warn warn("Setting array from one with different strides/storage order. " "This will cease to work in 2013.x.", stacklevel=2) assert self.flags.forc if self.size: drv.memcpy_htod(self.gpudata, ary) def set_async(self, ary, stream=None): assert ary.size == self.size assert ary.dtype == self.dtype if ary.strides != self.strides: from warnings import warn warn("Setting array from one with different strides/storage order. " "This will cease to work in 2013.x.", stacklevel=2) assert self.flags.forc if not ary.flags.forc: raise RuntimeError("cannot asynchronously set from " "non-contiguous array") if self.size: drv.memcpy_htod_async(self.gpudata, ary, stream) def get(self, ary=None, pagelocked=False): if ary is None: if pagelocked: ary = drv.pagelocked_empty(self.shape, self.dtype) else: ary = np.empty(self.shape, self.dtype) ary = _as_strided(ary, strides=self.strides) else: assert ary.size == self.size assert ary.dtype == self.dtype assert ary.flags.forc assert self.flags.forc, "Array in get() must be contiguous" if self.size: drv.memcpy_dtoh(ary, self.gpudata) return ary def get_async(self, stream=None, ary=None): if ary is None: ary = drv.pagelocked_empty(self.shape, self.dtype) ary = _as_strided(ary, strides=self.strides) else: assert ary.size == self.size assert ary.dtype == self.dtype assert ary.flags.forc assert self.flags.forc, "Array in get() must be contiguous" if self.size: drv.memcpy_dtoh_async(ary, self.gpudata, stream) return ary def copy(self): if not self.flags.forc: raise RuntimeError("only contiguous arrays may copied.") new = GPUArray(self.shape, self.dtype) drv.memcpy_dtod(new.gpudata, self.gpudata, self.nbytes) return new def __str__(self): return str(self.get()) def __repr__(self): return repr(self.get()) def __hash__(self): raise TypeError("GPUArrays are not hashable.") @property def ptr(self): return self.gpudata.__int__() # kernel invocation wrappers ---------------------------------------------- def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" assert self.shape == other.shape if not self.flags.forc or not other.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) if add_timer is not None: add_timer(3*self.size, func.prepared_timed_call(self._grid, selffac, self.gpudata, otherfac, other.gpudata, out.gpudata, self.mem_size)) else: func.prepared_async_call(self._grid, self._block, stream, selffac, self.gpudata, otherfac, other.gpudata, out.gpudata, self.mem_size) return out def _axpbz(self, selffac, other, out, stream=None): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") func = elementwise.get_axpbz_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, selffac, self.gpudata, other, out.gpudata, self.mem_size) return out def _elwise_multiply(self, other, out, stream=None): if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "*") func.prepared_async_call(self._grid, self._block, stream, self.gpudata, other.gpudata, out.gpudata, self.mem_size) return out def _rdiv_scalar(self, other, out, stream=None): """Divides an array by a scalar:: y = n / self """ if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, self.gpudata, other, out.gpudata, self.mem_size) return out def _div(self, other, out, stream=None): """Divides an array by another array.""" if not self.flags.forc or not other.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") assert self.shape == other.shape func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "/") func.prepared_async_call(self._grid, self._block, stream, self.gpudata, other.gpudata, out.gpudata, self.mem_size) return out def _new_like_me(self, dtype=None): strides = None if dtype is None: dtype = self.dtype else: if dtype == self.dtype: strides = self.strides return self.__class__(self.shape, dtype, allocator=self.allocator, strides=strides) # operators --------------------------------------------------------------- def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None): """Return `selffac * self + otherfac*other`. """ result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbyz(selffac, other, otherfac, result, add_timer) def __add__(self, other): """Add an array with an array or an array with a scalar.""" if isinstance(other, GPUArray): # add another vector result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbyz(1, other, 1, result) else: # add a scalar if other == 0: return self.copy() else: result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(1, other, result) __radd__ = __add__ def __sub__(self, other): """Substract an array from an array or a scalar from an array.""" if isinstance(other, GPUArray): result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbyz(1, other, -1, result) else: if other == 0: return self.copy() else: # create a new array for the result result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(1, -other, result) def __rsub__(self, other): """Substracts an array by a scalar or an array:: x = n - self """ # other must be a scalar result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(-1, other, result) def __iadd__(self, other): if isinstance(other, GPUArray): return self._axpbyz(1, other, 1, self) else: return self._axpbz(1, other, self) def __isub__(self, other): if isinstance(other, GPUArray): return self._axpbyz(1, other, -1, self) else: return self._axpbz(1, -other, self) def __neg__(self): result = self._new_like_me() return self._axpbz(-1, 0, result) def __mul__(self, other): if isinstance(other, GPUArray): result = self._new_like_me(_get_common_dtype(self, other)) return self._elwise_multiply(other, result) else: result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(other, 0, result) def __rmul__(self, scalar): result = self._new_like_me(_get_common_dtype(self, scalar)) return self._axpbz(scalar, 0, result) def __imul__(self, other): if isinstance(other, GPUArray): return self._elwise_multiply(other, self) else: return self._axpbz(other, 0, self) def __div__(self, other): """Divides an array by an array or a scalar:: x = self / n """ if isinstance(other, GPUArray): result = self._new_like_me(_get_common_dtype(self, other)) return self._div(other, result) else: if other == 1: return self.copy() else: # create a new array for the result result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(1/other, 0, result) __truediv__ = __div__ def __rdiv__(self, other): """Divides an array by a scalar or an array:: x = n / self """ # create a new array for the result result = self._new_like_me(_get_common_dtype(self, other)) return self._rdiv_scalar(other, result) __rtruediv__ = __rdiv__ def __idiv__(self, other): """Divides an array by an array or a scalar:: x /= n """ if isinstance(other, GPUArray): return self._div(other, self) else: if other == 1: return self else: return self._axpbz(1/other, 0, self) __itruediv__ = __idiv__ def fill(self, value, stream=None): """fills the array with the specified value""" func = elementwise.get_fill_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, stream, value, self.gpudata, self.mem_size) return self def bind_to_texref(self, texref, allow_offset=False): return texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset) / self.dtype.itemsize def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False, allow_complex_hack=False, allow_offset=False): if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") if self.dtype == np.float64 and allow_double_hack: if channels != 1: raise ValueError( "'fake' double precision textures can " "only have one channel") channels = 2 fmt = drv.array_format.SIGNED_INT32 read_as_int = True elif self.dtype == np.complex64 and allow_complex_hack: if channels != 1: raise ValueError( "'fake' complex64 textures can " "only have one channel") channels = 2 fmt = drv.array_format.UNSIGNED_INT32 read_as_int = True elif self.dtype == np.complex128 and allow_complex_hack: if channels != 1: raise ValueError( "'fake' complex128 textures can " "only have one channel") channels = 4 fmt = drv.array_format.SIGNED_INT32 read_as_int = True else: fmt = drv.dtype_to_array_format(self.dtype) read_as_int = np.integer in self.dtype.type.__mro__ offset = texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset) texref.set_format(fmt, channels) if read_as_int: texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER) return offset/self.dtype.itemsize def __len__(self): """Return the size of the leading dimension of self.""" if len(self.shape): return self.shape[0] else: return TypeError("scalar has no len()") def __abs__(self): """Return a `GPUArray` of the absolute values of the elements of `self`. """ result = self._new_like_me() if self.dtype == np.float32: fname = "fabsf" elif self.dtype == np.float64: fname = "fabs" else: fname = "abs" if issubclass(self.dtype.type, np.complexfloating): from pytools import match_precision out_dtype = match_precision(np.dtype(np.float64), self.dtype) result = self._new_like_me(out_dtype) else: out_dtype = self.dtype func = elementwise.get_unary_func_kernel(fname, self.dtype, out_dtype=out_dtype) func.prepared_async_call(self._grid, self._block, None, self.gpudata, result.gpudata, self.mem_size) return result def __pow__(self, other): """pow function:: example: array = pow(array) array = pow(array,4) array = pow(array,array) """ if isinstance(other, GPUArray): if not self.flags.forc or not other.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") assert self.shape == other.shape result = self._new_like_me(_get_common_dtype(self, other)) func = elementwise.get_pow_array_kernel( self.dtype, other.dtype, result.dtype) func.prepared_async_call(self._grid, self._block, None, self.gpudata, other.gpudata, result.gpudata, self.mem_size) return result else: if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") result = self._new_like_me() func = elementwise.get_pow_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, None, other, self.gpudata, result.gpudata, self.mem_size) return result def reverse(self, stream=None): """Return this array in reversed order. The array is treated as one-dimensional. """ if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") result = self._new_like_me() func = elementwise.get_reverse_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, stream, self.gpudata, result.gpudata, self.mem_size) return result def astype(self, dtype, stream=None): if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") if dtype == self.dtype: return self.copy() result = self._new_like_me(dtype=dtype) func = elementwise.get_copy_kernel(dtype, self.dtype) func.prepared_async_call(self._grid, self._block, stream, result.gpudata, self.gpudata, self.mem_size) return result def reshape(self, *shape): # TODO: add more error-checking, perhaps if isinstance(shape[0], tuple) or isinstance(shape[0], list): shape = tuple(shape[0]) if shape == self.shape: return self size = reduce(lambda x, y: x * y, shape, 1) if size != self.size: raise ValueError("total size of new array must be unchanged") return GPUArray( shape=shape, dtype=self.dtype, allocator=self.allocator, base=self, gpudata=int(self.gpudata)) def ravel(self): return self.reshape(self.size) def view(self, dtype=None): if dtype is None: dtype = self.dtype old_itemsize = self.dtype.itemsize itemsize = np.dtype(dtype).itemsize from pytools import argmin2 min_stride_axis = argmin2( (axis, abs(stride)) for axis, stride in enumerate(self.strides)) if self.shape[min_stride_axis] * old_itemsize % itemsize != 0: raise ValueError("new type not compatible with array") new_shape = ( self.shape[:min_stride_axis] + (self.shape[min_stride_axis] * old_itemsize // itemsize,) + self.shape[min_stride_axis+1:]) new_strides = ( self.strides[:min_stride_axis] + (self.strides[min_stride_axis] * itemsize // old_itemsize,) + self.strides[min_stride_axis+1:]) return GPUArray( shape=new_shape, dtype=dtype, allocator=self.allocator, strides=new_strides, base=self, gpudata=int(self.gpudata)) # {{{ slicing def __getitem__(self, index): """ .. versionadded:: 2013.1 """ if not isinstance(index, tuple): index = (index,) new_shape = [] new_offset = 0 new_strides = [] seen_ellipsis = False index_axis = 0 array_axis = 0 while index_axis < len(index): index_entry = index[index_axis] if array_axis > len(self.shape): raise IndexError("too many axes in index") if isinstance(index_entry, slice): start, stop, idx_stride = index_entry.indices( self.shape[array_axis]) array_stride = self.strides[array_axis] new_shape.append((stop-start)//idx_stride) new_strides.append(idx_stride*array_stride) new_offset += array_stride*start index_axis += 1 array_axis += 1 elif isinstance(index_entry, (int, np.integer)): array_shape = self.shape[array_axis] if index_entry < 0: index_entry += array_shape if not (0 <= index_entry < array_shape): raise IndexError( "subindex in axis %d out of range" % index_axis) new_offset += self.strides[array_axis]*index_entry index_axis += 1 array_axis += 1 elif index_entry is Ellipsis: index_axis += 1 remaining_index_count = len(index) - index_axis new_array_axis = len(self.shape) - remaining_index_count if new_array_axis < array_axis: raise IndexError("invalid use of ellipsis in index") while array_axis < new_array_axis: new_shape.append(self.shape[array_axis]) new_strides.append(self.strides[array_axis]) array_axis += 1 if seen_ellipsis: raise IndexError( "more than one ellipsis not allowed in index") seen_ellipsis = True else: raise IndexError("invalid subindex in axis %d" % index_axis) while array_axis < len(self.shape): new_shape.append(self.shape[array_axis]) new_strides.append(self.strides[array_axis]) array_axis += 1 return GPUArray( shape=tuple(new_shape), dtype=self.dtype, allocator=self.allocator, base=self, gpudata=int(self.gpudata)+new_offset, strides=tuple(new_strides)) # }}} # {{{ complex-valued business @property def real(self): dtype = self.dtype if issubclass(dtype.type, np.complexfloating): from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) result = self._new_like_me(dtype=real_dtype) func = elementwise.get_real_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, self.gpudata, result.gpudata, self.mem_size) return result else: return self @property def imag(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) result = self._new_like_me(dtype=real_dtype) func = elementwise.get_imag_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, self.gpudata, result.gpudata, self.mem_size) return result else: return zeros_like(self) def conj(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") result = self._new_like_me() func = elementwise.get_conj_kernel(dtype) func.prepared_async_call(self._grid, self._block, None, self.gpudata, result.gpudata, self.mem_size) return result else: return self # }}} # {{{ rich comparisons __eq__ = _make_binary_op("==") __ne__ = _make_binary_op("!=") __le__ = _make_binary_op("<=") __ge__ = _make_binary_op(">=") __lt__ = _make_binary_op("<") __gt__ = _make_binary_op(">") # }}} # }}} # {{{ creation helpers def to_gpu(ary, allocator=drv.mem_alloc): """converts a numpy array to a GPUArray""" result = GPUArray(ary.shape, ary.dtype, allocator, strides=ary.strides) result.set(ary) return result def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None): """converts a numpy array to a GPUArray""" result = GPUArray(ary.shape, ary.dtype, allocator, strides=ary.strides) result.set_async(ary, stream) return result empty = GPUArray def zeros(shape, dtype, allocator=drv.mem_alloc, order="C"): """Returns an array of the given shape and dtype filled with 0's.""" result = GPUArray(shape, dtype, allocator, order=order) zero = np.zeros((), dtype) result.fill(zero) return result def empty_like(other_ary): result = GPUArray( other_ary.shape, other_ary.dtype, other_ary.allocator) return result def zeros_like(other_ary): result = GPUArray( other_ary.shape, other_ary.dtype, other_ary.allocator) zero = np.zeros((), result.dtype) result.fill(zero) return result def arange(*args, **kwargs): """Create an array filled with numbers spaced `step` apart, starting from `start` and ending at `stop`. For floating point arguments, the length of the result is `ceil((stop - start)/step)`. This rule may result in the last element of the result being greater than stop. """ # argument processing ----------------------------------------------------- # Yuck. Thanks, numpy developers. ;) from pytools import Record class Info(Record): pass explicit_dtype = False inf = Info() inf.start = None inf.stop = None inf.step = None inf.dtype = None if isinstance(args[-1], np.dtype): inf.dtype = args[-1] args = args[:-1] explicit_dtype = True argc = len(args) if argc == 0: raise ValueError("stop argument required") elif argc == 1: inf.stop = args[0] elif argc == 2: inf.start = args[0] inf.stop = args[1] elif argc == 3: inf.start = args[0] inf.stop = args[1] inf.step = args[2] else: raise ValueError("too many arguments") admissible_names = ["start", "stop", "step", "dtype"] for k, v in kwargs.iteritems(): if k in admissible_names: if getattr(inf, k) is None: setattr(inf, k, v) if k == "dtype": explicit_dtype = True else: raise ValueError("may not specify '%s' by position and keyword" % k) else: raise ValueError("unexpected keyword argument '%s'" % k) if inf.start is None: inf.start = 0 if inf.step is None: inf.step = 1 if inf.dtype is None: inf.dtype = np.array([inf.start, inf.stop, inf.step]).dtype # actual functionality ---------------------------------------------------- dtype = np.dtype(inf.dtype) start = dtype.type(inf.start) step = dtype.type(inf.step) stop = dtype.type(inf.stop) if not explicit_dtype and dtype != np.float32: from warnings import warn warn("behavior change: arange guessed dtype other than float32. " "suggest specifying explicit dtype.") from math import ceil size = int(ceil((stop-start)/step)) result = GPUArray((size,), dtype) func = elementwise.get_arange_kernel(dtype) func.prepared_async_call(result._grid, result._block, kwargs.get("stream"), result.gpudata, start, step, size) return result # }}} # {{{ pickle support import copy_reg copy_reg.pickle(GPUArray, lambda data: (to_gpu, (data.get(),)), to_gpu) # }}} # {{{ take/put def take(a, indices, out=None, stream=None): if out is None: out = GPUArray(indices.shape, a.dtype, a.allocator) assert len(indices.shape) == 1 func, tex_src = elementwise.get_take_kernel(a.dtype, indices.dtype) a.bind_to_texref_ext(tex_src[0], allow_double_hack=True, allow_complex_hack=True) func.prepared_async_call(out._grid, out._block, stream, indices.gpudata, out.gpudata, indices.size) return out def multi_take(arrays, indices, out=None, stream=None): if not len(arrays): return [] assert len(indices.shape) == 1 from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].dtype vec_count = len(arrays) if out is None: out = [GPUArray(indices.shape, a_dtype, a_allocator) for i in range(vec_count)] else: if len(out) != len(arrays): raise ValueError("out and arrays must have the same length") chunk_size = _builtin_min(vec_count, 20) def make_func_for_chunk_size(chunk_size): return elementwise.get_take_kernel(a_dtype, indices.dtype, vec_count=chunk_size) func, tex_src = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: func, tex_src = make_func_for_chunk_size(vec_count-start_i) for i, a in enumerate(arrays[chunk_slice]): a.bind_to_texref_ext(tex_src[i], allow_double_hack=True) func.prepared_async_call(indices._grid, indices._block, stream, indices.gpudata, *([o.gpudata for o in out[chunk_slice]] + [indices.size])) return out def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, out=None, stream=None, src_offsets=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator vec_count = len(arrays) if out is None: out = [GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if src_indices.dtype != dest_indices.dtype: raise TypeError("src_indices and dest_indices must have the same dtype") if len(src_indices.shape) != 1: raise ValueError("src_indices must be 1D") if src_indices.shape != dest_indices.shape: raise ValueError("src_indices and dest_indices must have the same shape") if src_offsets is None: src_offsets_list = [] max_chunk_size = 20 else: src_offsets_list = src_offsets if len(src_offsets) != vec_count: raise ValueError("src_indices and src_offsets must have the same length") max_chunk_size = 10 chunk_size = _builtin_min(vec_count, max_chunk_size) def make_func_for_chunk_size(chunk_size): return elementwise.get_take_put_kernel( a_dtype, src_indices.dtype, with_offsets=src_offsets is not None, vec_count=chunk_size) func, tex_src = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: func, tex_src = make_func_for_chunk_size(vec_count-start_i) for src_tr, a in zip(tex_src, arrays[chunk_slice]): a.bind_to_texref_ext(src_tr, allow_double_hack=True) func.prepared_async_call(src_indices._grid, src_indices._block, stream, dest_indices.gpudata, src_indices.gpudata, *([o.gpudata for o in out[chunk_slice]] + src_offsets_list[chunk_slice] + [src_indices.size])) return out def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator vec_count = len(arrays) if out is None: out = [GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if len(dest_indices.shape) != 1: raise ValueError("src_indices must be 1D") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): return elementwise.get_put_kernel( a_dtype, dest_indices.dtype, vec_count=chunk_size) func = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: func = make_func_for_chunk_size(vec_count-start_i) func.prepared_async_call(dest_indices._grid, dest_indices._block, stream, dest_indices.gpudata, *([o.gpudata for o in out[chunk_slice]] + [i.gpudata for i in arrays[chunk_slice]] + [dest_indices.size])) return out # }}} # {{{ conditionals def if_positive(criterion, then_, else_, out=None, stream=None): if not (criterion.shape == then_.shape == else_.shape): raise ValueError("shapes do not match") if not (then_.dtype == else_.dtype): raise ValueError("dtypes do not match") func = elementwise.get_if_positive_kernel( criterion.dtype, then_.dtype) if out is None: out = empty_like(then_) func.prepared_async_call(criterion._grid, criterion._block, stream, criterion.gpudata, then_.gpudata, else_.gpudata, out.gpudata, criterion.size) return out def _make_binary_minmax_func(which): def f(a, b, out=None, stream=None): if out is None: out = empty_like(a) func = elementwise.get_binary_minmax_kernel(which, a.dtype, b.dtype, out.dtype) func.prepared_async_call(a._grid, a._block, stream, a.gpudata, b.gpudata, out.gpudata, a.size) return out return f minimum = _make_binary_minmax_func("min") maximum = _make_binary_minmax_func("max") # }}} # {{{ reductions def sum(a, dtype=None, stream=None): from pycuda.reduction import get_sum_kernel krnl = get_sum_kernel(dtype, a.dtype) return krnl(a, stream=stream) def subset_sum(subset, a, dtype=None, stream=None): from pycuda.reduction import get_subset_sum_kernel krnl = get_subset_sum_kernel(dtype, subset.dtype, a.dtype) return krnl(subset, a, stream=stream) def dot(a, b, dtype=None, stream=None): from pycuda.reduction import get_dot_kernel if dtype is None: dtype = _get_common_dtype(a, b) krnl = get_dot_kernel(dtype, a.dtype, b.dtype) return krnl(a, b, stream=stream) def subset_dot(subset, a, b, dtype=None, stream=None): from pycuda.reduction import get_subset_dot_kernel krnl = get_subset_dot_kernel(dtype, subset.dtype, a.dtype, b.dtype) return krnl(subset, a, b, stream=stream) def _make_minmax_kernel(what): def f(a, stream=None): from pycuda.reduction import get_minmax_kernel krnl = get_minmax_kernel(what, a.dtype) return krnl(a, stream=stream) return f _builtin_min = min _builtin_max = max min = _make_minmax_kernel("min") max = _make_minmax_kernel("max") def _make_subset_minmax_kernel(what): def f(subset, a, stream=None): from pycuda.reduction import get_subset_minmax_kernel krnl = get_subset_minmax_kernel(what, a.dtype, subset.dtype) return krnl(subset, a, stream=stream) return f subset_min = _make_subset_minmax_kernel("min") subset_max = _make_subset_minmax_kernel("max") # }}} # vim: foldmethod=marker pycuda-2013.1.1+git20140310/pycuda/gl/0002755000175000000500000000000012313360364015220 5ustar tomussrcpycuda-2013.1.1+git20140310/pycuda/gl/__init__.py0000644000175000000500000000066612313360364017337 0ustar tomussrcimport pycuda._driver as _drv if not _drv.have_gl_ext(): raise ImportError("PyCUDA was compiled without GL extension support") init = _drv.gl_init make_context = _drv.make_gl_context graphics_map_flags = _drv.graphics_map_flags BufferObject = _drv.BufferObject BufferObjectMapping = _drv.BufferObjectMapping RegisteredBuffer = _drv.RegisteredBuffer RegisteredImage = _drv.RegisteredImage RegisteredMapping = _drv.RegisteredMapping pycuda-2013.1.1+git20140310/pycuda/gl/autoinit.py0000644000175000000500000000044312313360364017425 0ustar tomussrcimport pycuda.driver as cuda import pycuda.gl as cudagl cuda.init() assert cuda.Device.count() >= 1 from pycuda.tools import make_default_context context = make_default_context(lambda dev: cudagl.make_context(dev)) device = context.get_device() import atexit atexit.register(context.pop) pycuda-2013.1.1+git20140310/pycuda/elementwise.py0000644000175000000500000004704612313360364017522 0ustar tomussrc"""Elementwise functionality.""" from __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from pycuda.tools import context_dependent_memoize import numpy as np from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg from pytools import memoize_method def get_elwise_module(arguments, operation, name="kernel", keep=False, options=None, preamble="", loop_prep="", after_loop=""): from pycuda.compiler import SourceModule return SourceModule(""" #include %(preamble)s __global__ void %(name)s(%(arguments)s) { unsigned tid = threadIdx.x; unsigned total_threads = gridDim.x*blockDim.x; unsigned cta_start = blockDim.x*blockIdx.x; unsigned i; %(loop_prep)s; for (i = cta_start + tid; i < n; i += total_threads) { %(operation)s; } %(after_loop)s; } """ % { "arguments": ", ".join(arg.declarator() for arg in arguments), "operation": operation, "name": name, "preamble": preamble, "loop_prep": loop_prep, "after_loop": after_loop, }, options=options, keep=keep) def get_elwise_range_module(arguments, operation, name="kernel", keep=False, options=None, preamble="", loop_prep="", after_loop=""): from pycuda.compiler import SourceModule return SourceModule(""" #include %(preamble)s __global__ void %(name)s(%(arguments)s) { unsigned tid = threadIdx.x; unsigned total_threads = gridDim.x*blockDim.x; unsigned cta_start = blockDim.x*blockIdx.x; long i; %(loop_prep)s; if (step < 0) { for (i = start + (cta_start + tid)*step; i > stop; i += total_threads*step) { %(operation)s; } } else { for (i = start + (cta_start + tid)*step; i < stop; i += total_threads*step) { %(operation)s; } } %(after_loop)s; } """ % { "arguments": ", ".join(arg.declarator() for arg in arguments), "operation": operation, "name": name, "preamble": preamble, "loop_prep": loop_prep, "after_loop": after_loop, }, options=options, keep=keep) def get_elwise_kernel_and_types(arguments, operation, name="kernel", keep=False, options=None, use_range=False, **kwargs): if isinstance(arguments, str): from pycuda.tools import parse_c_arg arguments = [parse_c_arg(arg) for arg in arguments.split(",")] if use_range: arguments.extend([ ScalarArg(np.intp, "start"), ScalarArg(np.intp, "stop"), ScalarArg(np.intp, "step"), ]) else: arguments.append(ScalarArg(np.uintp, "n")) if use_range: module_builder = get_elwise_range_module else: module_builder = get_elwise_module mod = module_builder(arguments, operation, name, keep, options, **kwargs) func = mod.get_function(name) func.prepare("".join(arg.struct_char for arg in arguments)) return func, arguments def get_elwise_kernel(arguments, operation, name="kernel", keep=False, options=None, **kwargs): """Return a L{pycuda.driver.Function} that performs the same scalar operation on one or several vectors. """ func, arguments = get_elwise_kernel_and_types( arguments, operation, name, keep, options, **kwargs) return func class ElementwiseKernel: def __init__(self, arguments, operation, name="kernel", keep=False, options=None, **kwargs): self.gen_kwargs = kwargs.copy() self.gen_kwargs.update(dict(keep=keep, options=options, name=name, operation=operation, arguments=arguments)) @memoize_method def generate_stride_kernel_and_types(self, use_range): knl, arguments = get_elwise_kernel_and_types(use_range=use_range, **self.gen_kwargs) assert [i for i, arg in enumerate(arguments) if isinstance(arg, VectorArg)], \ "ElementwiseKernel can only be used with functions that " \ "have at least one vector argument" return knl, arguments def __call__(self, *args, **kwargs): vectors = [] range_ = kwargs.pop("range", None) slice_ = kwargs.pop("slice", None) stream = kwargs.pop("stream", None) if kwargs: raise TypeError("invalid keyword arguments specified: " + ", ".join(kwargs.iterkeys())) invocation_args = [] func, arguments = self.generate_stride_kernel_and_types( range_ is not None or slice_ is not None) for arg, arg_descr in zip(args, arguments): if isinstance(arg_descr, VectorArg): if not arg.flags.forc: raise RuntimeError("elementwise kernel cannot " "deal with non-contiguous arrays") vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) repr_vec = vectors[0] if slice_ is not None: if range_ is not None: raise TypeError("may not specify both range and slice " "keyword arguments") range_ = slice(*slice_.indices(repr_vec.size)) if range_ is not None: invocation_args.append(range_.start) invocation_args.append(range_.stop) if range_.step is None: invocation_args.append(1) else: invocation_args.append(range_.step) from pycuda.gpuarray import splay grid, block = splay(abs(range_.stop - range_.start)//range_.step) else: block = repr_vec._block grid = repr_vec._grid invocation_args.append(repr_vec.mem_size) func.prepared_async_call(grid, block, stream, *invocation_args) @context_dependent_memoize def get_take_kernel(dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), } args = [VectorArg(idx_dtype, "idx")] + [ VectorArg(dtype, "dest"+str(i))for i in range(vec_count)] + [ ScalarArg(np.intp, "n") ] preamble = "#include \n\n" + "\n".join( "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) for i in range(vec_count)) body = ( ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join( "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) for i in range(vec_count))) mod = get_elwise_module(args, body, "take", preamble=preamble) func = mod.get_function("take") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src) return func, tex_src @context_dependent_memoize def get_take_put_kernel(dtype, idx_dtype, with_offsets, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), } args = [ VectorArg(idx_dtype, "gmem_dest_idx"), VectorArg(idx_dtype, "gmem_src_idx"), ] + [ VectorArg(dtype, "dest%d" % i) for i in range(vec_count) ] + [ ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets ] + [ScalarArg(np.intp, "n")] preamble = "#include \n\n" + "\n".join( "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) for i in range(vec_count)) if with_offsets: def get_copy_insn(i): return ("dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);" % (i, i, i)) else: def get_copy_insn(i): return ("dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)) body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + "\n".join(get_copy_insn(i) for i in range(vec_count))) mod = get_elwise_module(args, body, "take_put", preamble=preamble) func = mod.get_function("take_put") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] func.prepare( "PP"+(vec_count*"P") + (bool(with_offsets)*vec_count*idx_dtype.char) + np.dtype(np.uintp).char, texrefs=tex_src) return func, tex_src @context_dependent_memoize def get_put_kernel(dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(idx_dtype, "gmem_dest_idx"), ] + [ VectorArg(dtype, "dest%d" % i) for i in range(vec_count) ] + [ VectorArg(dtype, "src%d" % i) for i in range(vec_count) ] + [ScalarArg(np.intp, "n")] body = ( "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count))) func = get_elwise_module(args, body, "put").get_function("put") func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char) return func @context_dependent_memoize def get_copy_kernel(dtype_dest, dtype_src): return get_elwise_kernel( "%(tp_dest)s *dest, %(tp_src)s *src" % { "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = src[i]", "copy") @context_dependent_memoize def get_linear_combination_kernel(summand_descriptors, dtype_z): from pycuda.tools import dtype_to_ctype from pycuda.elementwise import \ VectorArg, ScalarArg, get_elwise_module args = [] preamble = ["#include \n\n"] loop_prep = [] summands = [] tex_names = [] for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \ enumerate(summand_descriptors): if is_gpu_scalar: preamble.append( "texture <%s, 1, cudaReadModeElementType> tex_a%d;" % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) args.append(VectorArg(vector_dtype, "x%d" % i)) tex_names.append("tex_a%d" % i) loop_prep.append( "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" % (dtype_to_ctype(scalar_dtype), i, i)) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i)) summands.append("a%d*x%d[i]" % (i, i)) args.append(VectorArg(dtype_z, "z")) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, "z[i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep)) func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] func.prepare("".join(arg.struct_char for arg in args), texrefs=tex_src) return func, tex_src @context_dependent_memoize def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): return get_elwise_kernel( "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = a*x[i] + b*y[i]", "axpbyz") @context_dependent_memoize def get_axpbz_kernel(dtype_x, dtype_z): return get_elwise_kernel( "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z) }, "z[i] = a * x[i] + b", "axpb") @context_dependent_memoize def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] %s y[i]" % operator, "multiply") @context_dependent_memoize def get_rdivide_elwise_kernel(dtype_x, dtype_z): return get_elwise_kernel( "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = y / x[i]", "divide_r") @context_dependent_memoize def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y[i])" % func, func+"_kernel") def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z): if not np.float64 in [dtype_x, dtype_y]: func = func + "f" from pytools import any if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]): func = "f"+func return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z) @context_dependent_memoize def get_fill_kernel(dtype): return get_elwise_kernel( "%(tp)s a, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = a", "fill") @context_dependent_memoize def get_reverse_kernel(dtype): return get_elwise_kernel( "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = y[n-1-i]", "reverse") @context_dependent_memoize def get_real_kernel(dtype, real_dtype): return get_elwise_kernel( "%(tp)s *y, %(real_tp)s *z" % { "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, "z[i] = real(y[i])", "real") @context_dependent_memoize def get_imag_kernel(dtype, real_dtype): return get_elwise_kernel( "%(tp)s *y, %(real_tp)s *z" % { "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, "z[i] = imag(y[i])", "imag") @context_dependent_memoize def get_conj_kernel(dtype): return get_elwise_kernel( "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = pycuda::conj(y[i])", "conj") @context_dependent_memoize def get_arange_kernel(dtype): return get_elwise_kernel( "%(tp)s *z, %(tp)s start, %(tp)s step" % { "tp": dtype_to_ctype(dtype), }, "z[i] = start + i*step", "arange") @context_dependent_memoize def get_pow_kernel(dtype): if dtype == np.float32: func = "powf" else: func = "pow" return get_elwise_kernel( "%(tp)s value, %(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = %s(y[i], value)" % func, "pow_method") @context_dependent_memoize def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): if np.float64 in [dtype_x, dtype_y]: func = "pow" else: func = "powf" return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y[i])" % func, "pow_method") @context_dependent_memoize def get_fmod_kernel(): return get_elwise_kernel( "float *arg, float *mod, float *z", "z[i] = fmod(arg[i], mod[i])", "fmod_kernel") @context_dependent_memoize def get_modf_kernel(): return get_elwise_kernel( "float *x, float *intpart ,float *fracpart", "fracpart[i] = modf(x[i], &intpart[i])", "modf_kernel") @context_dependent_memoize def get_frexp_kernel(): return get_elwise_kernel( "float *x, float *significand, float *exponent", """ int expt = 0; significand[i] = frexp(x[i], &expt); exponent[i] = expt; """, "frexp_kernel") @context_dependent_memoize def get_ldexp_kernel(): return get_elwise_kernel( "float *sig, float *expt, float *z", "z[i] = ldexp(sig[i], int(expt[i]))", "ldexp_kernel") @context_dependent_memoize def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): if out_dtype is None: out_dtype = in_dtype return get_elwise_kernel( "%(tp_in)s *y, %(tp_out)s *z" % { "tp_in": dtype_to_ctype(in_dtype), "tp_out": dtype_to_ctype(out_dtype), }, "z[i] = %s(y[i])" % func_name, "%s_kernel" % func_name) @context_dependent_memoize def get_if_positive_kernel(crit_dtype, dtype): return get_elwise_kernel([ VectorArg(crit_dtype, "crit"), VectorArg(dtype, "then_"), VectorArg(dtype, "else_"), VectorArg(dtype, "result"), ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", "if_positive") @context_dependent_memoize def get_scalar_op_kernel(dtype_x, dtype_y, operator): return get_elwise_kernel( "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_a": dtype_to_ctype(dtype_x), }, "y[i] = x[i] %s a" % operator, "scalarop_kernel") pycuda-2013.1.1+git20140310/pycuda/reduction.py0000644000175000000500000003312112313360364017162 0ustar tomussrc"""Computation of reductions on vectors.""" from __future__ import division __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Based on code/ideas by Mark Harris . Original License: Copyright 1993-2007 NVIDIA Corporation. All rights reserved. NOTICE TO USER: This source code is subject to NVIDIA ownership rights under U.S. and international Copyright laws. NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE. U.S. Government End Users. This source code is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of "commercial computer software" and "commercial computer software documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item. Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the source code with only those rights set forth herein. """ from pycuda.tools import context_dependent_memoize from pycuda.tools import dtype_to_ctype import numpy as np def get_reduction_module(out_type, block_size, neutral, reduce_expr, map_expr, arguments, name="reduce_kernel", keep=False, options=None, preamble=""): from pycuda.compiler import SourceModule src = """ #include #define BLOCK_SIZE %(block_size)d #define READ_AND_MAP(i) (%(map_expr)s) #define REDUCE(a, b) (%(reduce_expr)s) %(preamble)s typedef %(out_type)s out_type; extern "C" __global__ void %(name)s(out_type *out, %(arguments)s, unsigned int seq_count, unsigned int n) { // Needs to be variable-size to prevent the braindead CUDA compiler from // running constructors on this array. Grrrr. extern __shared__ out_type sdata[]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*BLOCK_SIZE*seq_count + tid; out_type acc = %(neutral)s; for (unsigned s = 0; s < seq_count; ++s) { if (i >= n) break; acc = REDUCE(acc, READ_AND_MAP(i)); i += BLOCK_SIZE; } sdata[tid] = acc; __syncthreads(); #if (BLOCK_SIZE >= 512) if (tid < 256) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 256]); } __syncthreads(); #endif #if (BLOCK_SIZE >= 256) if (tid < 128) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 128]); } __syncthreads(); #endif #if (BLOCK_SIZE >= 128) if (tid < 64) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 64]); } __syncthreads(); #endif if (tid < 32) { // 'volatile' required according to Fermi compatibility guide 1.2.2 volatile out_type *smem = sdata; if (BLOCK_SIZE >= 64) smem[tid] = REDUCE(smem[tid], smem[tid + 32]); if (BLOCK_SIZE >= 32) smem[tid] = REDUCE(smem[tid], smem[tid + 16]); if (BLOCK_SIZE >= 16) smem[tid] = REDUCE(smem[tid], smem[tid + 8]); if (BLOCK_SIZE >= 8) smem[tid] = REDUCE(smem[tid], smem[tid + 4]); if (BLOCK_SIZE >= 4) smem[tid] = REDUCE(smem[tid], smem[tid + 2]); if (BLOCK_SIZE >= 2) smem[tid] = REDUCE(smem[tid], smem[tid + 1]); } if (tid == 0) out[blockIdx.x] = sdata[0]; } """ % { "out_type": out_type, "arguments": arguments, "block_size": block_size, "neutral": neutral, "reduce_expr": reduce_expr, "map_expr": map_expr, "name": name, "preamble": preamble } return SourceModule(src, options=options, keep=keep, no_extern_c=True) def get_reduction_kernel_and_types(stage, out_type, block_size, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", keep=False, options=None, preamble=""): if stage == 1: if map_expr is None: map_expr = "in[i]" elif stage == 2: if map_expr is None: map_expr = "pycuda_reduction_inp[i]" in_arg = "const %s *pycuda_reduction_inp" % out_type if arguments: arguments = in_arg + ", " + arguments else: arguments = in_arg else: assert False mod = get_reduction_module(out_type, block_size, neutral, reduce_expr, map_expr, arguments, name, keep, options, preamble) from pycuda.tools import get_arg_type func = mod.get_function(name) arg_types = [get_arg_type(arg) for arg in arguments.split(",")] func.prepare("P%sII" % "".join(arg_types)) return func, arg_types class ReductionKernel: def __init__(self, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", keep=False, options=None, preamble=""): self.dtype_out = np.dtype(dtype_out) self.block_size = 512 s1_func, self.stage1_arg_types = get_reduction_kernel_and_types( 1, dtype_to_ctype(dtype_out), self.block_size, neutral, reduce_expr, map_expr, arguments, name=name+"_stage1", keep=keep, options=options, preamble=preamble) self.stage1_func = s1_func.prepared_async_call # stage 2 has only one input and no map expression s2_func, self.stage2_arg_types = get_reduction_kernel_and_types( 2, dtype_to_ctype(dtype_out), self.block_size, neutral, reduce_expr, arguments=arguments, name=name+"_stage2", keep=keep, options=options, preamble=preamble) self.stage2_func = s2_func.prepared_async_call assert [i for i, arg_tp in enumerate(self.stage1_arg_types) if arg_tp == "P"], \ "ReductionKernel can only be used with functions that have at least one " \ "vector argument" def __call__(self, *args, **kwargs): MAX_BLOCK_COUNT = 1024 SMALL_SEQ_COUNT = 4 s1_func = self.stage1_func s2_func = self.stage2_func kernel_wrapper = kwargs.get("kernel_wrapper") if kernel_wrapper is not None: s1_func = kernel_wrapper(s1_func) s2_func = kernel_wrapper(s2_func) stream = kwargs.get("stream") from gpuarray import empty f = s1_func arg_types = self.stage1_arg_types stage1_args = args while True: invocation_args = [] vectors = [] for arg, arg_tp in zip(args, arg_types): if arg_tp == "P": if not arg.flags.forc: raise RuntimeError("ReductionKernel cannot " "deal with non-contiguous arrays") vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) repr_vec = vectors[0] sz = repr_vec.size if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT: total_block_size = SMALL_SEQ_COUNT*self.block_size block_count = (sz + total_block_size - 1) // total_block_size seq_count = SMALL_SEQ_COUNT else: block_count = MAX_BLOCK_COUNT macroblock_size = block_count*self.block_size seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1: result = empty((), self.dtype_out, repr_vec.allocator) else: result = empty((block_count,), self.dtype_out, repr_vec.allocator) kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize) #print block_count, seq_count, self.block_size, sz f((block_count, 1), (self.block_size, 1, 1), stream, *([result.gpudata]+invocation_args+[seq_count, sz]), **kwargs) if block_count == 1: return result else: f = s2_func arg_types = self.stage2_arg_types args = (result,) + stage1_args @context_dependent_memoize def get_sum_kernel(dtype_out, dtype_in): if dtype_out is None: dtype_out = dtype_in return ReductionKernel(dtype_out, "0", "a+b", arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)}) @context_dependent_memoize def get_subset_sum_kernel(dtype_out, dtype_subset, dtype_in): if dtype_out is None: dtype_out = dtype_in return ReductionKernel(dtype_out, "0", "a+b", map_expr="in[lookup_tbl[i]]", arguments="const %(tp_lut)s *lookup_tbl, const %(tp)s *in" % { "tp": dtype_to_ctype(dtype_in), "tp_lut": dtype_to_ctype(dtype_subset), }) @context_dependent_memoize def get_dot_kernel(dtype_out, dtype_a, dtype_b): return ReductionKernel(dtype_out, neutral="0", reduce_expr="a+b", map_expr="a[i]*b[i]", arguments="const %(tp_a)s *a, const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }, keep=True) @context_dependent_memoize def get_subset_dot_kernel(dtype_out, dtype_subset, dtype_a=None, dtype_b=None): if dtype_out is None: dtype_out = dtype_a if dtype_b is None: if dtype_a is None: dtype_b = dtype_out else: dtype_b = dtype_a if dtype_a is None: dtype_a = dtype_out # important: lookup_tbl must be first--it controls the length return ReductionKernel(dtype_out, neutral="0", reduce_expr="a+b", map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]", arguments="const %(tp_lut)s *lookup_tbl, " "const %(tp_a)s *a, const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), "tp_lut": dtype_to_ctype(dtype_subset), }) def get_minmax_neutral(what, dtype): dtype = np.dtype(dtype) if issubclass(dtype.type, np.inexact): if what == "min": return "MY_INFINITY" elif what == "max": return "-MY_INFINITY" else: raise ValueError("what is not min or max.") else: if what == "min": return str(np.iinfo(dtype).max) elif what == "max": return str(np.iinfo(dtype).min) else: raise ValueError("what is not min or max.") @context_dependent_memoize def get_minmax_kernel(what, dtype): if dtype == np.float64: reduce_expr = "f%s(a,b)" % what elif dtype == np.float32: reduce_expr = "f%sf(a,b)" % what elif dtype.kind in "iu": reduce_expr = "%s(a,b)" % what else: raise TypeError("unsupported dtype specified") return ReductionKernel(dtype, neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, arguments="const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), }, preamble="#define MY_INFINITY (1./0)") @context_dependent_memoize def get_subset_minmax_kernel(what, dtype, dtype_subset): if dtype == np.float64: reduce_expr = "f%s(a,b)" % what elif dtype == np.float32: reduce_expr = "f%sf(a,b)" % what elif dtype.kind in "iu": reduce_expr = "%s(a,b)" % what else: raise TypeError("unsupported dtype specified") return ReductionKernel(dtype, neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, map_expr="in[lookup_tbl[i]]", arguments="const %(tp_lut)s *lookup_tbl, " "const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), "tp_lut": dtype_to_ctype(dtype_subset), }, preamble="#define MY_INFINITY (1./0)") pycuda-2013.1.1+git20140310/pycuda/_mymako.py0000644000175000000500000000107412313360364016624 0ustar tomussrctry: import mako.template except ImportError: raise ImportError( "Some of PyCUDA's facilities require the Mako templating engine.\n" "You or a piece of software you have used has tried to call such a\n" "part of PyCUDA, but there was a problem importing Mako.\n\n" "You may install mako now by typing one of:\n" "- easy_install Mako\n" "- pip install Mako\n" "- aptitude install python-mako\n" "\nor whatever else is appropriate for your system.") from mako import * pycuda-2013.1.1+git20140310/pycuda/debug.py0000644000175000000500000000077212313360364016262 0ustar tomussrcimport pycuda.driver pycuda.driver.set_debugging() import sys from optparse import OptionParser parser = OptionParser( usage="usage: %prog [options] SCRIPT-TO-RUN [SCRIPT-ARGUMENTS]") parser.disable_interspersed_args() options, args = parser.parse_args() if len(args) < 1: parser.print_help() sys.exit(2) mainpyfile = args[0] from os.path import exists if not exists(mainpyfile): print 'Error:', mainpyfile, 'does not exist' sys.exit(1) sys.argv = args execfile(mainpyfile) pycuda-2013.1.1+git20140310/pycuda/compiler.py0000644000175000000500000002012212313360364016775 0ustar tomussrcfrom pytools import memoize # don't import pycuda.driver here--you'll create an import loop import sys from tempfile import mkstemp from os import unlink from pytools.prefork import call_capture_output @memoize def get_nvcc_version(nvcc): cmdline = [nvcc, "--version"] result, stdout, stderr = call_capture_output(cmdline) if result != 0 or not stdout: from warnings import warn warn("NVCC version could not be determined.") stdout = "nvcc unknown version" return stdout.decode("utf-8", "replace") def _new_md5(): try: import hashlib return hashlib.md5() except ImportError: # for Python << 2.5 import md5 return md5.new() def preprocess_source(source, options, nvcc): handle, source_path = mkstemp(suffix='.cu') outf = open(source_path, 'w') outf.write(source) outf.close() os.close(handle) cmdline = [nvcc, '--preprocess'] + options + [source_path] if 'win32' in sys.platform: cmdline.extend(['--compiler-options', '-EP']) else: cmdline.extend(['--compiler-options', '-P']) result, stdout, stderr = call_capture_output(cmdline, error_on_nonzero=False) if result != 0: from pycuda.driver import CompileError raise CompileError("nvcc preprocessing of %s failed" % source_path, cmdline, stderr=stderr) # sanity check if len(stdout) < 0.5*len(source): from pycuda.driver import CompileError raise CompileError("nvcc preprocessing of %s failed with ridiculously " "small code output - likely unsupported compiler." % source_path, cmdline, stderr=stderr.decode("utf-8", "replace")) unlink(source_path) return stdout.decode("utf-8", "replace") def compile_plain(source, options, keep, nvcc, cache_dir): from os.path import join if cache_dir: checksum = _new_md5() if '#include' in source: checksum.update(preprocess_source(source, options, nvcc).encode("utf-8")) else: checksum.update(source.encode("utf-8")) for option in options: checksum.update(option.encode("utf-8")) checksum.update(get_nvcc_version(nvcc).encode("utf-8")) from pycuda.characterize import platform_bits checksum.update(str(platform_bits()).encode("utf-8")) cache_file = checksum.hexdigest() cache_path = join(cache_dir, cache_file + ".cubin") try: cache_file = open(cache_path, "rb") try: return cache_file.read() finally: cache_file.close() except: pass from tempfile import mkdtemp file_dir = mkdtemp() file_root = "kernel" cu_file_name = file_root + ".cu" cu_file_path = join(file_dir, cu_file_name) outf = open(cu_file_path, "w") outf.write(str(source)) outf.close() if keep: options = options[:] options.append("--keep") print "*** compiler output in %s" % file_dir cmdline = [nvcc, "--cubin"] + options + [cu_file_name] result, stdout, stderr = call_capture_output(cmdline, cwd=file_dir, error_on_nonzero=False) try: cubin_f = open(join(file_dir, file_root + ".cubin"), "rb") except IOError: no_output = True else: no_output = False if result != 0 or (no_output and (stdout or stderr)): if result == 0: from warnings import warn warn("PyCUDA: nvcc exited with status 0, but appears to have " "encountered an error") from pycuda.driver import CompileError raise CompileError("nvcc compilation of %s failed" % cu_file_path, cmdline, stdout=stdout.decode("utf-8", "replace"), stderr=stderr.decode("utf-8", "replace")) if stdout or stderr: lcase_err_text = (stdout+stderr).decode("utf-8", "replace").lower() from warnings import warn if "demoted" in lcase_err_text or "demoting" in lcase_err_text: warn("nvcc said it demoted types in source code it " "compiled--this is likely not what you want.", stacklevel=4) warn("The CUDA compiler succeeded, but said the following:\n" + (stdout+stderr).decode("utf-8", "replace"), stacklevel=4) cubin = cubin_f.read() cubin_f.close() if cache_dir: outf = open(cache_path, "wb") outf.write(cubin) outf.close() if not keep: from os import listdir, unlink, rmdir for name in listdir(file_dir): unlink(join(file_dir, name)) rmdir(file_dir) return cubin def _get_per_user_string(): try: from os import getuid except ImportError: checksum = _new_md5() from os import environ checksum.update(environ["USERNAME"].encode("utf-8")) return checksum.hexdigest() else: return "uid%d" % getuid() def _find_pycuda_include_path(): from pkg_resources import Requirement, resource_filename return resource_filename(Requirement.parse("pycuda"), "pycuda/cuda") import os DEFAULT_NVCC_FLAGS = [ _flag.strip() for _flag in os.environ.get("PYCUDA_DEFAULT_NVCC_FLAGS", "").split() if _flag.strip()] def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except RuntimeError: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join from tempfile import gettempdir cache_dir = join(gettempdir(), "pycuda-compiler-cache-v1-%s" % _get_per_user_string()) from os import mkdir try: mkdir(cache_dir) except OSError, e: from errno import EEXIST if e.errno != EEXIST: raise if arch is not None: options.extend(["-arch", arch]) if code is not None: options.extend(["-code", code]) if 'darwin' in sys.platform and sys.maxint == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 2147483647: options.append('-m32') include_dirs = include_dirs + [_find_pycuda_include_path()] for i in include_dirs: options.append("-I"+i) return compile_plain(source, options, keep, nvcc, cache_dir) class SourceModule(object): def __init__(self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): self._check_arch(arch) cubin = compile(source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs) from pycuda.driver import module_from_buffer self.module = module_from_buffer(cubin) self.get_global = self.module.get_global self.get_texref = self.module.get_texref if hasattr(self.module, "get_surfref"): self.get_surfref = self.module.get_surfref def _check_arch(self, arch): if arch is None: return try: from pycuda.driver import Context capability = Context.get_device().compute_capability() if tuple(map(int, tuple(arch.split("_")[1]))) > capability: from warnings import warn warn("trying to compile for a compute capability " "higher than selected GPU") except: pass def get_function(self, name): return self.module.get_function(name) pycuda-2013.1.1+git20140310/pycuda/cumath.py0000644000175000000500000001064612313360364016456 0ustar tomussrcimport pycuda.gpuarray as gpuarray import pycuda.elementwise as elementwise import numpy as np import warnings from pycuda.driver import Stream def _make_unary_array_func(name): def f(array, stream_or_out=None, **kwargs): if stream_or_out is not None: warnings.warn("please use 'out' or 'stream' keyword arguments", DeprecationWarning) if isinstance(stream_or_out, Stream): stream = stream_or_out out = None else: stream = None out = stream_or_out out, stream = None, None if 'out' in kwargs: out = kwargs['out'] if 'stream' in kwargs: stream = kwargs['stream'] if array.dtype == np.float32: func_name = name + "f" else: func_name = name if not array.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") if out is None: out = array._new_like_me() else: assert out.dtype == array.dtype assert out.strides == array.strides assert out.shape == array.shape func = elementwise.get_unary_func_kernel(func_name, array.dtype) func.prepared_async_call(array._grid, array._block, stream, array.gpudata, out.gpudata, array.mem_size) return out return f fabs = _make_unary_array_func("fabs") ceil = _make_unary_array_func("ceil") floor = _make_unary_array_func("floor") exp = _make_unary_array_func("exp") log = _make_unary_array_func("log") log10 = _make_unary_array_func("log10") sqrt = _make_unary_array_func("sqrt") sin = _make_unary_array_func("sin") cos = _make_unary_array_func("cos") tan = _make_unary_array_func("tan") asin = _make_unary_array_func("asin") acos = _make_unary_array_func("acos") atan = _make_unary_array_func("atan") sinh = _make_unary_array_func("sinh") cosh = _make_unary_array_func("cosh") tanh = _make_unary_array_func("tanh") def fmod(arg, mod, stream=None): """Return the floating point remainder of the division `arg/mod`, for each element in `arg` and `mod`.""" result = gpuarray.GPUArray(arg.shape, arg.dtype) if not arg.flags.forc or not mod.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") func = elementwise.get_fmod_kernel() func.prepared_async_call(arg._grid, arg._block, stream, arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size) return result def frexp(arg, stream=None): """Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. """ if not arg.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") sig = gpuarray.GPUArray(arg.shape, arg.dtype) expt = gpuarray.GPUArray(arg.shape, arg.dtype) func = elementwise.get_frexp_kernel() func.prepared_async_call(arg._grid, arg._block, stream, arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) return sig, expt def ldexp(significand, exponent, stream=None): """Return a new array of floating point values composed from the entries of `significand` and `exponent`, paired together as `result = significand * 2**exponent`. """ if not significand.flags.forc or not exponent.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") result = gpuarray.GPUArray(significand.shape, significand.dtype) func = elementwise.get_ldexp_kernel() func.prepared_async_call(significand._grid, significand._block, stream, significand.gpudata, exponent.gpudata, result.gpudata, significand.mem_size) return result def modf(arg, stream=None): """Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. """ if not arg.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") intpart = gpuarray.GPUArray(arg.shape, arg.dtype) fracpart = gpuarray.GPUArray(arg.shape, arg.dtype) func = elementwise.get_modf_kernel() func.prepared_async_call(arg._grid, arg._block, stream, arg.gpudata, intpart.gpudata, fracpart.gpudata, arg.mem_size) return fracpart, intpart pycuda-2013.1.1+git20140310/pycuda/tools.py0000644000175000000500000003344012313360364016332 0ustar tomussrc"""Miscallenous helper functionality.""" from __future__ import division __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pycuda.driver as cuda from decorator import decorator import pycuda._driver as _drv import numpy as np bitlog2 = _drv.bitlog2 DeviceMemoryPool = _drv.DeviceMemoryPool PageLockedMemoryPool = _drv.PageLockedMemoryPool from pycuda.compyte.dtypes import ( register_dtype, get_or_register_dtype, _fill_dtype_registry, dtype_to_ctype as base_dtype_to_ctype) _fill_dtype_registry(respect_windows=True) get_or_register_dtype("pycuda::complex", np.complex64) get_or_register_dtype("pycuda::complex", np.complex128) # {{{ debug memory pool class DebugMemoryPool(DeviceMemoryPool): def __init__(self, interactive=True, logfile=None): DeviceMemoryPool.__init__(self) self.last_free, _ = cuda.mem_get_info() self.interactive = interactive if logfile is None: import sys logfile = sys.stdout self.logfile = logfile from weakref import WeakKeyDictionary self.blocks = WeakKeyDictionary() if interactive: from pytools.diskdict import DiskDict self.stacktrace_mnemonics = DiskDict("pycuda-stacktrace-mnemonics") def allocate(self, size): from traceback import extract_stack stack = tuple(frm[2] for frm in extract_stack()) description = self.describe(stack, size) histogram = {} for bsize, descr in self.blocks.itervalues(): histogram[bsize, descr] = histogram.get((bsize, descr), 0) + 1 from pytools import common_prefix cpfx = common_prefix(descr for bsize, descr in histogram) print >> self.logfile, \ "\n Allocation of size %d occurring " \ "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):" \ "\n at: %s" % ( (size, self.last_free) + cuda.mem_get_info() + (self.held_blocks, self.active_blocks, description)) hist_items = sorted(list(histogram.iteritems())) for (bsize, descr), count in hist_items: print >> self.logfile, \ " %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count) if self.interactive: raw_input(" [Enter]") result = DeviceMemoryPool.allocate(self, size) self.blocks[result] = size, description self.last_free, _ = cuda.mem_get_info() return result def describe(self, stack, size): if not self.interactive: return "|".join(stack) else: try: return self.stacktrace_mnemonics[stack, size] except KeyError: print size, stack while True: mnemonic = raw_input("Enter mnemonic or [Enter] for more info:") if mnemonic == '': from traceback import print_stack print_stack() else: break self.stacktrace_mnemonics[stack, size] = mnemonic return mnemonic # }}} # {{{ default device/context def get_default_device(default=0): from warnings import warn warn("get_default_device() is deprecated; " "use make_default_context() instead", DeprecationWarning) from pycuda.driver import Device import os dev = os.environ.get("CUDA_DEVICE") if dev is None: try: dev = (open(os.path.join(os.path.expanduser("~"), ".cuda_device")) .read().strip()) except: pass if dev is None: dev = default try: dev = int(dev) except TypeError: raise TypeError("CUDA device number (CUDA_DEVICE or ~/.cuda-device) must be an integer") return Device(dev) def make_default_context(ctx_maker=None): if ctx_maker is None: def ctx_maker(dev): return dev.make_context() ndevices = cuda.Device.count() if ndevices == 0: raise RuntimeError("No CUDA enabled device found. " "Please check your installation.") # Is CUDA_DEVICE set? import os devn = os.environ.get("CUDA_DEVICE") # Is $HOME/.cuda_device set ? if devn is None: try: homedir = os.environ.get("HOME") assert homedir is not None devn = (open(os.path.join(homedir, ".cuda_device")) .read().strip()) except: pass # If either CUDA_DEVICE or $HOME/.cuda_device is set, try to use it if devn is not None: try: devn = int(devn) except TypeError: raise TypeError("CUDA device number (CUDA_DEVICE or ~/.cuda_device)" " must be an integer") dev = cuda.Device(devn) return ctx_maker(dev) # Otherwise, try to use any available device else: for devn in xrange(ndevices): dev = cuda.Device(devn) try: return ctx_maker(dev) except cuda.Error: pass raise RuntimeError("make_default_context() wasn't able to create a context " "on any of the %d detected devices" % ndevices) # }}} # {{{ rounding helpers def _exact_div(dividend, divisor): quot, rem = divmod(dividend, divisor) assert rem == 0 return quot def _int_ceiling(value, multiple_of=1): """Round C{value} up to be a C{multiple_of} something.""" # Mimicks the Excel "floor" function (for code stolen from occupancy calculator) from math import ceil return int(ceil(value/multiple_of))*multiple_of def _int_floor(value, multiple_of=1): """Round C{value} down to be a C{multiple_of} something.""" # Mimicks the Excel "floor" function (for code stolen from occupancy calculator) from math import floor return int(floor(value/multiple_of))*multiple_of # }}} # {{{ device data class DeviceData: def __init__(self, dev=None): import pycuda.driver as drv if dev is None: dev = cuda.Context.get_device() self.max_threads = dev.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK) self.warp_size = dev.get_attribute(drv.device_attribute.WARP_SIZE) if dev.compute_capability() >= (2,0): self.warps_per_mp = 48 elif dev.compute_capability() >= (1,2): self.warps_per_mp = 32 else: self.warps_per_mp = 24 self.thread_blocks_per_mp = 8 self.registers = dev.get_attribute(drv.device_attribute.MAX_REGISTERS_PER_BLOCK) self.shared_memory = dev.get_attribute(drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK) if dev.compute_capability() >= (2,0): self.smem_alloc_granularity = 128 self.smem_granularity = 32 else: self.smem_alloc_granularity = 512 self.smem_granularity = 16 if dev.compute_capability() >= (2,0): self.register_allocation_unit = "warp" else: self.register_allocation_unit = "block" def align(self, bytes, word_size=4): return _int_ceiling(bytes, self.align_bytes(word_size)) def align_dtype(self, elements, dtype_size): return _int_ceiling(elements, self.align_words(dtype_size)) def align_words(self, word_size): return _exact_div(self.align_bytes(word_size), word_size) def align_bytes(self, word_size=4): if word_size == 4: return 64 elif word_size == 8: return 128 elif word_size == 16: return 128 else: raise ValueError, "no alignment possible for fetches of size %d" % word_size def coalesce(self, thread_count): return _int_ceiling(thread_count, 16) @staticmethod def make_valid_tex_channel_count(size): valid_sizes = [1,2,4] for vs in valid_sizes: if size <= vs: return vs raise ValueError, "could not enlarge argument to valid channel count" # }}} # {{{ occupancy class OccupancyRecord: def __init__(self, devdata, threads, shared_mem=0, registers=0): if threads > devdata.max_threads: raise ValueError("too many threads") # copied literally from occupancy calculator alloc_warps = _int_ceiling(threads/devdata.warp_size) alloc_smem = _int_ceiling(shared_mem, devdata.smem_alloc_granularity) if devdata.register_allocation_unit == "warp": alloc_regs = alloc_warps*32*registers elif devdata.register_allocation_unit == "block": alloc_regs = _int_ceiling(alloc_warps*2, 4)*16*registers else: raise ValueError("Improper register allocation unit:"+devdata.register_allocation_unit) if alloc_regs > devdata.registers: raise ValueError("too many registers") if alloc_smem > devdata.shared_memory: raise ValueError("too much smem") self.tb_per_mp_limits = [(devdata.thread_blocks_per_mp, "device"), (_int_floor(devdata.warps_per_mp/alloc_warps), "warps") ] if registers > 0: self.tb_per_mp_limits.append((_int_floor(devdata.registers/alloc_regs), "regs")) if shared_mem > 0: self.tb_per_mp_limits.append((_int_floor(devdata.shared_memory/alloc_smem), "smem")) self.tb_per_mp, self.limited_by = min(self.tb_per_mp_limits) self.warps_per_mp = self.tb_per_mp * alloc_warps self.occupancy = self.warps_per_mp / devdata.warps_per_mp # }}} # {{{ C types <-> dtypes class Argument: def __init__(self, dtype, name): self.dtype = np.dtype(dtype) self.name = name def __repr__(self): return "%s(%r, %s)" % ( self.__class__.__name__, self.name, self.dtype) def dtype_to_ctype(dtype, with_fp_tex_hack=False): if dtype is None: raise ValueError("dtype may not be None") dtype = np.dtype(dtype) if with_fp_tex_hack: if dtype == np.float32: return "fp_tex_float" elif dtype == np.float64: return "fp_tex_double" elif dtype == np.complex64: return "fp_tex_cfloat" elif dtype == np.complex128: return "fp_tex_cdouble" return base_dtype_to_ctype(dtype) class VectorArg(Argument): def declarator(self): return "%s *%s" % (dtype_to_ctype(self.dtype), self.name) struct_char = "P" class ScalarArg(Argument): def declarator(self): return "%s %s" % (dtype_to_ctype(self.dtype), self.name) @property def struct_char(self): result = self.dtype.char if result == "V": result = "%ds" % self.dtype.itemsize return result def parse_c_arg(c_arg): from pycuda.compyte.dtypes import parse_c_arg_backend return parse_c_arg_backend(c_arg, ScalarArg, VectorArg) def get_arg_type(c_arg): return parse_c_arg(c_arg).struct_char # }}} # {{{ context-dep memoization context_dependent_memoized_functions = [] @decorator def context_dependent_memoize(func, *args): try: ctx_dict = func._pycuda_ctx_dep_memoize_dic except AttributeError: # FIXME: This may keep contexts alive longer than desired. # But I guess since the memory in them is freed, who cares. ctx_dict = func._pycuda_ctx_dep_memoize_dic = {} cur_ctx = cuda.Context.get_current() try: return ctx_dict[cur_ctx][args] except KeyError: context_dependent_memoized_functions.append(func) arg_dict = ctx_dict.setdefault(cur_ctx, {}) result = func(*args) arg_dict[args] = result return result def clear_context_caches(): for func in context_dependent_memoized_functions: try: ctx_dict = func._pycuda_ctx_dep_memoize_dic except AttributeError: pass else: ctx_dict.clear() # }}} # {{{ py.test interaction def mark_cuda_test(inner_f): def f(*args, **kwargs): import pycuda.driver # appears to be idempotent, i.e. no harm in calling it more than once pycuda.driver.init() ctx = make_default_context() try: assert isinstance(ctx.get_device().name(), str) assert isinstance(ctx.get_device().compute_capability(), tuple) assert isinstance(ctx.get_device().get_attributes(), dict) inner_f(*args, **kwargs) finally: ctx.pop() from pycuda.tools import clear_context_caches clear_context_caches() from gc import collect collect() try: from py.test import mark as mark_test except ImportError: return f return mark_test.cuda(f) # }}} # vim: foldmethod=marker pycuda-2013.1.1+git20140310/pycuda/sparse/0002755000175000000500000000000012313360364016113 5ustar tomussrcpycuda-2013.1.1+git20140310/pycuda/sparse/coordinate.py0000644000175000000500000002112012313360364020606 0ustar tomussrcfrom __future__ import division from pytools import memoize_method import pycuda.driver as drv import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np COO_FLAT_KERNEL_TEMPLATE = """ #include #define BLOCK_SIZE %(block_size)d #define WARP_SIZE %(warp_size)d typedef %(value_type)s value_type; typedef %(index_type)s index_type; texture<%(tex_value_type)s, 1, cudaReadModeElementType> tex_x; static __inline__ __device__ float atomicAdd(float *addr, float val) { float old=*addr, assumed; do { assumed = old; old = int_as_float( atomicCAS((int*)addr, float_as_int(assumed), float_as_int(val+assumed))); } while( assumed!=old ); return old; } #ifndef CUDA_NO_SM_13_DOUBLE_INTRINSICS static __attribute__ ((unused)) __inline__ __device__ double atomicAdd(double *addr, double val) { double old=*addr, assumed; do { assumed = old; old = __longlong_as_double( atomicCAS((unsigned long long int*)addr, __double_as_longlong(assumed), __double_as_longlong(val+assumed))); } while( assumed!=old ); return old; } #endif __global__ void spmv_coo_flat_kernel(const index_type num_nonzeros, const index_type interval_size, const index_type *I, const index_type *J, const value_type *V, value_type *y) { __shared__ index_type idx[BLOCK_SIZE]; __shared__ value_type val[BLOCK_SIZE]; __shared__ index_type carry_idx[BLOCK_SIZE / 32]; __shared__ value_type carry_val[BLOCK_SIZE / 32]; const index_type thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index const index_type thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp const index_type warp_id = thread_id / WARP_SIZE; // global warp index const index_type warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA const index_type begin = warp_id * interval_size + thread_lane; // thread's offset into I,J,V const index_type end = min(begin + interval_size, num_nonzeros); // end of thread's work if(begin >= end) return; // warp has no work to do const index_type first_idx = I[warp_id * interval_size]; // first row of this warp's interval if (thread_lane == 0) { carry_idx[warp_lane] = first_idx; carry_val[warp_lane] = 0; } for(index_type n = begin; n < end; n += WARP_SIZE) { idx[threadIdx.x] = I[n]; // row index val[threadIdx.x] = V[n] * fp_tex1Dfetch(tex_x, J[n]); // val = A[row,col] * x[col] if (thread_lane == 0){ if(idx[threadIdx.x] == carry_idx[warp_lane]) val[threadIdx.x] += carry_val[warp_lane]; // row continues into this warp's span else if(carry_idx[warp_lane] != first_idx) y[carry_idx[warp_lane]] += carry_val[warp_lane]; // row terminated, does not span boundary else atomicAdd(y + carry_idx[warp_lane], carry_val[warp_lane]); // row terminated, but spans iter-warp boundary } // segmented reduction in shared memory if( thread_lane >= 1 && idx[threadIdx.x] == idx[threadIdx.x - 1] ) { val[threadIdx.x] += val[threadIdx.x - 1]; } if( thread_lane >= 2 && idx[threadIdx.x] == idx[threadIdx.x - 2] ) { val[threadIdx.x] += val[threadIdx.x - 2]; } if( thread_lane >= 4 && idx[threadIdx.x] == idx[threadIdx.x - 4] ) { val[threadIdx.x] += val[threadIdx.x - 4]; } if( thread_lane >= 8 && idx[threadIdx.x] == idx[threadIdx.x - 8] ) { val[threadIdx.x] += val[threadIdx.x - 8]; } if( thread_lane >= 16 && idx[threadIdx.x] == idx[threadIdx.x -16] ) { val[threadIdx.x] += val[threadIdx.x - 16]; } if( thread_lane == 31 ) { carry_idx[warp_lane] = idx[threadIdx.x]; // last thread in warp saves its results carry_val[warp_lane] = val[threadIdx.x]; } else if ( idx[threadIdx.x] != idx[threadIdx.x+1] ) { // row terminates here if(idx[threadIdx.x] != first_idx) y[idx[threadIdx.x]] += val[threadIdx.x]; // row terminated, does not span inter-warp boundary else atomicAdd(y + idx[threadIdx.x], val[threadIdx.x]); // row terminated, but spans iter-warp boundary } } // final carry if(thread_lane == 31){ atomicAdd(y + carry_idx[warp_lane], carry_val[warp_lane]); } } """ COO_SERIAL_KERNEL_TEMPLATE = """ typedef %(value_type)s value_type; typedef %(index_type)s index_type; __global__ void spmv_coo_serial_kernel(const index_type num_nonzeros, const index_type *I, const index_type *J, const value_type *V, const value_type *x, value_type *y) { for (index_type n = 0; n < num_nonzeros; n++) y[I[n]] += V[n] * x[J[n]]; } """ class CoordinateSpMV: def __init__(self, mat, dtype): self.dtype = np.dtype(dtype) self.index_dtype = np.dtype(np.int32) self.shape = mat.shape self.block_size = 128 from scipy.sparse import coo_matrix coo_mat = coo_matrix(mat, dtype=self.dtype) self.row_gpu = gpuarray.to_gpu(coo_mat.row.astype(self.index_dtype)) self.col_gpu = gpuarray.to_gpu(coo_mat.col.astype(self.index_dtype)) self.data_gpu = gpuarray.to_gpu(coo_mat.data) self.nnz = coo_mat.nnz from pycuda.tools import DeviceData dev = drv.Context.get_device() devdata = DeviceData() max_threads = (devdata.warps_per_mp*devdata.warp_size* dev.multiprocessor_count) max_blocks = 4*max_threads // self.block_size warps_per_block = self.block_size // dev.warp_size if self.nnz: def divide_into(x, y): return (x+y-1)//y num_units = self.nnz // dev.warp_size num_warps = min(num_units, warps_per_block * max_blocks) self.num_blocks = divide_into(num_warps, warps_per_block) num_iters = divide_into(num_units, num_warps) self.interval_size = dev.warp_size * num_iters self.tail = num_units * dev.warp_size @memoize_method def get_flat_kernel(self): from pycuda.tools import dtype_to_ctype mod = SourceModule( COO_FLAT_KERNEL_TEMPLATE % { "value_type": dtype_to_ctype(self.dtype), "tex_value_type": dtype_to_ctype( self.dtype, with_fp_tex_hack=True), "index_type": dtype_to_ctype(self.index_dtype), "block_size": self.block_size, "warp_size": drv.Context.get_device().warp_size, }) func = mod.get_function("spmv_coo_flat_kernel") x_texref = mod.get_texref("tex_x") func.prepare(self.index_dtype.char*2 + "PPPP", (self.block_size, 1, 1), texrefs=[x_texref]) return func, x_texref @memoize_method def get_serial_kernel(self): from pycuda.tools import dtype_to_ctype mod = SourceModule( COO_SERIAL_KERNEL_TEMPLATE % { "value_type": dtype_to_ctype(self.dtype), "index_type": dtype_to_ctype(self.index_dtype), }) func = mod.get_function("spmv_coo_serial_kernel") func.prepare(self.index_dtype.char + "PPPPP", (1, 1, 1)) return func def __call__(self, x, y=None): if y is None: y = gpuarray.zeros(self.shape[0], dtype=self.dtype, allocator=x.allocator) if self.nnz == 0: return y flat_func, x_texref = self.get_flat_kernel() x.bind_to_texref_ext(x_texref, allow_double_hack=True) flat_func.prepared_call((self.num_blocks, 1), self.tail, self.interval_size, self.row_gpu.gpudata, self.col_gpu.gpudata, self.data_gpu.gpudata, y.gpudata) self.get_serial_kernel().prepared_call( (1, 1), self.nnz - self.tail, self.row_gpu[self.tail:].gpudata, self.col_gpu[self.tail:].gpudata, self.data_gpu[self.tail:].gpudata, x.gpudata, y.gpudata) return y pycuda-2013.1.1+git20140310/pycuda/sparse/__init__.py0000644000175000000500000000020712313360364020221 0ustar tomussrcfrom warnings import warn warn("pycuda.sparse is deprecated. and will be removed in 2015.x", DeprecationWarning, stacklevel=2) pycuda-2013.1.1+git20140310/pycuda/sparse/packeted.py0000644000175000000500000002670212313360364020252 0ustar tomussrcfrom __future__ import division from pytools import memoize_method import pycuda.driver as drv import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np PKT_KERNEL_TEMPLATE = """ typedef %(index_type)s index_type; typedef %(value_type)s value_type; typedef %(packed_index_type)s packed_index_type; #define ROWS_PER_PACKET %(rows_per_packet)d #define THREADS_PER_PACKET %(threads_per_packet)d template __device__ void memcpy_device( ValueType *dest, const ValueType *src, const IndexType num_values) { for(unsigned int i = threadIdx.x; i < num_values; i += blockDim.x) { dest[i] = src[i]; } } #define pkt_unpack_row_index(packed_index) ( packed_index >> 16 ) #define pkt_unpack_col_index(packed_index) (packed_index & 0xFFFF) extern "C" { __global__ void spmv_pkt_kernel(const index_type *row_ptr, const index_type *pos_start, const index_type *pos_end, const packed_index_type *index_array, const value_type *data_array, const value_type *x, value_type *y) { __shared__ value_type s_x[ROWS_PER_PACKET]; // input x-values __shared__ value_type s_y[ROWS_PER_PACKET]; // output y-values const index_type thread_id = __umul24(THREADS_PER_PACKET, blockIdx.x) + threadIdx.x; // base index of the submatrix corresponding to this packet const index_type packet_base_row = row_ptr[blockIdx.x]; const index_type packet_num_rows = row_ptr[blockIdx.x+1] - packet_base_row; // copy local x and y values from global memory into shared memory memcpy_device(s_x, x + packet_base_row, packet_num_rows); memcpy_device(s_y, y + packet_base_row, packet_num_rows); __syncthreads(); // process packet const index_type packet_start = pos_start[thread_id]; const index_type packet_end = pos_end[thread_id]; for(index_type pos = packet_start; pos != packet_end; pos += THREADS_PER_PACKET) { // row and column indices are stored in the same 32-bit word const index_type packed_index = index_array[pos]; const index_type row = pkt_unpack_row_index(packed_index); const index_type col = pkt_unpack_col_index(packed_index); const value_type val = data_array[pos]; s_y[row] += val * s_x[col]; } __syncthreads(); // copy y-values from shared memory to global array memcpy_device(y + packet_base_row, s_y, packet_num_rows); } } """ class PacketedSpMV: def __init__(self, mat, is_symmetric, dtype): from pycuda.tools import DeviceData devdata = DeviceData() # all row indices in the data structure generation code are # "unpermuted" unless otherwise specified self.dtype = np.dtype(dtype) self.index_dtype = np.int32 self.packed_index_dtype = np.uint32 self.threads_per_packet = devdata.max_threads h, w = self.shape = mat.shape if h != w: raise ValueError("only square matrices are supported") self.rows_per_packet = (devdata.shared_memory - 100) \ // (2*self.dtype.itemsize) self.block_count = \ (h + self.rows_per_packet - 1) // self.rows_per_packet # get metis partition ------------------------------------------------- from scipy.sparse import csr_matrix csr_mat = csr_matrix(mat, dtype=self.dtype) from pymetis import part_graph if not is_symmetric: # make sure adjacency graph is undirected adj_mat = csr_mat + csr_mat.T else: adj_mat = csr_mat while True: cut_count, dof_to_packet_nr = part_graph(int(self.block_count), xadj=adj_mat.indptr, adjncy=adj_mat.indices) # build packet_nr_to_dofs packet_nr_to_dofs = {} for i, packet_nr in enumerate(dof_to_packet_nr): try: dof_packet = packet_nr_to_dofs[packet_nr] except KeyError: packet_nr_to_dofs[packet_nr] = dof_packet = [] dof_packet.append(i) packet_nr_to_dofs = [packet_nr_to_dofs.get(i) for i in range(len(packet_nr_to_dofs))] too_big = False for packet_dofs in packet_nr_to_dofs: if len(packet_dofs) >= self.rows_per_packet: too_big = True break if too_big: old_block_count = self.block_count self.block_count = int(2+1.05*self.block_count) print ("Metis produced a big block at block count " "%d--retrying with %d" % (old_block_count, self.block_count)) continue break assert len(packet_nr_to_dofs) == self.block_count # permutations, base rows --------------------------------------------- new2old_fetch_indices, \ old2new_fetch_indices, \ packet_base_rows = self.find_simple_index_stuff( packet_nr_to_dofs) # find local row cost and remaining_coo ------------------------------- local_row_costs, remaining_coo = \ self.find_local_row_costs_and_remaining_coo( csr_mat, dof_to_packet_nr, old2new_fetch_indices) local_nnz = np.sum(local_row_costs) assert remaining_coo.nnz == csr_mat.nnz - local_nnz # find thread assignment for each block ------------------------------- thread_count = len(packet_nr_to_dofs)*self.threads_per_packet thread_assignments, thread_costs = self.find_thread_assignment( packet_nr_to_dofs, local_row_costs, thread_count) max_thread_costs = np.max(thread_costs) # build data structure ------------------------------------------------ from pkt_build import build_pkt_data_structure build_pkt_data_structure(self, packet_nr_to_dofs, max_thread_costs, old2new_fetch_indices, csr_mat, thread_count, thread_assignments, local_row_costs) self.packet_base_rows = gpuarray.to_gpu(packet_base_rows) self.new2old_fetch_indices = gpuarray.to_gpu( new2old_fetch_indices) self.old2new_fetch_indices = gpuarray.to_gpu( old2new_fetch_indices) from coordinate import CoordinateSpMV self.remaining_coo_gpu = CoordinateSpMV( remaining_coo, dtype) def find_simple_index_stuff(self, packet_nr_to_dofs): new2old_fetch_indices = np.zeros( self.shape[0], dtype=self.index_dtype) old2new_fetch_indices = np.zeros( self.shape[0], dtype=self.index_dtype) packet_base_rows = np.zeros( self.block_count+1, dtype=self.index_dtype) row_start = 0 for packet_nr, packet in enumerate(packet_nr_to_dofs): packet_base_rows[packet_nr] = row_start row_end = row_start + len(packet) pkt_indices = np.array(packet, dtype=self.index_dtype) new2old_fetch_indices[row_start:row_end] = \ pkt_indices old2new_fetch_indices[pkt_indices] = \ np.arange(row_start, row_end, dtype=self.index_dtype) row_start += len(packet) packet_base_rows[self.block_count] = row_start return (new2old_fetch_indices, old2new_fetch_indices, packet_base_rows) def find_local_row_costs_and_remaining_coo(self, csr_mat, dof_to_packet_nr, old2new_fetch_indices): h, w = self.shape local_row_costs = [0]*h rem_coo_values = [] rem_coo_i = [] rem_coo_j = [] iptr = csr_mat.indptr indices = csr_mat.indices data = csr_mat.data for i in xrange(h): for idx in xrange(iptr[i], iptr[i+1]): j = indices[idx] if dof_to_packet_nr[i] == dof_to_packet_nr[j]: local_row_costs[i] += 1 else: rem_coo_values.append(data[idx]) rem_coo_i.append(old2new_fetch_indices[i]) rem_coo_j.append(old2new_fetch_indices[j]) from scipy.sparse import coo_matrix remaining_coo = coo_matrix( (rem_coo_values, (rem_coo_i, rem_coo_j)), self.shape, dtype=self.dtype) return local_row_costs, remaining_coo def find_thread_assignment(self, packet_nr_to_dofs, local_row_cost, thread_count): thread_assignments = [[] for i in range(thread_count)] thread_costs = np.zeros(thread_count) for packet_nr, packet_dofs in enumerate(packet_nr_to_dofs): row_costs_and_numbers = sorted( [(local_row_cost[i], i) for i in packet_dofs], reverse=True) base_thread_nr = packet_nr*self.threads_per_packet thread_offset = 0 # zigzag assignment step = 1 for row_cost, row_number in row_costs_and_numbers: ti = base_thread_nr+thread_offset thread_assignments[ti].append(row_number) thread_costs[ti] += row_cost if thread_offset + step >= self.threads_per_packet: step = -1 elif thread_offset + step < 0: step = 1 else: thread_offset += step return thread_assignments, thread_costs def build_gpu_data_structure(self, packet_nr_to_dofs, max_thread_costs, old2new_fetch_indices, csr_mat, thread_count, thread_assignments, local_row_costs): # these arrays will likely be too long, but that's ok from pkt_build import build_pkt_structure build_pkt_structure(self, packet_nr_to_dofs, thread_assignments, thread_starts, thread_ends, index_array, data_array) # copy data to the gpu ------------------------------------------------ # execution --------------------------------------------------------------- @memoize_method def get_kernel(self): from pycuda.tools import dtype_to_ctype mod = SourceModule( PKT_KERNEL_TEMPLATE % { "value_type": dtype_to_ctype(self.dtype), "index_type": dtype_to_ctype(self.index_dtype), "packed_index_type": dtype_to_ctype(self.packed_index_dtype), "threads_per_packet": self.threads_per_packet, "rows_per_packet": self.rows_per_packet, }, no_extern_c=True) func = mod.get_function("spmv_pkt_kernel") func.prepare("PPPPPPP") return func def permute(self, x): return gpuarray.take(x, self.new2old_fetch_indices) def unpermute(self, x): return gpuarray.take(x, self.old2new_fetch_indices) def __call__(self, x, y=None): if y is None: y = gpuarray.zeros(self.shape[0], dtype=self.dtype, allocator=x.allocator) self.get_kernel().prepared_call( (self.block_count, 1), (self.threads_per_packet, 1, 1), self.packet_base_rows.gpudata, self.thread_starts.gpudata, self.thread_ends.gpudata, self.index_array.gpudata, self.data_array.gpudata, x.gpudata, y.gpudata) self.remaining_coo_gpu(x, y) return y pycuda-2013.1.1+git20140310/pycuda/sparse/pkt_build.py0000644000175000000500000000517412313360364020447 0ustar tomussrcimport numpy as np import pycuda.gpuarray as gpuarray def build_pkt_data_structure(spmv, packet_nr_to_dofs, max_thread_costs, old2new_fetch_indices, csr_mat, thread_count, thread_assignments, local_row_costs): packet_start = 0 base_dof_nr = 0 index_array = np.zeros( max_thread_costs*thread_count, dtype=spmv.packed_index_dtype) data_array = np.zeros( max_thread_costs*thread_count, dtype=spmv.dtype) thread_starts = np.zeros( thread_count, dtype=spmv.index_dtype) thread_ends = np.zeros( thread_count, dtype=spmv.index_dtype) for packet_nr, packet_dofs in enumerate(packet_nr_to_dofs): base_thread_nr = packet_nr*spmv.threads_per_packet max_packet_items = 0 for thread_offset in range(spmv.threads_per_packet): thread_write_idx = packet_start+thread_offset thread_start = packet_start+thread_offset thread_starts[base_thread_nr+thread_offset] = thread_write_idx for row_nr in thread_assignments[base_thread_nr+thread_offset]: perm_row_nr = old2new_fetch_indices[row_nr] rel_row_nr = perm_row_nr - base_dof_nr assert 0 <= rel_row_nr < len(packet_dofs) row_entries = 0 for idx in range(csr_mat.indptr[row_nr], csr_mat.indptr[row_nr+1]): col_nr = csr_mat.indices[idx] perm_col_nr = old2new_fetch_indices[col_nr] rel_col_nr = perm_col_nr - base_dof_nr if 0 <= rel_col_nr < len(packet_dofs): index_array[thread_write_idx] = (rel_row_nr << 16) + rel_col_nr data_array[thread_write_idx] = csr_mat.data[idx] thread_write_idx += spmv.threads_per_packet row_entries += 1 assert row_entries == local_row_costs[row_nr] thread_ends[base_thread_nr+thread_offset] = thread_write_idx thread_items = (thread_write_idx - thread_start)//spmv.threads_per_packet max_packet_items = max( max_packet_items, thread_items) base_dof_nr += len(packet_dofs) packet_start += max_packet_items*spmv.threads_per_packet spmv.thread_starts = gpuarray.to_gpu(thread_starts) spmv.thread_ends = gpuarray.to_gpu(thread_ends) spmv.index_array = gpuarray.to_gpu(index_array) spmv.data_array = gpuarray.to_gpu(data_array) try: import pyximport except ImportError: pass else: pyximport.install() from pycuda.sparse.pkt_build_cython import build_pkt_data_structure pycuda-2013.1.1+git20140310/pycuda/sparse/pkt_build_cython.pyx0000644000175000000500000000536212313360364022222 0ustar tomussrcimport numpy import pycuda.gpuarray as gpuarray def build_pkt_data_structure(spmv, packet_nr_to_dofs, max_thread_costs, old2new_fetch_indices, csr_mat, thread_count, thread_assignments, local_row_costs): cdef int packet_start, base_dof_nr cdef int packet_nr cdef int max_packet_items cdef int thread_offset cdef int thread_write_idx, thread_start cdef int row_nr, rel_row_nr, perm_row_nr cdef int col_nr, rel_col_nr, perm_col_nr cdef int idx packet_start = 0 base_dof_nr = 0 index_array = numpy.zeros( max_thread_costs*thread_count, dtype=spmv.packed_index_dtype) data_array = numpy.zeros( max_thread_costs*thread_count, dtype=spmv.dtype) thread_starts = numpy.zeros( thread_count, dtype=spmv.index_dtype) thread_ends = numpy.zeros( thread_count, dtype=spmv.index_dtype) for packet_nr, packet_dofs in enumerate(packet_nr_to_dofs): base_thread_nr = packet_nr*spmv.threads_per_packet max_packet_items = 0 for thread_offset in range(spmv.threads_per_packet): thread_write_idx = packet_start+thread_offset thread_start = packet_start+thread_offset thread_starts[base_thread_nr+thread_offset] = thread_write_idx for row_nr in thread_assignments[base_thread_nr+thread_offset]: perm_row_nr = old2new_fetch_indices[row_nr] rel_row_nr = perm_row_nr - base_dof_nr assert 0 <= rel_row_nr < len(packet_dofs) row_entries = 0 for idx in range(csr_mat.indptr[row_nr], csr_mat.indptr[row_nr+1]): col_nr = csr_mat.indices[idx] perm_col_nr = old2new_fetch_indices[col_nr] rel_col_nr = perm_col_nr - base_dof_nr if 0 <= rel_col_nr < len(packet_dofs): index_array[thread_write_idx] = (rel_row_nr << 16) + rel_col_nr data_array[thread_write_idx] = csr_mat.data[idx] thread_write_idx += spmv.threads_per_packet row_entries += 1 assert row_entries == local_row_costs[row_nr] thread_ends[base_thread_nr+thread_offset] = thread_write_idx thread_items = (thread_write_idx - thread_start)//spmv.threads_per_packet max_packet_items = max( max_packet_items, thread_items) base_dof_nr += len(packet_dofs) packet_start += max_packet_items*spmv.threads_per_packet spmv.thread_starts = gpuarray.to_gpu(thread_starts) spmv.thread_ends = gpuarray.to_gpu(thread_ends) spmv.index_array = gpuarray.to_gpu(index_array) spmv.data_array = gpuarray.to_gpu(data_array) pycuda-2013.1.1+git20140310/pycuda/sparse/cg.py0000644000175000000500000001643312313360364017063 0ustar tomussrcfrom __future__ import division from pycuda.sparse.inner import AsyncInnerProduct from pytools import memoize_method import pycuda.gpuarray as gpuarray import numpy as np class ConvergenceError(RuntimeError): pass class CGStateContainer: def __init__(self, operator, precon=None, pagelocked_allocator=None): if precon is None: from pycuda.sparse.operator import IdentityOperator precon = IdentityOperator(operator.dtype, operator.shape[0]) self.operator = operator self.precon = precon self.pagelocked_allocator = pagelocked_allocator @memoize_method def make_lc2_kernel(self, dtype, a_is_gpu, b_is_gpu): from pycuda.elementwise import get_linear_combination_kernel return get_linear_combination_kernel(( (a_is_gpu, dtype, dtype), (b_is_gpu, dtype, dtype) ), dtype) def lc2(self, a, x, b, y, out=None): if out is None: out = gpuarray.empty(x.shape, dtype=x.dtype, allocator=x.allocator) assert x.dtype == y.dtype == out.dtype a_is_gpu = isinstance(a, gpuarray.GPUArray) b_is_gpu = isinstance(b, gpuarray.GPUArray) assert x.shape == y.shape == out.shape kernel, texrefs = self.make_lc2_kernel( x.dtype, a_is_gpu, b_is_gpu) texrefs = texrefs[:] args = [] if a_is_gpu: assert a.dtype == x.dtype assert a.shape == () a.bind_to_texref_ext(texrefs.pop(0), allow_double_hack=True) else: args.append(a) args.append(x.gpudata) if b_is_gpu: assert b.dtype == y.dtype assert b.shape == () b.bind_to_texref_ext(texrefs.pop(0), allow_double_hack=True) else: args.append(b) args.append(y.gpudata) args.append(out.gpudata) args.append(x.mem_size) kernel.prepared_call(x._grid, x._block, *args) return out @memoize_method def guarded_div_kernel(self, dtype_x, dtype_y, dtype_z): from pycuda.elementwise import get_elwise_kernel from pycuda.tools import dtype_to_ctype return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = y[i] == 0 ? 0 : (x[i] / y[i])", "divide") def guarded_div(self, a, b): from pycuda.gpuarray import _get_common_dtype result = a._new_like_me(_get_common_dtype(a, b)) assert a.shape == b.shape func = self.guarded_div_kernel(a.dtype, b.dtype, result.dtype) func.prepared_async_call(a._grid, a._block, None, a.gpudata, b.gpudata, result.gpudata, a.mem_size) return result def reset(self, rhs, x=None): self.rhs = rhs if x is None: x = np.zeros((self.operator.shape[0],)) self.x = x self.residual = rhs - self.operator(x) self.d = self.precon(self.residual) # grows at the end delta = AsyncInnerProduct(self.residual, self.d, self.pagelocked_allocator) self.real_delta_queue = [delta] self.delta = delta.gpu_result def one_iteration(self, compute_real_residual=False): # typed up from J.R. Shewchuk, # An Introduction to the Conjugate Gradient Method # Without the Agonizing Pain, Edition 1 1/4 [8/1994] # Appendix B3 q = self.operator(self.d) myip = gpuarray.dot(self.d, q) alpha = self.guarded_div(self.delta, myip) self.lc2(1, self.x, alpha, self.d, out=self.x) if compute_real_residual: self.residual = self.lc2( 1, self.rhs, -1, self.operator(self.x)) else: self.lc2(1, self.residual, -alpha, q, out=self.residual) s = self.precon(self.residual) delta_old = self.delta delta = AsyncInnerProduct(self.residual, s, self.pagelocked_allocator) self.delta = delta.gpu_result beta = self.guarded_div(self.delta, delta_old) self.lc2(1, s, beta, self.d, out=self.d) if compute_real_residual: self.real_delta_queue.append(delta) def run(self, max_iterations=None, tol=1e-7, debug_callback=None): check_interval = 20 if max_iterations is None: max_iterations = max( 3*check_interval+1, 10 * self.operator.shape[0]) real_resid_interval = min(self.operator.shape[0], 50) iterations = 0 delta_0 = None while iterations < max_iterations: compute_real_residual = \ iterations % real_resid_interval == 0 self.one_iteration( compute_real_residual=compute_real_residual) if debug_callback is not None: if compute_real_residual: what = "it+residual" else: what = "it" debug_callback(what, iterations, self.x, self.residual, self.d, self.delta) # do often enough to allow AsyncInnerProduct # to progress through (polled) event chain rdq = self.real_delta_queue if iterations % check_interval == 0: if delta_0 is None: delta_0 = rdq[0].get_host_result() if delta_0 is not None: rdq.pop(0) if delta_0 is not None: i = 0 while i < len(rdq): delta = rdq[i].get_host_result() if delta is not None: if abs(delta) < tol*tol * abs(delta_0): if debug_callback is not None: debug_callback("end", iterations, self.x, self.residual, self.d, self.delta) return self.x rdq.pop(i) else: i += 1 iterations += 1 raise ConvergenceError("cg failed to converge") def solve_pkt_with_cg(pkt_spmv, b, precon=None, x=None, tol=1e-7, max_iterations=None, debug=False, pagelocked_allocator=None): if x is None: x = gpuarray.zeros(pkt_spmv.shape[0], dtype=pkt_spmv.dtype, allocator=b.allocator) else: x = pkt_spmv.permute(x) if pagelocked_allocator is None: pagelocked_allocator = drv.pagelocked_empty cg = CGStateContainer(pkt_spmv, precon, pagelocked_allocator=pagelocked_allocator) cg.reset(pkt_spmv.permute(b), x) it_count = [0] res_count = [0] def debug_callback(what, it_number, x, resid, d, delta): if what == "it": it_count[0] += 1 elif what == "it+residual": res_count[0] += 1 it_count[0] += 1 result = cg.run(max_iterations, tol, debug_callback=debug_callback) return pkt_spmv.unpermute(result), it_count[0], res_count[0] pycuda-2013.1.1+git20140310/pycuda/sparse/operator.py0000644000175000000500000000157112313360364020322 0ustar tomussrcclass OperatorBase(object): @property def dtype(self): raise NotImplementedError @property def shape(self): raise NotImplementedError def __neg__(self): return NegOperator(self) class IdentityOperator(OperatorBase): def __init__(self, dtype, n): self.my_dtype = dtype self.n = n @property def dtype(self): return self.my_dtype @property def shape(self): return self.n, self.n def __call__(self, operand): return operand class DiagonalPreconditioner(OperatorBase): def __init__(self, diagonal): self.diagonal = diagonal @property def dtype(self): return self.diagonal.dtype @property def shape(self): n = self.diagonal.shape[0] return n, n def __call__(self, operand): return self.diagonal*operand pycuda-2013.1.1+git20140310/pycuda/sparse/inner.py0000644000175000000500000000255412313360364017604 0ustar tomussrcfrom __future__ import division import pycuda.driver as drv import pycuda.gpuarray as gpuarray STREAM_POOL = [] def get_stream(): if STREAM_POOL: return STREAM_POOL.pop() else: return drv.Stream() class AsyncInnerProduct: def __init__(self, a, b, pagelocked_allocator): self.gpu_result = gpuarray.dot(a, b) self.gpu_finished_evt = drv.Event() self.gpu_finished_evt.record() self.gpu_finished = False self.pagelocked_allocator = pagelocked_allocator def get_host_result(self): if not self.gpu_finished: if self.gpu_finished_evt.query(): self.gpu_finished = True self.copy_stream = get_stream() self.host_dest = self.pagelocked_allocator( self.gpu_result.shape, self.gpu_result.dtype, self.copy_stream) drv.memcpy_dtoh_async(self.host_dest, self.gpu_result.gpudata, self.copy_stream) self.copy_finished_evt = drv.Event() self.copy_finished_evt.record() else: if self.copy_finished_evt.query(): STREAM_POOL.append(self.copy_stream) return self.host_dest def _at_exit(): STREAM_POOL[:] = [] import atexit atexit.register(_at_exit) pycuda-2013.1.1+git20140310/pycuda/driver.py0000644000175000000500000005532412313360364016472 0ustar tomussrctry: from pycuda._driver import * # noqa except ImportError, e: if "_v2" in str(e): from warnings import warn warn("Failed to import the CUDA driver interface, with an error " "message indicating that the version of your CUDA header " "does not match the version of your CUDA driver.") raise import numpy as np try: ManagedAllocationOrStub = ManagedAllocation except NameError: # Provide ManagedAllocationOrStub if not on CUDA 6. # This avoids having to do a version check in a high-traffic code path below. class ManagedAllocationOrStub(object): pass CUDA_DEBUGGING = False def set_debugging(flag=True): global CUDA_DEBUGGING CUDA_DEBUGGING = flag class CompileError(Error): def __init__(self, msg, command_line, stdout=None, stderr=None): self.msg = msg self.command_line = command_line self.stdout = stdout self.stderr = stderr def __str__(self): result = self.msg if self.command_line: try: result += "\n[command: %s]" % (" ".join(self.command_line)) except Exception, e: print e if self.stdout: result += "\n[stdout:\n%s]" % self.stdout if self.stderr: result += "\n[stderr:\n%s]" % self.stderr return result class ArgumentHandler(object): def __init__(self, ary): self.array = ary self.dev_alloc = None def get_device_alloc(self): if self.dev_alloc is None: self.dev_alloc = mem_alloc_like(self.array) return self.dev_alloc def pre_call(self, stream): pass class In(ArgumentHandler): def pre_call(self, stream): if stream is not None: memcpy_htod(self.get_device_alloc(), self.array) else: memcpy_htod(self.get_device_alloc(), self.array) class Out(ArgumentHandler): def post_call(self, stream): if stream is not None: memcpy_dtoh(self.array, self.get_device_alloc()) else: memcpy_dtoh(self.array, self.get_device_alloc()) class InOut(In, Out): pass def _add_functionality(): def device_get_attributes(dev): result = {} for att_name in dir(device_attribute): if not att_name[0].isupper(): continue att_id = getattr(device_attribute, att_name) try: att_value = dev.get_attribute(att_id) except LogicError, e: from warnings import warn warn("CUDA driver raised '%s' when querying '%s' on '%s'" % (e, att_name, dev)) else: result[att_id] = att_value return result def device___getattr__(dev, name): return dev.get_attribute(getattr(device_attribute, name.upper())) def _build_arg_buf(args): handlers = [] arg_data = [] format = "" for i, arg in enumerate(args): if isinstance(arg, np.number): arg_data.append(arg) format += arg.dtype.char elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)): arg_data.append(int(arg)) format += "P" elif isinstance(arg, ArgumentHandler): handlers.append(arg) arg_data.append(int(arg.get_device_alloc())) format += "P" elif isinstance(arg, np.ndarray): if isinstance(arg.base, ManagedAllocationOrStub): arg_data.append(int(arg.base)) format += "P" else: arg_data.append(arg) format += "%ds" % arg.nbytes else: try: gpudata = np.intp(arg.gpudata) except AttributeError: raise TypeError("invalid type on parameter #%d (0-based)" % i) else: # for gpuarrays arg_data.append(int(gpudata)) format += "P" from pycuda._pvt_struct import pack return handlers, pack(format, *arg_data) # {{{ pre-CUDA 4 call interface (stateful) def function_param_set_pre_v4(func, *args): handlers = [] handlers, buf = _build_arg_buf(args) func._param_setv(0, buf) func._param_set_size(len(buf)) return handlers def function_call_pre_v4(func, *args, **kwargs): grid = kwargs.pop("grid", (1, 1)) stream = kwargs.pop("stream", None) block = kwargs.pop("block", None) shared = kwargs.pop("shared", None) texrefs = kwargs.pop("texrefs", []) time_kernel = kwargs.pop("time_kernel", False) if kwargs: raise ValueError( "extra keyword arguments: %s" % (",".join(kwargs.iterkeys()))) if block is None: raise ValueError("must specify block size") func._set_block_shape(*block) handlers = func._param_set(*args) if shared is not None: func._set_shared_size(shared) for handler in handlers: handler.pre_call(stream) for texref in texrefs: func.param_set_texref(texref) post_handlers = [handler for handler in handlers if hasattr(handler, "post_call")] if stream is None: if time_kernel: Context.synchronize() from time import time start_time = time() func._launch_grid(*grid) if post_handlers or time_kernel: Context.synchronize() if time_kernel: run_time = time()-start_time for handler in post_handlers: handler.post_call(stream) if time_kernel: return run_time else: assert not time_kernel, \ "Can't time the kernel on an asynchronous invocation" func._launch_grid_async(grid[0], grid[1], stream) if post_handlers: for handler in post_handlers: handler.post_call(stream) def function_prepare_pre_v4(func, arg_types, block=None, shared=None, texrefs=[]): from warnings import warn if block is not None: warn("setting the block size in Function.prepare is deprecated", DeprecationWarning, stacklevel=2) func._set_block_shape(*block) if shared is not None: warn("setting the shared memory size in Function.prepare is deprecated", DeprecationWarning, stacklevel=2) func._set_shared_size(shared) func.texrefs = texrefs func.arg_format = "" for i, arg_type in enumerate(arg_types): if (isinstance(arg_type, type) and np is not None and np.number in arg_type.__mro__): func.arg_format += np.dtype(arg_type).char elif isinstance(arg_type, str): func.arg_format += arg_type else: func.arg_format += np.dtype(np.intp).char from pycuda._pvt_struct import calcsize func._param_set_size(calcsize(func.arg_format)) return func def function_prepared_call_pre_v4(func, grid, block, *args, **kwargs): if isinstance(block, tuple): func._set_block_shape(*block) else: from warnings import warn warn("Not passing the block size to prepared_call is deprecated as of " "version 2011.1.", DeprecationWarning, stacklevel=2) args = (block,) + args shared_size = kwargs.pop("shared_size", None) if shared_size is not None: func._set_shared_size(shared_size) if kwargs: raise TypeError("unknown keyword arguments: " + ", ".join(kwargs.iterkeys())) from pycuda._pvt_struct import pack func._param_setv(0, pack(func.arg_format, *args)) for texref in func.texrefs: func.param_set_texref(texref) func._launch_grid(*grid) def function_prepared_timed_call_pre_v4(func, grid, block, *args, **kwargs): if isinstance(block, tuple): func._set_block_shape(*block) else: from warnings import warn warn("Not passing the block size to prepared_timed_call is " "deprecated as of version 2011.1.", DeprecationWarning, stacklevel=2) args = (block,) + args shared_size = kwargs.pop("shared_size", None) if shared_size is not None: func._set_shared_size(shared_size) if kwargs: raise TypeError("unknown keyword arguments: " + ", ".join(kwargs.iterkeys())) from pycuda._pvt_struct import pack func._param_setv(0, pack(func.arg_format, *args)) for texref in func.texrefs: func.param_set_texref(texref) start = Event() end = Event() start.record() func._launch_grid(*grid) end.record() def get_call_time(): end.synchronize() return end.time_since(start)*1e-3 return get_call_time def function_prepared_async_call_pre_v4(func, grid, block, stream, *args, **kwargs): if isinstance(block, tuple): func._set_block_shape(*block) else: from warnings import warn warn("Not passing the block size to prepared_async_call is " "deprecated as of version 2011.1.", DeprecationWarning, stacklevel=2) args = (stream,) + args stream = block shared_size = kwargs.pop("shared_size", None) if shared_size is not None: func._set_shared_size(shared_size) if kwargs: raise TypeError("unknown keyword arguments: " + ", ".join(kwargs.iterkeys())) from pycuda._pvt_struct import pack func._param_setv(0, pack(func.arg_format, *args)) for texref in func.texrefs: func.param_set_texref(texref) if stream is None: func._launch_grid(*grid) else: grid_x, grid_y = grid func._launch_grid_async(grid_x, grid_y, stream) # }}} # {{{ CUDA 4+ call interface (stateless) def function_call(func, *args, **kwargs): grid = kwargs.pop("grid", (1, 1)) stream = kwargs.pop("stream", None) block = kwargs.pop("block", None) shared = kwargs.pop("shared", 0) texrefs = kwargs.pop("texrefs", []) time_kernel = kwargs.pop("time_kernel", False) if kwargs: raise ValueError( "extra keyword arguments: %s" % (",".join(kwargs.iterkeys()))) if block is None: raise ValueError("must specify block size") func._set_block_shape(*block) handlers, arg_buf = _build_arg_buf(args) for handler in handlers: handler.pre_call(stream) for texref in texrefs: func.param_set_texref(texref) post_handlers = [handler for handler in handlers if hasattr(handler, "post_call")] if stream is None: if time_kernel: Context.synchronize() from time import time start_time = time() func._launch_kernel(grid, block, arg_buf, shared, None) if post_handlers or time_kernel: Context.synchronize() if time_kernel: run_time = time()-start_time for handler in post_handlers: handler.post_call(stream) if time_kernel: return run_time else: assert not time_kernel, \ "Can't time the kernel on an asynchronous invocation" func._launch_kernel(grid, block, arg_buf, shared, stream) if post_handlers: for handler in post_handlers: handler.post_call(stream) def function_prepare(func, arg_types, texrefs=[]): func.texrefs = texrefs func.arg_format = "" for i, arg_type in enumerate(arg_types): if (isinstance(arg_type, type) and np.number in arg_type.__mro__): func.arg_format += np.dtype(arg_type).char elif isinstance(arg_type, np.dtype): if arg_type.char == "V": func.arg_format += "%ds" % arg_type.itemsize else: func.arg_format += arg_type.char elif isinstance(arg_type, str): func.arg_format += arg_type else: func.arg_format += np.dtype(np.intp).char return func def function_prepared_call(func, grid, block, *args, **kwargs): if isinstance(block, tuple): func._set_block_shape(*block) else: from warnings import warn warn("Not passing the block size to prepared_call is deprecated as of " "version 2011.1.", DeprecationWarning, stacklevel=2) args = (block,) + args shared_size = kwargs.pop("shared_size", 0) if kwargs: raise TypeError("unknown keyword arguments: " + ", ".join(kwargs.iterkeys())) from pycuda._pvt_struct import pack arg_buf = pack(func.arg_format, *args) for texref in func.texrefs: func.param_set_texref(texref) func._launch_kernel(grid, block, arg_buf, shared_size, None) def function_prepared_timed_call(func, grid, block, *args, **kwargs): shared_size = kwargs.pop("shared_size", 0) if kwargs: raise TypeError("unknown keyword arguments: " + ", ".join(kwargs.iterkeys())) from pycuda._pvt_struct import pack arg_buf = pack(func.arg_format, *args) for texref in func.texrefs: func.param_set_texref(texref) start = Event() end = Event() start.record() func._launch_kernel(grid, block, arg_buf, shared_size, None) end.record() def get_call_time(): end.synchronize() return end.time_since(start)*1e-3 return get_call_time def function_prepared_async_call(func, grid, block, stream, *args, **kwargs): if isinstance(block, tuple): func._set_block_shape(*block) else: from warnings import warn warn("Not passing the block size to prepared_async_call is " "deprecated as of version 2011.1.", DeprecationWarning, stacklevel=2) args = (stream,) + args stream = block shared_size = kwargs.pop("shared_size", 0) if kwargs: raise TypeError("unknown keyword arguments: " + ", ".join(kwargs.iterkeys())) from pycuda._pvt_struct import pack arg_buf = pack(func.arg_format, *args) for texref in func.texrefs: func.param_set_texref(texref) func._launch_kernel(grid, block, arg_buf, shared_size, stream) # }}} def function___getattr__(self, name): if get_version() >= (2, 2): return self.get_attribute(getattr(function_attribute, name.upper())) else: if name == "num_regs": return self._hacky_registers elif name == "shared_size_bytes": return self._hacky_smem elif name == "local_size_bytes": return self._hacky_lmem else: raise AttributeError("no attribute '%s' in Function" % name) def mark_func_method_deprecated(func): def new_func(*args, **kwargs): from warnings import warn warn("'%s' has been deprecated in version 2011.1. Please use " "the stateless launch interface instead." % func.__name__[1:], DeprecationWarning, stacklevel=2) return func(*args, **kwargs) try: from functools import update_wrapper except ImportError: pass else: try: update_wrapper(new_func, func) except: # User won't see true signature. Oh well. pass return new_func Device.get_attributes = device_get_attributes Device.__getattr__ = device___getattr__ if get_version() >= (4,): Function.__call__ = function_call Function.prepare = function_prepare Function.prepared_call = function_prepared_call Function.prepared_timed_call = function_prepared_timed_call Function.prepared_async_call = function_prepared_async_call else: Function._param_set = function_param_set_pre_v4 Function.__call__ = function_call_pre_v4 Function.prepare = function_prepare_pre_v4 Function.prepared_call = function_prepared_call_pre_v4 Function.prepared_timed_call = function_prepared_timed_call_pre_v4 Function.prepared_async_call = function_prepared_async_call_pre_v4 for meth_name in ["set_block_shape", "set_shared_size", "param_set_size", "param_set", "param_seti", "param_setf", "param_setv", "launch", "launch_grid", "launch_grid_async"]: setattr(Function, meth_name, mark_func_method_deprecated( getattr(Function, "_"+meth_name))) Function.__getattr__ = function___getattr__ _add_functionality() # {{{ pagelocked numpy arrays def pagelocked_zeros(shape, dtype, order="C", mem_flags=0): result = pagelocked_empty(shape, dtype, order, mem_flags) result.fill(0) return result def pagelocked_empty_like(array, mem_flags=0): if array.flags.c_contiguous: order = "C" elif array.flags.f_contiguous: order = "F" else: raise ValueError("could not detect array order") return pagelocked_empty(array.shape, array.dtype, order, mem_flags) def pagelocked_zeros_like(array, mem_flags=0): result = pagelocked_empty_like(array, mem_flags) result.fill(0) return result # }}} # {{{ aligned numpy arrays def aligned_zeros(shape, dtype, order="C", alignment=4096): result = aligned_empty(shape, dtype, order, alignment) result.fill(0) return result def aligned_empty_like(array, alignment=4096): if array.flags.c_contiguous: order = "C" elif array.flags.f_contiguous: order = "F" else: raise ValueError("could not detect array order") return aligned_empty(array.shape, array.dtype, order, alignment) def aligned_zeros_like(array, alignment=4096): result = aligned_empty_like(array, alignment) result.fill(0) return result # }}} # {{{ managed numpy arrays (CUDA Unified Memory) def managed_zeros(shape, dtype, order="C", mem_flags=0): result = managed_empty(shape, dtype, order, mem_flags) result.fill(0) return result def managed_empty_like(array, mem_flags=0): if array.flags.c_contiguous: order = "C" elif array.flags.f_contiguous: order = "F" else: raise ValueError("could not detect array order") return managed_empty(array.shape, array.dtype, order, mem_flags) def managed_zeros_like(array, mem_flags=0): result = pagelocked_empty_like(array, mem_flags) result.fill(0) return result # }}} def mem_alloc_like(ary): return mem_alloc(ary.nbytes) # {{{ array handling def dtype_to_array_format(dtype): if dtype == np.uint8: return array_format.UNSIGNED_INT8 elif dtype == np.uint16: return array_format.UNSIGNED_INT16 elif dtype == np.uint32: return array_format.UNSIGNED_INT32 elif dtype == np.int8: return array_format.SIGNED_INT8 elif dtype == np.int16: return array_format.SIGNED_INT16 elif dtype == np.int32: return array_format.SIGNED_INT32 elif dtype == np.float32: return array_format.FLOAT else: raise TypeError( "cannot convert dtype '%s' to array format" % dtype) def matrix_to_array(matrix, order, allow_double_hack=False): if order.upper() == "C": h, w = matrix.shape stride = 0 elif order.upper() == "F": w, h = matrix.shape stride = -1 else: raise LogicError("order must be either F or C") matrix = np.asarray(matrix, order=order) descr = ArrayDescriptor() descr.width = w descr.height = h if matrix.dtype == np.float64 and allow_double_hack: descr.format = array_format.SIGNED_INT32 descr.num_channels = 2 else: descr.format = dtype_to_array_format(matrix.dtype) descr.num_channels = 1 ary = Array(descr) copy = Memcpy2D() copy.set_src_host(matrix) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = \ matrix.strides[stride] copy.height = h copy(aligned=True) return ary def make_multichannel_2d_array(ndarray, order): """Channel count has to be the first dimension of the C{ndarray}.""" descr = ArrayDescriptor() if order.upper() == "C": h, w, num_channels = ndarray.shape stride = 0 elif order.upper() == "F": num_channels, w, h = ndarray.shape stride = 2 else: raise LogicError("order must be either F or C") descr.width = w descr.height = h descr.format = dtype_to_array_format(ndarray.dtype) descr.num_channels = num_channels ary = Array(descr) copy = Memcpy2D() copy.set_src_host(ndarray) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = \ ndarray.strides[stride] copy.height = h copy(aligned=True) return ary def bind_array_to_texref(ary, texref): texref.set_array(ary) texref.set_address_mode(0, address_mode.CLAMP) texref.set_address_mode(1, address_mode.CLAMP) texref.set_filter_mode(filter_mode.POINT) # }}} def matrix_to_texref(matrix, texref, order): bind_array_to_texref(matrix_to_array(matrix, order), texref) # {{{ device copies def to_device(bf_obj): import sys if sys.version_info >= (2, 7): bf = memoryview(bf_obj).tobytes() else: bf = buffer(bf_obj) result = mem_alloc(len(bf)) memcpy_htod(result, bf) return result def from_device(devptr, shape, dtype, order="C"): result = np.empty(shape, dtype, order) memcpy_dtoh(result, devptr) return result def from_device_like(devptr, other_ary): result = np.empty_like(other_ary) memcpy_dtoh(result, devptr) return result # }}} # vim: fdm=marker pycuda-2013.1.1+git20140310/pycuda/_cluda.py0000644000175000000500000000113112313360364016411 0ustar tomussrcCLUDA_PREAMBLE = """ #define local_barrier() __syncthreads(); #define WITHIN_KERNEL __device__ #define KERNEL extern "C" __global__ #define GLOBAL_MEM /* empty */ #define LOCAL_MEM __shared__ #define LOCAL_MEM_ARG /* empty */ #define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1) #define LID_0 threadIdx.x #define LID_1 threadIdx.y #define LID_2 threadIdx.z #define GID_0 blockIdx.x #define GID_1 blockIdx.y #define GID_2 blockIdx.z #define LDIM_0 blockDim.x #define LDIM_1 blockDim.y #define LDIM_2 blockDim.z #define GDIM_0 gridDim.x #define GDIM_1 gridDim.y #define GDIM_2 gridDim.z """ pycuda-2013.1.1+git20140310/examples/0002755000175000000500000000000012313360366015151 5ustar tomussrcpycuda-2013.1.1+git20140310/examples/demo.py0000644000175000000500000000221512313360364016443 0ustar tomussrc# Sample source code from the Tutorial Introduction in the documentation. import pycuda.driver as cuda import pycuda.autoinit # noqa from pycuda.compiler import SourceModule import numpy a = numpy.random.randn(4, 4) a = a.astype(numpy.float32) a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize) cuda.memcpy_htod(a_gpu, a) mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """) func = mod.get_function("doublify") func(a_gpu, block=(4, 4, 1), grid=(1, 1), shared=0) a_doubled = numpy.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) print "original array:" print a print "doubled with kernel:" print a_doubled # alternate kernel invocation ------------------------------------------------- func(cuda.InOut(a), block=(4, 4, 1)) print "doubled with InOut:" print a # part 2 ---------------------------------------------------------------------- import pycuda.gpuarray as gpuarray a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) a_doubled = (2*a_gpu).get() print "original array:" print a_gpu print "doubled with gpuarray:" print a_doubled pycuda-2013.1.1+git20140310/examples/fill_gpu_with_nans.py0000644000175000000500000000106012313360364021367 0ustar tomussrcimport pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import numpy free_bytes, total_bytes = cuda.mem_get_info() exp = 10 while True: fill_floats = free_bytes / 4 - (1<datalen; idx += blockDim.x) { float *a_ptr = a->ptr; a_ptr[idx] *= 2; } } """) func = mod.get_function("double_array") func(struct_arr, block = (32, 1, 1), grid=(2, 1)) print "doubled arrays" print array1 print array2 func(numpy.intp(do2_ptr), block = (32, 1, 1), grid=(1, 1)) print "doubled second only" print array1 print array2 func.prepare("P", block=(32, 1, 1)) func.prepared_call((2, 1), struct_arr) print "doubled again" print array1 print array2 func.prepared_call((1, 1), do2_ptr) print "doubled second only again" print array1 print array2 pycuda-2013.1.1+git20140310/README.rst0000644000175000000500000000271112313360364015017 0ustar tomussrcPyCUDA lets you access `Nvidia `_'s `CUDA `_ parallel computation API from Python. Several wrappers of the CUDA API already exist-so what's so special about PyCUDA? .. image:: https://badge.fury.io/py/pycuda.png :target: http://pypi.python.org/pypi/pycuda * Object cleanup tied to lifetime of objects. This idiom, often called `RAII `_ in C++, makes it much easier to write correct, leak- and crash-free code. PyCUDA knows about dependencies, too, so (for example) it won't detach from a context before all memory allocated in it is also freed. * Convenience. Abstractions like pycuda.driver.SourceModule and pycuda.gpuarray.GPUArray make CUDA programming even more convenient than with Nvidia's C-based runtime. * Completeness. PyCUDA puts the full power of CUDA's driver API at your disposal, if you wish. It also includes code for interoperability with OpenGL. * Automatic Error Checking. All CUDA errors are automatically translated into Python exceptions. * Speed. PyCUDA's base layer is written in C++, so all the niceties above are virtually free. * Helpful `Documentation `_ and a `Wiki `_. Relatedly, like-minded computing goodness for `OpenCL `_ is provided by PyCUDA's sister project `PyOpenCL `_. pycuda-2013.1.1+git20140310/test/0002755000175000000500000000000012313360364014310 5ustar tomussrcpycuda-2013.1.1+git20140310/test/test_cumath.py0000644000175000000500000002127212313360364017204 0ustar tomussrcfrom __future__ import division import math import numpy as np from pycuda.tools import mark_cuda_test def have_pycuda(): try: import pycuda # noqa return True except: return False if have_pycuda(): import pycuda.gpuarray as gpuarray import pycuda.driver as drv # noqa import pycuda.cumath as cumath sizes = [10, 128, 1024, 1 << 10, 1 << 13] dtypes = [np.float32, np.float64] complex_dtypes = [np.complex64, np.complex128] numpy_func_names = { "asin": "arcsin", "acos": "arccos", "atan": "arctan", } def make_unary_function_test(name, a=0, b=1, threshold=0, complex=False): def test(): gpu_func = getattr(cumath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) if complex: _dtypes = complex_dtypes else: _dtypes = dtypes for s in sizes: for dtype in _dtypes: np.random.seed(1) A = (np.random.random(s)*(b-a) + a).astype(dtype) if complex: A += (np.random.random(s)*(b-a) + a)*1j args = gpuarray.to_gpu(A) gpu_results = gpu_func(args).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), \ (max_err, name, dtype) gpu_results2 = gpuarray.empty_like(args) gr2 = gpu_func(args, out=gpu_results2) assert gpu_results2 is gr2 gr2 = gr2.get() max_err = np.max(np.abs(cpu_results - gr2)) assert (max_err <= threshold).all(), \ (max_err, name, dtype) return mark_cuda_test(test) if have_pycuda(): test_ceil = make_unary_function_test("ceil", -10, 10) test_floor = make_unary_function_test("ceil", -10, 10) test_fabs = make_unary_function_test("fabs", -10, 10) test_exp = make_unary_function_test("exp", -3, 3, 1e-5) test_exp_c = make_unary_function_test("exp", -3, 3, 1e-5, complex=True) test_log = make_unary_function_test("log", 1e-5, 1, 5e-7) test_log10 = make_unary_function_test("log10", 1e-5, 1, 3e-7) test_sqrt = make_unary_function_test("sqrt", 1e-5, 1, 2e-7) test_sin = make_unary_function_test("sin", -10, 10, 1e-7) test_sin_c = make_unary_function_test("sin", -3, 3, 2e-6, complex=True) test_cos = make_unary_function_test("cos", -10, 10, 1e-7) test_cos_c = make_unary_function_test("cos", -3, 3, 2e-6, complex=True) test_asin = make_unary_function_test("asin", -0.9, 0.9, 5e-7) #test_sin_c = make_unary_function_test("sin", -0.9, 0.9, 2e-6, complex=True) test_acos = make_unary_function_test("acos", -0.9, 0.9, 5e-7) #test_acos_c = make_unary_function_test("acos", -0.9, 0.9, 2e-6, complex=True) test_tan = make_unary_function_test("tan", -math.pi/2 + 0.1, math.pi/2 - 0.1, 1e-5) test_tan_c = make_unary_function_test("tan", -math.pi/2 + 0.1, math.pi/2 - 0.1, 3e-5, complex=True) test_atan = make_unary_function_test("atan", -10, 10, 2e-7) test_sinh = make_unary_function_test("sinh", -3, 3, 2e-6) test_sinh_c = make_unary_function_test("sinh", -3, 3, 2e-6, complex=True) test_cosh = make_unary_function_test("cosh", -3, 3, 2e-6) test_cosh_c = make_unary_function_test("cosh", -3, 3, 2e-6, complex=True) test_tanh = make_unary_function_test("tanh", -3, 3, 2e-6) test_tanh_c = make_unary_function_test("tanh", -math.pi/2 + 0.1, math.pi/2 - 0.1, 3e-5, complex=True) class TestMath: disabled = not have_pycuda() @mark_cuda_test def test_fmod(self): """tests if the fmod function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32)/10 a2 = gpuarray.arange(s, dtype=np.float32)/45.2 + 0.1 b = cumath.fmod(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.fmod(a[i], a2[i]) == b[i] @mark_cuda_test def test_ldexp(self): """tests if the ldexp function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32) a2 = gpuarray.arange(s, dtype=np.float32)*1e-3 b = cumath.ldexp(a, a2) a = a.get() a2 = a2.get() b = b.get() for i in range(s): assert math.ldexp(a[i], int(a2[i])) == b[i] @mark_cuda_test def test_modf(self): """tests if the modf function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32)/10 fracpart, intpart = cumath.modf(a) a = a.get() intpart = intpart.get() fracpart = fracpart.get() for i in range(s): fracpart_true, intpart_true = math.modf(a[i]) assert intpart_true == intpart[i] assert abs(fracpart_true - fracpart[i]) < 1e-4 @mark_cuda_test def test_frexp(self): """tests if the frexp function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32)/10 significands, exponents = cumath.frexp(a) a = a.get() significands = significands.get() exponents = exponents.get() for i in range(s): sig_true, ex_true = math.frexp(a[i]) assert sig_true == significands[i] assert ex_true == exponents[i] @mark_cuda_test def test_unary_func_kwargs(self): """tests if the kwargs to the unary functions work""" from pycuda.driver import Stream name, a, b, threshold = ("exp", -3, 3, 1e-5) gpu_func = getattr(cumath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) for s in sizes: for dtype in dtypes: np.random.seed(1) A = (np.random.random(s)*(b-a) + a).astype(dtype) if complex: A += (np.random.random(s)*(b-a) + a)*1j np.random.seed(1) A = (np.random.random(s)*(b-a) + a).astype(dtype) args = gpuarray.to_gpu(A) # 'out' kw gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, out=gpu_results).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) # 'out' position gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, gpu_results).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) # 'stream' kw mystream = Stream() np.random.seed(1) A = (np.random.random(s)*(b-a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, stream=mystream).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) # 'stream' position mystream = Stream() np.random.seed(1) A = (np.random.random(s)*(b-a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, mystream).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) # 'out' and 'stream' kw mystream = Stream() np.random.seed(1) A = (np.random.random(s)*(b-a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, stream=mystream, out=gpu_results).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), (max_err, name, dtype) if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests. import pycuda.autoinit # noqa import sys if len(sys.argv) > 1: exec (sys.argv[1]) else: from py.test.cmdline import main main([__file__]) pycuda-2013.1.1+git20140310/test/test_driver.py0000644000175000000500000004172312313360364017221 0ustar tomussrcfrom __future__ import division import numpy as np import numpy.linalg as la from pycuda.tools import mark_cuda_test def have_pycuda(): try: import pycuda return True except: return False if have_pycuda(): import pycuda.gpuarray as gpuarray import pycuda.driver as drv from pycuda.compiler import SourceModule class TestDriver: disabled = not have_pycuda() @mark_cuda_test def test_memory(self): z = np.random.randn(400).astype(np.float32) new_z = drv.from_device_like(drv.to_device(z), z) assert la.norm(new_z-z) == 0 @mark_cuda_test def test_simple_kernel(self): mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = np.random.randn(400).astype(np.float32) b = np.random.randn(400).astype(np.float32) dest = np.zeros_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1)) assert la.norm(dest-a*b) == 0 @mark_cuda_test def test_simple_kernel_2(self): mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = np.random.randn(400).astype(np.float32) b = np.random.randn(400).astype(np.float32) a_gpu = drv.to_device(a) b_gpu = drv.to_device(b) dest = np.zeros_like(a) multiply_them( drv.Out(dest), a_gpu, b_gpu, block=(400,1,1)) assert la.norm(dest-a*b) == 0 drv.Context.synchronize() # now try with offsets dest = np.zeros_like(a) multiply_them( drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu, block=(399,1,1)) assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0 @mark_cuda_test def test_vector_types(self): mod = SourceModule(""" __global__ void set_them(float3 *dest, float3 x) { const int i = threadIdx.x; dest[i] = x; } """) set_them = mod.get_function("set_them") a = gpuarray.vec.make_float3(1, 2, 3) dest = np.empty((400), gpuarray.vec.float3) set_them(drv.Out(dest), a, block=(400,1,1)) assert (dest == a).all() from py.test import mark as mark_test @mark_cuda_test def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") shape = (32,8) a = drv.pagelocked_zeros(shape, dtype=np.float32) b = drv.pagelocked_zeros(shape, dtype=np.float32) a[:] = np.random.randn(*shape) b[:] = np.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them( drv.Out(dest), a_gpu, b_gpu, block=shape+(1,), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest-a*b) == 0 @mark_cuda_test def test_gpuarray(self): a = np.arange(200000, dtype=np.float32) b = a + 17 import pycuda.gpuarray as gpuarray a_g = gpuarray.to_gpu(a) b_g = gpuarray.to_gpu(b) diff = (a_g-3*b_g+(-a_g)).get() - (a-3*b+(-a)) assert la.norm(diff) == 0 diff = ((a_g*b_g).get()-a*b) assert la.norm(diff) == 0 @mark_cuda_test def donottest_cublas_mixing(): test_streamed_kernel() import pycuda.blas as blas shape = (10,) a = blas.ones(shape, dtype=np.float32) b = 33*blas.ones(shape, dtype=np.float32) assert ((-a+b).from_gpu() == 32).all() test_streamed_kernel() @mark_cuda_test def test_2d_texture(self): mod = SourceModule(""" texture mtx_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; dest[row*w+col] = tex2D(mtx_tex, row, col); } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (3,4) a = np.random.randn(*shape).astype(np.float32) drv.matrix_to_texref(a, mtx_tex, order="F") dest = np.zeros(shape, dtype=np.float32) copy_texture(drv.Out(dest), block=shape+(1,), texrefs=[mtx_tex] ) assert la.norm(dest-a) == 0 @mark_cuda_test def test_multiple_2d_textures(self): mod = SourceModule(""" texture mtx_tex; texture mtx2_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; dest[row*w+col] = tex2D(mtx_tex, row, col) + tex2D(mtx2_tex, row, col); } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx2_tex = mod.get_texref("mtx2_tex") shape = (3,4) a = np.random.randn(*shape).astype(np.float32) b = np.random.randn(*shape).astype(np.float32) drv.matrix_to_texref(a, mtx_tex, order="F") drv.matrix_to_texref(b, mtx2_tex, order="F") dest = np.zeros(shape, dtype=np.float32) copy_texture(drv.Out(dest), block=shape+(1,), texrefs=[mtx_tex, mtx2_tex] ) assert la.norm(dest-a-b) < 1e-6 @mark_cuda_test def test_multichannel_2d_texture(self): mod = SourceModule(""" #define CHANNELS 4 texture mtx_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; float4 texval = tex2D(mtx_tex, row, col); dest[(row*w+col)*CHANNELS + 0] = texval.x; dest[(row*w+col)*CHANNELS + 1] = texval.y; dest[(row*w+col)*CHANNELS + 2] = texval.z; dest[(row*w+col)*CHANNELS + 3] = texval.w; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (5,6) channels = 4 a = np.asarray( np.random.randn(*((channels,)+shape)), dtype=np.float32, order="F") drv.bind_array_to_texref( drv.make_multichannel_2d_array(a, order="F"), mtx_tex) dest = np.zeros(shape+(channels,), dtype=np.float32) copy_texture(drv.Out(dest), block=shape+(1,), texrefs=[mtx_tex] ) reshaped_a = a.transpose(1,2,0) #print reshaped_a #print dest assert la.norm(dest-reshaped_a) == 0 @mark_cuda_test def test_multichannel_linear_texture(self): mod = SourceModule(""" #define CHANNELS 4 texture mtx_tex; __global__ void copy_texture(float *dest) { int i = threadIdx.x+blockDim.x*threadIdx.y; float4 texval = tex1Dfetch(mtx_tex, i); dest[i*CHANNELS + 0] = texval.x; dest[i*CHANNELS + 1] = texval.y; dest[i*CHANNELS + 2] = texval.z; dest[i*CHANNELS + 3] = texval.w; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (16, 16) channels = 4 a = np.random.randn(*(shape+(channels,))).astype(np.float32) a_gpu = drv.to_device(a) mtx_tex.set_address(a_gpu, a.nbytes) mtx_tex.set_format(drv.array_format.FLOAT, 4) dest = np.zeros(shape+(channels,), dtype=np.float32) copy_texture(drv.Out(dest), block=shape+(1,), texrefs=[mtx_tex] ) #print a #print dest assert la.norm(dest-a) == 0 @mark_cuda_test def test_large_smem(self): n = 4000 mod = SourceModule(""" #include __global__ void kernel(int *d_data) { __shared__ int sdata[%d]; sdata[threadIdx.x] = threadIdx.x; d_data[threadIdx.x] = sdata[threadIdx.x]; } """ % n) kernel = mod.get_function("kernel") import pycuda.gpuarray as gpuarray arg = gpuarray.zeros((n,), dtype=np.float32) kernel(arg, block=(1,1,1,), ) @mark_cuda_test def test_bitlog(self): from pycuda.tools import bitlog2 assert bitlog2(17) == 4 assert bitlog2(0xaffe) == 15 assert bitlog2(0x3affe) == 17 assert bitlog2(0xcc3affe) == 27 @mark_cuda_test def test_mempool_2(self): from pycuda.tools import DeviceMemoryPool as DMP from random import randrange for i in range(2000): s = randrange(1<<31) >> randrange(32) bin_nr = DMP.bin_number(s) asize = DMP.alloc_size(bin_nr) assert asize >= s, s assert DMP.bin_number(asize) == bin_nr, s assert asize < asize*(1+1/8) @mark_cuda_test def test_mempool(self): from pycuda.tools import bitlog2 from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() maxlen = 10 queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0-6, e0-4): for i in range(100): queue.append(pool.allocate(1< 10: queue.pop(0) del queue pool.stop_holding() @mark_cuda_test def test_multi_context(self): if drv.get_version() < (2,0,0): return if drv.get_version() >= (2,2,0): if drv.Context.get_device().compute_mode == drv.compute_mode.EXCLUSIVE: return mem_a = drv.mem_alloc(50) ctx2 = drv.Context.get_device().make_context() mem_b = drv.mem_alloc(60) del mem_a del mem_b ctx2.detach() @mark_cuda_test def test_3d_texture(self): # adapted from code by Nicolas Pinto w = 2 h = 4 d = 8 shape = (w, h, d) a = np.asarray( np.random.randn(*shape), dtype=np.float32, order="F") descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule(""" texture mtx_tex; __global__ void copy_texture(float *dest) { int x = threadIdx.x; int y = threadIdx.y; int z = threadIdx.z; int dx = blockDim.x; int dy = blockDim.y; int i = (z*dy + y)*dx + x; dest[i] = tex3D(mtx_tex, x, y, z); //dest[i] = x; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx_tex.set_array(ary) dest = np.zeros(shape, dtype=np.float32, order="F") copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex]) assert la.norm(dest-a) == 0 @mark_cuda_test def test_prepared_invocation(self): a = np.random.randn(4,4).astype(np.float32) a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize) drv.memcpy_htod(a_gpu, a) mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*blockDim.x; a[idx] *= 2; } """) func = mod.get_function("doublify") func.prepare("P") func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20) a_doubled = np.empty_like(a) drv.memcpy_dtoh(a_doubled, a_gpu) print (a) print (a_doubled) assert la.norm(a_doubled-2*a) == 0 # now with offsets func.prepare("P") a_quadrupled = np.empty_like(a) func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize) drv.memcpy_dtoh(a_quadrupled, a_gpu) assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0 @mark_cuda_test def test_prepared_with_vector(self): cuda_source = r''' __global__ void cuda_function(float3 input) { float3 result = make_float3(input.x, input.y, input.z); } ''' mod = SourceModule(cuda_source, cache_dir=False, keep=False) kernel = mod.get_function("cuda_function") arg_types = [gpuarray.vec.float3] kernel.prepare(arg_types) kernel.prepared_call((1, 1, 1), (1, 1, 1), gpuarray.vec.make_float3(0.0, 1.0, 2.0)) @mark_cuda_test def test_fp_textures(self): if drv.Context.get_device().compute_capability() < (1, 3): return for tp in [np.float32, np.float64]: from pycuda.tools import dtype_to_ctype tp_cstr = dtype_to_ctype(tp) mod = SourceModule(""" #include texture my_tex; __global__ void copy_texture(%(tp)s *dest) { int i = threadIdx.x; dest[i] = fp_tex1Dfetch(my_tex, i); } """ % {"tp": tp_cstr}) copy_texture = mod.get_function("copy_texture") my_tex = mod.get_texref("my_tex") import pycuda.gpuarray as gpuarray shape = (384,) a = np.random.randn(*shape).astype(tp) a_gpu = gpuarray.to_gpu(a) a_gpu.bind_to_texref_ext(my_tex, allow_double_hack=True) dest = np.zeros(shape, dtype=tp) copy_texture(drv.Out(dest), block=shape+(1,1,), texrefs=[my_tex]) assert la.norm(dest-a) == 0 @mark_cuda_test def test_constant_memory(self): # contributed by Andrew Wagner module = SourceModule(""" __constant__ float const_array[32]; __global__ void copy_constant_into_global(float* global_result_array) { global_result_array[threadIdx.x] = const_array[threadIdx.x]; } """) copy_constant_into_global = module.get_function("copy_constant_into_global") const_array, _ = module.get_global('const_array') host_array = np.random.randint(0,255,(32,)).astype(np.float32) global_result_array = drv.mem_alloc_like(host_array) drv.memcpy_htod(const_array, host_array) copy_constant_into_global( global_result_array, grid=(1, 1), block=(32, 1, 1)) host_result_array = np.zeros_like(host_array) drv.memcpy_dtoh(host_result_array, global_result_array) assert (host_result_array == host_array).all @mark_cuda_test def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64, alignment=4096) drv.register_host_memory(a) def test_import_pyopencl_before_pycuda(): try: import pyopencl except ImportError: return import pycuda.driver if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests. import pycuda.autoinit import sys if len(sys.argv) > 1: exec (sys.argv[1]) else: from py.test.cmdline import main main([__file__]) pycuda-2013.1.1+git20140310/test/test_gpuarray.py0000644000175000000500000006434012313360364017560 0ustar tomussrc#! /usr/bin/env python import numpy as np import numpy.linalg as la import sys from pycuda.tools import mark_cuda_test from pycuda.characterize import has_double_support def have_pycuda(): try: import pycuda # noqa return True except: return False if have_pycuda(): import pycuda.gpuarray as gpuarray import pycuda.driver as drv from pycuda.compiler import SourceModule class TestGPUArray: disabled = not have_pycuda() @mark_cuda_test def test_pow_array(self): a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) result = pow(a_gpu, a_gpu).get() assert (np.abs(a**a - result) < 1e-3).all() result = (a_gpu**a_gpu).get() assert (np.abs(pow(a, a) - result) < 1e-3).all() @mark_cuda_test def test_pow_number(self): a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) result = pow(a_gpu, 2).get() assert (np.abs(a**2 - result) < 1e-3).all() @mark_cuda_test def test_numpy_integer_shape(self): gpuarray.empty(np.int32(17), np.float32) gpuarray.empty((np.int32(17), np.int32(17)), np.float32) @mark_cuda_test def test_abs(self): a = -gpuarray.arange(111, dtype=np.float32) res = a.get() for i in range(111): assert res[i] <= 0 a = abs(a) res = a.get() for i in range(111): assert abs(res[i]) >= 0 assert res[i] == i @mark_cuda_test def test_len(self): a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_cpu = gpuarray.to_gpu(a) assert len(a_cpu) == 10 @mark_cuda_test def test_multiply(self): """Test the muliplication of an array with a scalar. """ for sz in [10, 50000]: for dtype, scalars in [ (np.float32, [2]), (np.complex64, [2, 2j]) ]: for scalar in scalars: a = np.arange(sz).astype(dtype) a_gpu = gpuarray.to_gpu(a) a_doubled = (scalar * a_gpu).get() assert (a * scalar == a_doubled).all() @mark_cuda_test def test_rmul_yields_right_type(self): a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) two_a = 2*a_gpu assert isinstance(two_a, gpuarray.GPUArray) two_a = np.float32(2)*a_gpu assert isinstance(two_a, gpuarray.GPUArray) @mark_cuda_test def test_multiply_array(self): """Test the multiplication of two arrays.""" a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(a) a_squared = (b_gpu*a_gpu).get() assert (a*a == a_squared).all() @mark_cuda_test def test_addition_array(self): """Test the addition of two arrays.""" a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) a_added = (a_gpu+a_gpu).get() assert (a+a == a_added).all() @mark_cuda_test def test_iaddition_array(self): """Test the inplace addition of two arrays.""" a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) a_gpu += a_gpu a_added = a_gpu.get() assert (a+a == a_added).all() @mark_cuda_test def test_addition_scalar(self): """Test the addition of an array and a scalar.""" a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) a_added = (7+a_gpu).get() assert (7+a == a_added).all() @mark_cuda_test def test_iaddition_scalar(self): """Test the inplace addition of an array and a scalar.""" a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) a_gpu += 7 a_added = a_gpu.get() assert (7+a == a_added).all() @mark_cuda_test def test_substract_array(self): """Test the substraction of two arrays.""" #test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) b = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) result = (a_gpu-b_gpu).get() assert (a-b == result).all() result = (b_gpu-a_gpu).get() assert (b-a == result).all() @mark_cuda_test def test_substract_scalar(self): """Test the substraction of an array and a scalar.""" #test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) #convert a to a gpu object a_gpu = gpuarray.to_gpu(a) result = (a_gpu-7).get() assert (a-7 == result).all() result = (7-a_gpu).get() assert (7-a == result).all() @mark_cuda_test def test_divide_scalar(self): """Test the division of an array and a scalar.""" a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) result = (a_gpu/2).get() assert (a/2 == result).all() result = (2/a_gpu).get() assert (2/a == result).all() @mark_cuda_test def test_divide_array(self): """Test the division of an array and a scalar. """ #test data a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) a_divide = (a_gpu/b_gpu).get() assert (np.abs(a/b - a_divide) < 1e-3).all() a_divide = (b_gpu/a_gpu).get() assert (np.abs(b/a - a_divide) < 1e-3).all() @mark_cuda_test def test_random(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] for dtype in dtypes: a = curand((10, 100), dtype=dtype).get() assert (0 <= a).all() assert (a < 1).all() @mark_cuda_test def test_curand_wrappers(self): from pycuda.curandom import get_curand_version if get_curand_version() is None: from pytest import skip skip("curand not installed") generator_types = [] if get_curand_version() >= (3, 2, 0): from pycuda.curandom import ( XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator) generator_types.extend([ XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator]) if get_curand_version() >= (4, 0, 0): from pycuda.curandom import ( ScrambledSobol32RandomNumberGenerator, Sobol64RandomNumberGenerator, ScrambledSobol64RandomNumberGenerator) generator_types.extend([ ScrambledSobol32RandomNumberGenerator, Sobol64RandomNumberGenerator, ScrambledSobol64RandomNumberGenerator]) if get_curand_version() >= (4, 1, 0): from pycuda.curandom import MRG32k3aRandomNumberGenerator generator_types.extend([MRG32k3aRandomNumberGenerator]) if has_double_support(): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] for gen_type in generator_types: gen = gen_type() for dtype in dtypes: gen.gen_normal(10000, dtype) # test non-Box-Muller version, if available gen.gen_normal(10001, dtype) if get_curand_version() >= (4, 0, 0): gen.gen_log_normal(10000, dtype, 10.0, 3.0) # test non-Box-Muller version, if available gen.gen_log_normal(10001, dtype, 10.0, 3.0) x = gen.gen_uniform(10000, dtype) x_host = x.get() assert (-1 <= x_host).all() assert (x_host <= 1).all() gen.gen_uniform(10000, np.uint32) if get_curand_version() >= (5, 0, 0): gen.gen_poisson(10000, np.uint32, 13.0) @mark_cuda_test def test_array_gt(self): """Test whether array contents are > the other array's contents""" a = np.array([5, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b = np.array([2, 10]).astype(np.float32) b_gpu = gpuarray.to_gpu(b) result = (a_gpu > b_gpu).get() assert result[0] assert not result[1] @mark_cuda_test def test_array_lt(self): """Test whether array contents are < the other array's contents""" a = np.array([5, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b = np.array([2, 10]).astype(np.float32) b_gpu = gpuarray.to_gpu(b) result = (b_gpu < a_gpu).get() assert result[0] assert not result[1] @mark_cuda_test def test_array_le(self): """Test whether array contents are <= the other array's contents""" a = np.array([5, 10, 1]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b = np.array([2, 10, 2]).astype(np.float32) b_gpu = gpuarray.to_gpu(b) result = (b_gpu <= a_gpu).get() assert result[0] assert result[1] assert not result[2] @mark_cuda_test def test_array_ge(self): """Test whether array contents are >= the other array's contents""" a = np.array([5, 10, 1]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b = np.array([2, 10, 2]).astype(np.float32) b_gpu = gpuarray.to_gpu(b) result = (a_gpu >= b_gpu).get() assert result[0] assert result[1] assert not result[2] @mark_cuda_test def test_array_eq(self): """Test whether array contents are == the other array's contents""" a = np.array([5, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b = np.array([2, 10]).astype(np.float32) b_gpu = gpuarray.to_gpu(b) result = (a_gpu == b_gpu).get() assert not result[0] assert result[1] @mark_cuda_test def test_array_ne(self): """Test whether array contents are != the other array's contents""" a = np.array([5, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b = np.array([2, 10]).astype(np.float32) b_gpu = gpuarray.to_gpu(b) result = (a_gpu != b_gpu).get() assert result[0] assert not result[1] @mark_cuda_test def test_nan_arithmetic(self): def make_nan_contaminated_vector(size): shape = (size,) a = np.random.randn(*shape).astype(np.float32) #for i in range(0, shape[0], 3): #a[i] = float('nan') from random import randrange for i in range(size//10): a[randrange(0, size)] = float('nan') return a size = 1 << 20 a = make_nan_contaminated_vector(size) a_gpu = gpuarray.to_gpu(a) b = make_nan_contaminated_vector(size) b_gpu = gpuarray.to_gpu(b) ab = a*b ab_gpu = (a_gpu*b_gpu).get() assert (np.isnan(ab) == np.isnan(ab_gpu)).all() @mark_cuda_test def test_elwise_kernel(self): from pycuda.curandom import rand as curand a_gpu = curand((50,)) b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel( "float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 @mark_cuda_test def test_ranged_elwise_kernel(self): from pycuda.elementwise import ElementwiseKernel set_to_seven = ElementwiseKernel( "float *z", "z[i] = 7", "set_to_seven") for i, slc in enumerate([ slice(5, 20000), slice(5, 20000, 17), slice(3000, 5, -1), slice(1000, -1), ]): a_gpu = gpuarray.zeros((50000,), dtype=np.float32) a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype) a_cpu[slc] = 7 set_to_seven(a_gpu, slice=slc) drv.Context.synchronize() assert la.norm(a_cpu - a_gpu.get()) == 0, i @mark_cuda_test def test_take(self): idx = gpuarray.arange(0, 10000, 2, dtype=np.uint32) for dtype in [np.float32, np.complex64]: a = gpuarray.arange(0, 600000, dtype=np.uint32).astype(dtype) a_host = a.get() result = gpuarray.take(a, idx) assert (a_host[idx.get()] == result.get()).all() @mark_cuda_test def test_arange(self): a = gpuarray.arange(12, dtype=np.float32) assert (np.arange(12, dtype=np.float32) == a.get()).all() @mark_cuda_test def test_reverse(self): a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_cpu = gpuarray.to_gpu(a) a_cpu = a_cpu.reverse() b = a_cpu.get() for i in range(0, 10): assert a[len(a)-1-i] == b[i] @mark_cuda_test def test_sum(self): from pycuda.curandom import rand as curand a_gpu = curand((200000,)) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = gpuarray.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4 @mark_cuda_test def test_minmax(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for what in ["min", "max"]: for dtype in dtypes: a_gpu = curand((200000,), dtype) a = a_gpu.get() op_a = getattr(np, what)(a) op_a_gpu = getattr(gpuarray, what)(a_gpu).get() assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what) @mark_cuda_test def test_subset_minmax(self): from pycuda.curandom import rand as curand l_a = 200000 gran = 5 l_m = l_a - l_a // gran + 1 if has_double_support(): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for dtype in dtypes: a_gpu = curand((l_a,), dtype) a = a_gpu.get() meaningful_indices_gpu = gpuarray.zeros(l_m, dtype=np.int32) meaningful_indices = meaningful_indices_gpu.get() j = 0 for i in range(len(meaningful_indices)): meaningful_indices[i] = j j = j + 1 if j % gran == 0: j = j + 1 meaningful_indices_gpu = gpuarray.to_gpu(meaningful_indices) b = a[meaningful_indices] min_a = np.min(b) min_a_gpu = gpuarray.subset_min(meaningful_indices_gpu, a_gpu).get() assert min_a_gpu == min_a @mark_cuda_test def test_dot(self): from pycuda.curandom import rand as curand for l in [2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129, 255, 256, 257, 16384 - 993, 20000]: a_gpu = curand((l,)) a = a_gpu.get() b_gpu = curand((l,)) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4 @mark_cuda_test def test_slice(self): from pycuda.curandom import rand as curand l = 20000 a_gpu = curand((l,)) a = a_gpu.get() from random import randrange for i in range(200): start = randrange(l) end = randrange(start, l) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] assert la.norm(a_gpu_slice.get()-a_slice) == 0 @mark_cuda_test def test_2d_slice_c(self): from pycuda.curandom import rand as curand n = 1000 m = 300 a_gpu = curand((n, m)) a = a_gpu.get() from random import randrange for i in range(200): start = randrange(n) end = randrange(start, n) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] assert la.norm(a_gpu_slice.get()-a_slice) == 0 @mark_cuda_test def test_2d_slice_f(self): from pycuda.curandom import rand as curand import pycuda.gpuarray as gpuarray n = 1000 m = 300 a_gpu = curand((n, m)) a_gpu_f = gpuarray.GPUArray((m, n), np.float32, gpudata=a_gpu.gpudata, order="F") a = a_gpu_f.get() from random import randrange for i in range(200): start = randrange(n) end = randrange(start, n) a_gpu_slice = a_gpu_f[:, start:end] a_slice = a[:, start:end] assert la.norm(a_gpu_slice.get()-a_slice) == 0 @mark_cuda_test def test_if_positive(self): from pycuda.curandom import rand as curand l = 20 a_gpu = curand((l,)) b_gpu = curand((l,)) a = a_gpu.get() b = b_gpu.get() import pycuda.gpuarray as gpuarray max_a_b_gpu = gpuarray.maximum(a_gpu, b_gpu) min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu) print (max_a_b_gpu) print (np.maximum(a, b)) assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0 @mark_cuda_test def test_take_put(self): for n in [5, 17, 333]: one_field_size = 8 buf_gpu = gpuarray.zeros(n*one_field_size, dtype=np.float32) dest_indices = gpuarray.to_gpu(np.array( [0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) read_map = gpuarray.to_gpu( np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)) gpuarray.multi_take_put( arrays=[buf_gpu for i in range(n)], dest_indices=dest_indices, src_indices=read_map, src_offsets=[i*one_field_size for i in range(n)], dest_shape=(96,)) drv.Context.synchronize() @mark_cuda_test def test_astype(self): from pycuda.curandom import rand as curand if not has_double_support(): return a_gpu = curand((2000,), dtype=np.float32) a = a_gpu.get().astype(np.float64) a2 = a_gpu.astype(np.float64).get() assert a2.dtype == np.float64 assert la.norm(a - a2) == 0, (a, a2) a_gpu = curand((2000,), dtype=np.float64) a = a_gpu.get().astype(np.float32) a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 assert la.norm(a - a2)/la.norm(a) < 1e-7 @mark_cuda_test def test_complex_bits(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.complex64, np.complex128] else: dtypes = [np.complex64] n = 20 for tp in dtypes: dtype = np.dtype(tp) from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) z = (curand((n,), real_dtype).astype(dtype) + 1j*curand((n,), real_dtype).astype(dtype)) assert la.norm(z.get().real - z.real.get()) == 0 assert la.norm(z.get().imag - z.imag.get()) == 0 assert la.norm(z.get().conj() - z.conj().get()) == 0 @mark_cuda_test def test_pass_slice_to_kernel(self): mod = SourceModule(""" __global__ void twice(float *a) { const int i = threadIdx.x + blockIdx.x * blockDim.x; a[i] *= 2; } """) multiply_them = mod.get_function("twice") a = np.ones(256**2, np.float32) a_gpu = gpuarray.to_gpu(a) multiply_them(a_gpu[256:-256], block=(256, 1, 1), grid=(254, 1)) a = a_gpu.get() assert (a[255:257] == np.array([1, 2], np.float32)).all() assert (a[255*256-1:255*256+1] == np.array([2, 1], np.float32)).all() @mark_cuda_test def test_scan(self): from pycuda.scan import ExclusiveScanKernel, InclusiveScanKernel for cls in [ExclusiveScanKernel, InclusiveScanKernel]: scan_kern = cls(np.int32, "a+b", "0") for n in [ 10, 2**10-5, 2**10, 2**20-2**18, 2**20-2**18+5, 2**10+5, 2**20+5, 2**20, 2**24 ]: host_data = np.random.randint(0, 10, n).astype(np.int32) gpu_data = gpuarray.to_gpu(host_data) scan_kern(gpu_data) desired_result = np.cumsum(host_data, axis=0) if cls is ExclusiveScanKernel: desired_result -= host_data assert (gpu_data.get() == desired_result).all() @mark_cuda_test def test_stride_preservation(self): A = np.random.rand(3, 3) AT = A.T print (AT.flags.f_contiguous, AT.flags.c_contiguous) AT_GPU = gpuarray.to_gpu(AT) print (AT_GPU.flags.f_contiguous, AT_GPU.flags.c_contiguous) assert np.allclose(AT_GPU.get(), AT) @mark_cuda_test def test_vector_fill(self): a_gpu = gpuarray.GPUArray(100, dtype=gpuarray.vec.float3) a_gpu.fill(gpuarray.vec.make_float3(0.0, 0.0, 0.0)) a = a_gpu.get() assert a.dtype is gpuarray.vec.float3 @mark_cuda_test def test_create_complex_zeros(self): gpuarray.zeros(3, np.complex64) @mark_cuda_test def test_reshape(self): a = np.arange(128).reshape(8, 16).astype(np.float32) a_gpu = gpuarray.to_gpu(a) # different ways to specify the shape a_gpu.reshape(4, 32) a_gpu.reshape((4, 32)) a_gpu.reshape([4, 32]) @mark_cuda_test def test_view(self): a = np.arange(128).reshape(8, 16).astype(np.float32) a_gpu = gpuarray.to_gpu(a) # same dtype view = a_gpu.view() assert view.shape == a_gpu.shape and view.dtype == a_gpu.dtype # larger dtype view = a_gpu.view(np.complex64) assert view.shape == (8, 8) and view.dtype == np.complex64 # smaller dtype view = a_gpu.view(np.int16) assert view.shape == (8, 32) and view.dtype == np.int16 @mark_cuda_test def test_struct_reduce(self): preamble = """ struct minmax_collector { float cur_min; float cur_max; __device__ minmax_collector() { } __device__ minmax_collector(float cmin, float cmax) : cur_min(cmin), cur_max(cmax) { } __device__ minmax_collector(minmax_collector const &src) : cur_min(src.cur_min), cur_max(src.cur_max) { } __device__ minmax_collector(minmax_collector const volatile &src) : cur_min(src.cur_min), cur_max(src.cur_max) { } __device__ minmax_collector volatile &operator=( minmax_collector const &src) volatile { cur_min = src.cur_min; cur_max = src.cur_max; return *this; } }; __device__ minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { return minmax_collector( fminf(a.cur_min, b.cur_min), fmaxf(a.cur_max, b.cur_max)); } """ mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)]) from pycuda.curandom import rand as curand a_gpu = curand((20000,), dtype=np.float32) a = a_gpu.get() from pycuda.tools import register_dtype register_dtype(mmc_dtype, "minmax_collector") from pycuda.reduction import ReductionKernel red = ReductionKernel(mmc_dtype, neutral="minmax_collector(10000, -10000)", # FIXME: needs infinity literal in real use, ok here reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])", arguments="float *x", preamble=preamble) minmax = red(a_gpu).get() #print minmax["cur_min"], minmax["cur_max"] #print np.min(a), np.max(a) assert minmax["cur_min"] == np.min(a) assert minmax["cur_max"] == np.max(a) @mark_cuda_test def test_view_and_strides(self): from pycuda.curandom import rand as curand X = curand((5, 10), dtype=np.float32) Y = X[:3, :5] y = Y.view() assert y.shape == Y.shape assert y.strides == Y.strides import pytest with pytest.raises(AssertionError): assert (y.get() == X.get()[:3, :5]).all() @mark_cuda_test def test_scalar_comparisons(self): a = np.array([1.0, 0.25, 0.1, -0.1, 0.0]) a_gpu = gpuarray.to_gpu(a) x_gpu = a_gpu > 0.25 x = (a > 0.25).astype(a.dtype) assert (x == x_gpu.get()).all() x_gpu = a_gpu <= 0.25 x = (a <= 0.25).astype(a.dtype) assert (x == x_gpu.get()).all() x_gpu = a_gpu == 0.25 x = (a == 0.25).astype(a.dtype) assert (x == x_gpu.get()).all() x_gpu = a_gpu == 1 # using an integer scalar x = (a == 1).astype(a.dtype) assert (x == x_gpu.get()).all() if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests. import pycuda.autoinit # noqa if len(sys.argv) > 1: exec (sys.argv[1]) else: from py.test.cmdline import main main([__file__]) pycuda-2013.1.1+git20140310/test/undistributed/0002755000175000000500000000000012313360364017175 5ustar tomussrcpycuda-2013.1.1+git20140310/test/undistributed/elwise-perf.py0000644000175000000500000000173212313360364021772 0ustar tomussrc#! /usr/bin/env python import pycuda.driver as drv import pycuda.autoinit import numpy import numpy.linalg as la def main(): from pytools import Table tbl = Table() tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]")) import pycuda.gpuarray as gpuarray # they're floats, i.e. 4 bytes each for power in range(10, 28): size = 1< 20: count = 10 else: count = 100 elapsed = [0] def add_timer(_, time): elapsed[0] += time() for i in range(count): a.mul_add(1, b, 2, add_timer) bytes = a.nbytes*count*3 bytes = a.nbytes*count*3 tbl.add_row((a.nbytes/(1<<20), elapsed[0]/count, bytes/elapsed[0]/1e9)) print tbl if __name__ == "__main__": main() pycuda-2013.1.1+git20140310/test/undistributed/measure_gpuarray_speed.py0000755000175000000500000000401212313360364024300 0ustar tomussrc#! /usr/bin/env python import pycuda.driver as drv import pycuda.autoinit import numpy import numpy.linalg as la def main(): import pycuda.gpuarray as gpuarray sizes = [] times_gpu = [] flops_gpu = [] flops_cpu = [] times_cpu = [] from pycuda.tools import bitlog2 max_power = bitlog2(drv.mem_get_info()[0]) - 2 # they're floats, i.e. 4 bytes each for power in range(10, max_power): size = 1< 20: count = 100 else: count = 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() for i in range(count): a+b end.record() end.synchronize() secs = start.time_till(end)*1e-3 times_gpu.append(secs/count) flops_gpu.append(size) del a del b # cpu ----------------------------------------------------------------- a_cpu = numpy.random.randn(size).astype(numpy.float32) b_cpu = numpy.random.randn(size).astype(numpy.float32) #start timer from time import time start = time() for i in range(count): a_cpu + b_cpu secs = time() - start times_cpu.append(secs/count) flops_cpu.append(size) # calculate pseudo flops flops_gpu = [f/t for f, t in zip(flops_gpu,times_gpu)] flops_cpu = [f/t for f, t in zip(flops_cpu,times_cpu)] from pytools import Table tbl = Table() tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup")) for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu): tbl.add_row((s, t, f, t_cpu, f_cpu, f/f_cpu)) print tbl if __name__ == "__main__": main() pycuda-2013.1.1+git20140310/test/undistributed/reduction-perf.py0000644000175000000500000000322112313360364022471 0ustar tomussrcfrom __future__ import division import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import numpy def main(): from pytools import Table tbl = Table() tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]")) from random import shuffle for dtype_out in [numpy.float32, numpy.float64]: for ex in range(15,27): sz = 1 << ex print sz from pycuda.curandom import rand as curand a_gpu = curand((sz,)) b_gpu = curand((sz,)) assert sz == a_gpu.shape[0] assert len(a_gpu.shape) == 1 from pycuda.reduction import get_sum_kernel, get_dot_kernel krnl = get_dot_kernel(dtype_out, a_gpu.dtype) elapsed = [0] def wrap_with_timer(f): def result(*args, **kwargs): start = cuda.Event() stop = cuda.Event() start.record() f(*args, **kwargs) stop.record() stop.synchronize() elapsed[0] += stop.time_since(start) return result # warm-up for i in range(3): krnl(a_gpu, b_gpu) cnt = 10 for i in range(cnt): krnl(a_gpu, b_gpu, #krnl(a_gpu, kernel_wrapper=wrap_with_timer) bytes = a_gpu.nbytes*2*cnt secs = elapsed[0]*1e-3 tbl.add_row((str(dtype_out), a_gpu.nbytes/(1<<20), elapsed[0]/cnt, bytes/secs/1e9)) print tbl if __name__ == "__main__": main() pycuda-2013.1.1+git20140310/setup.cfg0000644000175000000500000000011012313360364015140 0ustar tomussrc[flake8] ignore = E126,E127,E128,E123,E226,E241,E242 max-line-length=85 pycuda-2013.1.1+git20140310/MANIFEST.in0000644000175000000500000000116212313360364015065 0ustar tomussrcinclude pycuda/cuda/*.hpp include src/cpp/*.hpp include src/cpp/*.cpp include src/wrapper/*.hpp include src/wrapper/*.cpp include test/*.py include examples/*.py include examples/MORE* include doc/source/*.rst include doc/Makefile include doc/source/conf.py include doc/source/_static/*.css include doc/source/_templates/*.html include configure.py include Makefile.in include aksetup_helper.py include README_SETUP.txt include README.rst recursive-include bpl-subset/bpl_subset/boost *.h *.hpp *.cpp *.html *.inl *.ipp *.pl *.txt recursive-include bpl-subset/bpl_subset/libs *.h *.hpp *.cpp *.html *.inl *.ipp *.pl *.txt pycuda-2013.1.1+git20140310/doc/0002755000175000000500000000000012313360364014076 5ustar tomussrcpycuda-2013.1.1+git20140310/doc/upload-docs.sh0000755000175000000500000000013512313360364016644 0ustar tomussrc#! /bin/sh rsync --progress --verbose --archive --delete build/html/* doc-upload:doc/pycuda pycuda-2013.1.1+git20140310/doc/source/0002755000175000000500000000000012313360364015376 5ustar tomussrcpycuda-2013.1.1+git20140310/doc/source/array.rst0000644000175000000500000012023112313360364017243 0ustar tomussrcGPU Arrays ========== .. module:: pycuda.gpuarray Vector Types ------------ .. class :: vec All of CUDA's supported vector types, such as `float3` and `long4` are available as :mod:`numpy` data types within this class. These :class:`numpy.dtype` instances have field names of `x`, `y`, `z`, and `w` just like their CUDA counterparts. They will work both for parameter passing to kernels as well as for passing data back and forth between kernels and Python code. For each type, a `make_type` function is also provided (e.g. `make_float3(x,y,z)`). The :class:`GPUArray` Array Class --------------------------------- .. class:: GPUArray(shape, dtype, *, allocator=None, order="C") A :class:`numpy.ndarray` work-alike that stores its data and performs its computations on the compute device. *shape* and *dtype* work exactly as in :mod:`numpy`. Arithmetic methods in :class:`GPUArray` support the broadcasting of scalars. (e.g. `array+5`) If the *allocator* is a callable that, upon being called with an argument of the number of bytes to be allocated, returns an object that can be cast to an :class:`int` representing the address of the newly allocated memory. Observe that both :func:`pycuda.driver.mem_alloc` and :meth:`pycuda.tools.DeviceMemoryPool.alloc` are a model of this interface. All arguments beyond *allocator* should be considered keyword-only. .. attribute :: gpudata The :class:`pycuda.driver.DeviceAllocation` instance created for the memory that backs this :class:`GPUArray`. .. attribute :: shape The tuple of lengths of each dimension in the array. .. attribute :: dtype The :class:`numpy.dtype` of the items in the GPU array. .. attribute :: size The number of meaningful entries in the array. Can also be computed by multiplying up the numbers in :attr:`shape`. .. attribute :: mem_size The total number of entries, including padding, that are present in the array. Padding may arise for example because of pitch adjustment by :func:`pycuda.driver.mem_alloc_pitch`. .. attribute :: nbytes The size of the entire array in bytes. Computed as :attr:`size` times ``dtype.itemsize``. .. attribute :: strides Tuple of bytes to step in each dimension when traversing an array. .. attribute :: flags Return an object with attributes `c_contiguous`, `f_contiguous` and `forc`, which may be used to query contiguity properties in analogy to :attr:`numpy.ndarray.flags`. .. attribute :: ptr Return an :class:`int` reflecting the address in device memory where this array resides. .. versionadded: 2011.1 .. method :: __len__() Returns the size of the leading dimension of *self*. .. warning :: This method existed in version 0.93 and below, but it returned the value of :attr:`size` instead of its current value. The change was made in order to match :mod:`numpy`. .. method :: reshape(shape) Returns an array containing the same data with a new shape. .. method :: ravel() Returns flattened array containing the same data. .. method :: view(dtype=None) Returns view of array with the same data. If *dtype* is different from current dtype, the actual bytes of memory will be reinterpreted. .. method :: set(ary) Transfer the contents the :class:`numpy.ndarray` object *ary* onto the device. *ary* must have the same dtype and size (not necessarily shape) as *self*. .. method :: set_async(ary, stream=None) Asynchronously transfer the contents the :class:`numpy.ndarray` object *ary* onto the device, optionally sequenced on *stream*. *ary* must have the same dtype and size (not necessarily shape) as *self*. .. method :: get(ary=None, stream=None, pagelocked=False) Transfer the contents of *self* into *ary* or a newly allocated :mod:`numpy.ndarray`. If *ary* is given, it must have the right size (not necessarily shape) and dtype. If it is not given, a *pagelocked* specifies whether the new array is allocated page-locked. .. method :: get_async(ary=None, stream=None) Transfer the contents of *self* into *ary* or a newly allocated :mod:`numpy.ndarray`. If *ary* is given, it must have the right size (not necessarily shape) and dtype. If it is not given, a page-locked* array is newly allocated. .. method :: copy() .. versionadded :: 2013.1 .. method :: mul_add(self, selffac, other, otherfac, add_timer=None, stream=None): Return `selffac*self + otherfac*other`. *add_timer*, if given, is invoked with the result from :meth:`pycuda.driver.Function.prepared_timed_call`. .. method :: __add__(other) .. method :: __sub__(other) .. method :: __iadd__(other) .. method :: __isub__(other) .. method :: __neg__(other) .. method :: __mul__(other) .. method :: __div__(other) .. method :: __rdiv__(other) .. method :: __pow__(other) .. method :: __abs__() Return a :class:`GPUArray` containing the absolute value of each element of *self*. .. UNDOC reverse() .. method :: fill(scalar, stream=None) Fill the array with *scalar*. .. method :: astype(dtype, stream=None) Return *self*, cast to *dtype*. .. attribute :: real Return the real part of *self*, or *self* if it is real. .. versionadded:: 0.94 .. attribute :: imag Return the imaginary part of *self*, or *zeros_like(self)* if it is real. .. versionadded: 0.94 .. method :: conj() Return the complex conjugate of *self*, or *self* if it is real. .. versionadded: 0.94 .. method:: bind_to_texref(texref, allow_offset=False) Bind *self* to the :class:`pycuda.driver.TextureReference` *texref*. Due to alignment requirements, the effective texture bind address may be different from the requested one by an offset. This method returns this offset in units of *self*'s data type. If *allow_offset* is ``False``, a nonzero value of this offset will cause an exception to be raised. .. note:: It is recommended to use :meth:`bind_to_texref_ext` instead of this method. .. method:: bind_to_texref_ext(texref, channels=1, allow_double_hack=False, allow_offset=False) Bind *self* to the :class:`pycuda.driver.TextureReference` *texref*. In addition, set the texture reference's format to match :attr:`dtype` and its channel count to *channels*. This routine also sets the texture reference's :data:`pycuda.driver.TRSF_READ_AS_INTEGER` flag, if necessary. Due to alignment requirements, the effective texture bind address may be different from the requested one by an offset. This method returns this offset in units of *self*'s data type. If *allow_offset* is ``False``, a nonzero value of this offset will cause an exception to be raised. .. versionadded:: 0.93 .. highlight:: c As of this writing, CUDA textures do not natively support double-precision floating point data. To remedy this deficiency, PyCUDA contains a workaround, which can be enabled by passing *True* for allow_double_hack. In this case, use the following code for texture access in your kernel code:: #include texture my_tex; __global__ void f() { ... fp_tex1Dfetch(my_tex, threadIdx.x); ... } .. highlight:: python (This workaround was added in version 0.94.) Constructing :class:`GPUArray` Instances ---------------------------------------- .. function:: to_gpu(ary, allocator=None) Return a :class:`GPUArray` that is an exact copy of the :class:`numpy.ndarray` instance *ary*. See :class:`GPUArray` for the meaning of *allocator*. .. function:: to_gpu_async(ary, allocator=None, stream=None) Return a :class:`GPUArray` that is an exact copy of the :class:`numpy.ndarray` instance *ary*. The copy is done asynchronously, optionally sequenced into *stream*. See :class:`GPUArray` for the meaning of *allocator*. .. function:: empty(shape, dtype, *, allocator=None, order="C") A synonym for the :class:`GPUArray` constructor. .. function:: zeros(shape, dtype, *, allocator=None, order="C") Same as :func:`empty`, but the :class:`GPUArray` is zero-initialized before being returned. .. function:: empty_like(other_ary) Make a new, uninitialized :class:`GPUArray` having the same properties as *other_ary*. .. function:: zeros_like(other_ary) Make a new, zero-initialized :class:`GPUArray` having the same properties as *other_ary*. .. function:: arange(start, stop, step, dtype=None, stream=None) Create a :class:`GPUArray` filled with numbers spaced `step` apart, starting from `start` and ending at `stop`. For floating point arguments, the length of the result is `ceil((stop - start)/step)`. This rule may result in the last element of the result being greater than `stop`. *dtype*, if not specified, is taken as the largest common type of *start*, *stop* and *step*. .. function:: take(a, indices, stream=None) Return the :class:`GPUArray` ``[a[indices[0]], ..., a[indices[n]]]``. For the moment, *a* must be a type that can be bound to a texture. Conditionals ^^^^^^^^^^^^ .. function:: if_positive(criterion, then_, else_, out=None, stream=None) Return an array like *then_*, which, for the element at index *i*, contains *then_[i]* if *criterion[i]>0*, else *else_[i]*. (added in 0.94) .. function:: maximum(a, b, out=None, stream=None) Return the elementwise maximum of *a* and *b*. (added in 0.94) .. function:: minimum(a, b, out=None, stream=None) Return the elementwise minimum of *a* and *b*. (added in 0.94) Reductions ^^^^^^^^^^ .. function:: sum(a, dtype=None, stream=None) .. function:: subset_sum(subset, a, dtype=None, stream=None) .. versionadded:: 2013.1 .. function:: dot(a, b, dtype=None, stream=None) .. function:: subset_dot(subset, a, b, dtype=None, stream=None) .. function:: max(a, stream=None) .. function:: min(a, stream=None) .. function:: subset_max(subset, a, stream=None) .. function:: subset_min(subset, a, stream=None) Elementwise Functions on :class:`GPUArrray` Instances ----------------------------------------------------- .. module:: pycuda.cumath The :mod:`pycuda.cumath` module contains elementwise workalikes for the functions contained in :mod:`math`. Rounding and Absolute Value ^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. function:: fabs(array, *, out=None, stream=None) .. function:: ceil(array, *, out=None, stream=None) .. function:: floor(array, *, out=None, stream=None) Exponentials, Logarithms and Roots ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. function:: exp(array, *, out=None, stream=None) .. function:: log(array, *, out=None, stream=None) .. function:: log10(array, *, out=None, stream=None) .. function:: sqrt(array, *, out=None, stream=None) Trigonometric Functions ^^^^^^^^^^^^^^^^^^^^^^^ .. function:: sin(array, *, out=None, stream=None) .. function:: cos(array, *, out=None, stream=None) .. function:: tan(array, *, out=None, stream=None) .. function:: asin(array, *, out=None, stream=None) .. function:: acos(array, *, out=None, stream=None) .. function:: atan(array, *, out=None, stream=None) Hyperbolic Functions ^^^^^^^^^^^^^^^^^^^^ .. function:: sinh(array, *, out=None, stream=None) .. function:: cosh(array, *, out=None, stream=None) .. function:: tanh(array, *, out=None, stream=None) Floating Point Decomposition and Assembly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. function:: fmod(arg, mod, stream=None) Return the floating point remainder of the division `arg/mod`, for each element in `arg` and `mod`. .. function:: frexp(arg, stream=None) Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. .. function:: ldexp(significand, exponent, stream=None) Return a new array of floating point values composed from the entries of `significand` and `exponent`, paired together as `result = significand * 2**exponent`. .. function:: modf(arg, stream=None) Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. Generating Arrays of Random Numbers ----------------------------------- .. module:: pycuda.curandom .. function:: rand(shape, dtype=numpy.float32, stream=None) Return an array of `shape` filled with random values of `dtype` in the range [0,1). .. note:: The use case for this function is "I need some random numbers. I don't care how good they are or how fast I get them." It uses a pretty terrible MD5-based generator and doesn't even attempt to cache generated code. If you're interested in a non-toy random number generator, use the CURAND-based functionality below. .. warning:: The following classes are using random number generators that run on the GPU. Each thread uses its own generator. Creation of those generators requires more resources than subsequent generation of random numbers. After experiments it looks like maximum number of active generators on Tesla devices (with compute capabilities 1.x) is 256. Fermi devices allow for creating 1024 generators without any problems. If there are troubles with creating objects of class PseudoRandomNumberGenerator or QuasiRandomNumberGenerator decrease number of created generators (and therefore number of active threads). A pseudorandom sequence of numbers satisfies most of the statistical properties of a truly random sequence but is generated by a deterministic algorithm. A quasirandom sequence of n-dimensional points is generated by a deterministic algorithm designed to fill an n-dimensional space evenly. Quasirandom numbers are more expensive to generate. .. function:: get_curand_version() Obtain the version of CURAND against which PyCUDA was compiled. Returns a 3-tuple of integers as *(major, minor, revision)*. .. function:: seed_getter_uniform(N) Return an :class:`GPUArray` filled with one random `int32` repeated `N` times which can be used as a seed for XORWOW generator. .. function:: seed_getter_unique(N) Return an :class:`GPUArray` filled with `N` random `int32` which can be used as a seed for XORWOW generator. .. class:: XORWOWRandomNumberGenerator(seed_getter=None, offset=0) :arg seed_getter: a function that, given an integer count, will yield an `int32` :class:`GPUArray` of seeds. :arg offset: Starting index into the XORWOW sequence, given seed. Provides pseudorandom numbers. Generates sequences with period at least :math:`2^190`. CUDA 3.2 and above. .. versionadded:: 2011.1 .. method:: fill_uniform(data, stream=None) Fills in :class:`GPUArray` *data* with uniformly distributed pseudorandom values. .. method:: gen_uniform(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with uniformly distributed pseudorandom values, and returns newly created object. .. method:: fill_normal(data, stream=None) Fills in :class:`GPUArray` *data* with normally distributed pseudorandom values. .. method:: gen_normal(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with normally distributed pseudorandom values, and returns newly created object. .. method:: fill_log_normal(data, mean, stddev, stream=None) Fills in :class:`GPUArray` *data* with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*, and returns newly created object. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: fill_poisson(data, lambda_value, stream=None) Fills in :class:`GPUArray` *data* with Poisson distributed pseudorandom values with lambda *lambda_value*. *data* must be of type 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: gen_poisson(shape, dtype, lambda_value, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with Poisson distributed pseudorandom values with lambda *lambda_value*, and returns newly created object. *dtype* must be 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: call_skip_ahead(i, stream=None) Forces all generators to skip i values. Is equivalent to generating i values and discarding results, but is much faster. .. method:: call_skip_ahead_array(i, stream=None) Accepts array i of integer values, telling each generator how many values to skip. .. method:: call_skip_ahead_sequence(i, stream=None) Forces all generators to skip i subsequences. Is equivalent to generating i * :math:`2^67` values and discarding results, but is much faster. .. method:: call_skip_ahead_sequence_array(i, stream=None) Accepts array i of integer values, telling each generator how many subsequences to skip. .. class:: MRG32k3aRandomNumberGenerator(seed_getter=None, offset=0) :arg seed_getter: a function that, given an integer count, will yield an `int32` :class:`GPUArray` of seeds. :arg offset: Starting index into the XORWOW sequence, given seed. Provides pseudorandom numbers. Generates sequences with period at least :math:`2^190`. CUDA 4.1 and above. .. versionadded:: 2013.1 .. method:: fill_uniform(data, stream=None) Fills in :class:`GPUArray` *data* with uniformly distributed pseudorandom values. .. method:: gen_uniform(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with uniformly distributed pseudorandom values, and returns newly created object. .. method:: fill_normal(data, stream=None) Fills in :class:`GPUArray` *data* with normally distributed pseudorandom values. .. method:: gen_normal(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with normally distributed pseudorandom values, and returns newly created object. .. method:: fill_log_normal(data, mean, stddev, stream=None) Fills in :class:`GPUArray` *data* with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*. .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*, and returns newly created object. .. method:: fill_poisson(data, lambda_value, stream=None) Fills in :class:`GPUArray` *data* with Poisson distributed pseudorandom values with lambda *lambda_value*. *data* must be of type 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: gen_poisson(shape, dtype, lambda_value, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with Poisson distributed pseudorandom values with lambda *lambda_value*, and returns newly created object. *dtype* must be 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: call_skip_ahead(i, stream=None) Forces all generators to skip i values. Is equivalent to generating i values and discarding results, but is much faster. .. method:: call_skip_ahead_array(i, stream=None) Accepts array i of integer values, telling each generator how many values to skip. .. method:: call_skip_ahead_sequence(i, stream=None) Forces all generators to skip i subsequences. Is equivalent to generating i * :math:`2^67` values and discarding results, but is much faster. .. method:: call_skip_ahead_sequence_array(i, stream=None) Accepts array i of integer values, telling each generator how many subsequences to skip. .. function:: generate_direction_vectors(count, direction=direction_vector_set.VECTOR_32) Return an :class:`GPUArray` `count` filled with direction vectors used to initialize Sobol generators. .. function:: generate_scramble_constants32(count) Return a :class:`GPUArray` filled with `count' 32-bit unsigned integer numbers used to initialize :class:`ScrambledSobol32RandomNumberGenerator` .. function:: generate_scramble_constants64(count) Return a :class:`GPUArray` filled with `count' 64-bit unsigned integer numbers used to initialize :class:`ScrambledSobol64RandomNumberGenerator` .. class:: Sobol32RandomNumberGenerator(dir_vector=None, offset=0) :arg dir_vector: a :class:`GPUArray` of 32-element `int32` vectors which are used to initialize quasirandom generator; it must contain one vector for each initialized generator :arg offset: Starting index into the Sobol32 sequence, given direction vector. Provides quasirandom numbers. Generates sequences with period of :math:`2^32`. CUDA 3.2 and above. .. versionadded:: 2011.1 .. method:: fill_uniform(data, stream=None) Fills in :class:`GPUArray` *data* with uniformly distributed quasirandom values. .. method:: gen_uniform(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with uniformly distributed pseudorandom values, and returns newly created object. .. method:: fill_normal(data, stream=None) Fills in :class:`GPUArray` *data* with normally distributed quasirandom values. .. method:: gen_normal(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with normally distributed pseudorandom values, and returns newly created object. .. method:: fill_log_normal(data, mean, stddev, stream=None) Fills in :class:`GPUArray` *data* with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*, and returns newly created object. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: fill_poisson(data, lambda_value, stream=None) Fills in :class:`GPUArray` *data* with Poisson distributed pseudorandom values with lambda *lambda_value*. *data* must be of type 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: gen_poisson(shape, dtype, lambda_value, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with Poisson distributed pseudorandom values with lambda *lambda_value*, and returns newly created object. *dtype* must be 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: call_skip_ahead(i, stream=None) Forces all generators to skip i values. Is equivalent to generating i values and discarding results, but is much faster. .. method:: call_skip_ahead_array(i, stream=None) Accepts array i of integer values, telling each generator how many values to skip. .. class:: ScrambledSobol32RandomNumberGenerator(dir_vector=None, scramble_vector=None, offset=0) :arg dir_vector: a :class:`GPUArray` of 32-element `uint32` vectors which are used to initialize quasirandom generator; it must contain one vector for each initialized generator :arg scramble_vector: a :class:`GPUArray` of `uint32` elements which are used to initialize quasirandom generator; it must contain one number for each initialized generator :arg offset: Starting index into the Sobol32 sequence, given direction vector. Provides quasirandom numbers. Generates sequences with period of :math:`2^32`. CUDA 4.0 and above. .. versionadded:: 2011.1 .. method:: fill_uniform(data, stream=None) Fills in :class:`GPUArray` *data* with uniformly distributed quasirandom values. .. method:: gen_uniform(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with uniformly distributed pseudorandom values, and returns newly created object. .. method:: fill_normal(data, stream=None) Fills in :class:`GPUArray` *data* with normally distributed quasirandom values. .. method:: gen_normal(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with normally distributed pseudorandom values, and returns newly created object. .. method:: fill_log_normal(data, mean, stddev, stream=None) Fills in :class:`GPUArray` *data* with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*, and returns newly created object. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: fill_poisson(data, lambda_value, stream=None) Fills in :class:`GPUArray` *data* with Poisson distributed pseudorandom values with lambda *lambda_value*. *data* must be of type 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: gen_poisson(shape, dtype, lambda_value, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with Poisson distributed pseudorandom values with lambda *lambda_value*, and returns newly created object. *dtype* must be 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: call_skip_ahead(i, stream=None) Forces all generators to skip i values. Is equivalent to generating i values and discarding results, but is much faster. .. method:: call_skip_ahead_array(i, stream=None) Accepts array i of integer values, telling each generator how many values to skip. .. class:: Sobol64RandomNumberGenerator(dir_vector=None, offset=0) :arg dir_vector: a :class:`GPUArray` of 64-element `uint64` vectors which are used to initialize quasirandom generator; it must contain one vector for each initialized generator :arg offset: Starting index into the Sobol64 sequence, given direction vector. Provides quasirandom numbers. Generates sequences with period of :math:`2^64`. CUDA 4.0 and above. .. versionadded:: 2011.1 .. method:: fill_uniform(data, stream=None) Fills in :class:`GPUArray` *data* with uniformly distributed quasirandom values. .. method:: gen_uniform(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with uniformly distributed pseudorandom values, and returns newly created object. .. method:: fill_normal(data, stream=None) Fills in :class:`GPUArray` *data* with normally distributed quasirandom values. .. method:: gen_normal(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with normally distributed pseudorandom values, and returns newly created object. .. method:: fill_log_normal(data, mean, stddev, stream=None) Fills in :class:`GPUArray` *data* with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*, and returns newly created object. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: fill_poisson(data, lambda_value, stream=None) Fills in :class:`GPUArray` *data* with Poisson distributed pseudorandom values with lambda *lambda_value*. *data* must be of type 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: gen_poisson(shape, dtype, lambda_value, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with Poisson distributed pseudorandom values with lambda *lambda_value*, and returns newly created object. *dtype* must be 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: call_skip_ahead(i, stream=None) Forces all generators to skip i values. Is equivalent to generating i values and discarding results, but is much faster. .. method:: call_skip_ahead_array(i, stream=None) Accepts array i of integer values, telling each generator how many values to skip. .. class:: ScrambledSobol64RandomNumberGenerator(dir_vector=None, scramble_vector=None, offset=0) :arg dir_vector: a :class:`GPUArray` of 64-element `uint64` vectors which are used to initialize quasirandom generator; it must contain one vector for each initialized generator :arg scramble_vector: a :class:`GPUArray` of `uint64` vectors which are used to initialize quasirandom generator; it must contain one vector for each initialized generator :arg offset: Starting index into the ScrambledSobol64 sequence, given direction vector. Provides quasirandom numbers. Generates sequences with period of :math:`2^64`. CUDA 4.0 and above. .. versionadded:: 2011.1 .. method:: fill_uniform(data, stream=None) Fills in :class:`GPUArray` *data* with uniformly distributed quasirandom values. .. method:: gen_uniform(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with uniformly distributed pseudorandom values, and returns newly created object. .. method:: fill_normal(data, stream=None) Fills in :class:`GPUArray` *data* with normally distributed quasirandom values. .. method:: gen_normal(shape, dtype, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with normally distributed pseudorandom values, and returns newly created object. .. method:: fill_log_normal(data, mean, stddev, stream=None) Fills in :class:`GPUArray` *data* with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: gen_log_normal(shape, dtype, mean, stddev, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with log-normally distributed pseudorandom values with mean *mean* and standard deviation *stddev*, and returns newly created object. CUDA 4.0 and above. .. versionadded:: 2012.2 .. method:: fill_poisson(data, lambda_value, stream=None) Fills in :class:`GPUArray` *data* with Poisson distributed pseudorandom values with lambda *lambda_value*. *data* must be of type 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: gen_poisson(shape, dtype, lambda_value, stream=None) Creates object of :class:`GPUArray` with given *shape* and *dtype*, fills it in with Poisson distributed pseudorandom values with lambda *lambda_value*, and returns newly created object. *dtype* must be 32-bit unsigned int. CUDA 5.0 and above. .. versionadded:: 2013.1 .. method:: call_skip_ahead(i, stream=None) Forces all generators to skip i values. Is equivalent to generating i values and discarding results, but is much faster. .. method:: call_skip_ahead_array(i, stream=None) Accepts array i of integer values, telling each generator how many values to skip. Single-pass Custom Expression Evaluation ---------------------------------------- .. module:: pycuda.elementwise Evaluating involved expressions on :class:`GPUArray` instances can be somewhat inefficient, because a new temporary is created for each intermediate result. The functionality in the module :mod:`pycuda.elementwise` contains tools to help generate kernels that evaluate multi-stage expressions on one or several operands in a single pass. .. class:: ElementwiseKernel(arguments, operation, name="kernel", keep=False, options=[], preamble="") Generate a kernel that takes a number of scalar or vector *arguments* and performs the scalar *operation* on each entry of its arguments, if that argument is a vector. *arguments* is specified as a string formatted as a C argument list. *operation* is specified as a C assignment statement, without a semicolon. Vectors in *operation* should be indexed by the variable *i*. *name* specifies the name as which the kernel is compiled, *keep* and *options* are passed unmodified to :class:`pycuda.compiler.SourceModule`. *preamble* specifies some source code that is included before the elementwise kernel specification. You may use this to include other files and/or define functions that are used by *operation*. .. method:: __call__(*args, range=None, slice=None) Invoke the generated scalar kernel. The arguments may either be scalars or :class:`GPUArray` instances. If *range* is given, it must be a :class:`slice` object and specifies the range of indices *i* for which the *operation* is carried out. If *slice* is given, it must be a :class:`slice` object and specifies the range of indices *i* for which the *operation* is carried out, truncated to the container. Also, *slice* may contain negative indices to index relative to the end of the array. If *stream* is given, it must be a :class:`pycuda.driver.Stream` object, where the execution will be serialized. Here's a usage example:: import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import pycuda.autoinit import numpy from pycuda.curandom import rand as curand a_gpu = curand((50,)) b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel( "float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) import numpy.linalg as la assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 (You can find this example as :file:`examples/demo_elementwise.py` in the PyCuda distribution.) Custom Reductions ----------------- .. module:: pycuda.reduction .. class:: ReductionKernel(dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", keep=False, options=[], preamble="") Generate a kernel that takes a number of scalar or vector *arguments* (at least one vector argument), performs the *map_expr* on each entry of the vector argument and then the *reduce_expr* on the outcome of that. *neutral* serves as an initial value. *preamble* offers the possibility to add preprocessor directives and other code (such as helper functions) to be added before the actual reduction kernel code. Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr* uses the formal values "a" and "b" to indicate two operands of a binary reduction operation. If you do not specify a *map_expr*, "in[i]" -- and therefore the presence of only one input argument -- is automatically assumed. *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is performed and in which the result is returned. *neutral* is specified as float or integer formatted as string. *reduce_expr* and *map_expr* are specified as string formatted operations and *arguments* is specified as a string formatted as a C argument list. *name* specifies the name as which the kernel is compiled, *keep* and *options* are passed unmodified to :class:`pycuda.compiler.SourceModule`. *preamble* is specified as a string of code. .. method __call__(*args, stream=None) Here's a usage example:: a = gpuarray.arange(400, dtype=numpy.float32) b = gpuarray.arange(400, dtype=numpy.float32) krnl = ReductionKernel(numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="float *x, float *y") my_dot_prod = krnl(a, b).get() Parallel Scan / Prefix Sum -------------------------- .. module:: pycuda.scan .. class:: ExclusiveScanKernel(dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="") Generates a kernel that can compute a `prefix sum `_ using any associative operation given as *scan_expr*. *scan_expr* uses the formal values "a" and "b" to indicate two operands of an associative binary operation. *neutral* is the neutral element of *scan_expr*, obeying *scan_expr(a, neutral) == a*. *dtype* specifies the type of the arrays being operated on. *name_prefix* is used for kernel names to ensure recognizability in profiles and logs. *options* is a list of compiler options to use when building. *preamble* specifies a string of code that is inserted before the actual kernels. .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None) .. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None) Works like :class:`ExclusiveScanKernel`. Unlike the exclusive case, *neutral* is not required. Here's a usage example:: knl = InclusiveScanKernel(np.int32, "a+b") n = 2**20-2**18+5 host_data = np.random.randint(0, 10, n).astype(np.int32) dev_data = gpuarray.to_gpu(queue, host_data) knl(dev_data) assert (dev_data.get() == np.cumsum(host_data, axis=0)).all() Custom data types in Reduction and Scan --------------------------------------- If you would like to use your own (struct/union/whatever) data types in scan and reduction, define those types in the *preamble* and let PyCUDA know about them using this function: .. function:: pycuda.tools.register_dtype(dtype, name) *dtype* is a :func:`numpy.dtype`. .. versionadded: 2011.2 GPGPU Algorithms ---------------- Bogdan Opanchuk's `reikna `_ offers a variety of GPU-based algorithms (FFT, RNG, matrix multiplication) designed to work with :class:`pycuda.gpuarray.GPUArray` objects. pycuda-2013.1.1+git20140310/doc/source/metaprog.rst0000644000175000000500000001405212313360364017746 0ustar tomussrc.. _metaprog: Metaprogramming =============== In 'conventional' programming, one writes a program that accomplishes a task. In *metaprogramming*, one writes a program *that writes a program* that accomplishes a task. That sounds pretty complicated--so first of all, we'll look at why it may be a good idea nonetheless. Why Metaprogramming? -------------------- Automated Tuning ^^^^^^^^^^^^^^^^ A sizable part of a CUDA programmer's time is typically spent tuning code. This tuning answers questions like: * What's the optimal number of threads per block? * How much data should I work on at once? * What data should be loaded into shared memory, and how big should the corresponding blocks be? If you are lucky, you'll be able to find a pattern in the execution time of your code and come up with a heuristic that will allow you to reliably pick the fastest version. Unfortunately, this heuristic may become unreliable or even fail entirely with new hardware generations. The solution to this problem that PyCUDA tries to promote is: Forget heuristics. Benchmark at run time and use whatever works fastest. This is an important advantage of PyCUDA over the CUDA runtime API: It lets you make these decisions *while your code is running*. A number of prominent computing packages make use of a similar technique, among them ATLAS and FFTW. And while those require rather complicated optimization driver routines, you can drive PyCUDA from the comfort of Python. Data Types ^^^^^^^^^^ Your code may have to deal with different data types at run time. It may, for example, have to work on both single and double precision floating point numbers. You could just precompile versions for both, but why? Just generate whatever code is needed right *when* it is needed. Specialize Code for the Given Problem ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If you are writing a library, then your users will ask your library to perform a number of tasks. Imagine how liberating it would be if you could generate code purposely for the problem you're being asked to solve, instead of having to keep code unnecessarily generic and thereby slow. PyCUDA makes this a reality. Constants are Faster than Variables ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If your problem sizes vary from run to run, but you perform a larger number of kernel invocations on data of identical size, you may want to consider compiling data size into your code as a constant. This can have significant performance benefits, resulting mainly from decreased fetch times and less register pressure. In particular, multiplications by constants are much more efficiently carried out than general variable-variable multiplications. Loop Unrolling ^^^^^^^^^^^^^^ The CUDA programming guide says great things about :command:`nvcc` and how it will unroll loops for you. As of Version 2.1, that's simply not true, and ``#pragma unroll`` is simply a no-op, at least according to my experience. With metaprogramming, you can dynamically unroll your loops to the needed size in Python. Metaprogramming using a Templating Engine ----------------------------------------- If your metaprogramming needs are rather simple, perhaps the easiest way to generate code at run time is through a templating engine. Many templating engines for Python exist, two of the most prominent ones are `Jinja 2 `_ and `Cheetah `_. The following is a simple metaprogram that performs vector addition on configurable block sizes. It illustrates the templating-based metaprogramming technique:: from jinja2 import Template tpl = Template(""" __global__ void add( {{ type_name }} *tgt, {{ type_name }} *op1, {{ type_name }} *op2) { int idx = threadIdx.x + {{ thread_block_size }} * {{block_size}} * blockIdx.x; {% for i in range(block_size) %} {% set offset = i*thread_block_size %} tgt[idx + {{ offset }}] = op1[idx + {{ offset }}] + op2[idx + {{ offset }}]; {% endfor %} }""") rendered_tpl = tpl.render( type_name="float", block_size=block_size, thread_block_size=thread_block_size) mod = SourceModule(rendered_tpl) This snippet in a working context can be found in :file:`examples/demo_meta_template.py`. You can also find an example of matrix multiplication optimization using template metaprogramming with Cheetah in :file:`demo_meta_matrixmul_cheetah.py` and :file:`demo_meta_matrixmul_cheetah.template.cu`. Metaprogramming using :mod:`codepy` ----------------------------------- For more complicated metaprograms, it may be desirable to have more programmatic control over the assembly of the source code than a templating engine can provide. The :mod:`codepy` package provides a means of generating CUDA source code from a Python data structure. The following example demonstrates the use of :mod:`codepy` for metaprogramming. It accomplishes exactly the same as the above program:: from codepy.cgen import FunctionBody, \ FunctionDeclaration, Typedef, POD, Value, \ Pointer, Module, Block, Initializer, Assign from codepy.cgen.cuda import CudaGlobal mod = Module([ FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "add"), arg_decls=[Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"]])), Block([ Initializer( POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" % (thread_block_size*block_size)), ]+[ Assign( "tgt[idx+%d]" % (o*thread_block_size), "op1[idx+%d] + op2[idx+%d]" % ( o*thread_block_size, o*thread_block_size)) for o in range(block_size)]))]) mod = SourceModule(mod) This snippet in a working context can be found in :file:`examples/demo_meta_codepy.py`. pycuda-2013.1.1+git20140310/doc/source/tutorial.rst0000644000175000000500000001541112313360364017773 0ustar tomussrcTutorial ======== Getting started --------------- Before you can use PyCuda, you have to import and initialize it:: import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule Note that you do not *have* to use :mod:`pycuda.autoinit`-- initialization, context creation, and cleanup can also be performed manually, if desired. Transferring Data ----------------- The next step in most programs is to transfer data onto the device. In PyCuda, you will mostly transfer data from :mod:`numpy` arrays on the host. (But indeed, everything that satisfies the Python buffer interface will work, even a :class:`str`.) Let's make a 4x4 array of random numbers:: import numpy a = numpy.random.randn(4,4) But wait--*a* consists of double precision numbers, but most nVidia devices only support single precision:: a = a.astype(numpy.float32) Finally, we need somewhere to transfer data to, so we need to allocate memory on the device:: a_gpu = cuda.mem_alloc(a.nbytes) As a last step, we need to transfer the data to the GPU:: cuda.memcpy_htod(a_gpu, a) Executing a Kernel ------------------ For this tutorial, we'll stick to something simple: We will write code to double each entry in *a_gpu*. To this end, we write the corresponding CUDA C code, and feed it into the constructor of a :class:`pycuda.compiler.SourceModule`:: mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """) If there aren't any errors, the code is now compiled and loaded onto the device. We find a reference to our :class:`pycuda.driver.Function` and call it, specifying *a_gpu* as the argument, and a block size of 4x4:: func = mod.get_function("doublify") func(a_gpu, block=(4,4,1)) Finally, we fetch the data back from the GPU and display it, together with the original *a*:: a_doubled = numpy.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) print a_doubled print a This will print something like this:: [[ 0.51360393 1.40589952 2.25009012 3.02563429] [-0.75841576 -1.18757617 2.72269917 3.12156057] [ 0.28826082 -2.92448163 1.21624792 2.86353827] [ 1.57651746 0.63500965 2.21570683 -0.44537592]] [[ 0.25680196 0.70294976 1.12504506 1.51281714] [-0.37920788 -0.59378809 1.36134958 1.56078029] [ 0.14413041 -1.46224082 0.60812396 1.43176913] [ 0.78825873 0.31750482 1.10785341 -0.22268796]] It worked! That completes our walkthrough. Thankfully, PyCuda takes over from here and does all the cleanup for you, so you're done. Stick around for some bonus material in the next section, though. (You can find the code for this demo as :file:`examples/demo.py` in the PyCuda source distribution.) Shortcuts for Explicit Memory Copies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :class:`pycuda.driver.In`, :class:`pycuda.driver.Out`, and :class:`pycuda.driver.InOut` argument handlers can simplify some of the memory transfers. For example, instead of creating *a_gpu*, if replacing *a* is fine, the following code can be used:: func(cuda.InOut(a), block=(4, 4, 1)) Prepared Invocations ^^^^^^^^^^^^^^^^^^^^ Function invocation using the built-in :meth:`pycuda.driver.Function.__call__` method incurs overhead for type identification (see :ref:`reference-doc`). To achieve the same effect as above without this overhead, the function is bound to argument types (as designated by Python's standard library :mod:`struct` module), and then called. This also avoids having to assign explicit argument sizes using the `numpy.number` classes:: func.prepare("P", block=(4,4,1)) func.prepared_call((1, 1), a_gpu) Bonus: Abstracting Away the Complications ----------------------------------------- Using a :class:`pycuda.gpuarray.GPUArray`, the same effect can be achieved with much less writing:: import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import pycuda.autoinit import numpy a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32)) a_doubled = (2*a_gpu).get() print a_doubled print a_gpu Advanced Topics --------------- Structures ^^^^^^^^^^ (contributed by Nicholas Tung, find the code in :file:`examples/demo_struct.py`) Suppose we have the following structure, for doubling a number of variable length arrays:: mod = SourceModule(""" struct DoubleOperation { int datalen, __padding; // so 64-bit ptrs can be aligned float *ptr; }; __global__ void double_array(DoubleOperation *a) { a = &a[blockIdx.x]; for (int idx = threadIdx.x; idx < a->datalen; idx += blockDim.x) { a->ptr[idx] *= 2; } } """) Each block in the grid (see CUDA documentation) will double one of the arrays. The `for` loop allows for more data elements than threads to be doubled, though is not efficient if one can guarantee that there will be a sufficient number of threads. Next, a wrapper class for the structure is created, and two arrays are instantiated:: class DoubleOpStruct: mem_size = 8 + numpy.intp(0).nbytes def __init__(self, array, struct_arr_ptr): self.data = cuda.to_device(array) self.shape, self.dtype = array.shape, array.dtype cuda.memcpy_htod(int(struct_arr_ptr), numpy.int32(array.size)) cuda.memcpy_htod(int(struct_arr_ptr) + 8, numpy.intp(int(self.data))) def __str__(self): return str(cuda.from_device(self.data, self.shape, self.dtype)) struct_arr = cuda.mem_alloc(2 * DoubleOpStruct.mem_size) do2_ptr = int(struct_arr) + DoubleOpStruct.mem_size array1 = DoubleOpStruct(numpy.array([1, 2, 3], dtype=numpy.float32), struct_arr) array2 = DoubleOpStruct(numpy.array([0, 4], dtype=numpy.float32), do2_ptr) print("original arrays", array1, array2) This code uses the :func:`pycuda.driver.to_device` and :func:`pycuda.driver.from_device` functions to allocate and copy values, and demonstrates how offsets to an allocated block of memory can be used. Finally, the code can be executed; the following demonstrates doubling both arrays, then only the second:: func = mod.get_function("double_array") func(struct_arr, block = (32, 1, 1), grid=(2, 1)) print("doubled arrays", array1, array2) func(numpy.intp(do2_ptr), block = (32, 1, 1), grid=(1, 1)) print("doubled second only", array1, array2, "\n") Where to go from here --------------------- Once you feel sufficiently familiar with the basics, feel free to dig into the :ref:`reference-doc`. For more examples, check the in the :file:`examples/` subdirectory of the distribution. This folder also contains several benchmarks to see the difference between GPU and CPU based calculations. As a reference for how stuff is done, PyCuda's test suite in the :file:`test/` subdirectory of the distribution may also be of help. pycuda-2013.1.1+git20140310/doc/source/gl.rst0000644000175000000500000000702312313360364016532 0ustar tomussrc.. _gl-interop: OpenGL ====== .. module:: pycuda.gl .. function :: make_context(dev, flags=0) Create and return a :class:`pycuda.driver.Context` that has GL interoperability enabled. .. warning :: This will fail with a rather unhelpful error message if you don't already have a GL context created and active. .. class :: graphics_map_flags Usage of OpenGL object from CUDA. .. attribute :: NONE Read and write access to mapped OpenGL object from CUDA code. .. attribute :: READ_ONLY Read only access to mapped OpenGL object from CUDA code. .. attribute :: WRITE_DISCARD Write only access to mapped OpenGL object from CUDA code. Reading is prohibited. .. class :: RegisteredBuffer(bufobj, flags = CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE) Object managing mapping of OpenGL buffers to CUDA. Cannot be used to map images. .. method :: gl_handle() .. method :: unregister() .. method :: map(stream=None) Return a :class:`RegisteredMapping`. .. class :: RegisteredImage(bufobj, target, flags = CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE) Object managing mapping of OpenGL textures and render buffers to CUDA. *target* must be be one of: * `GL_TEXTURE_2D` * `GL_TEXTURE_RECTANGLE` * `GL_TEXTURE_CUBE_MAP` * `GL_TEXTURE_3D` * `GL_TEXTURE_2D_ARRAY` * `GL_RENDERBUFFER` (see PyOpenGL docs) .. method :: gl_handle() .. method :: unregister() .. method :: map(stream=None) Return a :class:`RegisteredMapping`. .. class :: RegisteredMapping .. method :: unmap(stream=None) If no stream is specified, the unmap will use the same stream as the original mapping. .. method :: device_ptr_and_size() Return a tuple *(dev_pointer, size)*. .. versionadded: 2011.1 .. method :: array(index, level) Return an array for mapped image object for given array index and MIP level. Automatic Initialization ------------------------ .. module:: pycuda.gl.autoinit .. warning :: Importing :mod:`pycuda.gl.autoinit` will fail with a rather unhelpful error message if you don't already have a GL context created and active. .. data:: device .. data:: context Old-style (pre-CUDA 3.0) API ---------------------------- .. function :: init() Enable GL interoperability for the already-created (so far non-GL) and currently active :class:`pycuda.driver.Context`. According to the forum post referenced in the note below, this will succeed on Windows XP and Linux, but it will not work on Windows Vista. There you *have* to create the GL-enabled context using :func:`make_context`. .. warning :: This function is deprecated since CUDA 3.0 and PyCUDA 2011.1. .. warning :: This will fail with a rather unhelpful error message if you don't already have a GL context created and active. .. note :: See this `post `_ on the Nvidia forums for a discussion of problems and solutions with the GL interop interface. .. class :: BufferObject(bufobj) .. warning :: This class is deprecated since CUDA 3.0 and PyCUDA 2011.1. .. method :: unregister() .. attribute :: handle() .. method :: map() .. class :: BufferObjectMapping .. warning :: This class is deprecated since CUDA 3.0 and PyCUDA 2011.1. It will be removed in PyCUDA 0.96. .. method :: unmap() .. method :: device_ptr() .. method :: size() pycuda-2013.1.1+git20140310/doc/source/misc.rst0000644000175000000500000004043512313360364017067 0ustar tomussrcChanges ======= Version 2014.1 -------------- .. note:: This version is the current development version. You can get it from `PyCUDA's version control repository `_. * Add :meth:`PointerHolderBase.as_buffer` and :meth:`DeviceAllocation.as_buffer`. * Support for :class:`device_attribute` values added in CUDA 5.0, 5.5, and 6.0. * Support for :ref:`managed_memory`. (contributed by Stan Seibert) Version 2013.1.1 ---------------- * Windows fix for PyCUDA on Python 3 (Thanks, Christoph Gohlke) Version 2013.1 -------------- * Python 3 support (large parts contributed by Tomasz Rybak) * Add :meth:`pycuda.gpuarray.GPUArray.__getitem__`, supporting generic slicing. It is *possible* to create non-contiguous arrays using this functionality. Most operations (elementwise etc.) will not work on such arrays. * More generators in :mod:`pycuda.curandom`. (contributed by Tomasz Rybak) * Many bug fixes .. note:: The addition of :meth:`pyopencl.array.Array.__getitem__` has an unintended consequence due to `numpy bug 3375 `_. For instance, this expression:: numpy.float32(5) * some_gpu_array may take a very long time to execute. This is because :mod:`numpy` first builds an object array of (compute-device) scalars (!) before it decides that that's probably not such a bright idea and finally calls :meth:`pycuda.gpuarray.GPUArray.__rmul__`. Note that only left arithmetic operations of :class:`pycuda.gpuarray.GPUArray` by :mod:`numpy` scalars are affected. Python's number types (:class:`float` etc.) are unaffected, as are right multiplications. If a program that used to run fast suddenly runs extremely slowly, it is likely that this bug is to blame. Here's what you can do: * Use Python scalars instead of :mod:`numpy` scalars. * Switch to right multiplications if possible. * Use a patched :mod:`numpy`. See the bug report linked above for a pull request with a fix. * Switch to a fixed version of :mod:`numpy` when available. Version 2012.1 -------------- * Numerous bug fixes. (including shipped-boost compilation on gcc 4.7) Version 2011.2 -------------- * Fix a memory leak when using pagelocked memory. (reported by Paul Cazeaux) * Fix complex scalar argument passing. * Fix :func:`pycuda.gpuarray.zeros` when used on complex arrays. * Add :func:`pycuda.tools.register_dtype` to enable scan/reduction on struct types. * More improvements to CURAND. * Add support for CUDA 4.1. Version 2011.1.2 ---------------- * Various fixes. Version 2011.1.1 ---------------- * Various fixes. Version 2011.1 -------------- When you update code to run on this version of PyCUDA, please make sure to have deprecation warnings enabled, so that you know when your code needs updating. (See `the Python docs `_. Caution: As of Python 2.7, deprecation warnings are disabled by default.) * Add support for CUDA 3.0-style OpenGL interop. (thanks to Tomasz Rybak) * Add :meth:`pycuda.driver.Stream.wait_for_event`. * Add *range* and *slice* keyword argument to :meth:`pycuda.elementwise.ElementwiseKernel.__call__`. * Document *preamble* constructor keyword argument to :class:`pycuda.elementwise.ElementwiseKernel`. * Add vector types, see :class:`pycuda.gpuarray.vec`. * Add :mod:`pycuda.scan`. * Add support for new features in CUDA 4.0. * Add :attr:`pycuda.gpuarray.GPUArray.strides`, :attr:`pycuda.gpuarray.GPUArray.flags`. Allow the creation of arrys in C and Fortran order. * Adopt stateless launch interface from CUDA, deprecate old one. * Add CURAND wrapper. (with work by Tomasz Rybak) * Add :data:`pycuda.compiler.DEFAULT_NVCC_FLAGS`. Version 0.94.2 -------------- * Fix the pesky Fermi reduction bug. (thanks to Tomasz Rybak) Version 0.94.1 -------------- * Support for CUDA debugging. (see `FAQ `_ for details.) Version 0.94 ------------ * Support for CUDA 3.0. (but not CUDA 3.0 beta!) Search for "CUDA 3.0" in :ref:`reference-doc` to see what's new. * Support for CUDA 3.1 beta. Search for "CUDA 3.1" in :ref:`reference-doc` to see what's new. * Support for CUDA 3.2 RC. Search for "CUDA 3.2" in :ref:`reference-doc` to see what's new. * Add sparse matrix-vector multiplication and linear system solving code, in :mod:`pycuda.sparse`. * Add :func:`pycuda.gpuarray.if_positive`, :func:`pycuda.gpuarray.maximum`, :func:`pycuda.gpuarray.minimum`. * Deprecate :func:`pycuda.tools.get_default_device` * Add :func:`pycuda.tools.make_default_context`. * Use :func:`pycuda.tools.make_default_context` in :mod:`pycuda.autoinit`, which changes its behavior. * Remove previously deprecated features: * :attr:`pycuda.driver.Function.registers`, :attr:`pycuda.driver.Function.lmem`, and :attr:`pycuda.driver.Function.smem` have been deprecated in favor of the mechanism above. See :attr:`pycuda.driver.Function.num_regs` for more. * the three-argument forms (i.e. with streams) of :func:`pycuda.driver.memcpy_dtoh` and :func:`pycuda.driver.memcpy_htod`. Use :func:`pycuda.driver.memcpy_dtoh_async` and :func:`pycuda.driver.memcpy_htod_async` instead. * :class:`pycuda.driver.SourceModule`. * Add :func:`pycuda.tools.context_dependent_memoize`, use it for context-dependent caching of PyCUDA's canned kernels. * Add :func:`pycuda.tools.mark_cuda_test`. * Add attributes of :exc:`pycuda.driver.CompileError`. (requested by Dan Lepage) * Add preliminary support for complex numbers. (initial discussion with Daniel Fan) * Add :attr:`pycuda.gpuarray.GPUArray.real`, :attr:`pycuda.gpuarray.GPUArray.imag`, :meth:`pycuda.gpuarray.GPUArray.conj`. * Add :class:`pycuda.driver.PointerHolderBase`. Version 0.93 ------------ .. warning:: Version 0.93 makes some changes to the PyCUDA programming interface. In all cases where documented features were changed, the old usage continues to work, but results in a warning. It is recommended that you update your code to remove the warning. * OpenGL interoperability in :mod:`pycuda.gl`. * Document :meth:`pycuda.gpuarray.GPUArray.__len__`. Change its definition to match :mod:`numpy`. * Add :meth:`pycuda.gpuarray.GPUArray.bind_to_texref_ext`. * Let :class:`pycuda.gpuarray.GPUArray` operators deal with generic data types, including type promotion. * Add :func:`pycuda.gpuarray.take`. * Fix thread handling by making internal context stack thread-local. * Add :class:`pycuda.reduction.ReductionKernel`. * Add :func:`pycuda.gpuarray.sum`, :func:`pycuda.gpuarray.dot`, :func:`pycuda.gpuarray.subset_dot`. * Synchronous and asynchronous memory transfers are now separate from each other, the latter having an ``_async`` suffix. The now-synchronous forms still take a :class:`pycuda.driver.Stream` argument, but this practice is deprecated and prints a warning. * :class:`pycuda.gpuarray.GPUArray` no longer has an associated :class:`pycuda.driver.Stream`. Asynchronous GPUArray transfers are now separate from synchronous ones and have an ``_async`` suffix. * Support for features added in CUDA 2.2. * :class:`pycuda.driver.SourceModule` has been moved to :class:`pycuda.compiler.SourceModule`. It is still available by the old name, but will print a warning about the impending deprecation. * :meth:`pycuda.driver.Device.get_attribute` with a :class:`pycuda.driver.device_attribute` `attr` can now be spelled `dev.attr`, with no further namespace detours. (Suggested by Ian Cullinan) Likewise for :meth:`pycuda.driver.Function.get_attribute` * :attr:`pycuda.driver.Function.registers`, :attr:`pycuda.driver.Function.lmem`, and :attr:`pycuda.driver.Function.smem` have been deprecated in favor of the mechanism above. See :attr:`pycuda.driver.Function.num_regs` for more. * Add PyCUDA version query mechanism, see :data:`pycuda.VERSION`. Version 0.92 ------------ .. note:: If you're upgrading from prior versions, you may delete the directory :file:`$HOME/.pycuda-compiler-cache` to recover now-unused disk space. .. note:: During this release time frame, I had the honor of giving a talk on PyCUDA for a `class `_ that a group around Nicolas Pinto was teaching at MIT. If you're interested, the slides for it are `available `_. * Make :class:`pycuda.tools.DeviceMemoryPool` official functionality, after numerous improvements. Add :class:`pycuda.tools.PageLockedMemoryPool` for pagelocked memory, too. * Properly deal with automatic cleanup in the face of several contexts. * Fix compilation on Python 2.4. * Fix 3D arrays. (Nicolas Pinto) * Improve error message when :command:`nvcc` is not found. * Automatically run Python GC before throwing out-of-memory errors. * Allow explicit release of memory using :meth:`pycuda.driver.DeviceAllocation.free`, :meth:`pycuda.driver.HostAllocation.free`, :meth:`pycuda.driver.Array.free`, :meth:`pycuda.tools.PooledDeviceAllocation.free`, :meth:`pycuda.tools.PooledHostAllocation.free`. * Make configure switch ``./configure.py --cuda-trace`` to enable API tracing. * Add a documentation chapter and examples on :ref:`metaprog`. * Add :func:`pycuda.gpuarray.empty_like` and :func:`pycuda.gpuarray.zeros_like`. * Add and document :attr:`pycuda.gpuarray.GPUArray.mem_size` in anticipation of stride/pitch support in :class:`pycuda.gpuarray.GPUArray`. * Merge Jozef Vesely's MD5-based RNG. * Document :func:`pycuda.driver.from_device` and :func:`pycuda.driver.from_device_like`. * Add :class:`pycuda.elementwise.ElementwiseKernel`. * Various documentation improvements. (many of them from Nicholas Tung) * Move PyCUDA's compiler cache to the system temporary directory, rather than the users home directory. Version 0.91 ------------ * Add support for compiling on CUDA 1.1. Added version query :func:`pycuda.driver.get_version`. Updated documentation to show 2.0-only functionality. * Support for Windows and MacOS X, in addition to Linux. (Gert Wohlgemuth, Cosmin Stejerean, Znah on the Nvidia forums, and David Gadling) * Support more arithmetic operators on :class:`pycuda.gpuarray.GPUArray`. (Gert Wohlgemuth) * Add :func:`pycuda.gpuarray.arange`. (Gert Wohlgemuth) * Add :mod:`pycuda.curandom`. (Gert Wohlgemuth) * Add :mod:`pycuda.cumath`. (Gert Wohlgemuth) * Add :mod:`pycuda.autoinit`. * Add :mod:`pycuda.tools`. * Add :class:`pycuda.tools.DeviceData` and :class:`pycuda.tools.OccupancyRecord`. * :class:`pycuda.gpuarray.GPUArray` parallelizes properly on GTX200-generation devices. * Make :class:`pycuda.driver.Function` resource usage available to the program. (See, e.g. :attr:`pycuda.driver.Function.registers`.) * Cache kernels compiled by :class:`pycuda.compiler.SourceModule`. (Tom Annau) * Allow for faster, prepared kernel invocation. See :meth:`pycuda.driver.Function.prepare`. * Added memory pools, at :class:`pycuda.tools.DeviceMemoryPool` as experimental, undocumented functionality. For some workloads, this can cure the slowness of :func:`pycuda.driver.mem_alloc`. * Fix the :ref:`memset ` family of functions. * Improve :ref:`errors`. * Add `order` parameter to :func:`pycuda.driver.matrix_to_array` and :func:`pycuda.driver.make_multichannel_2d_array`. Acknowledgments ================ * Gert Wohlgemuth ported PyCUDA to MacOS X and contributed large parts of :class:`pycuda.gpuarray.GPUArray`. * Alexander Mordvintsev contributed fixes for Windows XP. * Cosmin Stejerean provided multiple patches for PyCUDA's build system. * Tom Annau contributed an alternative SourceModule compiler cache as well as Windows build insight. * Nicholas Tung improved PyCUDA's documentation. * Jozef Vesely contributed a massively improved random number generator derived from the RSA Data Security, Inc. MD5 Message Digest Algorithm. * Chris Heuser provided a test cases for multi-threaded PyCUDA. * The reduction templating is based on code by Mark Harris at Nvidia. * Andrew Wagner provided a test case and contributed the port of the convolution example. The original convolution code is based on an example provided by Nvidia. * Hendrik Riedmann contributed the matrix transpose and list selection examples. * Peter Berrington contributed a working example for CUDA-OpenGL interoperability. * Maarten Breddels provided a patch for 'flat-egg' support. * Nicolas Pinto refactored :mod:`pycuda.autoinit` for automatic device finding. * Ian Ozsvald and Fabrizio Milo provided patches. * Min Ragan-Kelley solved the long-standing puzzle of why PyCUDA did not work on 64-bit CUDA on OS X (and provided a patch). * Tomasz Rybak solved another long-standing puzzle of why reduction failed to work on some Fermi chips. In addition, he provided a patch that updated PyCUDA's :ref:`gl-interop` to the state of CUDA 3.0. * Martin Bergtholdt of Philips Research provided a patch that made PyCUDA work on 64-bit Windows 7. Licensing ========= PyCUDA is licensed to you under the MIT/X Consortium license: Copyright (c) 2009,10 Andreas Klöckner and Contributors. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PyCUDA includes derivatives of parts of the `Thrust `_ computing package (in particular the scan implementation). These parts are licensed as follows: Copyright 2008-2011 NVIDIA Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. .. note:: If you use Apache-licensed parts, be aware that these may be incompatible with software licensed exclusively under GPL2. (Most software is licensed as GPL2 or later, in which case this is not an issue.) Frequently Asked Questions ========================== The FAQ is now maintained collaboratively in the `PyCUDA Wiki `_. Citing PyCUDA =============== We are not asking you to gratuitously cite PyCUDA in work that is otherwise unrelated to software. That said, if you do discuss some of the development aspects of your code and would like to highlight a few of the ideas behind PyCUDA, feel free to cite this article: Andreas Klöckner, Nicolas Pinto, Yunsup Lee, Bryan Catanzaro, Paul Ivanov, Ahmed Fasih, PyCUDA and PyOpenCL: A scripting-based approach to GPU run-time code generation, Parallel Computing, Volume 38, Issue 3, March 2012, Pages 157-174. Here's a Bibtex entry for your convenience:: @article{kloeckner_pycuda_2012, author = {{Kl{\"o}ckner}, Andreas and {Pinto}, Nicolas and {Lee}, Yunsup and {Catanzaro}, B. and {Ivanov}, Paul and {Fasih}, Ahmed }, title = "{PyCUDA and PyOpenCL: A Scripting-Based Approach to GPU Run-Time Code Generation}", journal = "Parallel Computing", volume = "38", number = "3", pages = "157--174", year = "2012", issn = "0167-8191", doi = "10.1016/j.parco.2011.09.001", } pycuda-2013.1.1+git20140310/doc/source/_static/0002755000175000000500000000000012313360364017024 5ustar tomussrcpycuda-2013.1.1+git20140310/doc/source/_static/akdoc.css0000644000175000000500000000125112313360364020614 0ustar tomussrcpre { line-height: 110%; } .footer { background-color: #eee; } body > div.container { margin-top:10px; } dd { margin-left: 40px; } tt.descname { font-size: 100%; } code { color: rgb(51,51,51); } h1 { padding-bottom:7px; border-bottom: 1px solid #ccc; } h2 { padding-bottom:5px; border-bottom: 1px solid #ccc; } h3 { padding-bottom:5px; border-bottom: 1px solid #ccc; } .rubric { font-size: 120%; padding-bottom:1px; border-bottom: 1px solid #ccc; } .headerlink { padding-left: 1ex; padding-right: 1ex; } a.headerlink:hover { text-decoration: none; } blockquote p { font-size: 100%; font-weight: normal; line-height: normal; }; pycuda-2013.1.1+git20140310/doc/source/install.rst0000644000175000000500000000025512313360364017576 0ustar tomussrc.. highlight:: sh Installation ============ Installation information is now maintained collaboratively in the `PyCUDA Wiki `_. pycuda-2013.1.1+git20140310/doc/source/util.rst0000644000175000000500000002166712313360364017117 0ustar tomussrcBuilt-in Utilities ================== Automatic Initialization ------------------------ .. module:: pycuda.autoinit The module :mod:`pycuda.autoinit`, when imported, automatically performs all the steps necessary to get CUDA ready for submission of compute kernels. It uses :func:`pycuda.tools.make_default_context` to create a compute context. .. data:: device An instance of :class:`pycuda.driver.Device` that was used for automatic initialization. .. data:: context A default-constructed instance of :class:`pycuda.driver.Context` on :data:`device`. This context is created by calling :func:`pycuda.tools.make_default_context`. Choice of Device ---------------- .. module:: pycuda.tools .. function:: make_default_context() Return a :class:`pycuda.driver.Context` instance chosen according to the following rules: * If the environment variable :envvar:`CUDA_DEVICE` is set, its integer value is used as the device number. * If the file :file:`.cuda-device` is present in the user's home directory, the integer value of its contents is used as the device number. * Otherwise, all available CUDA devices are tried in a round-robin fashion. An error is raised if this does not lead to a usable context. .. versionadded: 0.94 .. function:: get_default_device(default=0) Deprecated. Use :func:`make_default_context`. Return a :class:`pycuda.driver.Device` instance chosen according to the following rules: * If the environment variable :envvar:`CUDA_DEVICE` is set, its integer value is used as the device number. * If the file :file:`.cuda-device` is present in the user's home directory, the integer value of its contents is used as the device number. * Otherwise, `default` is used as the device number. .. versionchanged: 0.94 Deprecated. Kernel Caching -------------- .. function:: context_dependent_memoize(func) This decorator caches the result of the decorated function, *if* a subsequent occurs in the same :class:`pycuda.driver.Context`. This is useful for caching of kernels. .. function:: clear_context_caches() Empties all context-dependent memoization caches. Also releases all held reference contexts. If it is important to you that the program detaches from its context, you might need to call this function to free all remaining references to your context. Testing ------- .. function:: mark_cuda_test(func) This function, meant for use with :mod:`py.test`, will mark *func* with a "cuda" tag and make sure it has a CUDA context available when invoked. Device Metadata and Occupancy ----------------------------- .. class:: DeviceData(dev=None) Gives access to more information on a device than is available through :meth:`pycuda.driver.Device.get_attribute`. If `dev` is `None`, it defaults to the device returned by :meth:`pycuda.driver.Context.get_device`. .. attribute:: max_threads .. attribute:: warp_size .. attribute:: warps_per_mp .. attribute:: thread_blocks_per_mp .. attribute:: registers .. attribute:: shared_memory .. attribute:: smem_granularity The number of threads that participate in banked, simultaneous access to shared memory. .. attribute:: smem_alloc_granularity The size of the smallest possible (non-empty) shared memory allocation. .. method:: align_bytes(word_size=4) The distance between global memory base addresses that allow accesses of word-size `word_size` bytes to get coalesced. .. method:: align(bytes, word_size=4) Round up `bytes` to the next alignment boundary as given by :meth:`align_bytes`. .. method:: align_words(word_size) Return `self.align_bytes(word_size)/word_size`, while checking that the division did not yield a remainder. .. method:: align_dtype(elements, dtype_size) Round up `elements` to the next alignment boundary as given by :meth:`align_bytes`, where each element is assumed to be `dtype_size` bytes large. .. UNDOC coalesce .. staticmethod:: make_valid_tex_channel_count(size) Round up `size` to a valid texture channel count. .. class:: OccupancyRecord(devdata, threads, shared_mem=0, registers=0) Calculate occupancy for a given kernel workload characterized by * thread count of `threads` * shared memory use of `shared_mem` bytes * register use of `registers` 32-bit registers .. attribute:: tb_per_mp How many thread blocks execute on each multiprocessor. .. attribute:: limited_by What :attr:`tb_per_mp` is limited by. One of `"device"`, `"warps"`, `"regs"`, `"smem"`. .. attribute:: warps_per_mp How many warps execute on each multiprocessor. .. attribute:: occupancy A `float` value between 0 and 1 indicating how much of each multiprocessor's scheduling capability is occupied by the kernel. .. _mempool: Memory Pools ------------ The functions :func:`pycuda.driver.mem_alloc` and :func:`pycuda.driver.pagelocked_empty` can consume a fairly large amount of processing time if they are invoked very frequently. For example, code based on :class:`pycuda.gpuarray.GPUArray` can easily run into this issue because a fresh memory area is allocated for each intermediate result. Memory pools are a remedy for this problem based on the observation that often many of the block allocations are of the same sizes as previously used ones. Then, instead of fully returning the memory to the system and incurring the associated reallocation overhead, the pool holds on to the memory and uses it to satisfy future allocations of similarly-sized blocks. The pool reacts appropriately to out-of-memory conditions as long as all memory allocations are made through it. Allocations performed from outside of the pool may run into spurious out-of-memory conditions due to the pool owning much or all of the available memory. Device-based Memory Pool ^^^^^^^^^^^^^^^^^^^^^^^^ .. class:: PooledDeviceAllocation An object representing a :class:`DeviceMemoryPool`-based allocation of linear device memory. Once this object is deleted, its associated device memory is freed. :class:`PooledDeviceAllocation` instances can be cast to :class:`int` (and :class:`long`), yielding the starting address of the device memory allocated. .. method:: free Explicitly return the memory held by *self* to the associated memory pool. .. method:: __len__ Return the size of the allocated memory in bytes. .. class:: DeviceMemoryPool A memory pool for linear device memory as allocated using :func:`pycuda.driver.mem_alloc`. (see :ref:`mempool`) .. attribute:: held_blocks The number of unused blocks being held by this pool. .. attribute:: active_blocks The number of blocks in active use that have been allocated through this pool. .. method:: allocate(size) Return a :class:`PooledDeviceAllocation` of *size* bytes. .. method:: free_held Free all unused memory that the pool is currently holding. .. method:: stop_holding Instruct the memory to start immediately freeing memory returned to it, instead of holding it for future allocations. Implicitly calls :meth:`free_held`. This is useful as a cleanup action when a memory pool falls out of use. Memory Pool for pagelocked memory ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. class:: PooledHostAllocation An object representing a :class:`PageLockedMemoryPool`-based allocation of linear device memory. Once this object is deleted, its associated device memory is freed. .. method:: free Explicitly return the memory held by *self* to the associated memory pool. .. method:: __len__ Return the size of the allocated memory in bytes. .. class:: PageLockedAllocator(flags=0) Specifies the set of :class:`pycuda.driver.host_alloc_flags` used in its associated :class:`PageLockedMemoryPool`. .. class:: PageLockedMemoryPool(allocator=PageLockedAllocator()) A memory pool for pagelocked host memory as allocated using :func:`pycuda.driver.pagelocked_empty`. (see :ref:`mempool`) .. attribute:: held_blocks The number of unused blocks being held by this pool. .. attribute:: active_blocks The number of blocks in active use that have been allocated through this pool. .. method:: allocate(shape, dtype, order="C") Return an uninitialized ("empty") :class:`numpy.ndarray` with the given *shape*, *dtype*, and *order*. This array will be backed by a :class:`PooledHostAllocation`, which can be found as the ``.base`` attribute of the array. .. method:: free_held Free all unused memory that the pool is currently holding. .. method:: stop_holding Instruct the memory to start immediately freeing memory returned to it, instead of holding it for future allocations. Implicitly calls :meth:`free_held`. This is useful as a cleanup action when a memory pool falls out of use. pycuda-2013.1.1+git20140310/doc/source/conf.py0000644000175000000500000001401712313360364016676 0ustar tomussrc# -*- coding: utf-8 -*- # # PyCUDA documentation build configuration file, created by # sphinx-quickstart on Fri Jun 13 00:51:19 2008. # # This file is execfile()d with the current directory set to its containing dir. # # The contents of this file are pickled, so don't put values in the namespace # that aren't pickleable (module imports are okay, they're removed automatically). # # All configuration values have a default value; values that are commented out # serve to show the default value. #import sys, os # If your extensions are in another directory, add it here. If the directory # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. #sys.path.append(os.path.abspath('some/directory')) # General configuration # --------------------- # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.intersphinx'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General substitutions. project = 'PyCUDA' copyright = '2008, Andreas Kloeckner' # The default replacements for |version| and |release|, also used in various # other places throughout the built documents. # # The short X.Y version. ver_dic = {} execfile("../../pycuda/__init__.py", ver_dic) version = ".".join(str(x) for x in ver_dic["VERSION"]) # The full version, including alpha/beta/rc tags. release = ver_dic["VERSION_TEXT"] # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. #unused_docs = [] # List of directories, relative to source directories, that shouldn't be searched # for source files. #exclude_dirs = [] # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # Options for HTML output # ----------------------- try: import sphinx_bootstrap_theme except: from warnings import warn warn("I would like to use the sphinx bootstrap theme, but can't find it.\n" "'pip install sphinx_bootstrap_theme' to fix.") else: # Activate the theme. html_theme = 'bootstrap' html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { "navbar_fixed_top": "true", "navbar_site_name": "Contents", 'bootstrap_version': '3', 'source_link_position': 'footer', } # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths # given in html_static_path. #html_style = 'default.css' # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # The name of an image file (within the static path) to place at the top of # the sidebar. #html_logo = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_use_modindex = True # If true, the reST sources are included in the HTML build as _sources/. #html_copy_source = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'PyCudadoc' # Options for LaTeX output # ------------------------ # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, document class [howto/manual]). latex_documents = [ ('index', 'pycdua.tex', 'PyCUDA Documentation', 'Andreas Kloeckner', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # Additional stuff for the LaTeX preamble. #latex_preamble = '' # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_use_modindex = True intersphinx_mapping = { 'http://docs.python.org/dev': None, 'http://docs.scipy.org/doc/numpy/': None, 'http://documen.tician.de/codepy/': None, } pycuda-2013.1.1+git20140310/doc/source/driver.rst0000644000175000000500000015727012313360364017435 0ustar tomussrc.. _reference-doc: Device Interface ================ .. module:: pycuda .. moduleauthor:: Andreas Kloeckner Version Queries --------------- .. data:: VERSION Gives the numeric version of PyCUDA as a variable-length tuple of integers. Enables easy version checks such as *VERSION >= (0, 93)*. Added in PyCUDA 0.93. .. data:: VERSION_STATUS A text string such as `"rc4"` or `"beta"` qualifying the status of the release. .. versionadded:: 0.93 .. data:: VERSION_TEXT The full release name (such as `"0.93rc4"`) in string form. .. versionadded:: 0.93 .. module:: pycuda.driver :synopsis: Use CUDA devices from Python .. _errors: Error Reporting --------------- .. exception:: Error Base class of all PyCuda errors. .. exception:: CompileError Thrown when :class:`pycuda.compiler.SourceModule` compilation fails. .. attribute:: msg .. versionadded:: 0.94 .. attribute:: stdout .. versionadded:: 0.94 .. attribute:: stderr .. versionadded:: 0.94 .. attribute:: command_line .. versionadded:: 0.94 .. exception:: MemoryError Thrown when :func:`mem_alloc` or related functionality fails. .. exception:: LogicError Thrown when PyCuda was confronted with a situation where it is likely that the programmer has made a mistake. :exc:`LogicErrors` do not depend on outer circumstances defined by the run-time environment. Example: CUDA was used before it was initialized. .. exception:: LaunchError Thrown when kernel invocation has failed. (Note that this will often be reported by the next call after the actual kernel invocation.) .. exception:: RuntimeError Thrown when a unforeseen run-time failure is encountered that is not likely due to programmer error. Example: A file was not found. Constants --------- .. class:: ctx_flags Flags for :meth:`Device.make_context`. CUDA 2.0 and above only. .. attribute:: SCHED_AUTO If there are more contexts than processors, yield, otherwise spin while waiting for CUDA calls to complete. .. attribute:: SCHED_SPIN Spin while waiting for CUDA calls to complete. .. attribute:: SCHED_YIELD Yield to other threads while waiting for CUDA calls to complete. .. attribute:: SCHED_MASK Mask of valid scheduling flags in this bitfield. .. attribute:: SCHED_BLOCKING_SYNC Use blocking synchronization. CUDA 2.2 and newer. .. attribute:: MAP_HOST Support mapped pinned allocations. CUDA 2.2 and newer. .. attribute:: LMEM_RESIZE_TO_MAX Keep local memory allocation after launch. CUDA 3.2 and newer. Rumored to decrease Fermi launch overhead? .. versionadded:: 2011.1 .. attribute:: FLAGS_MASK Mask of valid flags in this bitfield. .. class:: event_flags Flags for :class:`Event`. CUDA 2.2 and newer. .. attribute:: DEFAULT .. attribute:: BLOCKING_SYNC .. attribute:: DISABLE_TIMING CUDA 3.2 and newer. .. versionadded:: 0.94 .. attribute:: INTERPROCESS CUDA 4.1 and newer. .. versionadded:: 2011.2 .. class:: device_attribute .. attribute:: MAX_THREADS_PER_BLOCK .. attribute:: MAX_BLOCK_DIM_X .. attribute:: MAX_BLOCK_DIM_Y .. attribute:: MAX_BLOCK_DIM_Z .. attribute:: MAX_GRID_DIM_X .. attribute:: MAX_GRID_DIM_Y .. attribute:: MAX_GRID_DIM_Z .. attribute:: TOTAL_CONSTANT_MEMORY .. attribute:: WARP_SIZE .. attribute:: MAX_PITCH .. attribute:: CLOCK_RATE .. attribute:: TEXTURE_ALIGNMENT .. attribute:: GPU_OVERLAP .. attribute:: MULTIPROCESSOR_COUNT CUDA 2.0 and above only. .. attribute:: SHARED_MEMORY_PER_BLOCK Deprecated as of CUDA 2.0. See below for replacement. .. attribute:: MAX_SHARED_MEMORY_PER_BLOCK CUDA 2.0 and above only. .. attribute:: REGISTERS_PER_BLOCK Deprecated as of CUDA 2.0. See below for replacement. .. attribute:: MAX_REGISTERS_PER_BLOCK CUDA 2.0 and above. .. attribute:: KERNEL_EXEC_TIMEOUT CUDA 2.2 and above. .. attribute:: INTEGRATED CUDA 2.2 and above. .. attribute:: CAN_MAP_HOST_MEMORY CUDA 2.2 and above. .. attribute:: COMPUTE_MODE CUDA 2.2 and above. See :class:`compute_mode`. .. attribute:: MAXIMUM_TEXTURE1D_WIDTH MAXIMUM_TEXTURE2D_WIDTH MAXIMUM_TEXTURE2D_HEIGHT MAXIMUM_TEXTURE3D_WIDTH MAXIMUM_TEXTURE3D_HEIGHT MAXIMUM_TEXTURE3D_DEPTH MAXIMUM_TEXTURE2D_ARRAY_WIDTH MAXIMUM_TEXTURE2D_ARRAY_HEIGHT MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES CUDA 3.0 and above. .. versionadded:: 0.94 .. attribute:: MAXIMUM_TEXTURE2D_LAYERED_WIDTH MAXIMUM_TEXTURE2D_LAYERED_HEIGHT MAXIMUM_TEXTURE2D_LAYERED_LAYERS MAXIMUM_TEXTURE1D_LAYERED_WIDTH MAXIMUM_TEXTURE1D_LAYERED_LAYERS CUDA 4.0 and above. .. versionadded:: 2011.1 .. attribute:: SURFACE_ALIGNMENT CUDA 3.0 (post-beta) and above. .. versionadded:: 0.94 .. attribute:: CONCURRENT_KERNELS CUDA 3.0 (post-beta) and above. .. versionadded:: 0.94 .. attribute:: ECC_ENABLED CUDA 3.0 (post-beta) and above. .. versionadded:: 0.94 .. attribute:: PCI_BUS_ID CUDA 3.2 and above. .. versionadded:: 0.94 .. attribute:: PCI_DEVICE_ID CUDA 3.2 and above. .. versionadded:: 0.94 .. attribute:: TCC_DRIVER CUDA 3.2 and above. .. versionadded:: 0.94 .. attribute:: MEMORY_CLOCK_RATE GLOBAL_MEMORY_BUS_WIDTH L2_CACHE_SIZE MAX_THREADS_PER_MULTIPROCESSOR ASYNC_ENGINE_COUNT UNIFIED_ADDRESSING CUDA 4.0 and above. .. versionadded:: 2011.1 .. attribute :: MAXIMUM_TEXTURE2D_GATHER_WIDTH MAXIMUM_TEXTURE2D_GATHER_HEIGHT MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE PCI_DOMAIN_ID TEXTURE_PITCH_ALIGNMENT MAXIMUM_TEXTURECUBEMAP_WIDTH MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS MAXIMUM_SURFACE1D_WIDTH MAXIMUM_SURFACE2D_WIDTH MAXIMUM_SURFACE2D_HEIGHT MAXIMUM_SURFACE3D_WIDTH MAXIMUM_SURFACE3D_HEIGHT MAXIMUM_SURFACE3D_DEPTH MAXIMUM_SURFACE1D_LAYERED_WIDTH MAXIMUM_SURFACE1D_LAYERED_LAYERS MAXIMUM_SURFACE2D_LAYERED_WIDTH MAXIMUM_SURFACE2D_LAYERED_HEIGHT MAXIMUM_SURFACE2D_LAYERED_LAYERS MAXIMUM_SURFACECUBEMAP_WIDTH MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS MAXIMUM_TEXTURE1D_LINEAR_WIDTH MAXIMUM_TEXTURE2D_LINEAR_WIDTH MAXIMUM_TEXTURE2D_LINEAR_HEIGHT MAXIMUM_TEXTURE2D_LINEAR_PITCH CUDA 4.1 and above. .. versionadded:: 2011.2 .. attribute :: MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT COMPUTE_CAPABILITY_MAJOR COMPUTE_CAPABILITY_MINOR MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH CUDA 5.0 and above. .. versionadded:: 2014.1 .. attribute :: STREAM_PRIORITIES_SUPPORTED CUDA 5.5 and above. .. versionadded:: 2014.1 .. attribute :: GLOBAL_L1_CACHE_SUPPORTED LOCAL_L1_CACHE_SUPPORTED MAX_SHARED_MEMORY_PER_MULTIPROCESSOR MAX_REGISTERS_PER_MULTIPROCESSOR MANAGED_MEMORY MULTI_GPU_BOARD MULTI_GPU_BOARD_GROUP_ID CUDA 6.0 and above. .. versionadded:: 2014.1 .. class:: pointer_attribute .. attribute:: CONTEXT MEMORY_TYPE DEVICE_POINTER HOST_POINTER CUDA 4.0 and above. .. versionadded:: 2011.1 .. class:: profiler_output_mode .. attribute:: KEY_VALUE_PAIR CSV CUDA 4.0 and above. .. versionadded:: 2011.1 .. class:: function_attribute Flags for :meth:`Function.get_attribute`. CUDA 2.2 and newer. .. attribute:: MAX_THREADS_PER_BLOCK .. attribute:: SHARED_SIZE_BYTES .. attribute:: CONST_SIZE_BYTES .. attribute:: LOCAL_SIZE_BYTES .. attribute:: NUM_REGS .. attribute:: PTX_VERSION CUDA 3.0 (post-beta) and above. .. versionadded:: 0.94 .. attribute:: BINARY_VERSION CUDA 3.0 (post-beta) and above. .. versionadded:: 0.94 .. attribute:: MAX .. class:: func_cache See :meth:`Function.set_cache_config`. CUDA 3.0 (post-beta) and above. .. versionadded:: 0.94 .. attribute:: PREFER_NONE .. attribute:: PREFER_SHARED .. attribute:: PREFER_L1 .. attribute:: PREFER_EQUAL CUDA 4.1 and above. .. versionadded:: 2011.2 .. class:: shared_config See :meth:`Function.set_shared_config`. CUDA 4.2 and above. .. attribute:: DEFAULT_BANK_SIZE .. attribute:: FOUR_BYTE_BANK_SIZE .. attribute:: EIGHT_BYTE_BANK_SIZE .. class:: array_format .. attribute:: UNSIGNED_INT8 .. attribute:: UNSIGNED_INT16 .. attribute:: UNSIGNED_INT32 .. attribute:: SIGNED_INT8 .. attribute:: SIGNED_INT16 .. attribute:: SIGNED_INT32 .. attribute:: HALF .. attribute:: FLOAT .. class:: array3d_flags .. attribute :: 2DARRAY CUDA 3.0 and above. Deprecated--use :attr:`LAYERED`. .. versionadded:: 0.94 .. attribute :: LAYERED CUDA 4.0 and above. .. versionadded:: 2011.1 .. attribute :: SURFACE_LDST CUDA 3.1 and above. .. versionadded:: 0.94 .. attribute :: CUBEMAP TEXTURE_GATHER CUDA 4.1 and above. .. versionadded:: 2011.2 .. class:: address_mode .. attribute:: WRAP .. attribute:: CLAMP .. attribute:: MIRROR .. attribute:: BORDER CUDA 3.2 and above. .. versionadded:: 0.94 .. class:: filter_mode .. attribute:: POINT .. attribute:: LINEAR .. class:: memory_type .. attribute:: HOST .. attribute:: DEVICE .. attribute:: ARRAY .. class:: compute_mode CUDA 2.2 and newer. .. attribute:: DEFAULT .. attribute:: EXCLUSIVE .. attribute:: PROHIBITED .. attribute:: EXCLUSIVE_PROCESS CUDA 4.0 and above. .. versionadded:: 2011.1 .. class:: jit_option CUDA 2.1 and newer. .. attribute:: MAX_REGISTERS .. attribute:: THREADS_PER_BLOCK .. attribute:: WALL_TIME .. attribute:: INFO_LOG_BUFFER .. attribute:: INFO_LOG_BUFFER_SIZE_BYTES .. attribute:: ERROR_LOG_BUFFER .. attribute:: ERROR_LOG_BUFFER_SIZE_BYTES .. attribute:: OPTIMIZATION_LEVEL .. attribute:: TARGET_FROM_CUCONTEXT .. attribute:: TARGET .. attribute:: FALLBACK_STRATEGY .. class:: jit_target CUDA 2.1 and newer. .. attribute:: COMPUTE_10 .. attribute:: COMPUTE_11 .. attribute:: COMPUTE_12 .. attribute:: COMPUTE_13 .. attribute:: COMPUTE_20 CUDA 3.0 and above. .. versionadded:: 0.94 .. attribute:: COMPUTE_21 CUDA 3.2 and above. .. versionadded:: 0.94 .. class:: jit_fallback CUDA 2.1 and newer. .. attribute:: PREFER_PTX .. attribute:: PREFER_BINARY .. class:: host_alloc_flags Flags to be used to allocate :ref:`pagelocked_memory`. .. attribute:: PORTABLE .. attribute:: DEVICEMAP .. attribute:: WRITECOMBINED .. class:: mem_attach_flags Flags to be used to allocate :ref:`managed_memory`. ..versionadded:: 2014.1 .. attribute:: GLOBAL .. attribute:: HOST .. attribute:: SINGLE .. class:: mem_host_register_flags .. attribute:: PORTABLE .. attribute:: DEVICEMAP CUDA 4.0 and newer. .. versionadded:: 2011.1 .. class:: limit Limit values for :meth:`Context.get_limit` and :meth:`Context.set_limit`. CUDA 3.1 and newer. .. versionadded:: 0.94 .. attribute:: STACK_SIZE .. attribute:: PRINTF_FIFO_SIZE .. attribute:: MALLOC_HEAP_SIE CUDA 3.2 and above. .. class:: ipc_mem_flags .. attribute:: LAZY_ENABLE_PEER_ACCESS Graphics-related constants ^^^^^^^^^^^^^^^^^^^^^^^^^^ .. class:: graphics_register_flags .. versionadded:: 2011.1 CUDA 4.0 and above. .. attribute:: NONE READ_ONLY WRITE_DISCARD SURFACE_LDST .. attribute :: TEXTURE_GATHER CUDA 4.1 and above. .. versionadded:: 2011.2 .. class:: array_cubemap_face .. attribute:: POSITIVE_X NEGATIVE_X POSITIVE_Y NEGATIVE_Y POSITIVE_Z NEGATIVE_Z CUDA 3.2 and above. .. versionadded:: 2011.1 Devices and Contexts -------------------- .. function:: get_version() Obtain the version of CUDA against which PyCuda was compiled. Returns a 3-tuple of integers as *(major, minor, revision)*. .. function:: get_driver_version() Obtain the version of the CUDA driver on top of which PyCUDA is running. Returns an integer version number. .. function:: init(flags=0) Initialize CUDA. .. warning:: This must be called before any other function in this module. See also :mod:`pycuda.autoinit`. .. class:: Device(number) Device(pci_bus_id) A handle to the *number*'th CUDA device. See also :mod:`pycuda.autoinit`. .. versionchanged:: 2011.2 The *pci_bus_id* version of the constructor is new in CUDA 4.1. .. staticmethod:: count() Return the number of CUDA devices found. .. method:: name() .. method:: pci_bus_id() CUDA 4.1 and newer. .. versionadded:: 2011.2 .. method:: compute_capability() Return a 2-tuple indicating the compute capability version of this device. .. method:: total_memory() Return the total amount of memory on the device in bytes. .. method:: get_attribute(attr) Return the (numeric) value of the attribute *attr*, which may be one of the :class:`device_attribute` values. All :class:`device_attribute` values may also be directly read as (lower-case) attributes on the :class:`Device` object itself, e.g. `dev.clock_rate`. .. method:: get_attributes() Return all device attributes in a :class:`dict`, with keys from :class:`device_attribute`. .. method:: make_context(flags=ctx_flags.SCHED_AUTO) Create a :class:`Context` on this device, with flags taken from the :class:`ctx_flags` values. Also make the newly-created context the current context. .. method:: can_access_peer(dev) CUDA 4.0 and newer. .. versionadded:: 2011.1 .. method:: __hash__() .. method:: __eq__() .. method:: __ne__() .. class:: Context An equivalent of a UNIX process on the compute device. Create instances of this class using :meth:`Device.make_context`. See also :mod:`pycuda.autoinit`. .. method:: detach() Decrease the reference count on this context. If the reference count hits zero, the context is deleted. .. method:: push() Make *self* the active context, pushing it on top of the context stack. CUDA 2.0 and above only. .. staticmethod:: pop() Remove any context from the top of the context stack, deactivating it. CUDA 2.0 and above only. .. staticmethod:: get_device() Return the device that the current context is working on. .. staticmethod:: synchronize() Wait for all activity in the current context to cease, then return. .. staticmethod:: set_limit(limit, value) See :class:`limit` for possible values of *limit*. CUDA 3.1 and above. .. versionadded:: 0.94 .. staticmethod:: get_limit(limit) See :class:`limit` for possible values of *limit*. CUDA 3.1 and above. .. versionadded:: 0.94 .. staticmethod:: set_cache_config(cc) See :class:`func_cache` for possible values of *cc*. CUDA 3.2 and above. .. versionadded:: 0.94 .. staticmethod:: get_cache_config() Return a value from :class:`func_cache`. CUDA 3.2 and above. .. versionadded:: 0.94 .. staticmethod:: set_shared_config(sc) See :class:`shared_config` for possible values of *sc*. CUDA 4.2 and above. .. versionadded:: 2013.1 .. staticmethod:: get_shared_config() Return a value from :class:`shared_config`. CUDA 4.2 and above. .. versionadded:: 2013.1 .. method:: get_api_version() Return an integer API version number. CUDA 3.2 and above. .. versionadded:: 0.94 .. method:: enable_peer_access(peer, flags=0) CUDA 4.0 and above. .. versionadded:: 2011.1 .. method:: disable_peer_access(peer, flags=0) CUDA 4.0 and above. .. versionadded:: 2011.1 Concurrency and Streams ----------------------- .. class:: Stream(flags=0) A handle for a queue of operations that will be carried out in order. .. method:: synchronize() Wait for all activity on this stream to cease, then return. .. method:: is_done() Return *True* iff all queued operations have completed. .. method:: wait_for_event(evt) Enqueues a wait for the given :class:`Event` instance. CUDA 3.2 and above. .. versionadded:: 2011.1 .. class:: Event(flags=0) An event is a temporal 'marker' in a :class:`Stream` that allows taking the time between two events--such as the time required to execute a kernel. An event's time is recorded when the :class:`Stream` has finished all tasks enqueued before the :meth:`record` call. See :class:`event_flags` for values for the *flags* parameter. .. method:: record(stream=None) Insert a recording point for *self* into the :class:`Stream` *stream*. Return *self*. .. method:: synchronize() Wait until the device execution stream reaches this event. Return *self*. .. method:: query() Return *True* if the device execution stream has reached this event. .. method:: time_since(event) Return the time in milliseconds that has passed between *self* and *event*. Use this method as `end.time_since(start)`. Note that this method will fail with an "invalid value" error if either of the events has not been reached yet. Use :meth:`synchronize` to ensure that the event has been reached. .. method:: time_till(event) Return the time in milliseconds that has passed between *event* and *self*. Use this method as `start.time_till(end)`. Note that this method will fail with an "invalid value" error if either of the events has not been reached yet. Use :meth:`synchronize` to ensure that the event has been reached. .. method:: ipc_handle() Return a :class:`bytes` object representing an IPC handle to this event. Requires Python 2.6 and CUDA 4.1. .. versionadded: 2011.2 .. staticmethod:: from_ipc_handle(handle) Requires Python 2.6 and CUDA 4.1. .. versionadded: 2011.2 Memory ------ Global Device Memory ^^^^^^^^^^^^^^^^^^^^ .. function:: mem_get_info() Return a tuple *(free, total)* indicating the free and total memory in the current context, in bytes. .. function:: mem_alloc(bytes) Return a :class:`DeviceAllocation` object representing a linear piece of device memory. .. function:: to_device(buffer) Allocate enough device memory for *buffer*, which adheres to the Python :class:`buffer` interface. Copy the contents of *buffer* onto the device. Return a :class:`DeviceAllocation` object representing the newly-allocated memory. .. function:: from_device(devptr, shape, dtype, order="C") Make a new :class:`numpy.ndarray` from the data at *devptr* on the GPU, interpreting them using *shape*, *dtype* and *order*. .. function:: from_device_like(devptr, other_ary) Make a new :class:`numpy.ndarray` from the data at *devptr* on the GPU, interpreting them as having the same shape, dtype and order as *other_ary*. .. function:: mem_alloc_pitch(width, height, access_size) Allocates a linear piece of device memory at least *width* bytes wide and *height* rows high that an be accessed using a data type of size *access_size* in a coalesced fashion. Returns a tuple *(dev_alloc, actual_pitch)* giving a :class:`DeviceAllocation` and the actual width of each row in bytes. .. class:: DeviceAllocation An object representing an allocation of linear device memory. Once this object is deleted, its associated device memory is freed. Objects of this type can be cast to :class:`int` to obtain a linear index into this :class:`Context`'s memory. .. method:: free() Release the held device memory now instead of when this object becomes unreachable. Any further use of the object is an error and will lead to undefined behavior. .. method:: as_buffer(size, offset=0) Return the pointer encapsulated by *self* as a Python buffer object, with the given *size* and, optionally, *offset*. .. versionadded:: 2014.1 .. function:: mem_get_ipc_handle(devptr) Return an opaque :class:`bytes` object representing an IPC handle to the device pointer *devptr*. .. versionadded:: 2011.2 Requires CUDA 4.1 and Python 2.6. .. class:: IPCMemoryHandle(ipc_handle, flags=ipc_mem_flags.LAZY_ENABLE_PEER_ACCESS) .. versionadded:: 2011.2 Requires CUDA 4.1 and Python 2.6. Objects of this type can be used in the same ways as a :class:`DeviceAllocation`. .. method:: close() .. class:: PointerHolderBase A base class that facilitates casting to pointers within PyCUDA. This allows the user to construct custom pointer types that may have been allocated by facilities outside of PyCUDA proper, but still need to be objects to facilitate RAII. The user needs to supply one method to facilitate the pointer cast: .. method:: get_pointer() Return the pointer encapsulated by *self*. .. method:: as_buffer(size, offset=0) Return the pointer encapsulated by *self* as a Python buffer object, with the given *size* and, optionally, *offset*. .. versionadded:: 2014.1 .. _pagelocked_memory : Pagelocked Host Memory ^^^^^^^^^^^^^^^^^^^^^^ Pagelocked Allocation ~~~~~~~~~~~~~~~~~~~~~ .. function:: pagelocked_empty(shape, dtype, order="C", mem_flags=0) Allocate a pagelocked :class:`numpy.ndarray` of *shape*, *dtype* and *order*. *mem_flags* may be one of the values in :class:`host_alloc_flags`. It may only be non-zero on CUDA 2.2 and newer. For the meaning of the other parameters, please refer to the :mod:`numpy` documentation. .. function:: pagelocked_zeros(shape, dtype, order="C", mem_flags=0) Like :func:`pagelocked_empty`, but initialized to zero. .. function:: pagelocked_empty_like(array, mem_flags=0) .. function:: pagelocked_zeros_like(array, mem_flags=0) The :class:`numpy.ndarray` instances returned by these functions have an attribute *base* that references an object of type .. class:: PagelockedHostAllocation Inherits from :class:`HostPointer`. An object representing an allocation of pagelocked host memory. Once this object is deleted, its associated device memory is freed. .. method:: free() Release the held memory now instead of when this object becomes unreachable. Any further use of the object (or its associated :mod:`numpy` array) is an error and will lead to undefined behavior. .. method:: get_flags() Return a bit field of values from :class:`host_alloc_flags`. Only available on CUDA 3.2 and newer. .. versionadded:: 0.94 .. class:: HostAllocation A deprecated name for :class:`PagelockedHostAllocation`. .. _aligned_host_memory : Aligned Host Memory ~~~~~~~~~~~~~~~~~~~ .. function:: aligned_empty(shape, dtype, order="C", alignment=4096) Allocate an :class:`numpy.ndarray` of *shape*, *dtype* and *order*, with data aligned to *alignment* bytes. For the meaning of the other parameters, please refer to the :mod:`numpy` documentation. .. versionadded:: 2011.1 .. function:: aligned_zeros(shape, dtype, order="C", alignment=4096) Like :func:`aligned_empty`, but with initialization to zero. .. versionadded:: 2011.1 .. function:: aligned_empty_like(array, alignment=4096) .. versionadded:: 2011.1 .. function:: aligned_zeros_like(array, alignment=4096) .. versionadded:: 2011.1 The :class:`numpy.ndarray` instances returned by these functions have an attribute *base* that references an object of type .. class:: AlignedHostAllocation Inherits from :class:`HostPointer`. An object representing an allocation of aligned host memory. .. method:: free() Release the held memory now instead of when this object becomes unreachable. Any further use of the object (or its associated :mod:`numpy` array) is an error and will lead to undefined behavior. Post-Allocation Pagelocking ~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. function:: register_host_memory(ary, flags=0) Returns a :class:`numpy.ndarray` which shares memory with *ary*. This memory will be page-locked as long as the return value of this function is alive. The returned array's *base* attribute contains a :class:`RegisteredHostMemory` instance, whose *base* attribute in turn contains *ary*. CUDA 4.0 and newer. *ary*'s data address and size must be page-aligned. One way to achieve this is to use the functions in :ref:`aligned_host_memory`. .. versionadded:: 2011.1 .. class:: RegisteredHostMemory Inherits from :class:`HostPointer`. CUDA 4.0 and newer. .. versionadded:: 2011.1 .. method:: unregister() Unregister the page-lock on the host memory held by this instance. Note that this does not free the memory, it only frees the page-lock. .. attribute:: base Contains the Python object from which this instance was constructed. .. class:: HostPointer Represents a page-locked host pointer. .. method:: get_device_pointer() Return a device pointer that indicates the address at which this memory is mapped into the device's address space. Only available on CUDA 2.2 and newer. .. _managed_memory : Managed Memory ^^^^^^^^^^^^^^ CUDA 6.0 adds support for a "Unified Memory" model, which creates a managed virtual memory space that is visible to both CPUs and GPUs. The OS will migrate the physical pages associated with managed memory between the CPU and GPU as needed. This allows a numpy array on the host to be passed to kernels without first creating a DeviceAllocation and manually copying the host data to and from the device. .. note:: Managed memory is only available for some combinations of CUDA device, operating system, and host compiler target architecture. Check the CUDA C Programming Guide and CUDA release notes for details. .. warning:: This interface to managed memory should be considered experimental. It is provided as a preview, but for now the same interface stability guarantees as for the rest of PyCUDA do not apply. Managed Memory Allocation ~~~~~~~~~~~~~~~~~~~~~~~~~ .. function:: managed_empty(shape, dtype, order="C", mem_flags=0) Allocate a managed :class:`numpy.ndarray` of *shape*, *dtype* and *order*. *mem_flags* may be one of the values in :class:`mem_attach_flags`. For the meaning of the other parameters, please refer to the :mod:`numpy` documentation. Only available on CUDA 6.0 and newer. .. versionadded:: 2014.1 .. function:: managed_zeros(shape, dtype, order="C", mem_flags=0) Like :func:`managed_empty`, but initialized to zero. Only available on CUDA 6.0 and newer. .. versionadded:: 2014.1 .. function:: managed_empty_like(array, mem_flags=0) Only available on CUDA 6.0 and newer. .. versionadded:: 2014.1 .. function:: managed_zeros_like(array, mem_flags=0) Only available on CUDA 6.0 and newer. .. versionadded:: 2014.1 The :class:`numpy.ndarray` instances returned by these functions have an attribute *base* that references an object of type .. class:: ManagedAllocation An object representing an allocation of managed host memory. Once this object is deleted, its associated CUDA managed memory is freed. .. method:: free() Release the held memory now instead of when this object becomes unreachable. Any further use of the object (or its associated :mod:`numpy` array) is an error and will lead to undefined behavior. .. method:: get_device_pointer() Return a device pointer that indicates the address at which this memory is mapped into the device's address space. For managed memory, this is also the host pointer. .. method:: attach(mem_flags, stream=None) Alter the visibility of the managed allocation to be one of the values in :class:`mem_attach_flags`. A managed array can be made visible to the host CPU and the entire CUDA context with *mem_attach_flags.GLOBAL*, or limited to the CPU only with *mem_attach_flags.HOST*. If *mem_attach_flags.SINGLE* is selected, then the array will only be visible to CPU and the provided instance of :class:`Stream`. Managed Memory Usage ~~~~~~~~~~~~~~~~~~~~ A managed numpy array is constructed and used on the host in a similar manner to a pagelocked array:: from pycuda.autoinit import context import pycuda.driver as cuda import numpy as np a = cuda.managed_empty(shape=10, dtype=np.float32, mem_flags=cuda.mem_attach_flags.GLOBAL) a[:] = np.linspace(0, 9, len(a)) # Fill array on host It can be passed to a GPU kernel, and used again on the host without an explicit copy:: from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void doublify(float *a) { a[threadIdx.x] *= 2; } """) doublify = mod.get_function("doublify") doublify(a, grid=(1,1), block=(len(a),1,1)) context.synchronize() # Wait for kernel completion before host access median = np.median(a) # Computed on host! .. warning:: The CUDA Unified Memory model has very specific rules regarding concurrent access of managed memory allocations. Host access to any managed array is not allowed while the GPU is executing a kernel, regardless of whether the array is in use by the running kernel. Failure to follow the concurrency rules will generate a segmentation fault, *causing the Python interpreter to terminate immediately*. Users of managed numpy arrays should read the "Unified Memory Programming" appendix of the CUDA C Programming Guide for further details on the concurrency restrictions. If you are encountering interpreter terminations due to concurrency issues, the `faulthandler ` module may be helpful in locating the location in your Python program where the faulty access is occurring. Arrays and Textures ^^^^^^^^^^^^^^^^^^^ .. class:: ArrayDescriptor .. attribute:: width .. attribute:: height .. attribute:: format A value of type :class:`array_format`. .. attribute:: num_channels .. class:: ArrayDescriptor3D .. attribute:: width .. attribute:: height .. attribute:: depth .. attribute:: format A value of type :class:`array_format`. CUDA 2.0 and above only. .. attribute:: num_channels .. class:: Array(descriptor) A 2D or 3D memory block that can only be accessed via texture references. *descriptor* can be of type :class:`ArrayDescriptor` or :class:`ArrayDescriptor3D`. .. method:: free() Release the array and its device memory now instead of when this object becomes unreachable. Any further use of the object is an error and will lead to undefined behavior. .. method:: get_descriptor() Return a :class:`ArrayDescriptor` object for this 2D array, like the one that was used to create it. .. method:: get_descriptor_3d() Return a :class:`ArrayDescriptor3D` object for this 3D array, like the one that was used to create it. CUDA 2.0 and above only. .. class:: SurfaceReference() .. note:: Instances of this class can only be constructed through :meth:`Module.get_surfref`. CUDA 3.1 and above. .. versionadded:: 0.94 .. method:: set_array(array, flags=0) Bind *self* to the :class:`Array` *array*. As long as *array* remains bound to this texture reference, it will not be freed--the texture reference keeps a reference to the array. .. method:: get_array() Get back the :class:`Array` to which *self* is bound. .. note:: This will be a different object than the one passed to :meth:`set_array`, but it will compare equal. .. class:: TextureReference() A handle to a binding of either linear memory or an :class:`Array` to a texture unit. .. method:: set_array(array) Bind *self* to the :class:`Array` *array*. As long as *array* remains bound to this texture reference, it will not be freed--the texture reference keeps a reference to the array. .. method:: set_address(devptr, bytes, allow_offset=False) Bind *self* to the a chunk of linear memory starting at the integer address *devptr*, encompassing a number of *bytes*. Due to alignment requirements, the effective texture bind address may be different from the requested one by an offset. This method returns this offset in bytes. If *allow_offset* is ``False``, a nonzero value of this offset will cause an exception to be raised. Unlike for :class:`Array` objects, no life support is provided for linear memory bound to texture references. .. method:: set_address_2d(devptr, descr, pitch) Bind *self* as a 2-dimensional texture to a chunk of global memory at *devptr*. The line-to-line offset in bytes is given by *pitch*. Width, height and format are given in the :class:`ArrayDescriptor` *descr*. :meth:`set_format` need not and should not be called in addition to this method. .. method:: set_format(fmt, num_components) Set the texture to have :class:`array_format` *fmt* and to have *num_components* channels. .. method:: set_address_mode(dim, am) Set the address mode of dimension *dim* to *am*, which must be one of the :class:`address_mode` values. .. method:: set_flags(flags) Set the flags to a combination of the *TRSF_XXX* values. .. method:: get_array() Get back the :class:`Array` to which *self* is bound. .. note:: This will be a different object than the one passed to :meth:`set_array`, but it will compare equal. .. method:: get_address_mode(dim) .. method:: get_filter_mode() .. method:: get_format() Return a tuple *(fmt, num_components)*, where *fmt* is of type :class:`array_format`, and *num_components* is the number of channels in this texture. (Version 2.0 and above only.) .. method:: get_flags() .. data:: TRSA_OVERRIDE_FORMAT .. data:: TRSF_READ_AS_INTEGER .. data:: TRSF_NORMALIZED_COORDINATES .. data:: TR_DEFAULT .. function:: matrix_to_array(matrix, order) Turn the two-dimensional :class:`numpy.ndarray` object *matrix* into an :class:`Array`. The `order` argument can be either `"C"` or `"F"`. If it is `"C"`, then `tex2D(x,y)` is going to fetch `matrix[y,x]`, and vice versa for for `"F"`. .. function:: make_multichannel_2d_array(matrix, order) Turn the three-dimensional :class:`numpy.ndarray` object *matrix* into an 2D :class:`Array` with multiple channels. Depending on `order`, the `matrix`'s shape is interpreted as * `height, width, num_channels` for `order == "C"`, * `num_channels, width, height` for `order == "F"`. .. note :: This function assumes that *matrix* has been created with the memory order *order*. If that is not the case, the copied data will likely not be what you expect. .. _memset: Initializing Device Memory ^^^^^^^^^^^^^^^^^^^^^^^^^^ .. function:: memset_d8(dest, data, count) .. function:: memset_d16(dest, data, count) .. function:: memset_d32(dest, data, count) .. note:: *count* is the number of elements, not bytes. .. function:: memset_d2d8(dest, pitch, data, width, height) .. function:: memset_d2d16(dest, pitch, data, width, height) .. function:: memset_d2d32(dest, pitch, data, width, height) Unstructured Memory Transfers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. function:: memcpy_htod(dest, src) Copy from the Python buffer *src* to the device pointer *dest* (an :class:`int` or a :class:`DeviceAllocation`). The size of the copy is determined by the size of the buffer. .. function:: memcpy_htod_async(dest, src, stream=None) Copy from the Python buffer *src* to the device pointer *dest* (an :class:`int` or a :class:`DeviceAllocation`) asynchronously, optionally serialized via *stream*. The size of the copy is determined by the size of the buffer. *src* must be page-locked memory, see, e.g. :func:`pagelocked_empty`. New in 0.93. .. function:: memcpy_dtoh(dest, src) Copy from the device pointer *src* (an :class:`int` or a :class:`DeviceAllocation`) to the Python buffer *dest*. The size of the copy is determined by the size of the buffer. .. function:: memcpy_dtoh_async(dest, src, stream=None) Copy from the device pointer *src* (an :class:`int` or a :class:`DeviceAllocation`) to the Python buffer *dest* asynchronously, optionally serialized via *stream*. The size of the copy is determined by the size of the buffer. *dest* must be page-locked memory, see, e.g. :func:`pagelocked_empty`. New in 0.93. .. function:: memcpy_dtod(dest, src, size) .. function:: memcpy_dtod_async(dest, src, size, stream=None) CUDA 3.0 and above. .. versionadded:: 0.94 .. function:: memcpy_peer(dest, src, size, dest_context=None, src_context=None) .. function:: memcpy_peer_async(dest, src, size, dest_context=None, src_context=None, stream=None) CUDA 4.0 and above. .. versionadded:: 2011.1 .. function:: memcpy_dtoa(ary, index, src, len) .. function:: memcpy_atod(dest, ary, index, len) .. function:: memcpy_htoa(ary, index, src) .. function:: memcpy_atoh(dest, ary, index) .. function:: memcpy_atoa(dest, dest_index, src, src_index, len) Structured Memory Transfers ^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. class:: Memcpy2D() .. attribute:: src_x_in_bytes X Offset of the origin of the copy. (initialized to 0) .. attribute:: src_y Y offset of the origin of the copy. (initialized to 0) .. attribute:: src_pitch Size of a row in bytes at the origin of the copy. .. method:: set_src_host(buffer) Set the *buffer*, which must be a Python object adhering to the buffer interface, to be the origin of the copy. .. method:: set_src_array(array) Set the :class:`Array` *array* to be the origin of the copy. .. method:: set_src_device(devptr) Set the device address *devptr* (an :class:`int` or a :class:`DeviceAllocation`) as the origin of the copy. .. method:: set_src_unified(buffer) Same as :meth:`set_src_host`, except that *buffer* may also correspond to device memory. CUDA 4.0 and above. Requires unified addressing. .. versionadded:: 2011.1 .. attribute :: dst_x_in_bytes X offset of the destination of the copy. (initialized to 0) .. attribute :: dst_y Y offset of the destination of the copy. (initialized to 0) .. attribute :: dst_pitch Size of a row in bytes at the destination of the copy. .. method:: set_dst_host(buffer) Set the *buffer*, which must be a Python object adhering to the buffer interface, to be the destination of the copy. .. method:: set_dst_array(array) Set the :class:`Array` *array* to be the destination of the copy. .. method:: set_dst_device(devptr) Set the device address *devptr* (an :class:`int` or a :class:`DeviceAllocation`) as the destination of the copy. .. method:: set_dst_unified(buffer) Same as :meth:`set_dst_host`, except that *buffer* may also correspond to device memory. CUDA 4.0 and above. Requires unified addressing. .. versionadded:: 2011.1 .. attribute:: width_in_bytes Number of bytes to copy for each row in the transfer. .. attribute:: height Number of rows to copy. .. method:: __call__([aligned=True]) Perform the specified memory copy, waiting for it to finish. If *aligned* is *False*, tolerate device-side misalignment for device-to-device copies that may lead to loss of copy bandwidth. .. method:: __call__(stream) Perform the memory copy asynchronously, serialized via the :class:`Stream` *stream*. Any host memory involved in the transfer must be page-locked. .. class:: Memcpy3D() :class:`Memcpy3D` has the same members as :class:`Memcpy2D`, and additionally all of the following: .. attribute:: src_height Ignored when source is an :class:`Array`. May be 0 if Depth==1. .. attribute:: src_z Z offset of the origin of the copy. (initialized to 0) .. attribute:: dst_height Ignored when destination is an :class:`Array`. May be 0 if Depth==1. .. attribute:: dst_z Z offset of the destination of the copy. (initialized to 0) .. attribute:: depth :class:`Memcpy3D` is supported on CUDA 2.0 and above only. .. class:: Memcpy3DPeer() :class:`Memcpy3DPeer` has the same members as :class:`Memcpy3D`, and additionally all of the following: .. method:: set_src_context(ctx) .. method:: set_dst_context(ctx) CUDA 4.0 and newer. .. versionadded:: 2011.1 Code on the Device: Modules and Functions ----------------------------------------- .. class:: Module Handle to a CUBIN module loaded onto the device. Can be created with :func:`module_from_file` and :func:`module_from_buffer`. .. method:: get_function(name) Return the :class:`Function` *name* in this module. .. warning:: While you can obtain different handles to the same function using this method, these handles all share the same state that is set through the ``set_XXX`` methods of :class:`Function`. This means that you can't obtain two different handles to the same function and :meth:`Function.prepare` them in two different ways. .. method:: get_global(name) Return a tuple `(device_ptr, size_in_bytes)` giving the device address and size of the global *name*. The main use of this method is to find the address of pre-declared `__constant__` arrays so they can be filled from the host before kernel invocation. .. method:: get_texref(name) Return the :class:`TextureReference` *name* from this module. .. method:: get_surfref(name) Return the :class:`SurfaceReference` *name* from this module. CUDA 3.1 and above. .. versionadded:: 0.94 .. function:: module_from_file(filename) Create a :class:`Module` by loading the CUBIN file *filename*. .. function:: module_from_buffer(buffer, options=[], message_handler=None) Create a :class:`Module` by loading a PTX or CUBIN module from *buffer*, which must support the Python buffer interface. (For example, :class:`str` and :class:`numpy.ndarray` do.) :param options: A list of tuples (:class:`jit_option`, value). :param message_handler: A callable that is called with a arguments of ``(compile_success_bool, info_str, error_str)`` which allows the user to process error and warning messages from the PTX compiler. Loading PTX modules as well as non-default values of *options* and *message_handler* are only allowed on CUDA 2.1 and newer. .. class:: Function Handle to a *__global__* function in a :class:`Module`. Create using :meth:`Module.get_function`. .. method:: __call__(arg1, ..., argn, block=block_size, [grid=(1,1), [stream=None, [shared=0, [texrefs=[], [time_kernel=False]]]]]) Launch *self*, with a thread block size of *block*. *block* must be a 3-tuple of integers. *arg1* through *argn* are the positional C arguments to the kernel. See :meth:`param_set` for details. See especially the warnings there. *grid* specifies, as a 2-tuple, the number of thread blocks to launch, as a two-dimensional grid. *stream*, if specified, is a :class:`Stream` instance serializing the copying of input arguments (if any), execution, and the copying of output arguments (again, if any). *shared* gives the number of bytes available to the kernel in *extern __shared__* arrays. *texrefs* is a :class:`list` of :class:`TextureReference` instances that the function will have access to. The function returns either *None* or the number of seconds spent executing the kernel, depending on whether *time_kernel* is *True*. This is a convenience interface that can be used instead of the :meth:`param_*` and :meth:`launch_*` methods below. For a faster (but mildly less convenient) way of invoking kernels, see :meth:`prepare` and :meth:`prepared_call`. *arg1* through *argn* are allowed to be of the following types: * Subclasses of :class:`numpy.number`. These are sized number types such as :class:`numpy.uint32` or :class:`numpy.float32`. * :class:`DeviceAllocation` instances, which will become a device pointer to the allocated memory. * Instances of :class:`ArgumentHandler` subclasses. These can be used to automatically transfer :mod:`numpy` arrays onto and off of the device. * Objects supporting the Python :class:`buffer` interface. These chunks of bytes will be copied into the parameter space verbatim. * :class:`GPUArray` instances. .. warning:: You cannot pass values of Python's native :class:`int` or :class:`float` types to param_set. Since there is no unambiguous way to guess the size of these integers or floats, it complains with a :exc:`TypeError`. .. note:: This method has to guess the types of the arguments passed to it, which can make it somewhat slow. For a kernel that is invoked often, this can be inconvenient. For a faster (but mildly less convenient) way of invoking kernels, see :meth:`prepare` and :meth:`prepared_call`. .. method:: param_set_texref(texref) Make the :class:`TextureReference` texref available to the function. .. method:: prepare(arg_types, block=None, shared=None, texrefs=[]) Prepare the invocation of this function by * setting up the argument types as `arg_types`. `arg_types` is expected to be an iterable containing type characters understood by the :mod:`struct` module or :class:`numpy.dtype` objects. (In addition, PyCUDA understands *'F'* and *'D'* for single- and double precision floating point numbers.) * setting the thread block shape for this function to `block`. * Registering the texture references `texrefs` for use with this functions. The :class:`TextureReference` objects in `texrefs` will be retained, and whatever these references are bound to at invocation time will be available through the corresponding texture references within the kernel. Return `self`. .. warning:: Passing *block* or *shared* not equal to *None* is djprecated as of version 2011.1. .. method:: prepared_call(grid, block, *args, shared_size=0) Invoke `self` using :meth:`launch_grid`, with `args` a grid size of `grid`, and a block size of *block*. Assumes that :meth:`prepare` was called on *self*. The texture references given to :meth:`prepare` are set up as parameters, as well. .. versionchanged:: 2012.1 *shared_size* was added. .. method:: prepared_timed_call(grid, block, *args, shared_size=0) Invoke `self` using :meth:`launch_grid`, with `args`, a grid size of `grid`, and a block size of *block*. Assumes that :meth:`prepare` was called on *self*. The texture references given to :meth:`prepare` are set up as parameters, as well. Return a 0-ary callable that can be used to query the GPU time consumed by the call, in seconds. Once called, this callable will block until completion of the invocation. .. versionchanged:: 2012.1 *shared_size* was added. .. method:: prepared_async_call(grid, block, stream, *args, shared_size=0) Invoke `self` using :meth:`launch_grid_async`, with `args`, a grid size of `grid`, and a block size of *block*, serialized into the :class:`pycuda.driver.Stream` `stream`. If `stream` is None, do the same as :meth:`prepared_call`. Assumes that :meth:`prepare` was called on *self*. The texture references given to :meth:`prepare` are set up as parameters, as well. .. versionchanged:: 2012.1 *shared_size* was added. .. method:: get_attribute(attr) Return one of the attributes given by the :class:`function_attribute` value *attr*. All :class:`function_attribute` values may also be directly read as (lower-case) attributes on the :class:`Function` object itself, e.g. `func.num_regs`. CUDA 2.2 and newer. .. versionadded:: 0.93 .. attribute:: set_cache_config(fc) See :class:`func_cache` for possible values of *fc*. CUDA 3.0 (post-beta) and newer. .. versionadded:: 0.94 .. attribute:: set_shared_config(sc) See :class:`shared_config` for possible values of *sc*. CUDA 4.2 and newer. .. versionadded:: 2013.1 .. attribute:: local_size_bytes The number of bytes of local memory used by this function. On CUDA 2.1 and below, this is only available if this function is part of a :class:`pycuda.compiler.SourceModule`. It replaces the now-deprecated attribute `lmem`. .. attribute:: shared_size_bytes The number of bytes of shared memory used by this function. On CUDA 2.1 and below, this is only available if this function is part of a :class:`pycuda.compiler.SourceModule`. It replaces the now-deprecated attribute `smem`. .. attribute:: num_regs The number of 32-bit registers used by this function. On CUDA 2.1 and below, this is only available if this function is part of a :class:`pycuda.compiler.SourceModule`. It replaces the now-deprecated attribute `registers`. .. method:: set_shared_size(bytes) Set *shared* to be the number of bytes available to the kernel in *extern __shared__* arrays. .. warning:: Deprecated as of version 2011.1. .. method:: set_block_shape(x, y, z) Set the thread block shape for this function. .. warning:: Deprecated as of version 2011.1. .. method:: param_set(arg1, ... argn) Set the thread block shape for this function. .. warning:: Deprecated as of version 2011.1. .. method:: param_set_size(bytes) Size the parameter space to *bytes*. .. warning:: Deprecated as of version 2011.1. .. method:: param_seti(offset, value) Set the integer at *offset* in the parameter space to *value*. .. warning:: Deprecated as of version 2011.1. .. method:: param_setf(offset, value) Set the float at *offset* in the parameter space to *value*. .. warning:: Deprecated as of version 2011.1. .. method:: launch() Launch a single thread block of *self*. .. warning:: Deprecated as of version 2011.1. .. method:: launch_grid(width, height) Launch a width*height grid of thread blocks of *self*. .. warning:: Deprecated as of version 2011.1. .. method:: launch_grid_async(width, height, stream) Launch a width*height grid of thread blocks of *self*, sequenced by the :class:`Stream` *stream*. .. warning:: Deprecated as of version 2011.1. .. class:: ArgumentHandler(array) .. class:: In(array) Inherits from :class:`ArgumentHandler`. Indicates that :class:`buffer` *array* should be copied to the compute device before invoking the kernel. .. class:: Out(array) Inherits from :class:`ArgumentHandler`. Indicates that :class:`buffer` *array* should be copied off the compute device after invoking the kernel. .. class:: InOut(array) Inherits from :class:`ArgumentHandler`. Indicates that :class:`buffer` *array* should be copied both onto the compute device before invoking the kernel, and off it afterwards. Profiler Control ================ CUDA 4.0 and newer. .. function:: initialize_profiler(config_file, output_file, output_mode) *output_mode* is one of the attributes of :class:`profiler_output_mode`. .. versionadded:: 2011.1 .. function:: start_profiler() .. versionadded:: 2011.1 .. function:: stop() .. versionadded:: 2011.1 Just-in-time Compilation ======================== .. module:: pycuda.compiler .. data:: DEFAULT_NVCC_FLAGS .. versionadded:: 2011.1 If no *options* are given in the calls below, the value of this list-type variable is used instead. This may be useful for injecting necessary flags into the compilation of automatically compiled kernels, such as those used by the module :mod:`pycuda.gpuarray`. The initial value of this variable is taken from the environment variable :envvar:`PYCUDA_DEFAULT_NVCC_FLAGS`. If you modify this variable in your code, please be aware that this is a globally shared variable that may be modified by multiple packages. Please exercise caution in such modifications--you risk breaking other people's code. .. class:: SourceModule(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]) Create a :class:`Module` from the CUDA source code *source*. The Nvidia compiler *nvcc* is assumed to be on the :envvar:`PATH` if no path to it is specified, and is invoked with *options* to compile the code. If *keep* is *True*, the compiler output directory is kept, and a line indicating its location in the file system is printed for debugging purposes. Unless *no_extern_c* is *True*, the given source code is wrapped in *extern "C" { ... }* to prevent C++ name mangling. `arch` and `code` specify the values to be passed for the :option:`-arch` and :option:`-code` options on the :program:`nvcc` command line. If `arch` is `None`, it defaults to the current context's device's compute capability. If `code` is `None`, it will not be specified. `cache_dir` gives the directory used for compiler caching. It has a sensible per-user default. If it is set to `False`, caching is disabled. This class exhibits the same public interface as :class:`pycuda.driver.Module`, but does not inherit from it. *Change note:* :class:`SourceModule` was moved from :mod:`pycuda.driver` to :mod:`pycuda.compiler` in version 0.93. .. function:: compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]) Perform the same compilation as the corresponding :class:`SourceModule` constructor, but only return resulting *cubin* file as a string. In particular, do not upload the code to the GPU. pycuda-2013.1.1+git20140310/doc/source/_templates/0002755000175000000500000000000012313360364017533 5ustar tomussrcpycuda-2013.1.1+git20140310/doc/source/_templates/layout.html0000644000175000000500000000012412313360364021731 0ustar tomussrc{% extends "!layout.html" %} {% set css_files = css_files + ['_static/akdoc.css']%} pycuda-2013.1.1+git20140310/doc/source/index.rst0000644000175000000500000000646212313360364017245 0ustar tomussrcWelcome to PyCUDA's documentation! ================================== PyCUDA gives you easy, Pythonic access to `Nvidia `_'s `CUDA `_ parallel computation API. Several wrappers of the CUDA API already exist--so why the need for PyCUDA? * Object cleanup tied to lifetime of objects. This idiom, often called `RAII `_ in C++, makes it much easier to write correct, leak- and crash-free code. PyCUDA knows about dependencies, too, so (for example) it won't detach from a context before all memory allocated in it is also freed. * Convenience. Abstractions like :class:`pycuda.compiler.SourceModule` and :class:`pycuda.gpuarray.GPUArray` make CUDA programming even more convenient than with Nvidia's C-based runtime. * Completeness. PyCUDA puts the full power of CUDA's driver API at your disposal, if you wish. * Automatic Error Checking. All CUDA errors are automatically translated into Python exceptions. * Speed. PyCUDA's base layer is written in C++, so all the niceties above are virtually free. * Helpful Documentation. You're looking at it. ;) Here's an example, to given you an impression:: import pycuda.autoinit import pycuda.driver as drv import numpy from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(400).astype(numpy.float32) b = numpy.random.randn(400).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1), grid=(1,1)) print dest-a*b (This example is :file:`examples/hello_gpu.py` in the PyCUDA source distribution.) On the surface, this program will print a screenful of zeros. Behind the scenes, a lot more interesting stuff is going on: * PyCUDA has compiled the CUDA source code and uploaded it to the card. .. note:: This code doesn't have to be a constant--you can easily have Python generate the code you want to compile. See :ref:`metaprog`. * PyCUDA's numpy interaction code has automatically allocated space on the device, copied the numpy arrays *a* and *b* over, launched a 400x1x1 single-block grid, and copied *dest* back. Note that you can just as well keep your data on the card between kernel invocations--no need to copy data all the time. * See how there's no cleanup code in the example? That's not because we were lazy and just skipped it. It simply isn't needed. PyCUDA will automatically infer what cleanup is necessary and do it for you. Curious? Let's get started. Contents ========= .. toctree:: :maxdepth: 2 install tutorial driver util gl array metaprog misc Note that this guide will not explain CUDA programming and technology. Please refer to Nvidia's `programming documentation `_ for that. PyCUDA also has its own `web site `_, where you can find updates, new versions, documentation, and support. Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` pycuda-2013.1.1+git20140310/doc/Makefile0000644000175000000500000000433212313360364015536 0ustar tomussrc# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html web pickle htmlhelp latex changes linkcheck help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " pickle to make pickle files (usable by e.g. sphinx-web)" @echo " htmlhelp to make HTML files and a HTML help project" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " changes to make an overview over all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" clean: -rm -rf build/* html: mkdir -p build/html build/doctrees source/.static $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html @echo @echo "Build finished. The HTML pages are in build/html." pickle: mkdir -p build/pickle build/doctrees $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle @echo @echo "Build finished; now you can process the pickle files or run" @echo " sphinx-web build/pickle" @echo "to start the sphinx-web server." web: pickle htmlhelp: mkdir -p build/htmlhelp build/doctrees $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) build/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in build/htmlhelp." latex: mkdir -p build/latex build/doctrees $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) build/latex @echo @echo "Build finished; the LaTeX files are in build/latex." @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ "run these through (pdf)latex." changes: mkdir -p build/changes build/doctrees $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes @echo @echo "The overview file is in build/changes." linkcheck: mkdir -p build/linkcheck build/doctrees $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in build/linkcheck/output.txt." pycuda-2013.1.1+git20140310/bpl-subset/0002755000175000000500000000000012313360364015411 5ustar tomussrcpycuda-2013.1.1+git20140310/README_SETUP.txt0000644000175000000500000000210212313360364016000 0ustar tomussrcHi, welcome. This Python package uses aksetup for installation, which means that installation should be easy and quick. If you don't want to continue reading, just try the regular ./configure.py --help ./configure.py --some-options make sudo make install That should do the trick. (By the way: If a config option says "several ok", then you may specify several values, separated by commas.) aksetup also supports regular distutils installation, without using configure: python setup.py build sudo python setup.py install In this case, configuration is obtained from files in this order: /etc/aksetup-defaults.py $HOME/.aksetup-defaults.py $PACKAGEDIR/siteconf.py Once you've run configure, you can copy options from your siteconf.py file to one of these files, and you won't ever have to configure them again manually. In fact, you may pass the options "--update-user" and "--update-global" to configure, and it will automatically update these files for you. This is particularly handy if you want to perform an unattended or automatic installation via easy_install. pycuda-2013.1.1+git20140310/configure.py0000755000175000000500000000013312313360364015662 0ustar tomussrc#! /usr/bin/env python from aksetup_helper import configure_frontend configure_frontend() pycuda-2013.1.1+git20140310/setup.py0000644000175000000500000001716112313360364015047 0ustar tomussrc#!/usr/bin/env python # -*- coding: latin-1 -*- from os.path import dirname, join, normpath def search_on_path(filenames): """Find file on system path.""" # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224 from os.path import exists, abspath from os import pathsep, environ search_path = environ["PATH"] paths = search_path.split(pathsep) for path in paths: for filename in filenames: if exists(join(path, filename)): return abspath(join(path, filename)) def get_config_schema(): from aksetup_helper import ConfigSchema, Option, \ IncludeDir, LibraryDir, Libraries, BoostLibraries, \ Switch, StringListOption, make_boost_base_options nvcc_path = search_on_path(["nvcc", "nvcc.exe"]) if nvcc_path is None: print("*** WARNING: nvcc not in path.") cuda_root_default = None else: cuda_root_default = normpath(join(dirname(nvcc_path), "..")) return ConfigSchema(make_boost_base_options() + [ Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"), BoostLibraries("python"), BoostLibraries("thread"), Switch("CUDA_TRACE", False, "Enable CUDA API tracing"), Option("CUDA_ROOT", default=cuda_root_default, help="Path to the CUDA toolkit"), Option("CUDA_PRETEND_VERSION", help="Assumed CUDA version, in the form 3010 for 3.1."), IncludeDir("CUDA", None), Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"), Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"), LibraryDir("CUDADRV", ["${CUDA_ROOT}/lib", "${CUDA_ROOT}/lib64"]), Libraries("CUDADRV", ["cuda"]), LibraryDir("CUDART", ["${CUDA_ROOT}/lib", "${CUDA_ROOT}/lib64"]), Libraries("CUDART", ["cudart"]), LibraryDir("CURAND", ["${CUDA_ROOT}/lib", "${CUDA_ROOT}/lib64"]), Libraries("CURAND", ["curand"]), StringListOption("CXXFLAGS", [], help="Any extra C++ compiler options to include"), StringListOption("LDFLAGS", [], help="Any extra linker options to include"), ]) def main(): import sys from aksetup_helper import (hack_distutils, get_config, setup, NumpyExtension, set_up_shipped_boost_if_requested, check_git_submodules) check_git_submodules() hack_distutils() conf = get_config(get_config_schema()) EXTRA_SOURCES, EXTRA_DEFINES = set_up_shipped_boost_if_requested("pycuda", conf) EXTRA_DEFINES["PYGPU_PACKAGE"] = "pycuda" EXTRA_DEFINES["PYGPU_PYCUDA"] = "1" LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"] LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"] + conf["CUDADRV_LIBNAME"]) if not conf["CUDA_INC_DIR"]: conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")] if conf["CUDA_TRACE"]: EXTRA_DEFINES["CUDAPP_TRACE_CUDA"] = 1 if conf["CUDA_PRETEND_VERSION"]: EXTRA_DEFINES["CUDAPP_PRETEND_CUDA_VERSION"] = conf["CUDA_PRETEND_VERSION"] INCLUDE_DIRS = ['src/cpp'] + conf["BOOST_INC_DIR"] + conf["CUDA_INC_DIR"] conf["USE_CUDA"] = True if 'darwin' in sys.platform and sys.maxsize == 2147483647: # The Python interpreter is running in 32 bit mode on OS X if "-arch" not in conf["CXXFLAGS"]: conf["CXXFLAGS"].extend(['-arch', 'i386', '-m32']) if "-arch" not in conf["LDFLAGS"]: conf["LDFLAGS"].extend(['-arch', 'i386', '-m32']) if 'darwin' in sys.platform: # set path to Cuda dynamic libraries, # as a safe substitute for DYLD_LIBRARY_PATH for lib_dir in conf["CUDADRV_LIB_DIR"]: conf["LDFLAGS"].extend(["-Xlinker", "-rpath", "-Xlinker", lib_dir]) if conf["CUDA_ENABLE_GL"]: EXTRA_SOURCES.append("src/wrapper/wrap_cudagl.cpp") EXTRA_DEFINES["HAVE_GL"] = 1 if conf["CUDA_ENABLE_CURAND"]: EXTRA_DEFINES["HAVE_CURAND"] = 1 EXTRA_SOURCES.extend([ "src/wrapper/wrap_curand.cpp" ]) LIBRARIES.extend(conf["CURAND_LIBNAME"]) LIBRARY_DIRS.extend(conf["CURAND_LIB_DIR"]) ver_dic = {} exec(compile(open("pycuda/__init__.py").read(), "pycuda/__init__.py", 'exec'), ver_dic) try: from distutils.command.build_py import build_py_2to3 as build_py except ImportError: # 2.x from distutils.command.build_py import build_py import sys if sys.version_info >= (3,): pvt_struct_source = "src/wrapper/_pvt_struct_v3.cpp" else: pvt_struct_source = "src/wrapper/_pvt_struct_v2.cpp" setup(name="pycuda", # metadata version=ver_dic["VERSION_TEXT"], description="Python wrapper for Nvidia CUDA", long_description=open("README.rst", "rt").read(), author="Andreas Kloeckner", author_email="inform@tiker.net", license="MIT", url="http://mathema.tician.de/software/pycuda", classifiers=[ 'Environment :: Console', 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'Intended Audience :: Other Audience', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: C++', 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 2.4', 'Programming Language :: Python :: 2.5', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Scientific/Engineering :: Physics', 'Topic :: Scientific/Engineering :: Visualization', ], # build info packages=["pycuda", "pycuda.gl", "pycuda.sparse", "pycuda.compyte"], install_requires=[ "pytools>=2011.2", "pytest>=2", "decorator>=3.2.0" ], ext_package="pycuda", ext_modules=[ NumpyExtension("_driver", [ "src/cpp/cuda.cpp", "src/cpp/bitlog.cpp", "src/wrapper/wrap_cudadrv.cpp", "src/wrapper/mempool.cpp", ]+EXTRA_SOURCES, include_dirs=INCLUDE_DIRS, library_dirs=LIBRARY_DIRS, libraries=LIBRARIES, define_macros=list(EXTRA_DEFINES.items()), extra_compile_args=conf["CXXFLAGS"], extra_link_args=conf["LDFLAGS"], ), NumpyExtension("_pvt_struct", [pvt_struct_source], extra_compile_args=conf["CXXFLAGS"], extra_link_args=conf["LDFLAGS"], ), ], include_package_data=True, package_data={ "pycuda": [ "cuda/*.hpp", ] }, zip_safe=False, # 2to3 invocation cmdclass={'build_py': build_py}) if __name__ == '__main__': main()