pax_global_header00006660000000000000000000000064126146550730014523gustar00rootroot0000000000000052 comment=8be1465e784101770a0afb5172f04ff283c69927 scikit-cuda-0.5.1/000077500000000000000000000000001261465507300137265ustar00rootroot00000000000000scikit-cuda-0.5.1/AUTHORS000077700000000000000000000000001261465507300214402docs/source/authors.rstustar00rootroot00000000000000scikit-cuda-0.5.1/CHANGES000077700000000000000000000000001261465507300213062docs/source/changes.rstustar00rootroot00000000000000scikit-cuda-0.5.1/INSTALL000077700000000000000000000000001261465507300214022docs/source/install.rstustar00rootroot00000000000000scikit-cuda-0.5.1/LICENSE000077700000000000000000000000001261465507300213322docs/source/license.rstustar00rootroot00000000000000scikit-cuda-0.5.1/MANIFEST.in000066400000000000000000000005261261465507300154670ustar00rootroot00000000000000include AUTHORS CHANGES INSTALL LICENSE README.rst tox.ini recursive-include skcuda *.py recursive-include skcuda *.h recursive-include demos *.py recursive-include tests *.py include docs/source/*.rst include docs/Makefile include docs/source/conf.py include docs/source/_static/* include docs/source/_templates/*.html exclude MANIFEST.in scikit-cuda-0.5.1/Makefile000066400000000000000000000006441261465507300153720ustar00rootroot00000000000000PYTHON := `which python` DESTDIR = / NAME = scikit-cuda VERSION = $(shell $(PYTHON) -c 'import setup; print setup.VERSION') .PHONY: package build docs install test clean package: $(PYTHON) setup.py sdist --formats=gztar build: $(PYTHON) setup.py build docs: $(PYTHON) setup.py build_sphinx install: $(PYTHON) setup.py install --root=$(DESTDIR) test: $(PYTHON) setup.py test clean: $(PYTHON) setup.py clean scikit-cuda-0.5.1/README.rst000066400000000000000000000034451261465507300154230ustar00rootroot00000000000000.. -*- rst -*- .. image:: https://raw.githubusercontent.com/lebedov/scikit-cuda/master/docs/source/_static/logo.png :alt: scikit-cuda Package Description ------------------- scikit-cuda provides Python interfaces to many of the functions in the CUDA device/runtime, CUBLAS, CUFFT, and CUSOLVER libraries distributed as part of NVIDIA's `CUDA Programming Toolkit `_, as well as interfaces to select functions in the free and standard versions of the `CULA Dense Toolkit `_. Both low-level wrapper functions similar to their C counterparts and high-level functions comparable to those in `NumPy and Scipy `_ are provided. .. image:: https://zenodo.org/badge/doi/10.5281/zenodo.20211.svg :target: http://dx.doi.org/10.5281/zenodo.20211 :alt: 0.5.0 .. image:: https://img.shields.io/pypi/v/scikit-cuda.svg :target: https://pypi.python.org/pypi/scikit-cuda :alt: Latest Version .. image:: https://img.shields.io/pypi/dm/scikit-cuda.svg :target: https://pypi.python.org/pypi/scikit-cuda :alt: Downloads .. image:: http://prime4commit.com/projects/102.svg :target: http://prime4commit.com/projects/102 :alt: Support the project Documentation ------------- Package documentation is available at ``_. Development ----------- The latest source code can be obtained from ``_. Authors & Acknowledgments ------------------------- See the included `AUTHORS`_ file for more information. .. _AUTHORS: docs/source/authors.rst License ------- This software is licensed under the `BSD License `_. See the included `LICENSE`_ file for more information. .. _LICENSE: docs/source/license.rst scikit-cuda-0.5.1/demos/000077500000000000000000000000001261465507300150355ustar00rootroot00000000000000scikit-cuda-0.5.1/demos/diag_demo.py000066400000000000000000000014251261465507300173210ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrate diagonal matrix creation on the GPU. """ import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing real diagonal matrix creation for type ' + str(np.dtype(t)) v = np.array([1, 2, 3, 4, 5, 6], t) v_gpu = gpuarray.to_gpu(v) d_gpu = culinalg.diag(v_gpu); print 'Success status: ', np.all(d_gpu.get() == np.diag(v)) scikit-cuda-0.5.1/demos/dot_demo.py000066400000000000000000000034401261465507300172020ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates multiplication of two matrices on the GPU. """ import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing matrix multiplication for type ' + str(np.dtype(t)) if np.iscomplexobj(t()): a = np.asarray(np.random.rand(10, 5)+1j*np.random.rand(10, 5), t) b = np.asarray(np.random.rand(5, 5)+1j*np.random.rand(5, 5), t) c = np.asarray(np.random.rand(5, 5)+1j*np.random.rand(5, 5), t) else: a = np.asarray(np.random.rand(10, 5), t) b = np.asarray(np.random.rand(5, 5), t) c = np.asarray(np.random.rand(5, 5), t) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) temp_gpu = culinalg.dot(a_gpu, b_gpu) d_gpu = culinalg.dot(temp_gpu, c_gpu) temp_gpu.gpudata.free() del(temp_gpu) print 'Success status: ', np.allclose(np.dot(np.dot(a, b), c) , d_gpu.get()) print 'Testing vector multiplication for type ' + str(np.dtype(t)) if np.iscomplexobj(t()): d = np.asarray(np.random.rand(5)+1j*np.random.rand(5), t) e = np.asarray(np.random.rand(5)+1j*np.random.rand(5), t) else: d = np.asarray(np.random.rand(5), t) e = np.asarray(np.random.rand(5), t) d_gpu = gpuarray.to_gpu(d) e_gpu = gpuarray.to_gpu(e) temp = culinalg.dot(d_gpu, e_gpu) print 'Success status: ', np.allclose(np.dot(d, e), temp) scikit-cuda-0.5.1/demos/fft2d_batch_demo.py000066400000000000000000000024661261465507300205710ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to use the PyCUDA interface to CUFFT to compute a batch of 2D FFTs. """ import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.fft as cu_fft print 'Testing fft/ifft..' N = 256 batch_size = 16 x = np.empty((batch_size, N, N), np.float32) xf = np.empty((batch_size, N, N), np.complex64) y = np.empty((batch_size, N, N), np.float32) for i in xrange(batch_size): x[i, :, :] = np.asarray(np.random.rand(N, N), np.float32) xf[i, :, :] = np.fft.fft2(x[i, :, :]) y[i, :, :] = np.real(np.fft.ifft2(xf[i, :, :])) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N, N/2+1), np.complex64) plan_forward = cu_fft.Plan((N, N), np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan((N, N), np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.empty((batch_size, N, N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan((N, N), np.complex64, np.complex64, batch_size) cu_fft.fft(x_gpu, x_gpu, plan) cu_fft.ifft(x_gpu, x_gpu, plan, True) print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6) scikit-cuda-0.5.1/demos/fft2d_demo.py000066400000000000000000000021211261465507300174140ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to use the PyCUDA interface to CUFFT to compute 2D FFTs. """ import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.fft as cu_fft print 'Testing fft/ifft..' N = 1024 M = N/2 x = np.asarray(np.random.rand(N, M), np.float32) xf = np.fft.fft2(x) y = np.real(np.fft.ifft2(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((x.shape[0], x.shape[1]/2+1), np.complex64) plan_forward = cu_fft.Plan(x_gpu.shape, np.float32, np.complex64) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(x_gpu.shape, np.complex64, np.float32) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.asarray(np.random.rand(N, M)+1j*np.random.rand(N, M), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(x_gpu.shape, np.complex64, np.complex64) cu_fft.fft(x_gpu, x_gpu, plan) cu_fft.ifft(x_gpu, x_gpu, plan, True) print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6) scikit-cuda-0.5.1/demos/fft_batch_demo.py000066400000000000000000000022161261465507300203340ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to use the PyCUDA interface to CUFFT to compute a batch of 1D FFTs. """ import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.fft as cu_fft print 'Testing fft/ifft..' N = 4096*16 batch_size = 16 x = np.asarray(np.random.rand(batch_size, N), np.float32) xf = np.fft.fft(x) y = np.real(np.fft.ifft(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((batch_size, N/2+1), np.complex64) plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.asarray(np.random.rand(batch_size, N)+\ 1j*np.random.rand(batch_size, N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(N, np.complex64, np.complex64, batch_size) cu_fft.fft(x_gpu, x_gpu, plan) cu_fft.ifft(x_gpu, x_gpu, plan, True) print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6) scikit-cuda-0.5.1/demos/fft_demo.py000066400000000000000000000020521261465507300171710ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to use the PyCUDA interface to CUFFT to compute 1D FFTs. """ import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.fft as cu_fft print 'Testing fft/ifft..' N = 4096*16 x = np.asarray(np.random.rand(N), np.float32) xf = np.fft.fft(x) y = np.real(np.fft.ifft(xf)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(N/2+1, np.complex64) plan_forward = cu_fft.Plan(x_gpu.shape, np.float32, np.complex64) cu_fft.fft(x_gpu, xf_gpu, plan_forward) y_gpu = gpuarray.empty_like(x_gpu) plan_inverse = cu_fft.Plan(x_gpu.shape, np.complex64, np.float32) cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True) print 'Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6) print 'Testing in-place fft..' x = np.asarray(np.random.rand(N)+1j*np.random.rand(N), np.complex64) x_gpu = gpuarray.to_gpu(x) plan = cu_fft.Plan(x_gpu.shape, np.complex64, np.complex64) cu_fft.fft(x_gpu, x_gpu, plan) cu_fft.ifft(x_gpu, x_gpu, plan, True) print 'Success status: ', np.allclose(x, x_gpu.get(), atol=1e-6) scikit-cuda-0.5.1/demos/indexing_2d_demo.py000066400000000000000000000043511261465507300206100ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to access 2D arrays within a PyCUDA kernel in a numpy-consistent manner. """ from string import Template import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np import skcuda.misc as misc A = 3 B = 4 N = A*B # Define a 2D array: # x_orig = np.arange(0, N, 1, np.float64) x_orig = np.asarray(np.random.rand(N), np.float64) x = x_orig.reshape((A, B)) # These functions demonstrate how to convert a linear index into subscripts: a = lambda i: i/B b = lambda i: np.mod(i, B) # Check that x[subscript(i)] is equivalent to x.flat[i]: subscript = lambda i: (a(i), b(i)) for i in xrange(x.size): assert x.flat[i] == x[subscript(i)] # Check that x[i, j] is equivalent to x.flat[index(i, j)]: index = lambda i, j: i*B+j for i in xrange(A): for j in xrange(B): assert x[i, j] == x.flat[index(i, j)] func_mod_template = Template(""" // Macro for converting subscripts to linear index: #define INDEX(a, b) a*${B}+b __global__ void func(double *x, unsigned int N) { // Obtain the linear index corresponding to the current thread: unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+ blockIdx.x*${max_threads_per_block}+threadIdx.x; // Convert the linear index to subscripts: unsigned int a = idx/${B}; unsigned int b = idx%${B}; // Use the subscripts to access the array: if (idx < N) { if (b == 0) x[INDEX(a,b)] = 100; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = \ SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B)) func = func_mod.get_function('func') x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, 0] = 100 print 'Success status: ', np.allclose(x_np, x_gpu.get()) scikit-cuda-0.5.1/demos/indexing_3d_demo.py000066400000000000000000000046461261465507300206200ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to access 3D arrays within a PyCUDA kernel in a numpy-consistent manner. """ from string import Template import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np import skcuda.misc as misc A = 3 B = 4 C = 5 N = A*B*C # Define a 3D array: # x_orig = np.arange(0, N, 1, np.float64) x_orig = np.asarray(np.random.rand(N), np.float64) x = x_orig.reshape((A, B, C)) # These functions demonstrate how to convert a linear index into subscripts: a = lambda i: i/(B*C) b = lambda i: np.mod(i, B*C)/C c = lambda i: np.mod(np.mod(i, B*C), C) # Check that x[ind(i)] is equivalent to x.flat[i]: subscript = lambda i: (a(i), b(i), c(i)) for i in xrange(x.size): assert x.flat[i] == x[subscript(i)] # Check that x[i,j,k] is equivalent to x.flat[index(i,j,k)]: index = lambda i,j,k: i*B*C+j*C+k for i in xrange(A): for j in xrange(B): for k in xrange(C): assert x[i, j, k] == x.flat[index(i, j, k)] func_mod_template = Template(""" // Macro for converting subscripts to linear index: #define INDEX(a, b, c) a*${B}*${C}+b*${C}+c __global__ void func(double *x, unsigned int N) { // Obtain the linear index corresponding to the current thread: unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+ blockIdx.x*${max_threads_per_block}+threadIdx.x; // Convert the linear index to subscripts: unsigned int a = idx/(${B}*${C}); unsigned int b = (idx%(${B}*${C}))/${C}; unsigned int c = (idx%(${B}*${C}))%${C}; // Use the subscripts to access the array: if (idx < N) { if (b == 0) x[INDEX(a,b,c)] = 100; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = \ SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C)) func = func_mod.get_function('func') x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, 0, :] = 100 print 'Success status: ', np.allclose(x_np, x_gpu.get()) scikit-cuda-0.5.1/demos/indexing_4d_demo.py000066400000000000000000000053141261465507300206120ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to access 4D arrays within a PyCUDA kernel in a numpy-consistent manner. """ from string import Template import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np import skcuda.misc as misc A = 3 B = 4 C = 5 D = 6 N = A*B*C*D # Define a 3D array: # x_orig = np.arange(0, N, 1, np.float64) x_orig = np.asarray(np.random.rand(N), np.float64) x = x_orig.reshape((A, B, C, D)) # These functions demonstrate how to convert a linear index into subscripts: a = lambda i: i/(B*C*D) b = lambda i: np.mod(i, B*C*D)/(C*D) c = lambda i: np.mod(np.mod(i, B*C*D), C*D)/D d = lambda i: np.mod(np.mod(np.mod(i, B*C*D), C*D), D) # Check that x[subscript(i)] is equivalent to x.flat[i]: subscript = lambda i: (a(i), b(i), c(i), d(i)) for i in xrange(x.size): assert x.flat[i] == x[subscript(i)] # Check that x[i,j,k,l] is equivalent to x.flat[index(i,j,k,l)]: index = lambda i,j,k,l: i*B*C*D+j*C*D+k*D+l for i in xrange(A): for j in xrange(B): for k in xrange(C): for l in xrange(D): assert x[i, j, k, l] == x.flat[index(i, j, k, l)] func_mod_template = Template(""" // Macro for converting subscripts to linear index: #define INDEX(a, b, c, d) a*${B}*${C}*${D}+b*${C}*${D}+c*${D}+d __global__ void func(double *x, unsigned int N) { // Obtain the linear index corresponding to the current thread: unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+ blockIdx.x*${max_threads_per_block}+threadIdx.x; // Convert the linear index to subscripts: unsigned int a = idx/(${B}*${C}*${D}); unsigned int b = (idx%(${B}*${C}*${D}))/(${C}*${D}); unsigned int c = ((idx%(${B}*${C}*${D}))%(${C}*${D}))/${D}; unsigned int d = ((idx%(${B}*${C}*${D}))%(${C}*${D}))%${D}; // Use the subscripts to access the array: if (idx < N) { if (c == 0) x[INDEX(a,b,c,d)] = 100; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = \ SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C, D=D)) func = func_mod.get_function('func') x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, :, 0, :] = 100 print 'Success status: ', np.allclose(x_np, x_gpu.get()) scikit-cuda-0.5.1/demos/mdot_demo.py000066400000000000000000000023351261465507300173610ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates multiplication of several matrices on the GPU. """ import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.autoinit import numpy as np import skcuda.linalg as linalg import skcuda.misc as cumisc linalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing multiple matrix multiplication for type ' + str(np.dtype(t)) if np.iscomplexobj(t()): a = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), t) b = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), t) c = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), t) else: a = np.asarray(np.random.rand(8, 4), t) b = np.asarray(np.random.rand(4, 4), t) c = np.asarray(np.random.rand(4, 4), t) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = linalg.mdot(a_gpu, b_gpu, c_gpu) print 'Success status: ', np.allclose(np.dot(a, np.dot(b, c)), d_gpu.get()) scikit-cuda-0.5.1/demos/pinv_demo.py000066400000000000000000000017661261465507300174010ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates computation of the pseudoinverse on the GPU. """ import pycuda.autoinit import pycuda.driver as drv import pycuda.gpuarray as gpuarray import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string import scikits.cuda.cula as cula demo_types = [np.float32, np.complex64] if cula._libcula_toolkit == 'premium' and \ cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing pinv for type ' + str(np.dtype(t)) a = np.asarray((np.random.rand(50, 50)-0.5)/10, t) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = culinalg.pinv(a_gpu) print 'Success status: ', np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=1e-2) print 'Maximum error: ', np.max(np.abs(np.linalg.pinv(a)-a_inv_gpu.get())) print '' scikit-cuda-0.5.1/demos/svd_demo.py000066400000000000000000000021101261465507300172010ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates computation of the singular value decomposition on the GPU. """ import pycuda.autoinit import pycuda.driver as drv import pycuda.gpuarray as gpuarray import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string import scikits.cuda.cula as cula demo_types = [np.float32, np.complex64] if cula._libcula_toolkit == 'premium' and \ cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing svd for type ' + str(np.dtype(t)) a = np.asarray((np.random.rand(50, 50)-0.5)/10, t) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = culinalg.svd(a_gpu) a_rec = np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())) print 'Success status: ', np.allclose(a, a_rec, atol=1e-3) print 'Maximum error: ', np.max(np.abs(a-a_rec)) print '' scikit-cuda-0.5.1/demos/transpose_demo.py000066400000000000000000000020671261465507300204360ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to transpose matrices on the GPU. """ import pycuda.autoinit import pycuda.driver as drv import pycuda.gpuarray as gpuarray import numpy as np import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing transpose for type ' + str(np.dtype(t)) if np.iscomplexobj(t()): b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], t) else: a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], t) a_gpu = gpuarray.to_gpu(a) at_gpu = culinalg.transpose(a_gpu) if np.iscomplexobj(t()): print 'Success status: ', np.all(np.conj(a.T) == at_gpu.get()) else: print 'Success status: ', np.all(a.T == at_gpu.get()) scikit-cuda-0.5.1/demos/tril_demo.py000066400000000000000000000016461261465507300173740ustar00rootroot00000000000000#!/usr/bin/env python """ Demonstrates how to extract the lower triangle of a matrix. """ import pycuda.autoinit import pycuda.driver as drv import numpy as np import pycuda.gpuarray as gpuarray import skcuda.linalg as culinalg import skcuda.misc as cumisc culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: import string demo_types = [np.float32, np.complex64] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3: demo_types.extend([np.float64, np.complex128]) for t in demo_types: print 'Testing lower triangle extraction for type ' + str(np.dtype(t)) N = 10 if np.iscomplexobj(t()): a = np.asarray(np.random.rand(N, N), t) else: a = np.asarray(np.random.rand(N, N)+1j*np.random.rand(N, N), t) a_gpu = gpuarray.to_gpu(a) b_gpu = culinalg.tril(a_gpu, False) print 'Success status: ', np.allclose(b_gpu.get(), np.tril(a)) scikit-cuda-0.5.1/docs/000077500000000000000000000000001261465507300146565ustar00rootroot00000000000000scikit-cuda-0.5.1/docs/Makefile000066400000000000000000000061371261465507300163250ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build SRCDIR = source # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/ $(SRCDIR)/generated/ html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/TED.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/TED.qhc" latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ "run these through (pdf)latex." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." scikit-cuda-0.5.1/docs/source/000077500000000000000000000000001261465507300161565ustar00rootroot00000000000000scikit-cuda-0.5.1/docs/source/_static/000077500000000000000000000000001261465507300176045ustar00rootroot00000000000000scikit-cuda-0.5.1/docs/source/_static/logo.png000066400000000000000000000126541261465507300212620ustar00rootroot00000000000000PNG  IHDROL8sBIT|d pHYs {/tEXtSoftwarewww.inkscape.org<)IDATxytUս>I$ *I֥Uq(ZE$hUSZ컶CW(2էXZx>(CP; }{>ke-ܽɹwOrr.8&Git)zuR짡;4zvuڀrk6X4kt{?6_BE%SwP :\E W5#];nZNm !) QuMGSf=M5(򒪗S!/li~ %UA3PB@NNZR0x |avb3!ɂ@CVeVA.hB!Z ӵpЯ˵J)]ުܿ;*p]sZ1}ʋ]BOCfhK5cAeK D3!64so!#B-#ZTy0ީu?"G!h"VioҾO+7;h[tB!DhSusds;=hC#C!h!N ލnz,B6_!BT\*[ZPAUJVB!2"€(E, !BnIG+BH:Z!"D*ͻZ;NۅړsA78;(ǘE@Ϡ*.'daNX`_cxN ? {nC<9s-mDc7xw/08<bYo8S@[6{Λ <r[1scv'+RW % w`FRf}-q{j(Az:ite3ޭpVp*ƆSt-=՚Œ ʀہXJ% ;{vu;Z]*uyt2(M)K~xc)|9] Q]@x\_ƼHʝ"~0OT)I9)9Tz>yj p`* #TQ8:3xK>pi t F|,93sA!R6B~52((:O92}S) LYEa2w>)Ճ&Vc[1Ô^*IE^P&u%`V'`Ĥ5o\V0Wi ţ q{l Ul&w|r[6cFo$G)n+&9/Ll w; Rz woC8'N6>͑5WB{ r2~ڷ 32d<̶Qذ=VY^~IE}34aMiv^EF y-OR]E|.Ѝ$}b턚etKT xܟp-${ME$oFm&`öt2זs4p%lknXpm`6gs7Iimmշ{*qq@23o9@TD㵛m`04zlmlM j}^2ojENAZem&ǥYe,~QaP|LHrkOMMкP@s͋HE-w0v0Ca0+ŅVb_U%x/z61RN=*Wf`1%E,K˫~RzEK x\omdjX}13 yӄ: m'؋S;@;zs\'S!9&՚1,|fQй R-o4éj\ %O܌ T?s1`٤70rN3!c!c|{Ek*6Ҭ& Ƣ96b91rR ;rj7m6Hzc3I*NG"uN ny,[&VU^e%EhugP+[Od |ǶJ &ӑ";;{rkǎ*[sN+XNna5{uq 0}AQ;wץzL|ԑz3!Vl̰qU^~,mS .u(| Un ǝ&b^m lp0Sg*fQ᭞߅=^hFx%xyL,Dr0z5 3( ݬQ[;˭Ovm>¦+&-->}IuLG 0Y$.W_=&ڧK[BChDvaܬoW=?R;Ak}zuZq*aBے1g DZ)#1 ddb44;\1mԗ;F!o3T: E蝿 }']ioS*KO ӔTN<#m; m4JBVxGnB-;vs'*^ "ݹAڜm(&7bFE=nivOch/|_= T`yhM&..:W/T5:m𐎽4̜ ̰Ew f{ k_5l 3 Ew!DKy^'\s0r5k4`ë6GשTSY<1?3? uBؕ֏Y؃%aGX>+8\_=7/ۣqn,{6gIZ" :F{t=f6_܇Y(׀B*{eH6l2G[<!W̘*봒rjoWq (VQ IKG/+E>S^ WL`a|^$ .јm8,q9(44 iziv;$`:k}{.ImVxpMF-ӱ?l6Ȕs@++NO7`Eܽn0\w>.أeS|_8gxoh[|tV~uxP60bg?f ipZyZ)Gi}sfAJhx0IH;@yYOP&cxqW<`$eRo%1pK7- |>*YGfG~A!bk2ĶDy~ }~xuX~[R-d>T.&m*xii&OՍolhu(Yw뿀q+E` ӝ AYy062xc0^ov78Y.^xHw0u̥^{  YW"C0)M4$u]ުz|Ǐ 4&)͖ S!k>?34ϭ5P9` #/9a-$o_{(>g&}=0^+n4]>Dq%xNuÞlVx <=CyX 7GdR:E`9tx6LV[0Q8 b[wݍ3=~\zF}c/x'ك^4b$tdMok8.3+t#BVwՏ8I'7Zy' x 3$Z\n9ǖ}]~F?G@v3J*B\Һm\hA`:`lsm'#) M EFi-&)!l0 xrLev0̃}w)$X^^R5_y+Cd`4JVY`X]r[ڂ["=wb:lS71~e^`0ǹmnb'|eNzY}[l9-8I'Y7_ Txc{PRqNL19nEbOaV!ɌuKP[Ծ}[,))j՘N[3"uc5 ~)쫍$xOl`*{CCV&S T%[>vF :>Nl{ԙjV0Cch_G0C]V4up&2T-1LډI]ɯ!c/A6nn*0r#ԔUHaZsJXfoΖ'^ WĬv,$yw1x:cV!zY|qh,KS-mB`M} 3.?Ϊ-8s&ZzcO=|U` ޻ 8f3uyO4g]ʶ\_hT^R鴃MB!L6 !mtB!DB!B6ʯW58@! p[׎w7$/[h\w.aBLkCsgVo\j=,"Kn 4*n{FVM!Ȅ;ZU3h:#"#h]ԲˌΐxOyq@wtcD$!;ڮs`Jk]6qq1_==4T֚nBO!n(DdO.T9Jv<奕WG!P:Z*ߧ8#ʋp;BS8[NYۭVP2hImM]J\!ОhLZRpvh z_UȖ!zG fR.E+GWJ?t|xOEW!mƥ:Zz0p:Ej}7繮l|=PB!KmcX)> T~5_*YY͙ۡgnݒ !?BIENDB`scikit-cuda-0.5.1/docs/source/_static/logo.svg000066400000000000000000000264571261465507300213030ustar00rootroot00000000000000 image/svg+xml SCIKIT-CUDA scikit-cuda-0.5.1/docs/source/_templates/000077500000000000000000000000001261465507300203135ustar00rootroot00000000000000scikit-cuda-0.5.1/docs/source/_templates/relations.html000066400000000000000000000004341261465507300232020ustar00rootroot00000000000000{%- if prev %}
  • {{ "«"|safe }} Previous
  • {%- endif %} {%- if next %}
  • Next {{ "»"|safe }}
  • {%- endif %} scikit-cuda-0.5.1/docs/source/authors.rst000066400000000000000000000045001261465507300203740ustar00rootroot00000000000000.. -*- rst -*- Authors & Acknowledgments ========================= This software was written and packaged by `Lev Givon `_. Although it depends upon the excellent `PyCUDA `_ package by `Andreas Klöckner `_, scikit-cuda is developed independently of PyCUDA. Special thanks are due to the following parties for their contributions: - `Frédéric Bastien `_ - CUBLAS version detection enhancements. - `David Wei Chiang `_ - Improvements to vectorized functions, bug fixes. - `Sander Dieleman `_ - CUBLAS 5 bindings. - `Chris Capdevila `_ - MacOS X library search fix. - `Ben Erichson `_ - QR decomposition, eigenvalue/eigenvector computation, Dynamic Mode Decomposition, randomized linear algebra routines. - `Ying Wei (Daniel) Fan `_ - Kindly permitted reuse of CUBLAS wrapper code in his PARRET Python package. - `Michael M. Forbes `_ - Improved MacOSX compatibility, bug fixes. - `Jacob Frelinger `_ - Various enhancements. - Tim Klein - Additional MAGMA wrappers. - `Eric Larson `_ - Various enhancements. - `Gregory R. Lee `_ - Enhanced FFT plan creation. - `Teodor Mihai Moldovan `_ - CUBLAS 5 bindings. - `Lars Pastewka `_ - FFT tests and FFTW compatibility mode configuration. - `Luke Pfister `_ - Bug fixes. - `Alex Rubinsteyn `_ - Support for CULA Dense Free R17. - `Xing Shi `_ - Bug fixes. - `Steve Taylor `_ - Cholesky factorization/solve functions. - Rob Turetsky - Useful feedback. - `Thomas Unterthiner `_ - Additional high-level and wrapper functions. - `Stefan van der Walt `_ - Bug fixes. - `Feng Wang `_ - Bug reports. - `Yiyin Zhou `_ - Patches, bug reports, and function wrappers scikit-cuda-0.5.1/docs/source/changes.rst000066400000000000000000000147031261465507300203250ustar00rootroot00000000000000.. -*- rst -*- Change Log ========== Release 0.5.1 - (October 30, 2015) ---------------------------------- * More CUSOLVER wrappers. * Eigenvalue/eigenvector computation (eng. by N. Ben Erichson). * QR decomposition (enh. by N. Ben Erichson). * Improved Windows 10 compatibility (enh. by N. Ben Erichson). * Function for constructing Vandermonde matrix in GPU memory (enh. by N. Ben Erichson). * Unrandomized and randomized Dynamic Mode Decomposition (enh. by N. Ben Erichson). * Randomized linear algebra routines (enh. by N. Ben Erichson). * Add triu function (enh. by N. Ben Erichson). * Support Bessel correction in computation of variance and standard deviation (#143). * Fix pip installation issues. Release 0.5.0 - (July 14, 2015) ------------------------------- * Rename package to scikit-cuda. * Reductions sum, mean, var, std, max, min, argmax, argmin accept keepdims option. * The same reductions now return a GPUArray instead of ndarray if axis=None. * Switch to PEP 440 version numbering. * Replace distribute_setup.py with ez_setup.py. * Improve support for latest NVIDIA GPUs. * Direct links to online NVIDIA documentation in CUBLAS, CUFFT wrapper docstrings. * Add wrappers for CUSOLVER in CUDA 7.0. * Add skcuda namespace package that contains all modules in scikits.cuda namespace. * Add more wrappers for CUBLAS 5 functions (enh. by Teodor Moldovan, Sander Dieleman). * Add support for CULA Dense Free R17 (enh. by Alex Rubinsteyn). * Memoize elementwise kernel used by ifft scaling (#37). * Speed up misc.maxabs using reduction and kernel memoization. * Speed up misc.cumsum using scan and kernel memoization. * Speed up linalg.conj and misc.diff using elementwise kernel and memoization. * Speed up special.{sici,exp1,expi} using elementwise kernel and memoization. * Add wrappers for experimental multi-GPU CULA routines in CULA Dense R14+. * Use ldconfig to find library paths rather than libdl (#39). * Fix win32 platform detection. * Add Cholesky factorization/solve routines (enh. by Steve Taylor). * Fix Cholesky factorization/solve routines (fix by Thomas Unterthiner). * Enable dot() function to operate inplace (enh. by Thomas Unterthiner). * Python 3 compatibility improvements (enh. by Thomas Unterthiner). * Support for Fortran-order arrays in dot() and cho_solve() (enh. by Thomas Unterthiner) * CULA-based matrix inversion (enh. by Thomas Unterthiner). * Add add_diag() function (enh. by Thomas Unterthiner). * Use cublas*copy in diag() function (enh. by Thomas Unterthiner). * Improved MacOSX compatibility (enh. by Michael M. Forbes). * Find CUBLAS version even when it is only accessible via LD_LIBRARY_PATH (enh. by Frédéric Bastien). * Get both major and minor version numbers from CUBLAS library when determining version. * Handle unset LD_LIBRARY_PATH variable (fix by Jan Schlüter). * Fix library search on MacOS X (fix by capdevc). * Fix library search on Windows. * Add Windows support to CULA wrappers. * Enable specification of memory pool allocator to linalg functions (enh. by Thomas Unterthiner). * Improve misc.select_block_grid_sizes() logic to handle different GPU hardware. * Compute transpose using CUDA 5.0 CUBLAS functions rather than with inefficient naive kernel. * Use ReadTheDocs theme when building HTML docs locally. * Support additional cufftPlanMany() parameters when creating FFT plans (enh. by Gregory R. Lee). * Improved Python 3.4 compatibility (enh. by Eric Larson). * Avoid unnecessary import of cublas when importing fft module (enh. by Eric Larson). * Matrix trace function (enh. by Thomas Unterthiner). * Functions for computing simple axis-wise stats over matrices (enh. by Thomas Unterthiner). * Matrix add_dot, add_matvec, div_matvec, mult_matvec functions (enh. by Thomas Unterthiner). * Faster dot_diag implementation using CUBLAS matrix-matrix multiplication (enh. by Thomas Unterthiner). * Memoize SourceModule calls to speed up various high-level functions (enh. by Thomas Unterthiner). * Function for computing matrix determinant (enh. by Thomas Unterthiner). * Function for computing min/max and argmin/argmax along a matrix axis (enh. by Thomas Unterthiner). * Set default value of the parameter 'overwrite' to False in all linalg functions. * Elementwise arithmetic operations with broadcasting up to 2 dimensions (enh. David Wei Chiang) Release 0.042 - (March 10, 2013) -------------------------------- * Add complex exponential integral. * Fix typo in cublasCgbmv. * Use CUBLAS v2 API, add preliminary support for CUBLAS 5 functions. * Detect CUBLAS version without initializing the GPU. * Work around numpy bug #1898. * Fix issues with pycuda installations done via easy_install/pip. * Add support for specifying streams when creating FFT plans. * Successfully find CULA R13a libraries. * Raise exceptions when functions in the full release of CULA Dense are invoked without the library installed. * Perform post-fft scaling in-place. * Fix broken Python 2.6 compatibility (#19). * Download distribute for package installation if it isn't available. * Prevent absence of CULA from causing import errors (enh. by Jacob Frelinger) * FFT batch tests and FFTW mode configuration (enh. by Lars Pastewka) Release 0.041 - (May 22, 2011) ------------------------------ * Fix bug preventing installation with pip. Release 0.04 - (May 11, 2011) ----------------------------- * Fix bug in cutoff_invert kernel. * Add get_compute_capability function and other goodies to misc module. * Use pycuda-complex.hpp to improve kernel readability. * Add integrate module. * Add unit tests for high-level functions. * Automatically determine device used by current context. * Support batched and multidimensional FFT operations. * Extended dot() function to support implicit transpose/Hermitian. * Support for in-place computation of singular vectors in svd() function. * Simplify kernel launch setup. * More CULA routine wrappers. * Wrappers for CULA R11 auxiliary routines. Release 0.03 - (November 22, 2010) ---------------------------------- * Add support for some functions in the premium version of CULA toolkit. * Add wrappers for all lapack functions in basic CULA toolkit. * Fix pinv() to properly invert complex matrices. * Add Hermitian transpose. * Add tril function. * Fix missing library detection. * Include missing CUDA headers in package. Release 0.02 - (September 21, 2010) ----------------------------------- * Add documentation. * Update copyright information. Release 0.01 - (September 17, 2010) ----------------------------------- * First public release. scikit-cuda-0.5.1/docs/source/conf.py000066400000000000000000000237431261465507300174660ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # scikit-cuda documentation build configuration file, created by # sphinx-quickstart on Fri Jul 19 10:11:54 2013. # # This file is execfile()d with the current directory set to its containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import re, sys, os # Prevent project dependencies from interfering with autodoc: class Mock(object): __all__ = [] def __init__(self, *args, **kwargs): pass def __call__(self, *args, **kwargs): return Mock() @classmethod def __getattr__(cls, name): if name in ('__file__', '__path__'): return '/dev/null' elif name == 'typecodes': return {'AllInteger': [], 'AllFloat': []} elif name[0] == name[0].upper(): mockType = type(name, (), {}) mockType.__module__ = __name__ return mockType else: return Mock() def __getitem__(self, v): return Mock() MOCK_MODULES = ['numpy', 'pycuda', 'pycuda.compiler', 'pycuda.curandom', 'pycuda.driver', 'pycuda.elementwise', 'pycuda.gpuarray', 'pycuda.reduction', 'pycuda.scan', 'pycuda.tools', 'pytools'] for mod_name in MOCK_MODULES: sys.modules[mod_name] = Mock() # Prevent attempts at finding certain shared libraries from causing module # import exceptions: import ctypes ctypes.cdll.LoadLibrary = Mock() # Prevent pkg_resources requirements checking from raising exceptions due to # missing dependencies: import pkg_resources pkg_resources.require = Mock() # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.append(os.path.abspath('../sphinxext')) sys.path.append(os.path.abspath('../skcuda')) sys.path.append(os.path.abspath('../../')) # Prevent cublas library load from interfering with doc build: import skcuda.utils def __temp(filename): return 'libcublas.so.6.5' __temp.__doc__ = skcuda.utils.get_soname.__doc__ skcuda.utils.get_soname = __temp # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.intersphinx', 'sphinx.ext.pngmath', 'sphinx.ext.viewcode', 'numpydoc'] # Generate autosummary stubs: autosummary_generate = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8' # The master toctree document. master_doc = 'index' # General information about the project. project = u'scikit-cuda' copyright = u'2009-2015, Lev Givon' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # The short X.Y version. version = '0.5' # The full version, including alpha/beta/rc tags. release = '0.5.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. language = 'en' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. #unused_docs = [] # List of directories, relative to source directory, that shouldn't be searched # for source files. exclude_trees = ['build'] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. #keep_warnings = False # -- Options for HTML output --------------------------------------------------- on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if on_rtd: html_theme = 'default' else: import sphinx_rtd_theme # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom themes here, relative to this directory. html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". #html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Output file base name for HTML help builder. htmlhelp_basename = 'scikit-cuda-doc' # -- Options for LaTeX output -------------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. 'preamble': """ \usepackage{amsmath} \usepackage{amsfonts} \usepackage{amssymb} \usepackage{ucs} \\renewcommand{\\familydefault}{\\sfdefault} """ } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'scikit-cuda-doc.tex', u'scikit-cuda Documentation', u'Lev Givon', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. latex_use_parts = True # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Documents to append as an appendix to all manuals. latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ('index', 'scikit-cuda', u'scikit-cuda Documentation', [u'Lev Givon'], 1) ] # If true, show URL addresses after external links. #man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ('index', 'scikit-cuda', u'scikit-cuda Documentation', u'Lev Givon', 'scikit-cuda', 'scikit-cuda', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. #texinfo_appendices = [] # If false, no module index is generated. #texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. #texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False # Generate links to other projects' online references: intersphinx_mapping = { 'http://docs.python.org/2/': None, 'http://docs.scipy.org/doc/numpy/': None, 'http://documen.tician.de/pycuda/': None, } scikit-cuda-0.5.1/docs/source/index.rst000066400000000000000000000013031261465507300200140ustar00rootroot00000000000000.. -*- rst -*- scikit-cuda =========== scikit-cuda provides Python interfaces to many of the functions in the CUDA device/runtime, CUBLAS, CUFFT, and CUSOLVER libraries distributed as part of NVIDIA's `CUDA Programming Toolkit `_, as well as interfaces to select functions in the free and standard versions of the `CULA Dense Toolkit `_. Both low-level wrapper functions similar to their C counterparts and high-level functions comparable to those in `NumPy and Scipy `_ are provided. Contents -------- .. toctree:: :maxdepth: 2 install reference authors license changes Index ----- * :ref:`genindex` scikit-cuda-0.5.1/docs/source/install.rst000066400000000000000000000072321261465507300203620ustar00rootroot00000000000000.. -*- rst -*- Installation ============ Quick Installation ------------------ If you have `pip `_ installed, you should be able to install the latest stable release of ``scikit-cuda`` by running the following:: pip install scikit-cuda All dependencies should be automatically downloaded and installed if they are not already on your system. Obtaining the Latest Software ----------------------------- The latest stable and development versions of ``scikit-cuda`` can be downloaded from `GitHub `_ Online documentation is available at ``_ Installation Dependencies ------------------------- ``scikit-cuda`` requires that the following software packages be installed: * `Python `_ 2.7 or 3.4. * `Setuptools `_ 0.6c10 or later. * `Mako `_ 1.0.1 or later. * `NumPy `_ 1.2.0 or later. * `PyCUDA `_ 2014.1 or later (some parts of ``scikit-cuda`` might not work properly with earlier versions). * `NVIDIA CUDA Toolkit `_ 5.0 or later. To run the unit tests, the following packages are also required: * `nose `_ 0.11 or later. * `SciPy `_ 0.14.0 or later. Some of the linear algebra functionality relies on the CULA Dense toolkit; the single precision release of the toolkit is free of charge, but requires registration. Depending on the version of CULA installed, some functions may not be available: * `CULA `_ R16a or later. To build the documentation, the following packages are also required: * `Docutils `_ 0.5 or later. * `Jinja2 `_ 2.2 or later. * `Pygments `_ 0.8 or later. * `Sphinx `_ 1.0.1 or later. * `Sphinx ReadTheDocs Theme `_ 0.1.6 or later. Platform Support ---------------- The software has been developed and tested on Linux; it should also work on other Unix-like platforms supported by the above packages. Parts of the package may work on Windows as well, but remain untested. Building and Installation ------------------------- ``scikit-cuda`` searches for CUDA libraries in the system library search path when imported. You may have to modify this path (e.g., by adding the path to the CUDA libraries to ``/etc/ld.so.conf`` and running ``ldconfig`` as root or to the ``LD_LIBRARY_PATH`` environmental variable on Linux, or by adding the CUDA library path to the ``DYLD_LIBRARY_PATH`` on MacOSX) if the libraries are not being found. To build and install the toolbox, download and unpack the source release and run:: python setup.py install from within the main directory in the release. To rebuild the documentation, run:: python setup.py build_sphinx Running the Unit Tests ---------------------- To run all of the package unit tests, download and unpack the package source tarball and run:: python setup.py test from within the main directory in the archive. Tests for individual modules (found in the ``tests/`` subdirectory) can also be run directly. Getting Started --------------- The functions provided by ``scikit-cuda`` are grouped into several submodules in the ``skcuda`` namespace package. Sample code demonstrating how to use different parts of the toolbox is located in the ``demos/`` subdirectory of the source release. Many of the high-level functions also contain doctests that describe their usage. scikit-cuda-0.5.1/docs/source/license.rst000066400000000000000000000027511261465507300203370ustar00rootroot00000000000000.. -*- rst -*- License ======= Copyright (c) 2009-2015, Lev Givon. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Lev Givon nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. scikit-cuda-0.5.1/docs/source/reference.rst000066400000000000000000000007261261465507300206530ustar00rootroot00000000000000.. -*- rst -*- Reference ========= Library Wrapper Routines ------------------------ .. toctree:: :maxdepth: 2 reference_cublas reference_cufft reference_cusolver reference_cula reference_pcula High-Level Routines ------------------- .. toctree:: :maxdepth: 2 reference_fft reference_integrate reference_linalg reference_rlinalg reference_special Other Routines -------------- .. toctree:: :maxdepth: 2 reference_misc scikit-cuda-0.5.1/docs/source/reference_cublas.rst000066400000000000000000000075771261465507300222170ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.cublas CUBLAS Routines =============== Helper Routines --------------- .. autosummary:: :toctree: generated/ :nosignatures: cublasCheckStatus cublasCreate cublasDestroy cublasGetCurrentCtx cublasGetStream cublasGetVersion cublasSetStream Wrapper Routines ---------------- Single Precision BLAS1 Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cublasIsamax cublasIsamin cublasSasum cublasSaxpy cublasScopy cublasSdot cublasSnrm2 cublasSrot cublasSrotg cublasSrotm cublasSrotmg cublasSscal cublasSswap cublasCaxpy cublasCcopy cublasCdotc cublasCdotu cublasCrot cublasCrotg cublasCscal cublasCsrot cublasCsscal cublasCswap cublasIcamax cublasIcamin cublasScasum cublasScnrm2 Double Precision BLAS1 Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cublasIdamax cublasIdamin cublasDasum cublasDaxpy cublasDcopy cublasDdot cublasDnrm2 cublasDrot cublasDrotg cublasDrotm cublasDrotmg cublasDscal cublasDswap cublasDzasum cublasDznrm2 cublasIzamax cublasIzamin cublasZaxpy cublasZcopy cublasZdotc cublasZdotu cublasZdrot cublasZdscal cublasZrot cublasZrotg cublasZscal cublasZswap Single Precision BLAS2 Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cublasSgbmv cublasSgemv cublasSger cublasSsbmv cublasSspmv cublasSspr cublasSspr2 cublasSsymv cublasSsyr cublasSsyr2 cublasStbmv cublasStbsv cublasStpmv cublasStpsv cublasStrmv cublasStrsv cublasCgbmv cublasCgemv cublasCgerc cublasCgeru cublasChbmv cublasChemv cublasCher cublasCher2 cublasChpmv cublasChpr cublasChpr2 cublasCtbmv cublasCtbsv cublasCtpmv cublasCtpsv cublasCtrmv cublasCtrsv Double Precision BLAS2 Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cublasDgbmv cublasDgemv cublasDger cublasDsbmv cublasDspmv cublasDspr cublasDspr2 cublasDsymv cublasDsyr cublasDsyr2 cublasDtbmv cublasDtbsv cublasDtpmv cublasDtpsv cublasDtrmv cublasDtrsv cublasZgbmv cublasZgemv cublasZgerc cublasZgeru cublasZhbmv cublasZhemv cublasZher cublasZher2 cublasZhpmv cublasZhpr cublasZhpr2 cublasZtbmv cublasZtbsv cublasZtpmv cublasZtpsv cublasZtrmv cublasZtrsv Single Precision BLAS3 Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cublasSgemm cublasSsymm cublasSsyrk cublasSsyr2k cublasStrmm cublasStrsm cublasCgemm cublasChemm cublasCherk cublasCher2k cublasCsymm cublasCsyrk cublasCsyr2k cublasCtrmm cublasCtrsm Double Precision BLAS3 Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cublasDgemm cublasDsymm cublasDsyrk cublasDsyr2k cublasDtrmm cublasDtrsm cublasZgemm cublasZhemm cublasZherk cublasZher2k cublasZsymm cublasZsyrk cublasZsyr2k cublasZtrmm cublasZtrsm Single-Precision BLAS-like Extension Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cublasSdgmm cublasSgeam cublasSgemmBatched cublasCgemmBatched cublasStrsmBatched cublasSgetrfBatched cublasCdgmm cublasCgeam Double-Precision BLAS-like Extension Routines ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cublasDdgmm cublasDgeam cublasDgemmBatched cublasZgemmBatched cublasDtrsmBatched cublasDgetrfBatched cublasZdgmm cublasZgeam scikit-cuda-0.5.1/docs/source/reference_cufft.rst000066400000000000000000000010221261465507300220300ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.cufft CUFFT Routines ============== Helper Routines --------------- .. autosummary:: :toctree: generated/ :nosignatures: cufftCheckStatus Wrapper Routines ---------------- .. autosummary:: :toctree: generated/ :nosignatures: cufftPlan1d cufftPlan2d cufftPlan3d cufftPlanMany cufftDestroy cufftExecC2C cufftExecR2C cufftExecC2R cufftExecZ2Z cufftExecD2Z cufftExecZ2D cufftSetCompatibilityMode cufftSetStream scikit-cuda-0.5.1/docs/source/reference_cula.rst000066400000000000000000000063411261465507300216560ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.cula CULA Routines ============= Framework Routines ------------------ .. autosummary:: :toctree: generated/ :nosignatures: culaCheckStatus culaFreeBuffers culaGetCublasMinimumVersion culaGetCublasRuntimeVersion culaGetCudaDriverVersion culaGetCudaMinimumVersion culaGetCudaRuntimeVersion culaGetDeviceCount culaGetErrorInfo culaGetErrorInfoString culaGetExecutingDevice culaGetLastStatus culaGetStatusString culaGetVersion culaInitialize culaSelectDevice culaShutdown Auxiliary Routines ------------------ Single Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceSgeNancheck culaDeviceSgeTranspose culaDeviceSgeTransposeInplace Single Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceCgeConjugate culaDeviceCgeNancheck culaDeviceCgeTranspose culaDeviceCgeTransposeConjugate culaDeviceCgeTransposeInplace culaDeviceCgeTransposeConjugateInplace Double Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceDgeNancheck culaDeviceDgeTranspose culaDeviceDgeTransposeInplace Double Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceZgeConjugate culaDeviceZgeNancheck culaDeviceZgeTranspose culaDeviceZgeTransposeConjugate culaDeviceZgeTransposeInplace culaDeviceZgeTransposeConjugateInplace BLAS Routines ------------- Single Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceSgemm culaDeviceSgemv Single Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceCgemm culaDeviceCgemv Double Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceDgemm culaDeviceDgemv Double Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceZgemm culaDeviceZgemv LAPACK Routines --------------- Single Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceSgels culaDeviceSgeqrf culaDeviceSgesv culaDeviceSgesvd culaDeviceSgetrf culaDeviceSgglse culaDeviceSposv culaDeviceSpotrf Single Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceCgels culaDeviceCgeqrf culaDeviceCgesv culaDeviceCgesvd culaDeviceCgetrf culaDeviceCgglse culaDeviceCposv culaDeviceCpotrf Double Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceDgels culaDeviceDgeqrf culaDeviceDgesv culaDeviceDgesvd culaDeviceDgetrf culaDeviceDgglse culaDeviceDposv culaDeviceDpotrf Double Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: culaDeviceZgels culaDeviceZgeqrf culaDeviceZgesv culaDeviceZgesvd culaDeviceZgetrf culaDeviceZgglse culaDeviceZposv culaDeviceZpotrf scikit-cuda-0.5.1/docs/source/reference_cusolver.rst000066400000000000000000000024001261465507300225640ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.cusolver CUSOLVER Routines ================= These routines are only available in CUDA 7.0 and later. Helper Routines --------------- .. autosummary:: :toctree: generated/ :nosignatures: cusolverDnCreate cusolverDnGetStream cusolverDnDestroy cusolverDnSetStream Wrapper Routines ---------------- Single Precision Routines ^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cusolverDnSgeqrf_bufferSize cusolverDnSgeqrf cusolverDnSgesvd_bufferSize cusolverDnSgesvd cusolverDnSgetrf_bufferSize cusolverDnSgetrf cusolverDnSgetrs cusolverDnCgeqrf_bufferSize cusolverDnCgeqrf cusolverDnCgesvd_bufferSize cusolverDnCgesvd cusolverDnCgetrf_bufferSize cusolverDnCgetrf cusolverDnCgetrs Double Precision Routines ^^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: cusolverDnDgeqrf_bufferSize cusolverDnDgeqrf cusolverDnDgesvd_bufferSize cusolverDnDgesvd cusolverDnDgetrf_bufferSize cusolverDnDgetrf cusolverDnDgetrs cusolverDnZgeqrf_bufferSize cusolverDnZgeqrf cusolverDnZgesvd_bufferSize cusolverDnZgesvd cusolverDnZgetrf_bufferSize cusolverDnZgetrf cusolverDnZgetrs scikit-cuda-0.5.1/docs/source/reference_fft.rst000066400000000000000000000002641261465507300215070ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.fft Fast Fourier Transform ====================== .. autosummary:: :toctree: generated/ :nosignatures: fft ifft Plan scikit-cuda-0.5.1/docs/source/reference_integrate.rst000066400000000000000000000002601261465507300227060ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.integrate Integration Routines ==================== .. autosummary:: :toctree: generated/ :nosignatures: trapz trapz2d scikit-cuda-0.5.1/docs/source/reference_linalg.rst000066400000000000000000000006711261465507300222000ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.linalg Linear Algebra Routines ======================= .. autosummary:: :toctree: generated/ :nosignatures: add_diag add_dot add_matvec cho_factor cho_solve conj det diag div_matvec dmd dot_diag dot eig eye hermitian inv mdot mult_matvec multiply norm pinv qr scale svd trace transpose tril triu vander scikit-cuda-0.5.1/docs/source/reference_misc.rst000066400000000000000000000011161261465507300216600ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.misc Miscellaneous Routines ====================== .. autosummary:: :toctree: generated/ :nosignatures: add add_matvec argmax argmin cumsum diff divide done_context get_by_index get_compute_capability get_current_device get_dev_attrs inf init init_context init_device iscomplextype isdoubletype max maxabs mean min multiply ones ones_like select_block_grid_sizes set_by_index set_realloc shutdown std subtract sum var zeros zeros_like scikit-cuda-0.5.1/docs/source/reference_pcula.rst000066400000000000000000000031051261465507300220310ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.pcula Multi-GPU CULA Routines ======================= Framework Routines ------------------ .. autosummary:: :toctree: generated/ :nosignatures: pculaConfigInit BLAS Routines ------------- Single Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: pculaSgemm pculaStrsm Single Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: pculaCgemm pculaCtrsm Double Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: pculaDgemm pculaDtrsm Double Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: pculaZgemm pculaZtrsm LAPACK Routines --------------- Single Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: pculaSgesv pculaSgetrf pculaSgetrs pculaSposv pculaSpotrf pculaSpotrs Single Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: pculaCgesv pculaCgetrf pculaCgetrs pculaCposv pculaCpotrf pculaCpotrs Double Precision Real ^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: pculaDgesv pculaDgetrf pculaDgetrs pculaDposv pculaDpotrf pculaDpotrs Double Precision Complex ^^^^^^^^^^^^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :nosignatures: pculaZgesv pculaZgetrf pculaZgetrs pculaZposv pculaZpotrf pculaZpotrs scikit-cuda-0.5.1/docs/source/reference_rlinalg.rst000066400000000000000000000003061261465507300223550ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.rlinalg Randomized Linear Algebra Routines ================================== .. autosummary:: :toctree: generated/ :nosignatures: rdmd rsvd scikit-cuda-0.5.1/docs/source/reference_special.rst000066400000000000000000000002651261465507300223510ustar00rootroot00000000000000.. -*- rst -*- .. currentmodule:: skcuda.special Special Math Functions ====================== .. autosummary:: :toctree: generated/ :nosignatures: exp1 expi sici scikit-cuda-0.5.1/docs/sphinxext/000077500000000000000000000000001261465507300167105ustar00rootroot00000000000000scikit-cuda-0.5.1/docs/sphinxext/LICENSE.txt000066400000000000000000000135071261465507300205410ustar00rootroot00000000000000------------------------------------------------------------------------------- The files - numpydoc.py - docscrape.py - docscrape_sphinx.py - phantom_import.py have the following license: Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- The files - compiler_unparse.py - comment_eater.py - traitsdoc.py have the following license: This software is OSI Certified Open Source Software. OSI Certified is a certification mark of the Open Source Initiative. Copyright (c) 2006, Enthought, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Enthought, Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- The file - plot_directive.py originates from Matplotlib (http://matplotlib.sf.net/) which has the following license: Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. 1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. scikit-cuda-0.5.1/docs/sphinxext/comment_eater.py000066400000000000000000000124511261465507300221070ustar00rootroot00000000000000from __future__ import division, absolute_import, print_function import sys if sys.version_info[0] >= 3: from io import StringIO else: from io import StringIO import compiler import inspect import textwrap import tokenize from .compiler_unparse import unparse class Comment(object): """ A comment block. """ is_comment = True def __init__(self, start_lineno, end_lineno, text): # int : The first line number in the block. 1-indexed. self.start_lineno = start_lineno # int : The last line number. Inclusive! self.end_lineno = end_lineno # str : The text block including '#' character but not any leading spaces. self.text = text def add(self, string, start, end, line): """ Add a new comment line. """ self.start_lineno = min(self.start_lineno, start[0]) self.end_lineno = max(self.end_lineno, end[0]) self.text += string def __repr__(self): return '%s(%r, %r, %r)' % (self.__class__.__name__, self.start_lineno, self.end_lineno, self.text) class NonComment(object): """ A non-comment block of code. """ is_comment = False def __init__(self, start_lineno, end_lineno): self.start_lineno = start_lineno self.end_lineno = end_lineno def add(self, string, start, end, line): """ Add lines to the block. """ if string.strip(): # Only add if not entirely whitespace. self.start_lineno = min(self.start_lineno, start[0]) self.end_lineno = max(self.end_lineno, end[0]) def __repr__(self): return '%s(%r, %r)' % (self.__class__.__name__, self.start_lineno, self.end_lineno) class CommentBlocker(object): """ Pull out contiguous comment blocks. """ def __init__(self): # Start with a dummy. self.current_block = NonComment(0, 0) # All of the blocks seen so far. self.blocks = [] # The index mapping lines of code to their associated comment blocks. self.index = {} def process_file(self, file): """ Process a file object. """ if sys.version_info[0] >= 3: nxt = file.__next__ else: nxt = file.next for token in tokenize.generate_tokens(nxt): self.process_token(*token) self.make_index() def process_token(self, kind, string, start, end, line): """ Process a single token. """ if self.current_block.is_comment: if kind == tokenize.COMMENT: self.current_block.add(string, start, end, line) else: self.new_noncomment(start[0], end[0]) else: if kind == tokenize.COMMENT: self.new_comment(string, start, end, line) else: self.current_block.add(string, start, end, line) def new_noncomment(self, start_lineno, end_lineno): """ We are transitioning from a noncomment to a comment. """ block = NonComment(start_lineno, end_lineno) self.blocks.append(block) self.current_block = block def new_comment(self, string, start, end, line): """ Possibly add a new comment. Only adds a new comment if this comment is the only thing on the line. Otherwise, it extends the noncomment block. """ prefix = line[:start[1]] if prefix.strip(): # Oops! Trailing comment, not a comment block. self.current_block.add(string, start, end, line) else: # A comment block. block = Comment(start[0], end[0], string) self.blocks.append(block) self.current_block = block def make_index(self): """ Make the index mapping lines of actual code to their associated prefix comments. """ for prev, block in zip(self.blocks[:-1], self.blocks[1:]): if not block.is_comment: self.index[block.start_lineno] = prev def search_for_comment(self, lineno, default=None): """ Find the comment block just before the given line number. Returns None (or the specified default) if there is no such block. """ if not self.index: self.make_index() block = self.index.get(lineno, None) text = getattr(block, 'text', default) return text def strip_comment_marker(text): """ Strip # markers at the front of a block of comment text. """ lines = [] for line in text.splitlines(): lines.append(line.lstrip('#')) text = textwrap.dedent('\n'.join(lines)) return text def get_class_traits(klass): """ Yield all of the documentation for trait definitions on a class object. """ # FIXME: gracefully handle errors here or in the caller? source = inspect.getsource(klass) cb = CommentBlocker() cb.process_file(StringIO(source)) mod_ast = compiler.parse(source) class_ast = mod_ast.node.nodes[0] for node in class_ast.code.nodes: # FIXME: handle other kinds of assignments? if isinstance(node, compiler.ast.Assign): name = node.nodes[0].name rhs = unparse(node.expr).strip() doc = strip_comment_marker(cb.search_for_comment(node.lineno, default='')) yield name, rhs, doc scikit-cuda-0.5.1/docs/sphinxext/compiler_unparse.py000066400000000000000000000602441261465507300226370ustar00rootroot00000000000000""" Turn compiler.ast structures back into executable python code. The unparse method takes a compiler.ast tree and transforms it back into valid python code. It is incomplete and currently only works for import statements, function calls, function definitions, assignments, and basic expressions. Inspired by python-2.5-svn/Demo/parser/unparse.py fixme: We may want to move to using _ast trees because the compiler for them is about 6 times faster than compiler.compile. """ from __future__ import division, absolute_import, print_function import sys from compiler.ast import Const, Name, Tuple, Div, Mul, Sub, Add if sys.version_info[0] >= 3: from io import StringIO else: from StringIO import StringIO def unparse(ast, single_line_functions=False): s = StringIO() UnparseCompilerAst(ast, s, single_line_functions) return s.getvalue().lstrip() op_precedence = { 'compiler.ast.Power':3, 'compiler.ast.Mul':2, 'compiler.ast.Div':2, 'compiler.ast.Add':1, 'compiler.ast.Sub':1 } class UnparseCompilerAst: """ Methods in this class recursively traverse an AST and output source code for the abstract syntax; original formatting is disregarged. """ ######################################################################### # object interface. ######################################################################### def __init__(self, tree, file = sys.stdout, single_line_functions=False): """ Unparser(tree, file=sys.stdout) -> None. Print the source for tree to file. """ self.f = file self._single_func = single_line_functions self._do_indent = True self._indent = 0 self._dispatch(tree) self._write("\n") self.f.flush() ######################################################################### # Unparser private interface. ######################################################################### ### format, output, and dispatch methods ################################ def _fill(self, text = ""): "Indent a piece of text, according to the current indentation level" if self._do_indent: self._write("\n"+" "*self._indent + text) else: self._write(text) def _write(self, text): "Append a piece of text to the current line." self.f.write(text) def _enter(self): "Print ':', and increase the indentation." self._write(": ") self._indent += 1 def _leave(self): "Decrease the indentation level." self._indent -= 1 def _dispatch(self, tree): "_dispatcher function, _dispatching tree type T to method _T." if isinstance(tree, list): for t in tree: self._dispatch(t) return meth = getattr(self, "_"+tree.__class__.__name__) if tree.__class__.__name__ == 'NoneType' and not self._do_indent: return meth(tree) ######################################################################### # compiler.ast unparsing methods. # # There should be one method per concrete grammar type. They are # organized in alphabetical order. ######################################################################### def _Add(self, t): self.__binary_op(t, '+') def _And(self, t): self._write(" (") for i, node in enumerate(t.nodes): self._dispatch(node) if i != len(t.nodes)-1: self._write(") and (") self._write(")") def _AssAttr(self, t): """ Handle assigning an attribute of an object """ self._dispatch(t.expr) self._write('.'+t.attrname) def _Assign(self, t): """ Expression Assignment such as "a = 1". This only handles assignment in expressions. Keyword assignment is handled separately. """ self._fill() for target in t.nodes: self._dispatch(target) self._write(" = ") self._dispatch(t.expr) if not self._do_indent: self._write('; ') def _AssName(self, t): """ Name on left hand side of expression. Treat just like a name on the right side of an expression. """ self._Name(t) def _AssTuple(self, t): """ Tuple on left hand side of an expression. """ # _write each elements, separated by a comma. for element in t.nodes[:-1]: self._dispatch(element) self._write(", ") # Handle the last one without writing comma last_element = t.nodes[-1] self._dispatch(last_element) def _AugAssign(self, t): """ +=,-=,*=,/=,**=, etc. operations """ self._fill() self._dispatch(t.node) self._write(' '+t.op+' ') self._dispatch(t.expr) if not self._do_indent: self._write(';') def _Bitand(self, t): """ Bit and operation. """ for i, node in enumerate(t.nodes): self._write("(") self._dispatch(node) self._write(")") if i != len(t.nodes)-1: self._write(" & ") def _Bitor(self, t): """ Bit or operation """ for i, node in enumerate(t.nodes): self._write("(") self._dispatch(node) self._write(")") if i != len(t.nodes)-1: self._write(" | ") def _CallFunc(self, t): """ Function call. """ self._dispatch(t.node) self._write("(") comma = False for e in t.args: if comma: self._write(", ") else: comma = True self._dispatch(e) if t.star_args: if comma: self._write(", ") else: comma = True self._write("*") self._dispatch(t.star_args) if t.dstar_args: if comma: self._write(", ") else: comma = True self._write("**") self._dispatch(t.dstar_args) self._write(")") def _Compare(self, t): self._dispatch(t.expr) for op, expr in t.ops: self._write(" " + op + " ") self._dispatch(expr) def _Const(self, t): """ A constant value such as an integer value, 3, or a string, "hello". """ self._dispatch(t.value) def _Decorators(self, t): """ Handle function decorators (eg. @has_units) """ for node in t.nodes: self._dispatch(node) def _Dict(self, t): self._write("{") for i, (k, v) in enumerate(t.items): self._dispatch(k) self._write(": ") self._dispatch(v) if i < len(t.items)-1: self._write(", ") self._write("}") def _Discard(self, t): """ Node for when return value is ignored such as in "foo(a)". """ self._fill() self._dispatch(t.expr) def _Div(self, t): self.__binary_op(t, '/') def _Ellipsis(self, t): self._write("...") def _From(self, t): """ Handle "from xyz import foo, bar as baz". """ # fixme: Are From and ImportFrom handled differently? self._fill("from ") self._write(t.modname) self._write(" import ") for i, (name,asname) in enumerate(t.names): if i != 0: self._write(", ") self._write(name) if asname is not None: self._write(" as "+asname) def _Function(self, t): """ Handle function definitions """ if t.decorators is not None: self._fill("@") self._dispatch(t.decorators) self._fill("def "+t.name + "(") defaults = [None] * (len(t.argnames) - len(t.defaults)) + list(t.defaults) for i, arg in enumerate(zip(t.argnames, defaults)): self._write(arg[0]) if arg[1] is not None: self._write('=') self._dispatch(arg[1]) if i < len(t.argnames)-1: self._write(', ') self._write(")") if self._single_func: self._do_indent = False self._enter() self._dispatch(t.code) self._leave() self._do_indent = True def _Getattr(self, t): """ Handle getting an attribute of an object """ if isinstance(t.expr, (Div, Mul, Sub, Add)): self._write('(') self._dispatch(t.expr) self._write(')') else: self._dispatch(t.expr) self._write('.'+t.attrname) def _If(self, t): self._fill() for i, (compare,code) in enumerate(t.tests): if i == 0: self._write("if ") else: self._write("elif ") self._dispatch(compare) self._enter() self._fill() self._dispatch(code) self._leave() self._write("\n") if t.else_ is not None: self._write("else") self._enter() self._fill() self._dispatch(t.else_) self._leave() self._write("\n") def _IfExp(self, t): self._dispatch(t.then) self._write(" if ") self._dispatch(t.test) if t.else_ is not None: self._write(" else (") self._dispatch(t.else_) self._write(")") def _Import(self, t): """ Handle "import xyz.foo". """ self._fill("import ") for i, (name,asname) in enumerate(t.names): if i != 0: self._write(", ") self._write(name) if asname is not None: self._write(" as "+asname) def _Keyword(self, t): """ Keyword value assignment within function calls and definitions. """ self._write(t.name) self._write("=") self._dispatch(t.expr) def _List(self, t): self._write("[") for i,node in enumerate(t.nodes): self._dispatch(node) if i < len(t.nodes)-1: self._write(", ") self._write("]") def _Module(self, t): if t.doc is not None: self._dispatch(t.doc) self._dispatch(t.node) def _Mul(self, t): self.__binary_op(t, '*') def _Name(self, t): self._write(t.name) def _NoneType(self, t): self._write("None") def _Not(self, t): self._write('not (') self._dispatch(t.expr) self._write(')') def _Or(self, t): self._write(" (") for i, node in enumerate(t.nodes): self._dispatch(node) if i != len(t.nodes)-1: self._write(") or (") self._write(")") def _Pass(self, t): self._write("pass\n") def _Printnl(self, t): self._fill("print ") if t.dest: self._write(">> ") self._dispatch(t.dest) self._write(", ") comma = False for node in t.nodes: if comma: self._write(', ') else: comma = True self._dispatch(node) def _Power(self, t): self.__binary_op(t, '**') def _Return(self, t): self._fill("return ") if t.value: if isinstance(t.value, Tuple): text = ', '.join([ name.name for name in t.value.asList() ]) self._write(text) else: self._dispatch(t.value) if not self._do_indent: self._write('; ') def _Slice(self, t): self._dispatch(t.expr) self._write("[") if t.lower: self._dispatch(t.lower) self._write(":") if t.upper: self._dispatch(t.upper) #if t.step: # self._write(":") # self._dispatch(t.step) self._write("]") def _Sliceobj(self, t): for i, node in enumerate(t.nodes): if i != 0: self._write(":") if not (isinstance(node, Const) and node.value is None): self._dispatch(node) def _Stmt(self, tree): for node in tree.nodes: self._dispatch(node) def _Sub(self, t): self.__binary_op(t, '-') def _Subscript(self, t): self._dispatch(t.expr) self._write("[") for i, value in enumerate(t.subs): if i != 0: self._write(",") self._dispatch(value) self._write("]") def _TryExcept(self, t): self._fill("try") self._enter() self._dispatch(t.body) self._leave() for handler in t.handlers: self._fill('except ') self._dispatch(handler[0]) if handler[1] is not None: self._write(', ') self._dispatch(handler[1]) self._enter() self._dispatch(handler[2]) self._leave() if t.else_: self._fill("else") self._enter() self._dispatch(t.else_) self._leave() def _Tuple(self, t): if not t.nodes: # Empty tuple. self._write("()") else: self._write("(") # _write each elements, separated by a comma. for element in t.nodes[:-1]: self._dispatch(element) self._write(", ") # Handle the last one without writing comma last_element = t.nodes[-1] self._dispatch(last_element) self._write(")") def _UnaryAdd(self, t): self._write("+") self._dispatch(t.expr) def _UnarySub(self, t): self._write("-") self._dispatch(t.expr) def _With(self, t): self._fill('with ') self._dispatch(t.expr) if t.vars: self._write(' as ') self._dispatch(t.vars.name) self._enter() self._dispatch(t.body) self._leave() self._write('\n') def _int(self, t): self._write(repr(t)) def __binary_op(self, t, symbol): # Check if parenthesis are needed on left side and then dispatch has_paren = False left_class = str(t.left.__class__) if (left_class in op_precedence.keys() and op_precedence[left_class] < op_precedence[str(t.__class__)]): has_paren = True if has_paren: self._write('(') self._dispatch(t.left) if has_paren: self._write(')') # Write the appropriate symbol for operator self._write(symbol) # Check if parenthesis are needed on the right side and then dispatch has_paren = False right_class = str(t.right.__class__) if (right_class in op_precedence.keys() and op_precedence[right_class] < op_precedence[str(t.__class__)]): has_paren = True if has_paren: self._write('(') self._dispatch(t.right) if has_paren: self._write(')') def _float(self, t): # if t is 0.1, str(t)->'0.1' while repr(t)->'0.1000000000001' # We prefer str here. self._write(str(t)) def _str(self, t): self._write(repr(t)) def _tuple(self, t): self._write(str(t)) ######################################################################### # These are the methods from the _ast modules unparse. # # As our needs to handle more advanced code increase, we may want to # modify some of the methods below so that they work for compiler.ast. ######################################################################### # # stmt # def _Expr(self, tree): # self._fill() # self._dispatch(tree.value) # # def _Import(self, t): # self._fill("import ") # first = True # for a in t.names: # if first: # first = False # else: # self._write(", ") # self._write(a.name) # if a.asname: # self._write(" as "+a.asname) # ## def _ImportFrom(self, t): ## self._fill("from ") ## self._write(t.module) ## self._write(" import ") ## for i, a in enumerate(t.names): ## if i == 0: ## self._write(", ") ## self._write(a.name) ## if a.asname: ## self._write(" as "+a.asname) ## # XXX(jpe) what is level for? ## # # def _Break(self, t): # self._fill("break") # # def _Continue(self, t): # self._fill("continue") # # def _Delete(self, t): # self._fill("del ") # self._dispatch(t.targets) # # def _Assert(self, t): # self._fill("assert ") # self._dispatch(t.test) # if t.msg: # self._write(", ") # self._dispatch(t.msg) # # def _Exec(self, t): # self._fill("exec ") # self._dispatch(t.body) # if t.globals: # self._write(" in ") # self._dispatch(t.globals) # if t.locals: # self._write(", ") # self._dispatch(t.locals) # # def _Print(self, t): # self._fill("print ") # do_comma = False # if t.dest: # self._write(">>") # self._dispatch(t.dest) # do_comma = True # for e in t.values: # if do_comma:self._write(", ") # else:do_comma=True # self._dispatch(e) # if not t.nl: # self._write(",") # # def _Global(self, t): # self._fill("global") # for i, n in enumerate(t.names): # if i != 0: # self._write(",") # self._write(" " + n) # # def _Yield(self, t): # self._fill("yield") # if t.value: # self._write(" (") # self._dispatch(t.value) # self._write(")") # # def _Raise(self, t): # self._fill('raise ') # if t.type: # self._dispatch(t.type) # if t.inst: # self._write(", ") # self._dispatch(t.inst) # if t.tback: # self._write(", ") # self._dispatch(t.tback) # # # def _TryFinally(self, t): # self._fill("try") # self._enter() # self._dispatch(t.body) # self._leave() # # self._fill("finally") # self._enter() # self._dispatch(t.finalbody) # self._leave() # # def _excepthandler(self, t): # self._fill("except ") # if t.type: # self._dispatch(t.type) # if t.name: # self._write(", ") # self._dispatch(t.name) # self._enter() # self._dispatch(t.body) # self._leave() # # def _ClassDef(self, t): # self._write("\n") # self._fill("class "+t.name) # if t.bases: # self._write("(") # for a in t.bases: # self._dispatch(a) # self._write(", ") # self._write(")") # self._enter() # self._dispatch(t.body) # self._leave() # # def _FunctionDef(self, t): # self._write("\n") # for deco in t.decorators: # self._fill("@") # self._dispatch(deco) # self._fill("def "+t.name + "(") # self._dispatch(t.args) # self._write(")") # self._enter() # self._dispatch(t.body) # self._leave() # # def _For(self, t): # self._fill("for ") # self._dispatch(t.target) # self._write(" in ") # self._dispatch(t.iter) # self._enter() # self._dispatch(t.body) # self._leave() # if t.orelse: # self._fill("else") # self._enter() # self._dispatch(t.orelse) # self._leave # # def _While(self, t): # self._fill("while ") # self._dispatch(t.test) # self._enter() # self._dispatch(t.body) # self._leave() # if t.orelse: # self._fill("else") # self._enter() # self._dispatch(t.orelse) # self._leave # # # expr # def _Str(self, tree): # self._write(repr(tree.s)) ## # def _Repr(self, t): # self._write("`") # self._dispatch(t.value) # self._write("`") # # def _Num(self, t): # self._write(repr(t.n)) # # def _ListComp(self, t): # self._write("[") # self._dispatch(t.elt) # for gen in t.generators: # self._dispatch(gen) # self._write("]") # # def _GeneratorExp(self, t): # self._write("(") # self._dispatch(t.elt) # for gen in t.generators: # self._dispatch(gen) # self._write(")") # # def _comprehension(self, t): # self._write(" for ") # self._dispatch(t.target) # self._write(" in ") # self._dispatch(t.iter) # for if_clause in t.ifs: # self._write(" if ") # self._dispatch(if_clause) # # def _IfExp(self, t): # self._dispatch(t.body) # self._write(" if ") # self._dispatch(t.test) # if t.orelse: # self._write(" else ") # self._dispatch(t.orelse) # # unop = {"Invert":"~", "Not": "not", "UAdd":"+", "USub":"-"} # def _UnaryOp(self, t): # self._write(self.unop[t.op.__class__.__name__]) # self._write("(") # self._dispatch(t.operand) # self._write(")") # # binop = { "Add":"+", "Sub":"-", "Mult":"*", "Div":"/", "Mod":"%", # "LShift":">>", "RShift":"<<", "BitOr":"|", "BitXor":"^", "BitAnd":"&", # "FloorDiv":"//", "Pow": "**"} # def _BinOp(self, t): # self._write("(") # self._dispatch(t.left) # self._write(")" + self.binop[t.op.__class__.__name__] + "(") # self._dispatch(t.right) # self._write(")") # # boolops = {_ast.And: 'and', _ast.Or: 'or'} # def _BoolOp(self, t): # self._write("(") # self._dispatch(t.values[0]) # for v in t.values[1:]: # self._write(" %s " % self.boolops[t.op.__class__]) # self._dispatch(v) # self._write(")") # # def _Attribute(self,t): # self._dispatch(t.value) # self._write(".") # self._write(t.attr) # ## def _Call(self, t): ## self._dispatch(t.func) ## self._write("(") ## comma = False ## for e in t.args: ## if comma: self._write(", ") ## else: comma = True ## self._dispatch(e) ## for e in t.keywords: ## if comma: self._write(", ") ## else: comma = True ## self._dispatch(e) ## if t.starargs: ## if comma: self._write(", ") ## else: comma = True ## self._write("*") ## self._dispatch(t.starargs) ## if t.kwargs: ## if comma: self._write(", ") ## else: comma = True ## self._write("**") ## self._dispatch(t.kwargs) ## self._write(")") # # # slice # def _Index(self, t): # self._dispatch(t.value) # # def _ExtSlice(self, t): # for i, d in enumerate(t.dims): # if i != 0: # self._write(': ') # self._dispatch(d) # # # others # def _arguments(self, t): # first = True # nonDef = len(t.args)-len(t.defaults) # for a in t.args[0:nonDef]: # if first:first = False # else: self._write(", ") # self._dispatch(a) # for a,d in zip(t.args[nonDef:], t.defaults): # if first:first = False # else: self._write(", ") # self._dispatch(a), # self._write("=") # self._dispatch(d) # if t.vararg: # if first:first = False # else: self._write(", ") # self._write("*"+t.vararg) # if t.kwarg: # if first:first = False # else: self._write(", ") # self._write("**"+t.kwarg) # ## def _keyword(self, t): ## self._write(t.arg) ## self._write("=") ## self._dispatch(t.value) # # def _Lambda(self, t): # self._write("lambda ") # self._dispatch(t.args) # self._write(": ") # self._dispatch(t.body) scikit-cuda-0.5.1/docs/sphinxext/docscrape.py000066400000000000000000000407151261465507300212340ustar00rootroot00000000000000"""Extract reference documentation from the NumPy source tree. """ from __future__ import division, absolute_import, print_function import inspect import textwrap import re import pydoc from warnings import warn import collections import sys class Reader(object): """A line-based string reader. """ def __init__(self, data): """ Parameters ---------- data : str String with lines separated by '\n'. """ if isinstance(data,list): self._str = data else: self._str = data.split('\n') # store string as list of lines self.reset() def __getitem__(self, n): return self._str[n] def reset(self): self._l = 0 # current line nr def read(self): if not self.eof(): out = self[self._l] self._l += 1 return out else: return '' def seek_next_non_empty_line(self): for l in self[self._l:]: if l.strip(): break else: self._l += 1 def eof(self): return self._l >= len(self._str) def read_to_condition(self, condition_func): start = self._l for line in self[start:]: if condition_func(line): return self[start:self._l] self._l += 1 if self.eof(): return self[start:self._l+1] return [] def read_to_next_empty_line(self): self.seek_next_non_empty_line() def is_empty(line): return not line.strip() return self.read_to_condition(is_empty) def read_to_next_unindented_line(self): def is_unindented(line): return (line.strip() and (len(line.lstrip()) == len(line))) return self.read_to_condition(is_unindented) def peek(self,n=0): if self._l + n < len(self._str): return self[self._l + n] else: return '' def is_empty(self): return not ''.join(self._str).strip() class NumpyDocString(object): def __init__(self, docstring, config={}): docstring = textwrap.dedent(docstring).split('\n') self._doc = Reader(docstring) self._parsed_data = { 'Signature': '', 'Summary': [''], 'Extended Summary': [], 'Parameters': [], 'Returns': [], 'Raises': [], 'Warns': [], 'Other Parameters': [], 'Attributes': [], 'Methods': [], 'See Also': [], 'Notes': [], 'Warnings': [], 'References': '', 'Examples': '', 'index': {} } self._parse() def __getitem__(self,key): return self._parsed_data[key] def __setitem__(self,key,val): if key not in self._parsed_data: warn("Unknown section %s" % key) else: self._parsed_data[key] = val def _is_at_section(self): self._doc.seek_next_non_empty_line() if self._doc.eof(): return False l1 = self._doc.peek().strip() # e.g. Parameters if l1.startswith('.. index::'): return True l2 = self._doc.peek(1).strip() # ---------- or ========== return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1)) def _strip(self,doc): i = 0 j = 0 for i,line in enumerate(doc): if line.strip(): break for j,line in enumerate(doc[::-1]): if line.strip(): break return doc[i:len(doc)-j] def _read_to_next_section(self): section = self._doc.read_to_next_empty_line() while not self._is_at_section() and not self._doc.eof(): if not self._doc.peek(-1).strip(): # previous line was empty section += [''] section += self._doc.read_to_next_empty_line() return section def _read_sections(self): while not self._doc.eof(): data = self._read_to_next_section() name = data[0].strip() if name.startswith('..'): # index section yield name, data[1:] elif len(data) < 2: yield StopIteration else: yield name, self._strip(data[2:]) def _parse_param_list(self,content): r = Reader(content) params = [] while not r.eof(): header = r.read().strip() if ' : ' in header: arg_name, arg_type = header.split(' : ')[:2] else: arg_name, arg_type = header, '' desc = r.read_to_next_unindented_line() desc = dedent_lines(desc) params.append((arg_name,arg_type,desc)) return params _name_rgx = re.compile(r"^\s*(:(?P\w+):`(?P[a-zA-Z0-9_.-]+)`|" r" (?P[a-zA-Z0-9_.-]+))\s*", re.X) def _parse_see_also(self, content): """ func_name : Descriptive text continued text another_func_name : Descriptive text func_name1, func_name2, :meth:`func_name`, func_name3 """ items = [] def parse_item_name(text): """Match ':role:`name`' or 'name'""" m = self._name_rgx.match(text) if m: g = m.groups() if g[1] is None: return g[3], None else: return g[2], g[1] raise ValueError("%s is not a item name" % text) def push_item(name, rest): if not name: return name, role = parse_item_name(name) items.append((name, list(rest), role)) del rest[:] current_func = None rest = [] for line in content: if not line.strip(): continue m = self._name_rgx.match(line) if m and line[m.end():].strip().startswith(':'): push_item(current_func, rest) current_func, line = line[:m.end()], line[m.end():] rest = [line.split(':', 1)[1].strip()] if not rest[0]: rest = [] elif not line.startswith(' '): push_item(current_func, rest) current_func = None if ',' in line: for func in line.split(','): if func.strip(): push_item(func, []) elif line.strip(): current_func = line elif current_func is not None: rest.append(line.strip()) push_item(current_func, rest) return items def _parse_index(self, section, content): """ .. index: default :refguide: something, else, and more """ def strip_each_in(lst): return [s.strip() for s in lst] out = {} section = section.split('::') if len(section) > 1: out['default'] = strip_each_in(section[1].split(','))[0] for line in content: line = line.split(':') if len(line) > 2: out[line[1]] = strip_each_in(line[2].split(',')) return out def _parse_summary(self): """Grab signature (if given) and summary""" if self._is_at_section(): return # If several signatures present, take the last one while True: summary = self._doc.read_to_next_empty_line() summary_str = " ".join([s.strip() for s in summary]).strip() if re.compile('^([\w., ]+=)?\s*[\w\.]+\(.*\)$').match(summary_str): self['Signature'] = summary_str if not self._is_at_section(): continue break if summary is not None: self['Summary'] = summary if not self._is_at_section(): self['Extended Summary'] = self._read_to_next_section() def _parse(self): self._doc.reset() self._parse_summary() for (section,content) in self._read_sections(): if not section.startswith('..'): section = ' '.join([s.capitalize() for s in section.split(' ')]) if section in ('Parameters', 'Returns', 'Raises', 'Warns', 'Other Parameters', 'Attributes', 'Methods'): self[section] = self._parse_param_list(content) elif section.startswith('.. index::'): self['index'] = self._parse_index(section, content) elif section == 'See Also': self['See Also'] = self._parse_see_also(content) else: self[section] = content # string conversion routines def _str_header(self, name, symbol='-'): return [name, len(name)*symbol] def _str_indent(self, doc, indent=4): out = [] for line in doc: out += [' '*indent + line] return out def _str_signature(self): if self['Signature']: return [self['Signature'].replace('*','\*')] + [''] else: return [''] def _str_summary(self): if self['Summary']: return self['Summary'] + [''] else: return [] def _str_extended_summary(self): if self['Extended Summary']: return self['Extended Summary'] + [''] else: return [] def _str_param_list(self, name): out = [] if self[name]: out += self._str_header(name) for param,param_type,desc in self[name]: if param_type: out += ['%s : %s' % (param, param_type)] else: out += [param] out += self._str_indent(desc) out += [''] return out def _str_section(self, name): out = [] if self[name]: out += self._str_header(name) out += self[name] out += [''] return out def _str_see_also(self, func_role): if not self['See Also']: return [] out = [] out += self._str_header("See Also") last_had_desc = True for func, desc, role in self['See Also']: if role: link = ':%s:`%s`' % (role, func) elif func_role: link = ':%s:`%s`' % (func_role, func) else: link = "`%s`_" % func if desc or last_had_desc: out += [''] out += [link] else: out[-1] += ", %s" % link if desc: out += self._str_indent([' '.join(desc)]) last_had_desc = True else: last_had_desc = False out += [''] return out def _str_index(self): idx = self['index'] out = [] out += ['.. index:: %s' % idx.get('default','')] for section, references in idx.items(): if section == 'default': continue out += [' :%s: %s' % (section, ', '.join(references))] return out def __str__(self, func_role=''): out = [] out += self._str_signature() out += self._str_summary() out += self._str_extended_summary() for param_list in ('Parameters', 'Returns', 'Other Parameters', 'Raises', 'Warns'): out += self._str_param_list(param_list) out += self._str_section('Warnings') out += self._str_see_also(func_role) for s in ('Notes','References','Examples'): out += self._str_section(s) for param_list in ('Attributes', 'Methods'): out += self._str_param_list(param_list) out += self._str_index() return '\n'.join(out) def indent(str,indent=4): indent_str = ' '*indent if str is None: return indent_str lines = str.split('\n') return '\n'.join(indent_str + l for l in lines) def dedent_lines(lines): """Deindent a list of lines maximally""" return textwrap.dedent("\n".join(lines)).split("\n") def header(text, style='-'): return text + '\n' + style*len(text) + '\n' class FunctionDoc(NumpyDocString): def __init__(self, func, role='func', doc=None, config={}): self._f = func self._role = role # e.g. "func" or "meth" if doc is None: if func is None: raise ValueError("No function or docstring given") doc = inspect.getdoc(func) or '' NumpyDocString.__init__(self, doc) if not self['Signature'] and func is not None: func, func_name = self.get_func() try: # try to read signature if sys.version_info[0] >= 3: argspec = inspect.getfullargspec(func) else: argspec = inspect.getargspec(func) argspec = inspect.formatargspec(*argspec) argspec = argspec.replace('*','\*') signature = '%s%s' % (func_name, argspec) except TypeError as e: signature = '%s()' % func_name self['Signature'] = signature def get_func(self): func_name = getattr(self._f, '__name__', self.__class__.__name__) if inspect.isclass(self._f): func = getattr(self._f, '__call__', self._f.__init__) else: func = self._f return func, func_name def __str__(self): out = '' func, func_name = self.get_func() signature = self['Signature'].replace('*', '\*') roles = {'func': 'function', 'meth': 'method'} if self._role: if self._role not in roles: print("Warning: invalid role %s" % self._role) out += '.. %s:: %s\n \n\n' % (roles.get(self._role,''), func_name) out += super(FunctionDoc, self).__str__(func_role=self._role) return out class ClassDoc(NumpyDocString): extra_public_methods = ['__call__'] def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc, config={}): if not inspect.isclass(cls) and cls is not None: raise ValueError("Expected a class or None, but got %r" % cls) self._cls = cls self.show_inherited_members = config.get('show_inherited_class_members', True) if modulename and not modulename.endswith('.'): modulename += '.' self._mod = modulename if doc is None: if cls is None: raise ValueError("No class or documentation string given") doc = pydoc.getdoc(cls) NumpyDocString.__init__(self, doc) if config.get('show_class_members', True): def splitlines_x(s): if not s: return [] else: return s.splitlines() for field, items in [('Methods', self.methods), ('Attributes', self.properties)]: if not self[field]: doc_list = [] for name in sorted(items): try: doc_item = pydoc.getdoc(getattr(self._cls, name)) doc_list.append((name, '', splitlines_x(doc_item))) except AttributeError: pass # method doesn't exist self[field] = doc_list @property def methods(self): if self._cls is None: return [] return [name for name, func in inspect.getmembers(self._cls) if ((not name.startswith('_') or name in self.extra_public_methods) and isinstance(func, collections.Callable) and self._is_show_member(name))] @property def properties(self): if self._cls is None: return [] return [name for name, func in inspect.getmembers(self._cls) if (not name.startswith('_') and (func is None or isinstance(func, property) or inspect.isgetsetdescriptor(func)) and self._is_show_member(name))] def _is_show_member(self, name): if self.show_inherited_members: return True # show all class members if name not in self._cls.__dict__: return False # class member is inherited, we do not show it return True scikit-cuda-0.5.1/docs/sphinxext/docscrape_sphinx.py000066400000000000000000000223341261465507300226220ustar00rootroot00000000000000from __future__ import division, absolute_import, print_function import sys, re, inspect, textwrap, pydoc import sphinx import collections from docscrape import NumpyDocString, FunctionDoc, ClassDoc if sys.version_info[0] >= 3: sixu = lambda s: s else: sixu = lambda s: unicode(s, 'unicode_escape') class SphinxDocString(NumpyDocString): def __init__(self, docstring, config={}): NumpyDocString.__init__(self, docstring, config=config) self.load_config(config) def load_config(self, config): self.use_plots = config.get('use_plots', False) self.class_members_toctree = config.get('class_members_toctree', True) # string conversion routines def _str_header(self, name, symbol='`'): return ['.. rubric:: ' + name, ''] def _str_field_list(self, name): return [':' + name + ':'] def _str_indent(self, doc, indent=4): out = [] for line in doc: out += [' '*indent + line] return out def _str_signature(self): return [''] if self['Signature']: return ['``%s``' % self['Signature']] + [''] else: return [''] def _str_summary(self): return self['Summary'] + [''] def _str_extended_summary(self): return self['Extended Summary'] + [''] def _str_returns(self): out = [] if self['Returns']: out += self._str_field_list('Returns') out += [''] for param, param_type, desc in self['Returns']: if param_type: out += self._str_indent(['**%s** : %s' % (param.strip(), param_type)]) else: out += self._str_indent([param.strip()]) if desc: out += [''] out += self._str_indent(desc, 8) out += [''] return out def _str_param_list(self, name): out = [] if self[name]: out += self._str_field_list(name) out += [''] for param, param_type, desc in self[name]: if param_type: out += self._str_indent(['**%s** : %s' % (param.strip(), param_type)]) else: out += self._str_indent(['**%s**' % param.strip()]) if desc: out += [''] out += self._str_indent(desc, 8) out += [''] return out @property def _obj(self): if hasattr(self, '_cls'): return self._cls elif hasattr(self, '_f'): return self._f return None def _str_member_list(self, name): """ Generate a member listing, autosummary:: table where possible, and a table where not. """ out = [] if self[name]: out += ['.. rubric:: %s' % name, ''] prefix = getattr(self, '_name', '') if prefix: prefix = '~%s.' % prefix autosum = [] others = [] for param, param_type, desc in self[name]: param = param.strip() # Check if the referenced member can have a docstring or not param_obj = getattr(self._obj, param, None) if not (callable(param_obj) or isinstance(param_obj, property) or inspect.isgetsetdescriptor(param_obj)): param_obj = None if param_obj and (pydoc.getdoc(param_obj) or not desc): # Referenced object has a docstring autosum += [" %s%s" % (prefix, param)] else: others.append((param, param_type, desc)) if autosum: out += ['.. autosummary::'] if self.class_members_toctree: out += [' :toctree:'] out += [''] + autosum if others: maxlen_0 = max(3, max([len(x[0]) for x in others])) hdr = sixu("=")*maxlen_0 + sixu(" ") + sixu("=")*10 fmt = sixu('%%%ds %%s ') % (maxlen_0,) out += ['', hdr] for param, param_type, desc in others: desc = sixu(" ").join(x.strip() for x in desc).strip() if param_type: desc = "(%s) %s" % (param_type, desc) out += [fmt % (param.strip(), desc)] out += [hdr] out += [''] return out def _str_section(self, name): out = [] if self[name]: out += self._str_header(name) out += [''] content = textwrap.dedent("\n".join(self[name])).split("\n") out += content out += [''] return out def _str_see_also(self, func_role): out = [] if self['See Also']: see_also = super(SphinxDocString, self)._str_see_also(func_role) out = ['.. seealso::', ''] out += self._str_indent(see_also[2:]) return out def _str_warnings(self): out = [] if self['Warnings']: out = ['.. warning::', ''] out += self._str_indent(self['Warnings']) return out def _str_index(self): idx = self['index'] out = [] if len(idx) == 0: return out out += ['.. index:: %s' % idx.get('default','')] for section, references in idx.items(): if section == 'default': continue elif section == 'refguide': out += [' single: %s' % (', '.join(references))] else: out += [' %s: %s' % (section, ','.join(references))] return out def _str_references(self): out = [] if self['References']: out += self._str_header('References') if isinstance(self['References'], str): self['References'] = [self['References']] out.extend(self['References']) out += [''] # Latex collects all references to a separate bibliography, # so we need to insert links to it if sphinx.__version__ >= "0.6": out += ['.. only:: latex',''] else: out += ['.. latexonly::',''] items = [] for line in self['References']: m = re.match(r'.. \[([a-z0-9._-]+)\]', line, re.I) if m: items.append(m.group(1)) out += [' ' + ", ".join(["[%s]_" % item for item in items]), ''] return out def _str_examples(self): examples_str = "\n".join(self['Examples']) if (self.use_plots and 'import matplotlib' in examples_str and 'plot::' not in examples_str): out = [] out += self._str_header('Examples') out += ['.. plot::', ''] out += self._str_indent(self['Examples']) out += [''] return out else: return self._str_section('Examples') def __str__(self, indent=0, func_role="obj"): out = [] out += self._str_signature() out += self._str_index() + [''] out += self._str_summary() out += self._str_extended_summary() out += self._str_param_list('Parameters') out += self._str_returns() for param_list in ('Other Parameters', 'Raises', 'Warns'): out += self._str_param_list(param_list) out += self._str_warnings() out += self._str_see_also(func_role) out += self._str_section('Notes') out += self._str_references() out += self._str_examples() for param_list in ('Attributes', 'Methods'): out += self._str_member_list(param_list) out = self._str_indent(out,indent) return '\n'.join(out) class SphinxFunctionDoc(SphinxDocString, FunctionDoc): def __init__(self, obj, doc=None, config={}): self.load_config(config) FunctionDoc.__init__(self, obj, doc=doc, config=config) class SphinxClassDoc(SphinxDocString, ClassDoc): def __init__(self, obj, doc=None, func_doc=None, config={}): self.load_config(config) ClassDoc.__init__(self, obj, doc=doc, func_doc=None, config=config) class SphinxObjDoc(SphinxDocString): def __init__(self, obj, doc=None, config={}): self._f = obj self.load_config(config) SphinxDocString.__init__(self, doc, config=config) def get_doc_object(obj, what=None, doc=None, config={}): if what is None: if inspect.isclass(obj): what = 'class' elif inspect.ismodule(obj): what = 'module' elif isinstance(obj, collections.Callable): what = 'function' else: what = 'object' if what == 'class': return SphinxClassDoc(obj, func_doc=SphinxFunctionDoc, doc=doc, config=config) elif what in ('function', 'method'): return SphinxFunctionDoc(obj, doc=doc, config=config) else: if doc is None: doc = pydoc.getdoc(obj) return SphinxObjDoc(obj, doc, config=config) scikit-cuda-0.5.1/docs/sphinxext/linkcode.py000066400000000000000000000047131261465507300210570ustar00rootroot00000000000000# -*- coding: utf-8 -*- """ linkcode ~~~~~~~~ Add external links to module code in Python object descriptions. :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ from __future__ import division, absolute_import, print_function import warnings import collections warnings.warn("This extension has been accepted to Sphinx upstream. " "Use the version from there (Sphinx >= 1.2) " "https://bitbucket.org/birkenfeld/sphinx/pull-request/47/sphinxextlinkcode", FutureWarning, stacklevel=1) from docutils import nodes from sphinx import addnodes from sphinx.locale import _ from sphinx.errors import SphinxError class LinkcodeError(SphinxError): category = "linkcode error" def doctree_read(app, doctree): env = app.builder.env resolve_target = getattr(env.config, 'linkcode_resolve', None) if not isinstance(env.config.linkcode_resolve, collections.Callable): raise LinkcodeError( "Function `linkcode_resolve` is not given in conf.py") domain_keys = dict( py=['module', 'fullname'], c=['names'], cpp=['names'], js=['object', 'fullname'], ) for objnode in doctree.traverse(addnodes.desc): domain = objnode.get('domain') uris = set() for signode in objnode: if not isinstance(signode, addnodes.desc_signature): continue # Convert signode to a specified format info = {} for key in domain_keys.get(domain, []): value = signode.get(key) if not value: value = '' info[key] = value if not info: continue # Call user code to resolve the link uri = resolve_target(domain, info) if not uri: # no source continue if uri in uris or not uri: # only one link per name, please continue uris.add(uri) onlynode = addnodes.only(expr='html') onlynode += nodes.reference('', '', internal=False, refuri=uri) onlynode[0] += nodes.inline('', _('[source]'), classes=['viewcode-link']) signode += onlynode def setup(app): app.connect('doctree-read', doctree_read) app.add_config_value('linkcode_resolve', None, '') scikit-cuda-0.5.1/docs/sphinxext/numpydoc.py000066400000000000000000000146511261465507300211270ustar00rootroot00000000000000""" ======== numpydoc ======== Sphinx extension that handles docstrings in the Numpy standard format. [1] It will: - Convert Parameters etc. sections to field lists. - Convert See Also section to a See also entry. - Renumber references. - Extract the signature from the docstring, if it can't be determined otherwise. .. [1] https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt """ from __future__ import division, absolute_import, print_function import sys import re import pydoc import sphinx import inspect import collections if sphinx.__version__ < '1.0.1': raise RuntimeError("Sphinx 1.0.1 or newer is required") from docscrape_sphinx import get_doc_object, SphinxDocString from sphinx.util.compat import Directive if sys.version_info[0] >= 3: sixu = lambda s: s else: sixu = lambda s: unicode(s, 'unicode_escape') def mangle_docstrings(app, what, name, obj, options, lines, reference_offset=[0]): cfg = dict( use_plots=app.config.numpydoc_use_plots, show_class_members=app.config.numpydoc_show_class_members, show_inherited_class_members=app.config.numpydoc_show_inherited_class_members, class_members_toctree=app.config.numpydoc_class_members_toctree, ) if what == 'module': # Strip top title title_re = re.compile(sixu('^\\s*[#*=]{4,}\\n[a-z0-9 -]+\\n[#*=]{4,}\\s*'), re.I|re.S) lines[:] = title_re.sub(sixu(''), sixu("\n").join(lines)).split(sixu("\n")) else: doc = get_doc_object(obj, what, sixu("\n").join(lines), config=cfg) if sys.version_info[0] >= 3: doc = str(doc) else: doc = unicode(doc) lines[:] = doc.split(sixu("\n")) if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \ obj.__name__: if hasattr(obj, '__module__'): v = dict(full_name=sixu("%s.%s") % (obj.__module__, obj.__name__)) else: v = dict(full_name=obj.__name__) lines += [sixu(''), sixu('.. htmlonly::'), sixu('')] lines += [sixu(' %s') % x for x in (app.config.numpydoc_edit_link % v).split("\n")] # replace reference numbers so that there are no duplicates references = [] for line in lines: line = line.strip() m = re.match(sixu('^.. \\[([a-z0-9_.-])\\]'), line, re.I) if m: references.append(m.group(1)) # start renaming from the longest string, to avoid overwriting parts references.sort(key=lambda x: -len(x)) if references: for i, line in enumerate(lines): for r in references: if re.match(sixu('^\\d+$'), r): new_r = sixu("R%d") % (reference_offset[0] + int(r)) else: new_r = sixu("%s%d") % (r, reference_offset[0]) lines[i] = lines[i].replace(sixu('[%s]_') % r, sixu('[%s]_') % new_r) lines[i] = lines[i].replace(sixu('.. [%s]') % r, sixu('.. [%s]') % new_r) reference_offset[0] += len(references) def mangle_signature(app, what, name, obj, options, sig, retann): # Do not try to inspect classes that don't define `__init__` if (inspect.isclass(obj) and (not hasattr(obj, '__init__') or 'initializes x; see ' in pydoc.getdoc(obj.__init__))): return '', '' if not (isinstance(obj, collections.Callable) or hasattr(obj, '__argspec_is_invalid_')): return if not hasattr(obj, '__doc__'): return doc = SphinxDocString(pydoc.getdoc(obj)) if doc['Signature']: sig = re.sub(sixu("^[^(]*"), sixu(""), doc['Signature']) return sig, sixu('') def setup(app, get_doc_object_=get_doc_object): if not hasattr(app, 'add_config_value'): return # probably called by nose, better bail out global get_doc_object get_doc_object = get_doc_object_ app.connect('autodoc-process-docstring', mangle_docstrings) app.connect('autodoc-process-signature', mangle_signature) app.add_config_value('numpydoc_edit_link', None, False) app.add_config_value('numpydoc_use_plots', None, False) app.add_config_value('numpydoc_show_class_members', True, True) app.add_config_value('numpydoc_show_inherited_class_members', True, True) app.add_config_value('numpydoc_class_members_toctree', True, True) # Extra mangling domains app.add_domain(NumpyPythonDomain) app.add_domain(NumpyCDomain) #------------------------------------------------------------------------------ # Docstring-mangling domains #------------------------------------------------------------------------------ from docutils.statemachine import ViewList from sphinx.domains.c import CDomain from sphinx.domains.python import PythonDomain class ManglingDomainBase(object): directive_mangling_map = {} def __init__(self, *a, **kw): super(ManglingDomainBase, self).__init__(*a, **kw) self.wrap_mangling_directives() def wrap_mangling_directives(self): for name, objtype in list(self.directive_mangling_map.items()): self.directives[name] = wrap_mangling_directive( self.directives[name], objtype) class NumpyPythonDomain(ManglingDomainBase, PythonDomain): name = 'np' directive_mangling_map = { 'function': 'function', 'class': 'class', 'exception': 'class', 'method': 'function', 'classmethod': 'function', 'staticmethod': 'function', 'attribute': 'attribute', } indices = [] class NumpyCDomain(ManglingDomainBase, CDomain): name = 'np-c' directive_mangling_map = { 'function': 'function', 'member': 'attribute', 'macro': 'function', 'type': 'class', 'var': 'object', } def wrap_mangling_directive(base_directive, objtype): class directive(base_directive): def run(self): env = self.state.document.settings.env name = None if self.arguments: m = re.match(r'^(.*\s+)?(.*?)(\(.*)?', self.arguments[0]) name = m.group(2).strip() if not name: name = self.arguments[0] lines = list(self.content) mangle_docstrings(env.app, objtype, name, None, None, lines) self.content = ViewList(lines, self.content.parent) return base_directive.run(self) return directive scikit-cuda-0.5.1/docs/sphinxext/phantom_import.py000066400000000000000000000133361261465507300223300ustar00rootroot00000000000000""" ============== phantom_import ============== Sphinx extension to make directives from ``sphinx.ext.autodoc`` and similar extensions to use docstrings loaded from an XML file. This extension loads an XML file in the Pydocweb format [1] and creates a dummy module that contains the specified docstrings. This can be used to get the current docstrings from a Pydocweb instance without needing to rebuild the documented module. .. [1] http://code.google.com/p/pydocweb """ from __future__ import division, absolute_import, print_function import imp, sys, compiler, types, os, inspect, re def setup(app): app.connect('builder-inited', initialize) app.add_config_value('phantom_import_file', None, True) def initialize(app): fn = app.config.phantom_import_file if (fn and os.path.isfile(fn)): print("[numpydoc] Phantom importing modules from", fn, "...") import_phantom_module(fn) #------------------------------------------------------------------------------ # Creating 'phantom' modules from an XML description #------------------------------------------------------------------------------ def import_phantom_module(xml_file): """ Insert a fake Python module to sys.modules, based on a XML file. The XML file is expected to conform to Pydocweb DTD. The fake module will contain dummy objects, which guarantee the following: - Docstrings are correct. - Class inheritance relationships are correct (if present in XML). - Function argspec is *NOT* correct (even if present in XML). Instead, the function signature is prepended to the function docstring. - Class attributes are *NOT* correct; instead, they are dummy objects. Parameters ---------- xml_file : str Name of an XML file to read """ import lxml.etree as etree object_cache = {} tree = etree.parse(xml_file) root = tree.getroot() # Sort items so that # - Base classes come before classes inherited from them # - Modules come before their contents all_nodes = dict([(n.attrib['id'], n) for n in root]) def _get_bases(node, recurse=False): bases = [x.attrib['ref'] for x in node.findall('base')] if recurse: j = 0 while True: try: b = bases[j] except IndexError: break if b in all_nodes: bases.extend(_get_bases(all_nodes[b])) j += 1 return bases type_index = ['module', 'class', 'callable', 'object'] def base_cmp(a, b): x = cmp(type_index.index(a.tag), type_index.index(b.tag)) if x != 0: return x if a.tag == 'class' and b.tag == 'class': a_bases = _get_bases(a, recurse=True) b_bases = _get_bases(b, recurse=True) x = cmp(len(a_bases), len(b_bases)) if x != 0: return x if a.attrib['id'] in b_bases: return -1 if b.attrib['id'] in a_bases: return 1 return cmp(a.attrib['id'].count('.'), b.attrib['id'].count('.')) nodes = root.getchildren() nodes.sort(base_cmp) # Create phantom items for node in nodes: name = node.attrib['id'] doc = (node.text or '').decode('string-escape') + "\n" if doc == "\n": doc = "" # create parent, if missing parent = name while True: parent = '.'.join(parent.split('.')[:-1]) if not parent: break if parent in object_cache: break obj = imp.new_module(parent) object_cache[parent] = obj sys.modules[parent] = obj # create object if node.tag == 'module': obj = imp.new_module(name) obj.__doc__ = doc sys.modules[name] = obj elif node.tag == 'class': bases = [object_cache[b] for b in _get_bases(node) if b in object_cache] bases.append(object) init = lambda self: None init.__doc__ = doc obj = type(name, tuple(bases), {'__doc__': doc, '__init__': init}) obj.__name__ = name.split('.')[-1] elif node.tag == 'callable': funcname = node.attrib['id'].split('.')[-1] argspec = node.attrib.get('argspec') if argspec: argspec = re.sub('^[^(]*', '', argspec) doc = "%s%s\n\n%s" % (funcname, argspec, doc) obj = lambda: 0 obj.__argspec_is_invalid_ = True if sys.version_info[0] >= 3: obj.__name__ = funcname else: obj.func_name = funcname obj.__name__ = name obj.__doc__ = doc if inspect.isclass(object_cache[parent]): obj.__objclass__ = object_cache[parent] else: class Dummy(object): pass obj = Dummy() obj.__name__ = name obj.__doc__ = doc if inspect.isclass(object_cache[parent]): obj.__get__ = lambda: None object_cache[name] = obj if parent: if inspect.ismodule(object_cache[parent]): obj.__module__ = parent setattr(object_cache[parent], name.split('.')[-1], obj) # Populate items for node in root: obj = object_cache.get(node.attrib['id']) if obj is None: continue for ref in node.findall('ref'): if node.tag == 'class': if ref.attrib['ref'].startswith(node.attrib['id'] + '.'): setattr(obj, ref.attrib['name'], object_cache.get(ref.attrib['ref'])) else: setattr(obj, ref.attrib['name'], object_cache.get(ref.attrib['ref'])) scikit-cuda-0.5.1/docs/sphinxext/plot_directive.py000066400000000000000000000500621261465507300223010ustar00rootroot00000000000000""" A special directive for generating a matplotlib plot. .. warning:: This is a hacked version of plot_directive.py from Matplotlib. It's very much subject to change! Usage ----- Can be used like this:: .. plot:: examples/example.py .. plot:: import matplotlib.pyplot as plt plt.plot([1,2,3], [4,5,6]) .. plot:: A plotting example: >>> import matplotlib.pyplot as plt >>> plt.plot([1,2,3], [4,5,6]) The content is interpreted as doctest formatted if it has a line starting with ``>>>``. The ``plot`` directive supports the options format : {'python', 'doctest'} Specify the format of the input include-source : bool Whether to display the source code. Default can be changed in conf.py and the ``image`` directive options ``alt``, ``height``, ``width``, ``scale``, ``align``, ``class``. Configuration options --------------------- The plot directive has the following configuration options: plot_include_source Default value for the include-source option plot_pre_code Code that should be executed before each plot. plot_basedir Base directory, to which plot:: file names are relative to. (If None or empty, file names are relative to the directoly where the file containing the directive is.) plot_formats File formats to generate. List of tuples or strings:: [(suffix, dpi), suffix, ...] that determine the file format and the DPI. For entries whose DPI was omitted, sensible defaults are chosen. plot_html_show_formats Whether to show links to the files in HTML. TODO ---- * Refactor Latex output; now it's plain images, but it would be nice to make them appear side-by-side, or in floats. """ from __future__ import division, absolute_import, print_function import sys, os, glob, shutil, imp, warnings, re, textwrap, traceback import sphinx if sys.version_info[0] >= 3: from io import StringIO else: from io import StringIO import warnings warnings.warn("A plot_directive module is also available under " "matplotlib.sphinxext; expect this numpydoc.plot_directive " "module to be deprecated after relevant features have been " "integrated there.", FutureWarning, stacklevel=2) #------------------------------------------------------------------------------ # Registration hook #------------------------------------------------------------------------------ def setup(app): setup.app = app setup.config = app.config setup.confdir = app.confdir app.add_config_value('plot_pre_code', '', True) app.add_config_value('plot_include_source', False, True) app.add_config_value('plot_formats', ['png', 'hires.png', 'pdf'], True) app.add_config_value('plot_basedir', None, True) app.add_config_value('plot_html_show_formats', True, True) app.add_directive('plot', plot_directive, True, (0, 1, False), **plot_directive_options) #------------------------------------------------------------------------------ # plot:: directive #------------------------------------------------------------------------------ from docutils.parsers.rst import directives from docutils import nodes def plot_directive(name, arguments, options, content, lineno, content_offset, block_text, state, state_machine): return run(arguments, content, options, state_machine, state, lineno) plot_directive.__doc__ = __doc__ def _option_boolean(arg): if not arg or not arg.strip(): # no argument given, assume used as a flag return True elif arg.strip().lower() in ('no', '0', 'false'): return False elif arg.strip().lower() in ('yes', '1', 'true'): return True else: raise ValueError('"%s" unknown boolean' % arg) def _option_format(arg): return directives.choice(arg, ('python', 'lisp')) def _option_align(arg): return directives.choice(arg, ("top", "middle", "bottom", "left", "center", "right")) plot_directive_options = {'alt': directives.unchanged, 'height': directives.length_or_unitless, 'width': directives.length_or_percentage_or_unitless, 'scale': directives.nonnegative_int, 'align': _option_align, 'class': directives.class_option, 'include-source': _option_boolean, 'format': _option_format, } #------------------------------------------------------------------------------ # Generating output #------------------------------------------------------------------------------ from docutils import nodes, utils try: # Sphinx depends on either Jinja or Jinja2 import jinja2 def format_template(template, **kw): return jinja2.Template(template).render(**kw) except ImportError: import jinja def format_template(template, **kw): return jinja.from_string(template, **kw) TEMPLATE = """ {{ source_code }} {{ only_html }} {% if source_link or (html_show_formats and not multi_image) %} ( {%- if source_link -%} `Source code <{{ source_link }}>`__ {%- endif -%} {%- if html_show_formats and not multi_image -%} {%- for img in images -%} {%- for fmt in img.formats -%} {%- if source_link or not loop.first -%}, {% endif -%} `{{ fmt }} <{{ dest_dir }}/{{ img.basename }}.{{ fmt }}>`__ {%- endfor -%} {%- endfor -%} {%- endif -%} ) {% endif %} {% for img in images %} .. figure:: {{ build_dir }}/{{ img.basename }}.png {%- for option in options %} {{ option }} {% endfor %} {% if html_show_formats and multi_image -%} ( {%- for fmt in img.formats -%} {%- if not loop.first -%}, {% endif -%} `{{ fmt }} <{{ dest_dir }}/{{ img.basename }}.{{ fmt }}>`__ {%- endfor -%} ) {%- endif -%} {% endfor %} {{ only_latex }} {% for img in images %} .. image:: {{ build_dir }}/{{ img.basename }}.pdf {% endfor %} """ class ImageFile(object): def __init__(self, basename, dirname): self.basename = basename self.dirname = dirname self.formats = [] def filename(self, format): return os.path.join(self.dirname, "%s.%s" % (self.basename, format)) def filenames(self): return [self.filename(fmt) for fmt in self.formats] def run(arguments, content, options, state_machine, state, lineno): if arguments and content: raise RuntimeError("plot:: directive can't have both args and content") document = state_machine.document config = document.settings.env.config options.setdefault('include-source', config.plot_include_source) # determine input rst_file = document.attributes['source'] rst_dir = os.path.dirname(rst_file) if arguments: if not config.plot_basedir: source_file_name = os.path.join(rst_dir, directives.uri(arguments[0])) else: source_file_name = os.path.join(setup.confdir, config.plot_basedir, directives.uri(arguments[0])) code = open(source_file_name, 'r').read() output_base = os.path.basename(source_file_name) else: source_file_name = rst_file code = textwrap.dedent("\n".join(map(str, content))) counter = document.attributes.get('_plot_counter', 0) + 1 document.attributes['_plot_counter'] = counter base, ext = os.path.splitext(os.path.basename(source_file_name)) output_base = '%s-%d.py' % (base, counter) base, source_ext = os.path.splitext(output_base) if source_ext in ('.py', '.rst', '.txt'): output_base = base else: source_ext = '' # ensure that LaTeX includegraphics doesn't choke in foo.bar.pdf filenames output_base = output_base.replace('.', '-') # is it in doctest format? is_doctest = contains_doctest(code) if 'format' in options: if options['format'] == 'python': is_doctest = False else: is_doctest = True # determine output directory name fragment source_rel_name = relpath(source_file_name, setup.confdir) source_rel_dir = os.path.dirname(source_rel_name) while source_rel_dir.startswith(os.path.sep): source_rel_dir = source_rel_dir[1:] # build_dir: where to place output files (temporarily) build_dir = os.path.join(os.path.dirname(setup.app.doctreedir), 'plot_directive', source_rel_dir) if not os.path.exists(build_dir): os.makedirs(build_dir) # output_dir: final location in the builder's directory dest_dir = os.path.abspath(os.path.join(setup.app.builder.outdir, source_rel_dir)) # how to link to files from the RST file dest_dir_link = os.path.join(relpath(setup.confdir, rst_dir), source_rel_dir).replace(os.path.sep, '/') build_dir_link = relpath(build_dir, rst_dir).replace(os.path.sep, '/') source_link = dest_dir_link + '/' + output_base + source_ext # make figures try: results = makefig(code, source_file_name, build_dir, output_base, config) errors = [] except PlotError as err: reporter = state.memo.reporter sm = reporter.system_message( 2, "Exception occurred in plotting %s: %s" % (output_base, err), line=lineno) results = [(code, [])] errors = [sm] # generate output restructuredtext total_lines = [] for j, (code_piece, images) in enumerate(results): if options['include-source']: if is_doctest: lines = [''] lines += [row.rstrip() for row in code_piece.split('\n')] else: lines = ['.. code-block:: python', ''] lines += [' %s' % row.rstrip() for row in code_piece.split('\n')] source_code = "\n".join(lines) else: source_code = "" opts = [':%s: %s' % (key, val) for key, val in list(options.items()) if key in ('alt', 'height', 'width', 'scale', 'align', 'class')] only_html = ".. only:: html" only_latex = ".. only:: latex" if j == 0: src_link = source_link else: src_link = None result = format_template( TEMPLATE, dest_dir=dest_dir_link, build_dir=build_dir_link, source_link=src_link, multi_image=len(images) > 1, only_html=only_html, only_latex=only_latex, options=opts, images=images, source_code=source_code, html_show_formats=config.plot_html_show_formats) total_lines.extend(result.split("\n")) total_lines.extend("\n") if total_lines: state_machine.insert_input(total_lines, source=source_file_name) # copy image files to builder's output directory if not os.path.exists(dest_dir): os.makedirs(dest_dir) for code_piece, images in results: for img in images: for fn in img.filenames(): shutil.copyfile(fn, os.path.join(dest_dir, os.path.basename(fn))) # copy script (if necessary) if source_file_name == rst_file: target_name = os.path.join(dest_dir, output_base + source_ext) f = open(target_name, 'w') f.write(unescape_doctest(code)) f.close() return errors #------------------------------------------------------------------------------ # Run code and capture figures #------------------------------------------------------------------------------ import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import matplotlib.image as image from matplotlib import _pylab_helpers import exceptions def contains_doctest(text): try: # check if it's valid Python as-is compile(text, '', 'exec') return False except SyntaxError: pass r = re.compile(r'^\s*>>>', re.M) m = r.search(text) return bool(m) def unescape_doctest(text): """ Extract code from a piece of text, which contains either Python code or doctests. """ if not contains_doctest(text): return text code = "" for line in text.split("\n"): m = re.match(r'^\s*(>>>|\.\.\.) (.*)$', line) if m: code += m.group(2) + "\n" elif line.strip(): code += "# " + line.strip() + "\n" else: code += "\n" return code def split_code_at_show(text): """ Split code at plt.show() """ parts = [] is_doctest = contains_doctest(text) part = [] for line in text.split("\n"): if (not is_doctest and line.strip() == 'plt.show()') or \ (is_doctest and line.strip() == '>>> plt.show()'): part.append(line) parts.append("\n".join(part)) part = [] else: part.append(line) if "\n".join(part).strip(): parts.append("\n".join(part)) return parts class PlotError(RuntimeError): pass def run_code(code, code_path, ns=None): # Change the working directory to the directory of the example, so # it can get at its data files, if any. pwd = os.getcwd() old_sys_path = list(sys.path) if code_path is not None: dirname = os.path.abspath(os.path.dirname(code_path)) os.chdir(dirname) sys.path.insert(0, dirname) # Redirect stdout stdout = sys.stdout sys.stdout = StringIO() # Reset sys.argv old_sys_argv = sys.argv sys.argv = [code_path] try: try: code = unescape_doctest(code) if ns is None: ns = {} if not ns: exec(setup.config.plot_pre_code, ns) exec(code, ns) except (Exception, SystemExit) as err: raise PlotError(traceback.format_exc()) finally: os.chdir(pwd) sys.argv = old_sys_argv sys.path[:] = old_sys_path sys.stdout = stdout return ns #------------------------------------------------------------------------------ # Generating figures #------------------------------------------------------------------------------ def out_of_date(original, derived): """ Returns True if derivative is out-of-date wrt original, both of which are full file paths. """ return (not os.path.exists(derived) or os.stat(derived).st_mtime < os.stat(original).st_mtime) def makefig(code, code_path, output_dir, output_base, config): """ Run a pyplot script *code* and save the images under *output_dir* with file names derived from *output_base* """ # -- Parse format list default_dpi = {'png': 80, 'hires.png': 200, 'pdf': 50} formats = [] for fmt in config.plot_formats: if isinstance(fmt, str): formats.append((fmt, default_dpi.get(fmt, 80))) elif type(fmt) in (tuple, list) and len(fmt)==2: formats.append((str(fmt[0]), int(fmt[1]))) else: raise PlotError('invalid image format "%r" in plot_formats' % fmt) # -- Try to determine if all images already exist code_pieces = split_code_at_show(code) # Look for single-figure output files first all_exists = True img = ImageFile(output_base, output_dir) for format, dpi in formats: if out_of_date(code_path, img.filename(format)): all_exists = False break img.formats.append(format) if all_exists: return [(code, [img])] # Then look for multi-figure output files results = [] all_exists = True for i, code_piece in enumerate(code_pieces): images = [] for j in range(1000): img = ImageFile('%s_%02d_%02d' % (output_base, i, j), output_dir) for format, dpi in formats: if out_of_date(code_path, img.filename(format)): all_exists = False break img.formats.append(format) # assume that if we have one, we have them all if not all_exists: all_exists = (j > 0) break images.append(img) if not all_exists: break results.append((code_piece, images)) if all_exists: return results # -- We didn't find the files, so build them results = [] ns = {} for i, code_piece in enumerate(code_pieces): # Clear between runs plt.close('all') # Run code run_code(code_piece, code_path, ns) # Collect images images = [] fig_managers = _pylab_helpers.Gcf.get_all_fig_managers() for j, figman in enumerate(fig_managers): if len(fig_managers) == 1 and len(code_pieces) == 1: img = ImageFile(output_base, output_dir) else: img = ImageFile("%s_%02d_%02d" % (output_base, i, j), output_dir) images.append(img) for format, dpi in formats: try: figman.canvas.figure.savefig(img.filename(format), dpi=dpi) except exceptions.BaseException as err: raise PlotError(traceback.format_exc()) img.formats.append(format) # Results results.append((code_piece, images)) return results #------------------------------------------------------------------------------ # Relative pathnames #------------------------------------------------------------------------------ try: from os.path import relpath except ImportError: # Copied from Python 2.7 if 'posix' in sys.builtin_module_names: def relpath(path, start=os.path.curdir): """Return a relative version of a path""" from os.path import sep, curdir, join, abspath, commonprefix, \ pardir if not path: raise ValueError("no path specified") start_list = abspath(start).split(sep) path_list = abspath(path).split(sep) # Work out how much of the filepath is shared by start and path. i = len(commonprefix([start_list, path_list])) rel_list = [pardir] * (len(start_list)-i) + path_list[i:] if not rel_list: return curdir return join(*rel_list) elif 'nt' in sys.builtin_module_names: def relpath(path, start=os.path.curdir): """Return a relative version of a path""" from os.path import sep, curdir, join, abspath, commonprefix, \ pardir, splitunc if not path: raise ValueError("no path specified") start_list = abspath(start).split(sep) path_list = abspath(path).split(sep) if start_list[0].lower() != path_list[0].lower(): unc_path, rest = splitunc(path) unc_start, rest = splitunc(start) if bool(unc_path) ^ bool(unc_start): raise ValueError("Cannot mix UNC and non-UNC paths (%s and %s)" % (path, start)) else: raise ValueError("path is on drive %s, start on drive %s" % (path_list[0], start_list[0])) # Work out how much of the filepath is shared by start and path. for i in range(min(len(start_list), len(path_list))): if start_list[i].lower() != path_list[i].lower(): break else: i += 1 rel_list = [pardir] * (len(start_list)-i) + path_list[i:] if not rel_list: return curdir return join(*rel_list) else: raise RuntimeError("Unsupported platform (no relpath available!)") scikit-cuda-0.5.1/docs/sphinxext/traitsdoc.py000066400000000000000000000102611261465507300212560ustar00rootroot00000000000000""" ========= traitsdoc ========= Sphinx extension that handles docstrings in the Numpy standard format, [1] and support Traits [2]. This extension can be used as a replacement for ``numpydoc`` when support for Traits is required. .. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard .. [2] http://code.enthought.com/projects/traits/ """ from __future__ import division, absolute_import, print_function import inspect import os import pydoc import collections from . import docscrape from . import docscrape_sphinx from .docscrape_sphinx import SphinxClassDoc, SphinxFunctionDoc, SphinxDocString from . import numpydoc from . import comment_eater class SphinxTraitsDoc(SphinxClassDoc): def __init__(self, cls, modulename='', func_doc=SphinxFunctionDoc): if not inspect.isclass(cls): raise ValueError("Initialise using a class. Got %r" % cls) self._cls = cls if modulename and not modulename.endswith('.'): modulename += '.' self._mod = modulename self._name = cls.__name__ self._func_doc = func_doc docstring = pydoc.getdoc(cls) docstring = docstring.split('\n') # De-indent paragraph try: indent = min(len(s) - len(s.lstrip()) for s in docstring if s.strip()) except ValueError: indent = 0 for n,line in enumerate(docstring): docstring[n] = docstring[n][indent:] self._doc = docscrape.Reader(docstring) self._parsed_data = { 'Signature': '', 'Summary': '', 'Description': [], 'Extended Summary': [], 'Parameters': [], 'Returns': [], 'Raises': [], 'Warns': [], 'Other Parameters': [], 'Traits': [], 'Methods': [], 'See Also': [], 'Notes': [], 'References': '', 'Example': '', 'Examples': '', 'index': {} } self._parse() def _str_summary(self): return self['Summary'] + [''] def _str_extended_summary(self): return self['Description'] + self['Extended Summary'] + [''] def __str__(self, indent=0, func_role="func"): out = [] out += self._str_signature() out += self._str_index() + [''] out += self._str_summary() out += self._str_extended_summary() for param_list in ('Parameters', 'Traits', 'Methods', 'Returns','Raises'): out += self._str_param_list(param_list) out += self._str_see_also("obj") out += self._str_section('Notes') out += self._str_references() out += self._str_section('Example') out += self._str_section('Examples') out = self._str_indent(out,indent) return '\n'.join(out) def looks_like_issubclass(obj, classname): """ Return True if the object has a class or superclass with the given class name. Ignores old-style classes. """ t = obj if t.__name__ == classname: return True for klass in t.__mro__: if klass.__name__ == classname: return True return False def get_doc_object(obj, what=None, config=None): if what is None: if inspect.isclass(obj): what = 'class' elif inspect.ismodule(obj): what = 'module' elif isinstance(obj, collections.Callable): what = 'function' else: what = 'object' if what == 'class': doc = SphinxTraitsDoc(obj, '', func_doc=SphinxFunctionDoc, config=config) if looks_like_issubclass(obj, 'HasTraits'): for name, trait, comment in comment_eater.get_class_traits(obj): # Exclude private traits. if not name.startswith('_'): doc['Traits'].append((name, trait, comment.splitlines())) return doc elif what in ('function', 'method'): return SphinxFunctionDoc(obj, '', config=config) else: return SphinxDocString(pydoc.getdoc(obj), config=config) def setup(app): # init numpydoc numpydoc.setup(app, get_doc_object) scikit-cuda-0.5.1/ez_setup.py000066400000000000000000000262501261465507300161430ustar00rootroot00000000000000#!/usr/bin/env python """ Setuptools bootstrapping installer. Run this script to install or upgrade setuptools. """ import os import shutil import sys import tempfile import zipfile import optparse import subprocess import platform import textwrap import contextlib import warnings from distutils import log try: from urllib.request import urlopen except ImportError: from urllib2 import urlopen try: from site import USER_SITE except ImportError: USER_SITE = None DEFAULT_VERSION = "15.1" DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/" DEFAULT_SAVE_DIR = os.curdir def _python_cmd(*args): """ Execute a command. Return True if the command succeeded. """ args = (sys.executable,) + args return subprocess.call(args) == 0 def _install(archive_filename, install_args=()): """Install Setuptools.""" with archive_context(archive_filename): # installing log.warn('Installing Setuptools') if not _python_cmd('setup.py', 'install', *install_args): log.warn('Something went wrong during the installation.') log.warn('See the error message above.') # exitcode will be 2 return 2 def _build_egg(egg, archive_filename, to_dir): """Build Setuptools egg.""" with archive_context(archive_filename): # building an egg log.warn('Building a Setuptools egg in %s', to_dir) _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) # returning the result log.warn(egg) if not os.path.exists(egg): raise IOError('Could not build the egg.') class ContextualZipFile(zipfile.ZipFile): """Supplement ZipFile class to support context manager for Python 2.6.""" def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() def __new__(cls, *args, **kwargs): """Construct a ZipFile or ContextualZipFile as appropriate.""" if hasattr(zipfile.ZipFile, '__exit__'): return zipfile.ZipFile(*args, **kwargs) return super(ContextualZipFile, cls).__new__(cls) @contextlib.contextmanager def archive_context(filename): """ Unzip filename to a temporary directory, set to the cwd. The unzipped target is cleaned up after. """ tmpdir = tempfile.mkdtemp() log.warn('Extracting in %s', tmpdir) old_wd = os.getcwd() try: os.chdir(tmpdir) with ContextualZipFile(filename) as archive: archive.extractall() # going in the directory subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) os.chdir(subdir) log.warn('Now working in %s', subdir) yield finally: os.chdir(old_wd) shutil.rmtree(tmpdir) def _do_download(version, download_base, to_dir, download_delay): """Download Setuptools.""" egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg' % (version, sys.version_info[0], sys.version_info[1])) if not os.path.exists(egg): archive = download_setuptools(version, download_base, to_dir, download_delay) _build_egg(egg, archive, to_dir) sys.path.insert(0, egg) # Remove previously-imported pkg_resources if present (see # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details). if 'pkg_resources' in sys.modules: del sys.modules['pkg_resources'] import setuptools setuptools.bootstrap_install_from = egg def use_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=DEFAULT_SAVE_DIR, download_delay=15): """ Ensure that a setuptools version is installed. Return None. Raise SystemExit if the requested version or later cannot be installed. """ to_dir = os.path.abspath(to_dir) # prior to importing, capture the module state for # representative modules. rep_modules = 'pkg_resources', 'setuptools' imported = set(sys.modules).intersection(rep_modules) try: import pkg_resources pkg_resources.require("setuptools>=" + version) # a suitable version is already installed return except ImportError: # pkg_resources not available; setuptools is not installed; download pass except pkg_resources.DistributionNotFound: # no version of setuptools was found; allow download pass except pkg_resources.VersionConflict as VC_err: if imported: _conflict_bail(VC_err, version) # otherwise, unload pkg_resources to allow the downloaded version to # take precedence. del pkg_resources _unload_pkg_resources() return _do_download(version, download_base, to_dir, download_delay) def _conflict_bail(VC_err, version): """ Setuptools was imported prior to invocation, so it is unsafe to unload it. Bail out. """ conflict_tmpl = textwrap.dedent(""" The required version of setuptools (>={version}) is not available, and can't be installed while this script is running. Please install a more recent version first, using 'easy_install -U setuptools'. (Currently using {VC_err.args[0]!r}) """) msg = conflict_tmpl.format(**locals()) sys.stderr.write(msg) sys.exit(2) def _unload_pkg_resources(): del_modules = [ name for name in sys.modules if name.startswith('pkg_resources') ] for mod_name in del_modules: del sys.modules[mod_name] def _clean_check(cmd, target): """ Run the command to download target. If the command fails, clean up before re-raising the error. """ try: subprocess.check_call(cmd) except subprocess.CalledProcessError: if os.access(target, os.F_OK): os.unlink(target) raise def download_file_powershell(url, target): """ Download the file at url to target using Powershell. Powershell will validate trust. Raise an exception if the command cannot complete. """ target = os.path.abspath(target) ps_cmd = ( "[System.Net.WebRequest]::DefaultWebProxy.Credentials = " "[System.Net.CredentialCache]::DefaultCredentials; " "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" % vars() ) cmd = [ 'powershell', '-Command', ps_cmd, ] _clean_check(cmd, target) def has_powershell(): """Determine if Powershell is available.""" if platform.system() != 'Windows': return False cmd = ['powershell', '-Command', 'echo test'] with open(os.path.devnull, 'wb') as devnull: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) except Exception: return False return True download_file_powershell.viable = has_powershell def download_file_curl(url, target): cmd = ['curl', url, '--silent', '--output', target] _clean_check(cmd, target) def has_curl(): cmd = ['curl', '--version'] with open(os.path.devnull, 'wb') as devnull: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) except Exception: return False return True download_file_curl.viable = has_curl def download_file_wget(url, target): cmd = ['wget', url, '--quiet', '--output-document', target] _clean_check(cmd, target) def has_wget(): cmd = ['wget', '--version'] with open(os.path.devnull, 'wb') as devnull: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) except Exception: return False return True download_file_wget.viable = has_wget def download_file_insecure(url, target): """Use Python to download the file, without connection authentication.""" src = urlopen(url) try: # Read all the data in one block. data = src.read() finally: src.close() # Write all the data in one block to avoid creating a partial file. with open(target, "wb") as dst: dst.write(data) download_file_insecure.viable = lambda: True def get_best_downloader(): downloaders = ( download_file_powershell, download_file_curl, download_file_wget, download_file_insecure, ) viable_downloaders = (dl for dl in downloaders if dl.viable()) return next(viable_downloaders, None) def download_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=DEFAULT_SAVE_DIR, delay=15, downloader_factory=get_best_downloader): """ Download setuptools from a specified location and return its filename. `version` should be a valid setuptools version number that is available as an sdist for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where the egg will be downloaded. `delay` is the number of seconds to pause before an actual download attempt. ``downloader_factory`` should be a function taking no arguments and returning a function for downloading a URL to a target. """ # making sure we use the absolute path to_dir = os.path.abspath(to_dir) zip_name = "setuptools-%s.zip" % version url = download_base + zip_name saveto = os.path.join(to_dir, zip_name) if not os.path.exists(saveto): # Avoid repeated downloads log.warn("Downloading %s", url) downloader = downloader_factory() downloader(url, saveto) return os.path.realpath(saveto) def _build_install_args(options): """ Build the arguments to 'python setup.py install' on the setuptools package. Returns list of command line arguments. """ return ['--user'] if options.user_install else [] def _parse_args(): """Parse the command line for options.""" parser = optparse.OptionParser() parser.add_option( '--user', dest='user_install', action='store_true', default=False, help='install in user site package (requires Python 2.6 or later)') parser.add_option( '--download-base', dest='download_base', metavar="URL", default=DEFAULT_URL, help='alternative URL from where to download the setuptools package') parser.add_option( '--insecure', dest='downloader_factory', action='store_const', const=lambda: download_file_insecure, default=get_best_downloader, help='Use internal, non-validating downloader' ) parser.add_option( '--version', help="Specify which version to download", default=DEFAULT_VERSION, ) parser.add_option( '--to-dir', help="Directory to save (and re-use) package", default=DEFAULT_SAVE_DIR, ) options, args = parser.parse_args() # positional arguments are ignored return options def _download_args(options): """Return args for download_setuptools function from cmdline args.""" return dict( version=options.version, download_base=options.download_base, downloader_factory=options.downloader_factory, to_dir=options.to_dir, ) def main(): """Install or upgrade setuptools and EasyInstall.""" options = _parse_args() archive = download_setuptools(**_download_args(options)) return _install(archive, _build_install_args(options)) if __name__ == '__main__': sys.exit(main()) scikit-cuda-0.5.1/scikits/000077500000000000000000000000001261465507300153775ustar00rootroot00000000000000scikit-cuda-0.5.1/scikits/__init__.py000066400000000000000000000002501261465507300175050ustar00rootroot00000000000000try: __import__('pkg_resources').declare_namespace(__name__) except ImportError: from pkgutil import extend_path __path__ = extend_path(__path__, __name__) scikit-cuda-0.5.1/scikits/cuda/000077500000000000000000000000001261465507300163135ustar00rootroot00000000000000scikit-cuda-0.5.1/scikits/cuda/__init__.py000066400000000000000000000013031261465507300204210ustar00rootroot00000000000000from __future__ import absolute_import import os import warnings warnings.warn('The scikits.cuda namespace package is deprecated and will be ' 'removed in the future; please import the skcuda package ' 'instead.', DeprecationWarning, stacklevel=2) # This import must precede the invocation of extend_path() to work with Python # 3: import skcuda from pkgutil import extend_path __path__ = extend_path(__path__, 'skcuda') from .info import __doc__ from .version import __version__ # Needed to ensure correct header location even when modules are import as # scikits.cuda.something: install_headers = skcuda.__file__.replace(os.path.basename(skcuda.__file__), '') + 'include' scikit-cuda-0.5.1/setup.py000066400000000000000000000044331261465507300154440ustar00rootroot00000000000000#!/usr/bin/env python import sys, os from glob import glob # Install setuptools if it isn't available: try: import setuptools except ImportError: from ez_setup import use_setuptools use_setuptools() from distutils.command.install_headers import install_headers from setuptools import find_packages from setuptools import setup NAME = 'scikit-cuda' VERSION = '0.5.1' AUTHOR = 'Lev Givon' AUTHOR_EMAIL = 'lev@columbia.edu' URL = 'https://github.com/lebedov/scikit-cuda/' DESCRIPTION = 'Python interface to GPU-powered libraries' LONG_DESCRIPTION = DESCRIPTION DOWNLOAD_URL = URL LICENSE = 'BSD' CLASSIFIERS = [ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.4', 'Topic :: Scientific/Engineering', 'Topic :: Software Development'] NAMESPACE_PACKAGES = ['scikits'] PACKAGES = find_packages() on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if not on_rtd: install_requires = ['mako >= 1.0.1', 'numpy >= 1.2.0', 'pycuda >= 2014.1'] tests_require = ['nose >= 0.11', 'scipy >= 0.14.0'], extras_require = dict(scipy = ['scipy >= 0.14.0'], sphinx_rtd_theme = ['sphinx_rtd_theme >= 0.1.6']) else: install_requires = [] tests_require = [] extras_require = {} if __name__ == "__main__": if os.path.exists('MANIFEST'): os.remove('MANIFEST') setup( name = NAME, version = VERSION, author = AUTHOR, author_email = AUTHOR_EMAIL, license = LICENSE, classifiers = CLASSIFIERS, description = DESCRIPTION, long_description = LONG_DESCRIPTION, url = URL, namespace_packages = NAMESPACE_PACKAGES, packages = PACKAGES, include_package_data = True, install_requires = install_requires, tests_require = tests_require, extras_require = extras_require, test_suite='nose.collector') scikit-cuda-0.5.1/skcuda/000077500000000000000000000000001261465507300152005ustar00rootroot00000000000000scikit-cuda-0.5.1/skcuda/__init__.py000066400000000000000000000006031261465507300173100ustar00rootroot00000000000000from __future__ import absolute_import try: __import__('pkg_resources').declare_namespace(__name__) except ImportError: from pkgutil import extend_path __path__ = extend_path(__path__, __name__) from .info import __doc__ from .version import __version__ # Location of headers: import os install_headers = \ __file__.replace(os.path.basename(__file__), '') + 'include' scikit-cuda-0.5.1/skcuda/autoinit.py000066400000000000000000000004701261465507300174070ustar00rootroot00000000000000#!/usr/bin/env python """ Autoinitialize CUDA tools. """ from __future__ import absolute_import import atexit from . import misc try: import cula _has_cula = True except (ImportError, OSError): _has_cula = False misc.init() if _has_cula: cula.culaInitialize() atexit.register(misc.shutdown) scikit-cuda-0.5.1/skcuda/cublas.py000066400000000000000000007571031261465507300170400ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to CUBLAS functions. Note: this module does not explicitly depend on PyCUDA. """ from __future__ import absolute_import import re import os import sys import warnings import ctypes import ctypes.util import atexit import numpy as np from string import Template from . import cuda from . import utils # Load library: _version_list = [7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] if 'linux' in sys.platform: _libcublas_libname_list = ['libcublas.so'] + \ ['libcublas.so.%s' % v for v in _version_list] elif sys.platform == 'darwin': _libcublas_libname_list = ['libcublas.dylib'] elif sys.platform == 'win32': if sys.maxsize > 2**32: _libcublas_libname_list = ['cublas.dll'] + \ ['cublas64_%s.dll' % int(10*v) for v in _version_list] else: _libcublas_libname_list = ['cublas.dll'] + \ ['cublas32_%s.dll' % int(10*v) for v in _version_list] else: raise RuntimeError('unsupported platform') # Print understandable error message when library cannot be found: _libcublas = None for _libcublas_libname in _libcublas_libname_list: try: if sys.platform == 'win32': _libcublas = ctypes.windll.LoadLibrary(_libcublas_libname) else: _libcublas = ctypes.cdll.LoadLibrary(_libcublas_libname) except OSError: pass else: break if _libcublas == None: raise OSError('cublas library not found') # Generic CUBLAS error: class cublasError(Exception): """CUBLAS error""" pass # Exceptions corresponding to different CUBLAS errors: class cublasNotInitialized(cublasError): """CUBLAS library not initialized.""" pass class cublasAllocFailed(cublasError): """Resource allocation failed.""" pass class cublasInvalidValue(cublasError): """Unsupported numerical value was passed to function.""" pass class cublasArchMismatch(cublasError): """Function requires an architectural feature absent from the device.""" pass class cublasMappingError(cublasError): """Access to GPU memory space failed.""" pass class cublasExecutionFailed(cublasError): """GPU program failed to execute.""" pass class cublasInternalError(cublasError): """An internal CUBLAS operation failed.""" pass cublasExceptions = { 0x1: cublasNotInitialized, 0x3: cublasAllocFailed, 0x7: cublasInvalidValue, 0x8: cublasArchMismatch, 0xb: cublasMappingError, 0xd: cublasExecutionFailed, 0xe: cublasInternalError, } _CUBLAS_OP = { 0: 0, # CUBLAS_OP_N 'n': 0, 'N': 0, 1: 1, # CUBLAS_OP_T 't': 1, 'T': 1, 2: 2, # CUBLAS_OP_C 'c': 2, 'C': 2, } _CUBLAS_FILL_MODE = { 0: 0, # CUBLAS_FILL_MODE_LOWER 'l': 0, 'L': 0, 1: 1, # CUBLAS_FILL_MODE_UPPER 'u': 1, 'U': 1, } _CUBLAS_DIAG = { 0: 0, # CUBLAS_DIAG_NON_UNIT, 'n': 0, 'N': 0, 1: 1, # CUBLAS_DIAG_UNIT 'u': 1, 'U': 1, } _CUBLAS_SIDE_MODE = { 0: 0, # CUBLAS_SIDE_LEFT 'l': 0, 'L': 0, 1: 1, # CUBLAS_SIDE_RIGHT 'r': 1, 'R': 1 } class _types: """Some alias types.""" handle = ctypes.c_void_p stream = ctypes.c_void_p def cublasCheckStatus(status): """ Raise CUBLAS exception Raise an exception corresponding to the specified CUBLAS error code. Parameters ---------- status : int CUBLAS error code. See Also -------- cublasExceptions """ if status != 0: try: raise cublasExceptions[status] except KeyError: raise cublasError # Helper functions: _libcublas.cublasCreate_v2.restype = int _libcublas.cublasCreate_v2.argtypes = [_types.handle] def cublasCreate(): """ Initialize CUBLAS. Initializes CUBLAS and creates a handle to a structure holding the CUBLAS library context. Returns ------- handle : int CUBLAS context. References ---------- `cublasCreate `_ """ handle = _types.handle() status = _libcublas.cublasCreate_v2(ctypes.byref(handle)) cublasCheckStatus(status) return handle.value _libcublas.cublasDestroy_v2.restype = int _libcublas.cublasDestroy_v2.argtypes = [_types.handle] def cublasDestroy(handle): """ Release CUBLAS resources. Releases hardware resources used by CUBLAS. Parameters ---------- handle : int CUBLAS context. References ---------- `cublasDestroy `_ """ status = _libcublas.cublasDestroy_v2(handle) cublasCheckStatus(status) _libcublas.cublasGetVersion_v2.restype = int _libcublas.cublasGetVersion_v2.argtypes = [_types.handle, ctypes.c_void_p] def cublasGetVersion(handle): """ Get CUBLAS version. Returns version number of installed CUBLAS libraries. Parameters ---------- handle : int CUBLAS context. Returns ------- version : int CUBLAS version. References ---------- `cublasGetVersion `_ """ version = ctypes.c_int() status = _libcublas.cublasGetVersion_v2(handle, ctypes.byref(version)) cublasCheckStatus(status) return version.value def _get_cublas_version(): """ Get and save CUBLAS version using the CUBLAS library's SONAME. This function tries to avoid calling cublasGetVersion because creating a CUBLAS context can subtly affect the performance of subsequent CUDA operations in certain circumstances. Results ------- version : str Zeros are appended to match format of version returned by cublasGetVersion() (e.g., '6050' corresponds to version 6.5). Notes ----- Since the version number does not appear to be obtainable from the MacOSX CUBLAS library, this function must call cublasGetVersion() on MacOSX (but raises a warning to let the user know). """ cublas_path = utils.find_lib_path('cublas') try: major, minor = re.search('[\D\.]+\.+(\d+)\.(\d+)', utils.get_soname(cublas_path)).groups() except: # Create a temporary context to run cublasGetVersion(): warnings.warn('creating CUBLAS context to get version number') h = cublasCreate() version = cublasGetVersion(h) cublasDestroy(h) return str(version) else: return major.ljust(2, '0')+minor.ljust(2, '0') _cublas_version = int(_get_cublas_version()) class _cublas_version_req(object): """ Decorator to replace function with a placeholder that raises an exception if the installed CUBLAS version is not greater than `v`. """ def __init__(self, v): self.vs = str(v) if isinstance(v, int): major = str(v) minor = '0' else: major, minor = re.search('(\d+)\.(\d+)', self.vs).groups() self.vi = major.ljust(2, '0')+minor.ljust(2, '0') def __call__(self,f): def f_new(*args,**kwargs): raise NotImplementedError('CUBLAS '+self.vs+' required') f_new.__doc__ = f.__doc__ if _cublas_version >= int(self.vi): return f else: return f_new _libcublas.cublasSetStream_v2.restype = int _libcublas.cublasSetStream_v2.argtypes = [_types.handle, _types.stream] def cublasSetStream(handle, id): """ Set current CUBLAS library stream. Parameters ---------- handle : id CUBLAS context. id : int Stream ID. References ---------- `cublasSetStream `_ """ status = _libcublas.cublasSetStream_v2(handle, id) cublasCheckStatus(status) _libcublas.cublasGetStream_v2.restype = int _libcublas.cublasGetStream_v2.argtypes = [_types.handle, ctypes.c_void_p] def cublasGetStream(handle): """ Set current CUBLAS library stream. Parameters ---------- handle : int CUBLAS context. Returns ------- id : int Stream ID. References ---------- `cublasGetStream `_ """ id = _types.stream() status = _libcublas.cublasGetStream_v2(handle, ctypes.byref(id)) cublasCheckStatus(status) return id.value try: _libcublas.cublasGetCurrentCtx.restype = int except AttributeError: def cublasGetCurrentCtx(): raise NotImplementedError( 'cublasGetCurrentCtx() not found; CULA CUBLAS library probably\n' 'precedes NVIDIA CUBLAS library in library search path') else: def cublasGetCurrentCtx(): return _libcublas.cublasGetCurrentCtx() cublasGetCurrentCtx.__doc__ = """ Get current CUBLAS context. Returns the current context used by CUBLAS. Returns ------- handle : int CUBLAS context. """ ### BLAS Level 1 Functions ### # ISAMAX, IDAMAX, ICAMAX, IZAMAX I_AMAX_doc = Template( """ Index of maximum magnitude element. Finds the smallest index of the maximum magnitude element of a ${precision} ${real} vector. Note: for complex arguments x, the "magnitude" is defined as `abs(x.real) + abs(x.imag)`, *not* as `abs(x)`. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vector. x : ctypes.c_void_p Pointer to ${precision} ${real} input vector. incx : int Storage spacing between elements of `x`. Returns ------- idx : int Index of maximum magnitude element. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> x = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> h = cublasCreate() >>> m = ${func}(h, x_gpu.size, x_gpu.gpudata, 1) >>> cublasDestroy(h) >>> np.allclose(m, np.argmax(abs(x.real) + abs(x.imag))) True Notes ----- This function returns a 0-based index. References ---------- `cublasIamax `_ """) _libcublas.cublasIsamax_v2.restype = int _libcublas.cublasIsamax_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasIsamax(handle, n, x, incx): result = ctypes.c_int() status = \ _libcublas.cublasIsamax_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return result.value-1 cublasIsamax.__doc__ = \ I_AMAX_doc.substitute(precision='single precision', real='real', data='np.random.rand(5).astype(np.float32)', func='cublasIsamax') _libcublas.cublasIdamax_v2.restype = int _libcublas.cublasIdamax_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasIdamax(handle, n, x, incx): result = ctypes.c_int() status = \ _libcublas.cublasIdamax_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return result.value-1 cublasIdamax.__doc__ = \ I_AMAX_doc.substitute(precision='double precision', real='real', data='np.random.rand(5).astype(np.float64)', func='cublasIdamax') _libcublas.cublasIcamax_v2.restype = int _libcublas.cublasIcamax_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasIcamax(handle, n, x, incx): result = ctypes.c_int() status = \ _libcublas.cublasIcamax_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return result.value-1 cublasIcamax.__doc__ = \ I_AMAX_doc.substitute(precision='single precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasIcamax') _libcublas.cublasIzamax_v2.restype = int _libcublas.cublasIzamax_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasIzamax(handle, n, x, incx): result = ctypes.c_int() status = \ _libcublas.cublasIzamax_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return result.value-1 cublasIzamax.__doc__ = \ I_AMAX_doc.substitute(precision='double precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasIzamax') # ISAMIN, IDAMIN, ICAMIN, IZAMIN I_AMIN_doc = Template( """ Index of minimum magnitude element (${precision} ${real}). Finds the smallest index of the minimum magnitude element of a ${precision} ${real} vector. Note: for complex arguments x, the "magnitude" is defined as `abs(x.real) + abs(x.imag)`, *not* as `abs(x)`. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vector. x : ctypes.c_void_p Pointer to ${precision} ${real} input vector. incx : int Storage spacing between elements of `x`. Returns ------- idx : int Index of minimum magnitude element. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> x = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> h = cublasCreate() >>> m = ${func}(h, x_gpu.size, x_gpu.gpudata, 1) >>> cublasDestroy(h) >>> np.allclose(m, np.argmin(abs(x.real) + abs(x.imag))) True Notes ----- This function returns a 0-based index. References ---------- `cublasIamin `_ """ ) _libcublas.cublasIsamin_v2.restype = int _libcublas.cublasIsamin_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasIsamin(handle, n, x, incx): result = ctypes.c_int() status = \ _libcublas.cublasIsamin_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return result.value-1 cublasIsamin.__doc__ = \ I_AMIN_doc.substitute(precision='single precision', real='real', data='np.random.rand(5).astype(np.float32)', func='cublasIsamin') _libcublas.cublasIdamin_v2.restype = int _libcublas.cublasIdamin_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasIdamin(handle, n, x, incx): result = ctypes.c_int() status = \ _libcublas.cublasIdamin_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return result.value-1 cublasIdamin.__doc__ = \ I_AMIN_doc.substitute(precision='double precision', real='real', data='np.random.rand(5).astype(np.float64)', func='cublasIdamin') _libcublas.cublasIcamin_v2.restype = int _libcublas.cublasIcamin_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasIcamin(handle, n, x, incx): result = ctypes.c_int() status = \ _libcublas.cublasIcamin_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return result.value-1 cublasIcamin.__doc__ = \ I_AMIN_doc.substitute(precision='single precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasIcamin') _libcublas.cublasIzamin_v2.restype = int _libcublas.cublasIzamin_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasIzamin(handle, n, x, incx): result = ctypes.c_int() status = \ _libcublas.cublasIzamin_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return result.value-1 cublasIzamin.__doc__ = \ I_AMIN_doc.substitute(precision='double precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasIzamin') # SASUM, DASUM, SCASUM, DZASUM _ASUM_doc = Template( """ Sum of absolute values of ${precision} ${real} vector. Computes the sum of the absolute values of the elements of a ${precision} ${real} vector. Note: if the vector is complex, then this computes the sum `sum(abs(x.real)) + sum(abs(x.imag))` Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vector. x : ctypes.c_void_p Pointer to ${precision} ${real} input vector. incx : int Storage spacing between elements of `x`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> x = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> h = cublasCreate() >>> s = ${func}(h, x_gpu.size, x_gpu.gpudata, 1) >>> cublasDestroy(h) >>> np.allclose(s, abs(x.real).sum() + abs(x.imag).sum()) True Returns ------- s : ${ret_type} Sum of absolute values. References ---------- `cublassum `_ """ ) _libcublas.cublasSasum_v2.restype = int _libcublas.cublasSasum_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasSasum(handle, n, x, incx): result = ctypes.c_float() status = _libcublas.cublasSasum_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return np.float32(result.value) cublasSasum.__doc__ = \ _ASUM_doc.substitute(precision='single precision', real='real', data='np.random.rand(5).astype(np.float32)', func='cublasSasum', ret_type='numpy.float32') _libcublas.cublasDasum_v2.restype = int _libcublas.cublasDasum_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasDasum(handle, n, x, incx): result = ctypes.c_double() status = _libcublas.cublasDasum_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return np.float64(result.value) cublasDasum.__doc__ = \ _ASUM_doc.substitute(precision='double precision', real='real', data='np.random.rand(5).astype(np.float64)', func='cublasDasum', ret_type='numpy.float64') _libcublas.cublasScasum_v2.restype = int _libcublas.cublasScasum_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasScasum(handle, n, x, incx): result = ctypes.c_float() status = _libcublas.cublasScasum_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return np.float32(result.value) cublasScasum.__doc__ = \ _ASUM_doc.substitute(precision='single precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasScasum', ret_type='numpy.float32') _libcublas.cublasDzasum_v2.restype = int _libcublas.cublasDzasum_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasDzasum(handle, n, x, incx): result = ctypes.c_double() status = _libcublas.cublasDzasum_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return np.float64(result.value) cublasDzasum.__doc__ = \ _ASUM_doc.substitute(precision='double precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasDzasum', ret_type='numpy.float64') # SAXPY, DAXPY, CAXPY, ZAXPY _AXPY_doc = Template( """ Vector addition (${precision} ${real}). Computes the sum of a ${precision} ${real} vector scaled by a ${precision} ${real} scalar and another ${precision} ${real} vector. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vectors. alpha : ${type} Scalar. x : ctypes.c_void_p Pointer to single precision input vector. incx : int Storage spacing between elements of `x`. y : ctypes.c_void_p Pointer to single precision input/output vector. incy : int Storage spacing between elements of `y`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> alpha = ${alpha} >>> x = ${data} >>> y = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = gpuarray.to_gpu(y) >>> h = cublasCreate() >>> ${func}(h, x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) >>> cublasDestroy(h) >>> np.allclose(y_gpu.get(), alpha*x+y) True Notes ----- Both `x` and `y` must contain `n` elements. References ---------- `cublasaxpy `_ """ ) _libcublas.cublasSaxpy_v2.restype = int _libcublas.cublasSaxpy_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasSaxpy(handle, n, alpha, x, incx, y, incy): status = _libcublas.cublasSaxpy_v2(handle, n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx, int(y), incy) cublasCheckStatus(status) cublasSaxpy.__doc__ = \ _AXPY_doc.substitute(precision='single precision', real='real', type='numpy.float32', alpha='np.float32(np.random.rand())', data='np.random.rand(5).astype(np.float32)', func='cublasSaxpy') _libcublas.cublasDaxpy_v2.restype = int _libcublas.cublasDaxpy_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDaxpy(handle, n, alpha, x, incx, y, incy): status = _libcublas.cublasDaxpy_v2(handle, n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx, int(y), incy) cublasCheckStatus(status) cublasDaxpy.__doc__ = \ _AXPY_doc.substitute(precision='double precision', real='real', type='numpy.float64', alpha='np.float64(np.random.rand())', data='np.random.rand(5).astype(np.float64)', func='cublasDaxpy') _libcublas.cublasCaxpy_v2.restype = int _libcublas.cublasCaxpy_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCaxpy(handle, n, alpha, x, incx, y, incy): status = _libcublas.cublasCaxpy_v2(handle, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy) cublasCheckStatus(status) cublasCaxpy.__doc__ = \ _AXPY_doc.substitute(precision='single precision', real='complex', type='numpy.complex64', alpha='np.complex64(np.random.rand()+1j*np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasCaxpy') _libcublas.cublasZaxpy_v2.restype = int _libcublas.cublasZaxpy_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZaxpy(handle, n, alpha, x, incx, y, incy): status = _libcublas.cublasZaxpy_v2(handle, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy) cublasCheckStatus(status) cublasZaxpy.__doc__ = \ _AXPY_doc.substitute(precision='double precision', real='complex', type='numpy.complex128', alpha='np.complex128(np.random.rand()+1j*np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasZaxpy') # SCOPY, DCOPY, CCOPY, ZCOPY _COPY_doc = Template( """ Vector copy (${precision} ${real}) Copies a ${precision} ${real} vector to another ${precision} ${real} vector. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vectors. x : ctypes.c_void_p Pointer to ${precision} ${real} input vector. incx : int Storage spacing between elements of `x`. y : ctypes.c_void_p Pointer to ${precision} ${real} output vector. incy : int Storage spacing between elements of `y`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> x = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = gpuarray.zeros_like(x_gpu) >>> h = cublasCreate() >>> ${func}(h, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) >>> cublasDestroy(h) >>> np.allclose(y_gpu.get(), x_gpu.get()) True Notes ----- Both `x` and `y` must contain `n` elements. References ---------- `cublascopy `_ """) _libcublas.cublasScopy_v2.restype = int _libcublas.cublasScopy_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasScopy(handle, n, x, incx, y, incy): status = _libcublas.cublasScopy_v2(handle, n, int(x), incx, int(y), incy) cublasCheckStatus(status) cublasScopy.__doc__ = \ _COPY_doc.substitute(precision='single precision', real='real', data='np.random.rand(5).astype(np.float32)', func='cublasScopy') _libcublas.cublasDcopy_v2.restype = int _libcublas.cublasDcopy_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDcopy(handle, n, x, incx, y, incy): status = _libcublas.cublasDcopy_v2(handle, n, int(x), incx, int(y), incy) cublasCheckStatus(status) cublasDcopy.__doc__ = \ _COPY_doc.substitute(precision='double precision', real='real', data='np.random.rand(5).astype(np.float64)', func='cublasDcopy') _libcublas.cublasCcopy_v2.restype = int _libcublas.cublasCcopy_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCcopy(handle, n, x, incx, y, incy): status = _libcublas.cublasCcopy_v2(handle, n, int(x), incx, int(y), incy) cublasCheckStatus(status) cublasCcopy.__doc__ = \ _COPY_doc.substitute(precision='single precision', real='complex', data='(np.random.rand(5)+np.random.rand(5)).astype(np.complex64)', func='cublasCcopy') _libcublas.cublasZcopy_v2.restype = int _libcublas.cublasZcopy_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZcopy(handle, n, x, incx, y, incy): status = _libcublas.cublasZcopy_v2(handle, n, int(x), incx, int(y), incy) cublasCheckStatus(status) cublasZcopy.__doc__ = \ _COPY_doc.substitute(precision='double precision', real='complex', data='(np.random.rand(5)+np.random.rand(5)).astype(np.complex128)', func='cublasZcopy') # SDOT, DDOT, CDOTU, CDOTC, ZDOTU, ZDOTC _DOT_doc = Template( """ Vector dot product (${precision} ${real}) Computes the dot product of two ${precision} ${real} vectors. cublasCdotc and cublasZdotc use the conjugate of the first vector when computing the dot product. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vectors. x : ctypes.c_void_p Pointer to ${precision} ${real} input vector. incx : int Storage spacing between elements of `x`. y : ctypes.c_void_p Pointer to ${precision} ${real} input/output vector. incy : int Storage spacing between elements of `y`. Returns ------- d : ${ret_type} Dot product of `x` and `y`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> x = ${data} >>> y = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = gpuarray.to_gpu(y) >>> h = cublasCreate() >>> d = ${func}(h, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) >>> cublasDestroy(h) >>> ${check} True Notes ----- Both `x` and `y` must contain `n` elements. References ---------- `cublasdot `_ """) _libcublas.cublasSdot_v2.restype = int _libcublas.cublasSdot_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasSdot(handle, n, x, incx, y, incy): result = ctypes.c_float() status = _libcublas.cublasSdot_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(result)) cublasCheckStatus(status) return np.float32(result.value) cublasSdot.__doc__ = _DOT_doc.substitute(precision='single precision', real='real', data='np.float32(np.random.rand(5))', ret_type='np.float32', func='cublasSdot', check='np.allclose(d, np.dot(x, y))') _libcublas.cublasDdot_v2.restype = int _libcublas.cublasDdot_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasDdot(handle, n, x, incx, y, incy): result = ctypes.c_double() status = _libcublas.cublasDdot_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(result)) cublasCheckStatus(status) return np.float64(result.value) cublasDdot.__doc__ = _DOT_doc.substitute(precision='double precision', real='real', data='np.float64(np.random.rand(5))', ret_type='np.float64', func='cublasDdot', check='np.allclose(d, np.dot(x, y))') _libcublas.cublasCdotu_v2.restype = int _libcublas.cublasCdotu_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasCdotu(handle, n, x, incx, y, incy): result = cuda.cuFloatComplex() status = _libcublas.cublasCdotu_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(result)) cublasCheckStatus(status) return np.complex64(result.value) cublasCdotu.__doc__ = _DOT_doc.substitute(precision='single precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', ret_type='np.complex64', func='cublasCdotu', check='np.allclose(d, np.dot(x, y))') _libcublas.cublasCdotc_v2.restype = int _libcublas.cublasCdotc_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasCdotc(handle, n, x, incx, y, incy): result = cuda.cuFloatComplex() status = _libcublas.cublasCdotc_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(result)) cublasCheckStatus(status) return np.complex64(result.value) cublasCdotc.__doc__ = _DOT_doc.substitute(precision='single precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', ret_type='np.complex64', func='cublasCdotc', check='np.allclose(d, np.dot(np.conj(x), y))') _libcublas.cublasZdotu_v2.restype = int _libcublas.cublasZdotu_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasZdotu(handle, n, x, incx, y, incy): result = cuda.cuDoubleComplex() status = _libcublas.cublasZdotu_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(result)) cublasCheckStatus(status) return np.complex128(result.value) cublasZdotu.__doc__ = _DOT_doc.substitute(precision='double precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', ret_type='np.complex128', func='cublasZdotu', check='np.allclose(d, np.dot(x, y))') _libcublas.cublasZdotc_v2.restype = int _libcublas.cublasZdotc_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasZdotc(handle, n, x, incx, y, incy): result = cuda.cuDoubleComplex() status = _libcublas.cublasZdotc_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(result)) cublasCheckStatus(status) return np.complex128(result.value) cublasZdotc.__doc__ = _DOT_doc.substitute(precision='double precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', ret_type='np.complex128', func='cublasZdotc', check='np.allclose(d, np.dot(np.conj(x), y))') # SNRM2, DNRM2, SCNRM2, DZNRM2 _NRM2_doc = Template( """ Euclidean norm (2-norm) of real vector. Computes the Euclidean norm of a ${precision} ${real} vector. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vectors. x : ctypes.c_void_p Pointer to ${precision} ${real} input vector. incx : int Storage spacing between elements of `x`. Returns ------- nrm : ${ret_type} Euclidean norm of `x`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> x = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> h = cublasCreate() >>> nrm = ${func}(h, x.size, x_gpu.gpudata, 1) >>> cublasDestroy(h) >>> np.allclose(nrm, np.linalg.norm(x)) True References ---------- `cublasnrm2 `_ """) _libcublas.cublasSnrm2_v2.restype = int _libcublas.cublasSnrm2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasSnrm2(handle, n, x, incx): result = ctypes.c_float() status = _libcublas.cublasSnrm2_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return np.float32(result.value) cublasSnrm2.__doc__ = \ _NRM2_doc.substitute(precision='single precision', real='real', data='np.float32(np.random.rand(5))', ret_type = 'numpy.float32', func='cublasSnrm2') _libcublas.cublasDnrm2_v2.restype = int _libcublas.cublasDnrm2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasDnrm2(handle, n, x, incx): result = ctypes.c_double() status = _libcublas.cublasDnrm2_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return np.float64(result.value) cublasDnrm2.__doc__ = \ _NRM2_doc.substitute(precision='double precision', real='real', data='np.float64(np.random.rand(5))', ret_type = 'numpy.float64', func='cublasDnrm2') _libcublas.cublasScnrm2_v2.restype = int _libcublas.cublasScnrm2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasScnrm2(handle, n, x, incx): result = ctypes.c_float() status = _libcublas.cublasScnrm2_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return np.float32(result.value) cublasScnrm2.__doc__ = \ _NRM2_doc.substitute(precision='single precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', ret_type = 'numpy.complex64', func='cublasScnrm2') _libcublas.cublasDznrm2_v2.restype = int _libcublas.cublasDznrm2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasDznrm2(handle, n, x, incx): result = ctypes.c_double() status = _libcublas.cublasDznrm2_v2(handle, n, int(x), incx, ctypes.byref(result)) cublasCheckStatus(status) return np.float64(result.value) cublasDznrm2.__doc__ = \ _NRM2_doc.substitute(precision='double precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', ret_type = 'numpy.complex128', func='cublasDznrm2') # SROT, DROT, CROT, CSROT, ZROT, ZDROT _ROT_doc = Template( """ Apply a ${real} rotation to ${real} vectors (${precision}) Multiplies the ${precision} matrix `[[c, s], [-s.conj(), c]]` with the 2 x `n` ${precision} matrix `[[x.T], [y.T]]`. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vectors. x : ctypes.c_void_p Pointer to ${precision} ${real} input/output vector. incx : int Storage spacing between elements of `x`. y : ctypes.c_void_p Pointer to ${precision} ${real} input/output vector. incy : int Storage spacing between elements of `y`. c : ${c_type} Element of rotation matrix. s : ${s_type} Element of rotation matrix. Notes ----- Both `x` and `y` must contain `n` elements. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> s = ${s_val}; c = ${c_val}; >>> x = ${data} >>> y = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = gpuarray.to_gpu(y) >>> h = cublasCreate() >>> ${func}(h, x.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1, c, s) >>> cublasDestroy(h) >>> np.allclose(x_gpu.get(), c*x+s*y) True >>> np.allclose(y_gpu.get(), -s.conj()*x+c*y) True References ---------- `cublasrot `_ """) _libcublas.cublasSrot_v2.restype = int _libcublas.cublasSrot_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cublasSrot(handle, n, x, incx, y, incy, c, s): status = _libcublas.cublasSrot_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(ctypes.c_float(c)), ctypes.byref(ctypes.c_float(s))) cublasCheckStatus(status) cublasSrot.__doc__ = _ROT_doc.substitute(precision='single precision', real='real', c_type='numpy.float32', s_type='numpy.float32', c_val='np.float32(np.random.rand())', s_val='np.float32(np.random.rand())', data='np.random.rand(5).astype(np.float32)', func='cublasSrot') _libcublas.cublasDrot_v2.restype = int _libcublas.cublasDrot_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cublasDrot(handle, n, x, incx, y, incy, c, s): status = _libcublas.cublasDrot_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(ctypes.c_double(c)), ctypes.byref(ctypes.c_double(s))) cublasCheckStatus(status) cublasDrot.__doc__ = _ROT_doc.substitute(precision='double precision', real='real', c_type='numpy.float64', s_type='numpy.float64', c_val='np.float64(np.random.rand())', s_val='np.float64(np.random.rand())', data='np.random.rand(5).astype(np.float64)', func='cublasDrot') _libcublas.cublasCrot_v2.restype = int _libcublas.cublasCrot_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cublasCrot(handle, n, x, incx, y, incy, c, s): status = _libcublas.cublasCrot_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(ctypes.c_float(c)), ctypes.byref(cuda.cuFloatComplex(s.real, s.imag))) cublasCheckStatus(status) cublasCrot.__doc__ = _ROT_doc.substitute(precision='single precision', real='complex', c_type='numpy.float32', s_type='numpy.complex64', c_val='np.float32(np.random.rand())', s_val='np.complex64(np.random.rand()+1j*np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasCrot') _libcublas.cublasCsrot_v2.restype = int _libcublas.cublasCsrot_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cublasCsrot(handle, n, x, incx, y, incy, c, s): status = _libcublas.cublasCsrot_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(ctypes.c_float(c)), ctypes.byref(ctypes.c_float(s))) cublasCheckStatus(status) cublasCsrot.__doc__ = _ROT_doc.substitute(precision='single precision', real='complex', c_type='numpy.float32', s_type='numpy.float32', c_val='np.float32(np.random.rand())', s_val='np.float32(np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasCsrot') _libcublas.cublasZrot_v2.restype = int _libcublas.cublasZrot_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cublasZrot(handle, n, x, incx, y, incy, c, s): status = _libcublas.cublasZrot_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(ctypes.c_double(c)), ctypes.byref(cuda.cuDoubleComplex(s.real, s.imag))) cublasCheckStatus(status) cublasZrot.__doc__ = _ROT_doc.substitute(precision='double precision', real='complex', c_type='numpy.float64', s_type='numpy.complex128', c_val='np.float64(np.random.rand())', s_val='np.complex128(np.random.rand()+1j*np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasZrot') _libcublas.cublasZdrot_v2.restype = int _libcublas.cublasZdrot_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cublasZdrot(handle, n, x, incx, y, incy, c, s): status = _libcublas.cublasZdrot_v2(handle, n, int(x), incx, int(y), incy, ctypes.byref(ctypes.c_double(c)), ctypes.byref(ctypes.c_double(s))) cublasCheckStatus(status) cublasZdrot.__doc__ = _ROT_doc.substitute(precision='double precision', real='complex', c_type='numpy.float64', s_type='numpy.float64', c_val='np.float64(np.random.rand())', s_val='np.float64(np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasZdrot') # SROTG, DROTG, CROTG, ZROTG _ROTG_doc = Template( """ Construct a ${precision} ${real} Givens rotation matrix. Constructs the ${precision} ${real} Givens rotation matrix `G = [[c, s], [-s.conj(), c]]` such that `dot(G, [[a], [b]] == [[r], [0]]`, where `c**2+s**2 == 1` and `r == a**2+b**2` for real numbers and `c**2+(conj(s)*s) == 1` and `r == (a/abs(a))*sqrt(abs(a)**2+abs(b)**2)` for `a != 0` and `r == b` for `a == 0`. Parameters ---------- handle : int CUBLAS context. a, b : ${type} Entries of vector whose second entry should be zeroed out by the rotation. Returns ------- r : ${type} Defined above. c : ${c_type} Cosine component of rotation matrix. s : ${s_type} Sine component of rotation matrix. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> a = ${a_val} >>> b = ${b_val} >>> h = cublasCreate() >>> r, c, s = ${func}(h, a, b) >>> cublasDestroy(h) >>> np.allclose(np.dot(np.array([[c, s], [-np.conj(s), c]]), np.array([[a], [b]])), np.array([[r], [0.0]]), atol=1e-6) True References ---------- `cublasrotg `_ """) _libcublas.cublasSrotg_v2.restype = int _libcublas.cublasSrotg_v2.argtypes = [_types.handle, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cublasSrotg(handle, a, b): _a = ctypes.c_float(a) _b = ctypes.c_float(b) _c = ctypes.c_float() _s = ctypes.c_float() status = _libcublas.cublasSrotg_v2(handle, ctypes.byref(_a), ctypes.byref(_b), ctypes.byref(_c), ctypes.byref(_s)) cublasCheckStatus(status) return np.float32(_a.value), np.float32(_c.value), np.float32(_s.value) cublasSrotg.__doc__ = \ _ROTG_doc.substitute(precision='single precision', real='real', type='numpy.float32', c_type='numpy.float32', s_type='numpy.float32', a_val='np.float32(np.random.rand())', b_val='np.float32(np.random.rand())', func='cublasSrotg') _libcublas.cublasDrotg_v2.restype = int _libcublas.cublasDrotg_v2.argtypes = [_types.handle, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cublasDrotg(handle, a, b): _a = ctypes.c_double(a) _b = ctypes.c_double(b) _c = ctypes.c_double() _s = ctypes.c_double() status = _libcublas.cublasDrotg_v2(handle, ctypes.byref(_a), ctypes.byref(_b), ctypes.byref(_c), ctypes.byref(_s)) cublasCheckStatus(status) return np.float64(_a.value), np.float64(_c.value), np.float64(_s.value) cublasDrotg.__doc__ = \ _ROTG_doc.substitute(precision='double precision', real='real', type='numpy.float64', c_type='numpy.float64', s_type='numpy.float64', a_val='np.float64(np.random.rand())', b_val='np.float64(np.random.rand())', func='cublasDrotg') _libcublas.cublasCrotg_v2.restype = int _libcublas.cublasCrotg_v2.argtypes = [_types.handle, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cublasCrotg(handle, a, b): _a = cuda.cuFloatComplex(a.real, a.imag) _b = cuda.cuFloatComplex(b.real, b.imag) _c = ctypes.c_float() _s = cuda.cuFloatComplex() status = _libcublas.cublasCrotg_v2(handle, ctypes.byref(_a), ctypes.byref(_b), ctypes.byref(_c), ctypes.byref(_s)) cublasCheckStatus(status) return np.complex64(_a.value), np.float32(_c.value), np.complex64(_s.value) cublasCrotg.__doc__ = \ _ROTG_doc.substitute(precision='single precision', real='complex', type='numpy.complex64', c_type='numpy.float32', s_type='numpy.complex64', a_val='np.complex64(np.random.rand()+1j*np.random.rand())', b_val='np.complex64(np.random.rand()+1j*np.random.rand())', func='cublasCrotg') _libcublas.cublasZrotg_v2.restype = int _libcublas.cublasZrotg_v2.argtypes = [_types.handle, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cublasZrotg(handle, a, b): _a = cuda.cuDoubleComplex(a.real, a.imag) _b = cuda.cuDoubleComplex(b.real, b.imag) _c = ctypes.c_double() _s = cuda.cuDoubleComplex() status = _libcublas.cublasZrotg_v2(handle, ctypes.byref(_a), ctypes.byref(_b), ctypes.byref(_c), ctypes.byref(_s)) cublasCheckStatus(status) return np.complex128(_a.value), np.float64(_c.value), np.complex128(_s.value) cublasZrotg.__doc__ = \ _ROTG_doc.substitute(precision='double precision', real='complex', type='numpy.complex128', c_type='numpy.float64', s_type='numpy.complex128', a_val='np.complex128(np.random.rand()+1j*np.random.rand())', b_val='np.complex128(np.random.rand()+1j*np.random.rand())', func='cublasZrotg') # SROTM, DROTM (need to add example) _ROTM_doc = Template( """ Apply a ${precision} real modified Givens rotation. Applies the ${precision} real modified Givens rotation matrix `h` to the 2 x `n` matrix `[[x.T], [y.T]]`. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vectors. x : ctypes.c_void_p Pointer to ${precision} real input/output vector. incx : int Storage spacing between elements of `x`. y : ctypes.c_void_p Pointer to ${precision} real input/output vector. incy : int Storage spacing between elements of `y`. sparam : numpy.ndarray sparam[0] contains the `flag` described below; sparam[1:5] contains the values `[h00, h10, h01, h11]` that determine the rotation matrix `h`. Notes ----- The rotation matrix may assume the following values: for `flag` == -1.0, `h` == `[[h00, h01], [h10, h11]]` for `flag` == 0.0, `h` == `[[1.0, h01], [h10, 1.0]]` for `flag` == 1.0, `h` == `[[h00, 1.0], [-1.0, h11]]` for `flag` == -2.0, `h` == `[[1.0, 0.0], [0.0, 1.0]]` Both `x` and `y` must contain `n` elements. References ---------- `cublassrotm `_ """) _libcublas.cublasSrotm_v2.restype = int _libcublas.cublasSrotm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasSrotm(handle, n, x, incx, y, incy, sparam): status = _libcublas.cublasSrotm_v2(handle, n, int(x), incx, int(y), incy, int(sparam.ctypes.data)) cublasCheckStatus(status) cublasSrotm.__doc__ = \ _ROTM_doc.substitute(precision='single precision') _libcublas.cublasDrotm_v2.restype = int _libcublas.cublasDrotm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasDrotm(handle, n, x, incx, y, incy, sparam): status = _libcublas.cublasDrotm_v2(handle, n, int(x), incx, int(y), incy, int(sparam.ctypes.data)) cublasCheckStatus(status) cublasDrotm.__doc__ = \ _ROTM_doc.substitute(precision='double precision') # SROTMG, DROTMG (need to add example) _ROTMG_doc = Template( """ Construct a ${precision} real modified Givens rotation matrix. Constructs the ${precision} real modified Givens rotation matrix `h = [[h11, h12], [h21, h22]]` that zeros out the second entry of the vector `[[sqrt(d1)*x1], [sqrt(d2)*x2]]`. Parameters ---------- handle : int CUBLAS context. d1 : ${type} ${precision} real value. d2 : ${type} ${precision} real value. x1 : ${type} ${precision} real value. x2 : ${type} ${precision} real value. Returns ------- sparam : numpy.ndarray sparam[0] contains the `flag` described below; sparam[1:5] contains the values `[h00, h10, h01, h11]` that determine the rotation matrix `h`. Notes ----- The rotation matrix may assume the following values: for `flag` == -1.0, `h` == `[[h00, h01], [h10, h11]]` for `flag` == 0.0, `h` == `[[1.0, h01], [h10, 1.0]]` for `flag` == 1.0, `h` == `[[h00, 1.0], [-1.0, h11]]` for `flag` == -2.0, `h` == `[[1.0, 0.0], [0.0, 1.0]]` References ---------- `cublasrotmg `_ """) _libcublas.cublasSrotmg_v2.restype = int _libcublas.cublasSrotmg_v2.argtypes = [_types.handle, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cublasSrotmg(handle, d1, d2, x1, y1): _d1 = ctypes.c_float(d1) _d2 = ctypes.c_float(d2) _x1 = ctypes.c_float(x1) _y1 = ctypes.c_float(y1) sparam = np.empty(5, np.float32) status = _libcublas.cublasSrotmg_v2(handle, ctypes.byref(_d1), ctypes.byref(_d2), ctypes.byref(_x1), ctypes.byref(_y1), int(sparam.ctypes.data)) cublasCheckStatus(status) return sparam cublasSrotmg.__doc__ = \ _ROTMG_doc.substitute(precision='single precision', type='numpy.float32') _libcublas.cublasDrotmg_v2.restype = int _libcublas.cublasDrotmg_v2.argtypes = [_types.handle, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cublasDrotmg(handle, d1, d2, x1, y1): _d1 = ctypes.c_double(d1) _d2 = ctypes.c_double(d2) _x1 = ctypes.c_double(x1) _y1 = ctypes.c_double(y1) sparam = np.empty(5, np.float64) status = _libcublas.cublasDrotmg_v2(handle, ctypes.byref(_d1), ctypes.byref(_d2), ctypes.byref(_x1), ctypes.byref(_y1), int(sparam.ctypes.data)) cublasCheckStatus(status) return sparam cublasDrotmg.__doc__ = \ _ROTMG_doc.substitute(precision='double precision', type='numpy.float64') # SSCAL, DSCAL, CSCAL, CSCAL, CSSCAL, ZSCAL, ZDSCAL _SCAL_doc = Template( """ Scale a ${precision} ${real} vector by a ${precision} ${a_real} scalar. Replaces a ${precision} ${real} vector `x` with `alpha * x`. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vectors. alpha : ${a_type} Scalar multiplier. x : ctypes.c_void_p Pointer to ${precision} ${real} input/output vector. incx : int Storage spacing between elements of `x`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> x = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> alpha = ${alpha} >>> h = cublasCreate() >>> ${func}(h, x.size, alpha, x_gpu.gpudata, 1) >>> cublasDestroy(h) >>> np.allclose(x_gpu.get(), alpha*x) True References ---------- `cublasscal `_ """) _libcublas.cublasSscal_v2.restype = int _libcublas.cublasSscal_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSscal(handle, n, alpha, x, incx): status = _libcublas.cublasSscal_v2(handle, n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx) cublasCheckStatus(status) cublasSscal.__doc__ = \ _SCAL_doc.substitute(precision='single precision', real='real', a_real='real', a_type='numpy.float32', alpha='np.float32(np.random.rand())', data='np.random.rand(5).astype(np.float32)', func='cublasSscal') _libcublas.cublasDscal_v2.restype = int _libcublas.cublasDscal_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDscal(handle, n, alpha, x, incx): status = _libcublas.cublasDscal_v2(handle, n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx) cublasCheckStatus(status) cublasDscal.__doc__ = \ _SCAL_doc.substitute(precision='double precision', real='real', a_real='real', a_type='numpy.float64', alpha='np.float64(np.random.rand())', data='np.random.rand(5).astype(np.float64)', func='cublasDscal') _libcublas.cublasCscal_v2.restype = int _libcublas.cublasCscal_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCscal(handle, n, alpha, x, incx): status = _libcublas.cublasCscal_v2(handle, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(x), incx) cublasCheckStatus(status) cublasCscal.__doc__ = \ _SCAL_doc.substitute(precision='single precision', real='complex', a_real='complex', a_type='numpy.complex64', alpha='np.complex64(np.random.rand()+1j*np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasCscal') _libcublas.cublasCsscal_v2.restype = int _libcublas.cublasCsscal_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCsscal(handle, n, alpha, x, incx): status = _libcublas.cublasCsscal_v2(handle, n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx) cublasCheckStatus(status) cublasCsscal.__doc__ = \ _SCAL_doc.substitute(precision='single precision', real='complex', a_real='real', a_type='numpy.float32', alpha='np.float32(np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasCsscal') _libcublas.cublasZscal_v2.restype = int _libcublas.cublasZscal_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZscal(handle, n, alpha, x, incx): status = _libcublas.cublasZscal_v2(handle, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(x), incx) cublasCheckStatus(status) cublasZscal.__doc__ = \ _SCAL_doc.substitute(precision='double precision', real='complex', a_real='complex', a_type='numpy.complex128', alpha='np.complex128(np.random.rand()+1j*np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasZscal') _libcublas.cublasZdscal_v2.restype = int _libcublas.cublasZdscal_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZdscal(handle, n, alpha, x, incx): status = _libcublas.cublasZdscal_v2(handle, n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx) cublasCheckStatus(status) cublasZdscal.__doc__ = \ _SCAL_doc.substitute(precision='double precision', real='complex', a_real='real', a_type='numpy.float64', alpha='np.float64(np.random.rand())', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasZdscal') # SSWAP, DSWAP, CSWAP, ZSWAP _SWAP_doc = Template( """ Swap ${precision} ${real} vectors. Swaps the contents of one ${precision} ${real} vector with those of another ${precision} ${real} vector. Parameters ---------- handle : int CUBLAS context. n : int Number of elements in input vectors. x : ctypes.c_void_p Pointer to ${precision} ${real} input/output vector. incx : int Storage spacing between elements of `x`. y : ctypes.c_void_p Pointer to ${precision} ${real} input/output vector. incy : int Storage spacing between elements of `y`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> x = ${data} >>> y = ${data} >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = gpuarray.to_gpu(y) >>> h = cublasCreate() >>> ${func}(h, x.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) >>> cublasDestroy(h) >>> np.allclose(x_gpu.get(), y) True >>> np.allclose(y_gpu.get(), x) True Notes ----- Both `x` and `y` must contain `n` elements. References ---------- `cublasswap `_ """) _libcublas.cublasSswap_v2.restype = int _libcublas.cublasSswap_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasSswap(handle, n, x, incx, y, incy): status = _libcublas.cublasSswap_v2(handle, n, int(x), incx, int(y), incy) cublasCheckStatus(status) cublasSswap.__doc__ = \ _SWAP_doc.substitute(precision='single precision', real='real', data='np.random.rand(5).astype(np.float32)', func='cublasSswap') _libcublas.cublasDswap_v2.restype = int _libcublas.cublasDswap_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDswap(handle, n, x, incx, y, incy): status = _libcublas.cublasDswap_v2(handle, n, int(x), incx, int(y), incy) cublasCheckStatus(status) cublasDswap.__doc__ = \ _SWAP_doc.substitute(precision='double precision', real='real', data='np.random.rand(5).astype(np.float64)', func='cublasDswap') _libcublas.cublasCswap_v2.restype = int _libcublas.cublasCswap_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCswap(handle, n, x, incx, y, incy): status = _libcublas.cublasCswap_v2(handle, n, int(x), incx, int(y), incy) cublasCheckStatus(status) cublasCswap.__doc__ = \ _SWAP_doc.substitute(precision='single precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)', func='cublasCswap') _libcublas.cublasZswap_v2.restype = int _libcublas.cublasZswap_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZswap(handle, n, x, incx, y, incy): status = _libcublas.cublasZswap_v2(handle, n, int(x), incx, int(y), incy) cublasCheckStatus(status) cublasZswap.__doc__ = \ _SWAP_doc.substitute(precision='double precision', real='complex', data='(np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128)', func='cublasZswap') ### BLAS Level 2 Functions ### # SGBMV, DGVMV, CGBMV, ZGBMV _libcublas.cublasSgbmv_v2.restype = int _libcublas.cublasSgbmv_v2.argtypes = [_types.handle, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSgbmv(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real single precision general banded matrix. References ---------- `cublasgbmv `_ """ trans = trans.encode('ascii') status = _libcublas.cublasSgbmv_v2(handle, trans, m, n, kl, ku, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(x), incx, ctypes.byref(ctypes.c_float(beta)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasDgbmv_v2.restype = int _libcublas.cublasDgbmv_v2.argtypes = [_types.handle, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDgbmv(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real double precision general banded matrix. References ---------- `cublasgbmv `_ """ trans = trans.encode('ascii') status = _libcublas.cublasDgbmv_v2(handle, trans, m, n, kl, ku, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(x), incx, ctypes.byref(ctypes.c_float(beta)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasCgbmv_v2.restype = int _libcublas.cublasCgbmv_v2.argtypes = [_types.handle, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCgbmv(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex single precision general banded matrix. References ---------- `cublasgbmv `_ """ trans = trans.encode('ascii') status = _libcublas.cublasCgbmv_v2(handle, trans, m, n, kl, ku, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasZgbmv_v2.restype = int _libcublas.cublasZgbmv_v2.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZgbmv(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex double precision general banded matrix. References ---------- `cublasgbmv `_ """ trans = trans.encode('ascii') status = _libcublas.cublasZgbmv_v2(handle, trans, m, n, kl, ku, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) # SGEMV, DGEMV, CGEMV, ZGEMV # XXX need to adjust # _GEMV_doc = Template( # """ # Matrix-vector product for ${precision} ${real} general matrix. # Computes the product `alpha*op(A)*x+beta*y`, where `op(A)` == `A` # or `op(A)` == `A.T`, and stores it in `y`. # Parameters # ---------- # trans : char # If `upper(trans)` in `['T', 'C']`, assume that `A` is # transposed. # m : int # Number of rows in `A`. # n : int # Number of columns in `A`. # alpha : ${a_type} # `A` is multiplied by this quantity. # A : ctypes.c_void_p # Pointer to ${precision} matrix. The matrix has # shape `(lda, n)` if `upper(trans)` == 'N', `(lda, m)` # otherwise. # lda : int # Leading dimension of `A`. # X : ctypes.c_void_p # Pointer to ${precision} array of length at least # `(1+(n-1)*abs(incx))` if `upper(trans) == 'N', # `(1+(m+1)*abs(incx))` otherwise. # incx : int # Spacing between elements of `x`. Must be nonzero. # beta : ${a_type} # `y` is multiplied by this quantity. If zero, `y` is ignored. # y : ctypes.c_void_p # Pointer to ${precision} array of length at least # `(1+(m+1)*abs(incy))` if `upper(trans)` == `N`, # `(1+(n+1)*abs(incy))` otherwise. # incy : int # Spacing between elements of `y`. Must be nonzero. # Examples # -------- # >>> import pycuda.autoinit # >>> import pycuda.gpuarray as gpuarray # >>> import numpy as np # >>> a = np.random.rand(2, 3).astype(np.float32) # >>> x = np.random.rand(3, 1).astype(np.float32) # >>> a_gpu = gpuarray.to_gpu(a.T.copy()) # >>> x_gpu = gpuarray.to_gpu(x) # >>> y_gpu = gpuarray.empty((2, 1), np.float32) # >>> alpha = np.float32(1.0) # >>> beta = np.float32(0) # >>> h = cublasCreate() # >>> ${func}(h, 'n', 2, 3, alpha, a_gpu.gpudata, 2, x_gpu.gpudata, 1, beta, y_gpu.gpudata, 1) # >>> cublasDestroy(h) # >>> np.allclose(y_gpu.get(), np.dot(a, x)) # True # """ _libcublas.cublasSgemv_v2.restype = int _libcublas.cublasSgemv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real single precision general matrix. References ---------- `cublasgemv `_ """ status = _libcublas.cublasSgemv_v2(handle, _CUBLAS_OP[trans], m, n, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(x), incx, ctypes.byref(ctypes.c_float(beta)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasDgemv_v2.restype = int _libcublas.cublasDgemv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real double precision general matrix. References ---------- `cublasgemv `_ """ status = _libcublas.cublasDgemv_v2(handle, _CUBLAS_OP[trans], m, n, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(x), incx, ctypes.byref(ctypes.c_double(beta)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasCgemv_v2.restype = int _libcublas.cublasCgemv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex single precision general matrix. References ---------- `cublasgemv `_ """ status = _libcublas.cublasCgemv_v2(handle, _CUBLAS_OP[trans], m, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasZgemv_v2.restype = int _libcublas.cublasZgemv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex double precision general matrix. References ---------- `cublasgemv `_ """ status = _libcublas.cublasZgemv_v2(handle, _CUBLAS_OP[trans], m, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) # SGER, DGER, CGERU, CGERC, ZGERU, ZGERC _libcublas.cublasSger_v2.restype = int _libcublas.cublasSger_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda): """ Rank-1 operation on real single precision general matrix. References ---------- `cublasger `_ """ status = _libcublas.cublasSger_v2(handle, m, n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) _libcublas.cublasDger_v2.restype = int _libcublas.cublasDger_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda): """ Rank-1 operation on real double precision general matrix. References ---------- `cublasger `_ """ status = _libcublas.cublasDger_v2(handle, m, n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) _libcublas.cublasCgerc_v2.restype = int _libcublas.cublasCgerc_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCgerc(handle, m, n, alpha, x, incx, y, incy, A, lda): """ Rank-1 operation on complex single precision general matrix. References ---------- `cublasger `_ """ status = _libcublas.cublasCgerc_v2(handle, m, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) _libcublas.cublasCgeru_v2.restype = int _libcublas.cublasCgeru_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCgeru(handle, m, n, alpha, x, incx, y, incy, A, lda): """ Rank-1 operation on complex single precision general matrix. References ---------- `cublasger `_ """ status = _libcublas.cublasCgeru_v2(handle, m, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) _libcublas.cublasZgerc_v2.restype = int _libcublas.cublasZgerc_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZgerc(handle, m, n, alpha, x, incx, y, incy, A, lda): """ Rank-1 operation on complex double precision general matrix. References ---------- `cublasger `_ """ status = _libcublas.cublasZgerc_v2(handle, m, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) _libcublas.cublasZgeru_v2.restype = int _libcublas.cublasZgeru_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZgeru(handle, m, n, alpha, x, incx, y, incy, A, lda): """ Rank-1 operation on complex double precision general matrix. References ---------- `cublasger `_ """ status = _libcublas.cublasZgeru_v2(handle, m, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) # SSBMV, DSBMV _libcublas.cublasSsbmv_v2.restype = int _libcublas.cublasSsbmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSsbmv(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real single precision symmetric-banded matrix. References ---------- `cublassbmv `_ """ status = _libcublas.cublasSsbmv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, k, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(x), incx, ctypes.byref(ctypes.c_float(beta)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasDsbmv_v2.restype = int _libcublas.cublasDsbmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDsbmv(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real double precision symmetric-banded matrix. References ---------- `cublasger `_ """ status = _libcublas.cublasDsbmv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, k, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(x), incx, ctypes.byref(ctypes.c_double(beta)), int(y), incy) cublasCheckStatus(status) # SSPMV, DSPMV _libcublas.cublasSspmv_v2.restype = int _libcublas.cublasSspmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSspmv(handle, uplo, n, alpha, AP, x, incx, beta, y, incy): """ Matrix-vector product for real single precision symmetric packed matrix. References ---------- `cublasspmv `_ """ status = _libcublas.cublasSspmv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_float(alpha)), ctypes.byref(ctypes.c_float(AP)), int(x), incx, ctypes.byref(ctypes.c_float(beta)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasDspmv_v2.restype = int _libcublas.cublasDspmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDspmv(handle, uplo, n, alpha, AP, x, incx, beta, y, incy): """ Matrix-vector product for real double precision symmetric packed matrix. References ---------- `cublasspmv `_ """ status = _libcublas.cublasDspmv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_double(alpha)), ctypes.byref(ctypes.c_double(AP)), int(x), incx, ctypes.byref(ctypes.c_double(beta)), int(y), incy) cublasCheckStatus(status) # SSPR, DSPR _libcublas.cublasSspr_v2.restype = int _libcublas.cublasSspr_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasSspr(handle, uplo, n, alpha, x, incx, AP): """ Rank-1 operation on real single precision symmetric packed matrix. References ---------- `cublasspr `_ """ status = _libcublas.cublasSspr_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx, int(AP)) cublasCheckStatus(status) _libcublas.cublasDspr_v2.restype = int _libcublas.cublasDspr_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasDspr(handle, uplo, n, alpha, x, incx, AP): """ Rank-1 operation on real double precision symmetric packed matrix. References ---------- `cublasspr `_ """ status = _libcublas.cublasDspr_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx, int(AP)) cublasCheckStatus(status) # SSPR2, DSPR2 _libcublas.cublasSspr2_v2.restype = int _libcublas.cublasSspr2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasSspr2(handle, uplo, n, alpha, x, incx, y, incy, AP): """ Rank-2 operation on real single precision symmetric packed matrix. References ---------- `cublasspr2 `_ """ status = _libcublas.cublasSspr2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx, int(y), incy, int(AP)) cublasCheckStatus(status) _libcublas.cublasDspr2_v2.restype = int _libcublas.cublasDspr2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasDspr2(handle, uplo, n, alpha, x, incx, y, incy, AP): """ Rank-2 operation on real double precision symmetric packed matrix. References ---------- `cublasspr2 `_ """ status = _libcublas.cublasDspr2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx, int(y), incy, int(AP)) cublasCheckStatus(status) # SSYMV, DSYMV, CSYMV, ZSYMV _libcublas.cublasSsymv_v2.restype = int _libcublas.cublasSsymv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSsymv(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real symmetric matrix. References ---------- `cublassymv `_ """ status = _libcublas.cublasSsymv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(x), incx, ctypes.byref(ctypes.c_float(beta)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasDsymv_v2.restype = int _libcublas.cublasDsymv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDsymv(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real double precision symmetric matrix. References ---------- `cublassymv `_ """ status = _libcublas.cublasDsymv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(x), incx, ctypes.byref(ctypes.c_double(beta)), int(y), incy) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasCsymv_v2.restype = int _libcublas.cublasCsymv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasCsymv(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex single precision symmetric matrix. References ---------- `cublassymv `_ """ status = _libcublas.cublasCsymv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasZsymv_v2.restype = int _libcublas.cublasZsymv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasZsymv(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex double precision symmetric matrix. References ---------- `cublassymv `_ """ status = _libcublas.cublasZsymv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) # SSYR, DSYR, CSYR, ZSYR _libcublas.cublasSsyr_v2.restype = int _libcublas.cublasSsyr_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasSsyr(handle, uplo, n, alpha, x, incx, A, lda): """ Rank-1 operation on real single precision symmetric matrix. References ---------- `cublassyr `_ """ status = _libcublas.cublasSsyr_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx, int(A), lda) cublasCheckStatus(status) _libcublas.cublasDsyr_v2.restype = int _libcublas.cublasDsyr_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDsyr(handle, uplo, n, alpha, x, incx, A, lda): """ Rank-1 operation on real double precision symmetric matrix. References ---------- `cublassyr `_ """ status = _libcublas.cublasDsyr_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx, int(A), lda) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasCsyr_v2.restype = int _libcublas.cublasCsyr_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasCsyr(handle, uplo, n, alpha, x, incx, A, lda): """ Rank-1 operation on complex single precision symmetric matrix. References ---------- `cublassyr `_ """ status = _libcublas.cublasCsyr_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(x), incx, int(A), lda) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasZsyr_v2.restype = int _libcublas.cublasZsyr_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasZsyr(handle, uplo, n, alpha, x, incx, A, lda): """ Rank-1 operation on complex double precision symmetric matrix. References ---------- `cublassyr `_ """ status = _libcublas.cublasZsyr_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(x), incx, int(A), lda) cublasCheckStatus(status) # SSYR2, DSYR2, CSYR2, ZSYR2 _libcublas.cublasSsyr2_v2.restype = int _libcublas.cublasSsyr2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasSsyr2(handle, uplo, n, alpha, x, incx, y, incy, A, lda): """ Rank-2 operation on real single precision symmetric matrix. References ---------- `cublassyr2 `_ """ status = _libcublas.cublasSsyr2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) _libcublas.cublasDsyr2_v2.restype = int _libcublas.cublasDsyr2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDsyr2(handle, uplo, n, alpha, x, incx, y, incy, A, lda): """ Rank-2 operation on real double precision symmetric matrix. References ---------- `cublassyr2 `_ """ status = _libcublas.cublasDsyr2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasCsyr2_v2.restype = int _libcublas.cublasCsyr2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasCsyr2(handle, uplo, n, alpha, x, incx, y, incy, A, lda): """ Rank-2 operation on complex single precision symmetric matrix. References ---------- `cublassyr2 `_ """ status = _libcublas.cublasCsyr2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasZsyr2_v2.restype = int _libcublas.cublasZsyr2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasZsyr2(handle, uplo, n, alpha, x, incx, y, incy, A, lda): """ Rank-2 operation on complex double precision symmetric matrix. References ---------- `cublassyr2 `_ """ status = _libcublas.cublasZsyr2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) # STBMV, DTBMV, CTBMV, ZTBMV _libcublas.cublasStbmv_v2.restype = int _libcublas.cublasStbmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasStbmv(handle, uplo, trans, diag, n, k, A, lda, x, incx): """ Matrix-vector product for real single precision triangular banded matrix. References ---------- `cublastbmv `_ """ status = _libcublas.cublasStbmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, k, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasDtbmv_v2.restype = int _libcublas.cublasDtbmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDtbmv(handle, uplo, trans, diag, n, k, A, lda, x, incx): """ Matrix-vector product for real double precision triangular banded matrix. References ---------- `cublastbmv `_ """ status = _libcublas.cublasDtbmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, k, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasCtbmv_v2.restype = int _libcublas.cublasCtbmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCtbmv(handle, uplo, trans, diag, n, k, A, lda, x, incx): """ Matrix-vector product for complex single precision triangular banded matrix. References ---------- `cublastbmv `_ """ status = _libcublas.cublasCtbmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, k, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasZtbmv_v2.restype = int _libcublas.cublasZtbmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZtbmv(handle, uplo, trans, diag, n, k, A, lda, x, incx): """ Matrix-vector product for complex double triangular banded matrix. References ---------- `cublastbmv `_ """ status = _libcublas.cublasZtbmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, k, int(A), lda, int(x), incx) cublasCheckStatus(status) # STBSV, DTBSV, CTBSV, ZTBSV _libcublas.cublasStbsv_v2.restype = int _libcublas.cublasStbsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasStbsv(handle, uplo, trans, diag, n, k, A, lda, x, incx): """ Solve real single precision triangular banded system with one right-hand side. References ---------- `cublastbsv `_ """ status = _libcublas.cublasStbsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, k, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasDtbsv_v2.restype = int _libcublas.cublasDtbsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDtbsv(handle, uplo, trans, diag, n, k, A, lda, x, incx): """ Solve real double precision triangular banded system with one right-hand side. References ---------- `cublastbsv `_ """ status = _libcublas.cublasDtbsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, k, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasCtbsv_v2.restype = int _libcublas.cublasCtbsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCtbsv(handle, uplo, trans, diag, n, k, A, lda, x, incx): """ Solve complex single precision triangular banded system with one right-hand side. References ---------- `cublastbsv `_ """ status = _libcublas.cublasCtbsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, k, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasZtbsv_v2.restype = int _libcublas.cublasZtbsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZtbsv(handle, uplo, trans, diag, n, k, A, lda, x, incx): """ Solve complex double precision triangular banded system with one right-hand side. References ---------- `cublastbsv `_ """ status = _libcublas.cublasZtbsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, k, int(A), lda, int(x), incx) cublasCheckStatus(status) # STPMV, DTPMV, CTPMV, ZTPMV _libcublas.cublasStpmv_v2.restype = int _libcublas.cublasStpmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasStpmv(handle, uplo, trans, diag, n, AP, x, incx): """ Matrix-vector product for real single precision triangular packed matrix. References ---------- `cublastpmv `_ """ status = _libcublas.cublasStpmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(AP), int(x), incx) cublasCheckStatus(status) _libcublas.cublasCtpmv_v2.restype = int _libcublas.cublasCtpmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCtpmv(handle, uplo, trans, diag, n, AP, x, incx): """ Matrix-vector product for complex single precision triangular packed matrix. References ---------- `cublastpmv `_ """ status = _libcublas.cublasCtpmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(AP), int(x), incx) cublasCheckStatus(status) _libcublas.cublasDtpmv_v2.restype = int _libcublas.cublasDtpmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDtpmv(handle, uplo, trans, diag, n, AP, x, incx): """ Matrix-vector product for real double precision triangular packed matrix. References ---------- `cublastpmv `_ """ status = _libcublas.cublasDtpmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(AP), int(x), incx) cublasCheckStatus(status) _libcublas.cublasZtpmv_v2.restype = int _libcublas.cublasZtpmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZtpmv(handle, uplo, trans, diag, n, AP, x, incx): """ Matrix-vector product for complex double precision triangular packed matrix. References ---------- `cublastpmv `_ """ status = _libcublas.cublasZtpmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(AP), int(x), incx) cublasCheckStatus(status) # STPSV, DTPSV, CTPSV, ZTPSV _libcublas.cublasStpsv_v2.restype = int _libcublas.cublasStpsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasStpsv(handle, uplo, trans, diag, n, AP, x, incx): """ Solve real triangular packed system with one right-hand side. References ---------- `cublastpsv `_ """ status = _libcublas.cublasStpsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(AP), int(x), incx) cublasCheckStatus(status) _libcublas.cublasDtpsv_v2.restype = int _libcublas.cublasDtpsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDtpsv(handle, uplo, trans, diag, n, AP, x, incx): """ Solve real double precision triangular packed system with one right-hand side. References ---------- `cublastpsv `_ """ status = _libcublas.cublasDtpsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(AP), int(x), incx) cublasCheckStatus(status) _libcublas.cublasCtpsv_v2.restype = int _libcublas.cublasCtpsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCtpsv(handle, uplo, trans, diag, n, AP, x, incx): """ Solve complex single precision triangular packed system with one right-hand side. References ---------- `cublastpsv `_ """ status = _libcublas.cublasCtpsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(AP), int(x), incx) cublasCheckStatus(status) _libcublas.cublasZtpsv_v2.restype = int _libcublas.cublasZtpsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZtpsv(handle, uplo, trans, diag, n, AP, x, incx): """ Solve complex double precision triangular packed system with one right-hand size. References ---------- `cublastpsv `_ """ status = _libcublas.cublasZtpsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(AP), int(x), incx) cublasCheckStatus(status) # STRMV, DTRMV, CTRMV, ZTRMV _libcublas.cublasStrmv_v2.restype = int _libcublas.cublasStrmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasStrmv(handle, uplo, trans, diag, n, A, lda, x, inx): """ Matrix-vector product for real single precision triangular matrix. References ---------- `cublastrmv `_ """ status = _libcublas.cublasStrmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(A), lda, int(x), inx) cublasCheckStatus(status) _libcublas.cublasCtrmv_v2.restype = int _libcublas.cublasCtrmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCtrmv(handle, uplo, trans, diag, n, A, lda, x, incx): """ Matrix-vector product for complex single precision triangular matrix. References ---------- `cublastrmv `_ """ status = _libcublas.cublasCtrmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasDtrmv_v2.restype = int _libcublas.cublasDtrmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDtrmv(handle, uplo, trans, diag, n, A, lda, x, inx): """ Matrix-vector product for real double precision triangular matrix. References ---------- `cublastrmv `_ """ status = _libcublas.cublasDtrmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(A), lda, int(x), inx) cublasCheckStatus(status) _libcublas.cublasZtrmv_v2.restype = int _libcublas.cublasZtrmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZtrmv(handle, uplo, trans, diag, n, A, lda, x, incx): """ Matrix-vector product for complex double precision triangular matrix. References ---------- `cublastrmv `_ """ status = _libcublas.cublasZtrmv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(A), lda, int(x), incx) cublasCheckStatus(status) # STRSV, DTRSV, CTRSV, ZTRSV _libcublas.cublasStrsv_v2.restype = int _libcublas.cublasStrsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasStrsv(handle, uplo, trans, diag, n, A, lda, x, incx): """ Solve real triangular system with one right-hand side. References ---------- `cublastrsv `_ """ status = _libcublas.cublasStrsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasDtrsv_v2.restype = int _libcublas.cublasDtrsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDtrsv(handle, uplo, trans, diag, n, A, lda, x, incx): """ Solve real double precision triangular system with one right-hand side. References ---------- `cublastrsv `_ """ status = _libcublas.cublasDtrsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasCtrsv_v2.restype = int _libcublas.cublasCtrsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCtrsv(handle, uplo, trans, diag, n, A, lda, x, incx): """ Solve complex single precision triangular system with one right-hand side. References ---------- `cublastrsv `_ """ status = _libcublas.cublasCtrsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(A), lda, int(x), incx) cublasCheckStatus(status) _libcublas.cublasZtrsv_v2.restype = int _libcublas.cublasZtrsv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZtrsv(handle, uplo, trans, diag, n, A, lda, x, incx): """ Solve complex double precision triangular system with one right-hand side. References ---------- `cublastrsv `_ """ status = _libcublas.cublasZtrsv_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], n, int(A), lda, int(x), incx) cublasCheckStatus(status) # CHEMV, ZHEMV _libcublas.cublasChemv_v2.restype = int _libcublas.cublasChemv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasChemv(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix vector product for single precision Hermitian matrix. References ---------- `cublashemv `_ """ status = _libcublas.cublasChemv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasZhemv_v2.restype = int _libcublas.cublasZhemv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZhemv(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for double precision Hermitian matrix. References ---------- `cublashemv `_ """ status = _libcublas.cublasZhemv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) # CHBMV, ZHBMV _libcublas.cublasChbmv_v2.restype = int _libcublas.cublasChbmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasChbmv(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for single precision Hermitian banded matrix. References ---------- `cublashbmv `_ """ status = _libcublas.cublasChbmv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, k, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasZhbmv_v2.restype = int _libcublas.cublasZhbmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZhbmv(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for double precision Hermitian banded matrix. References ---------- `cublashbmv `_ """ status = _libcublas.cublasZhbmv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, k, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(x), incx, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) # CHPMV, ZHPMV _libcublas.cublasChpmv_v2.restype = int _libcublas.cublasChpmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasChpmv(handle, uplo, n, alpha, AP, x, incx, beta, y, incy): """ Matrix-vector product for single precision Hermitian packed matrix. References ---------- `cublashpmv `_ """ status = _libcublas.cublasChpmv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(AP), int(x), incx, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) _libcublas.cublasZhpmv_v2.restype = int _libcublas.cublasZhpmv_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZhpmv(handle, uplo, n, alpha, AP, x, incx, beta, y, incy): """ Matrix-vector product for double precision Hermitian packed matrix. References ---------- `cublashpmv `_ """ status = _libcublas.cublasZhpmv_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(AP), int(x), incx, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(y), incy) cublasCheckStatus(status) # CHER, ZHER _libcublas.cublasCher_v2.restype = int _libcublas.cublasCher_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCher(handle, uplo, n, alpha, x, incx, A, lda): """ Rank-1 operation on single precision Hermitian matrix. References ---------- `cublasher `_ """ status = _libcublas.cublasCher_v2(handle, _CUBLAS_FILL_MODE[uplo], n, alpha, int(x), incx, int(A), lda) cublasCheckStatus(status) _libcublas.cublasZher_v2.restype = int _libcublas.cublasZher_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZher(handle, uplo, n, alpha, x, incx, A, lda): """ Rank-1 operation on double precision Hermitian matrix. References ---------- `cublasher `_ """ status = _libcublas.cublasZher_v2(handle, _CUBLAS_FILL_MODE[uplo], n, alpha, int(x), incx, int(A), lda) cublasCheckStatus(status) # CHER2, ZHER2 _libcublas.cublasCher2_v2.restype = int _libcublas.cublasCher2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCher2(handle, uplo, n, alpha, x, incx, y, incy, A, lda): """ Rank-2 operation on single precision Hermitian matrix. References ---------- `cublasher2 `_ """ status = _libcublas.cublasCher2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) _libcublas.cublasZher2_v2.restype = int _libcublas.cublasZher2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZher2(handle, uplo, n, alpha, x, incx, y, incy, A, lda): """ Rank-2 operation on double precision Hermitian matrix. References ---------- `cublasher2 `_ """ status = _libcublas.cublasZher2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(A), lda) cublasCheckStatus(status) # CHPR, ZHPR _libcublas.cublasChpr_v2.restype = int _libcublas.cublasChpr_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasChpr(handle, uplo, n, alpha, x, incx, AP): """ Rank-1 operation on single precision Hermitian packed matrix. References ---------- `cublashpr `_ """ status = _libcublas.cublasChpr_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_float(alpha)), int(x), incx, int(AP)) cublasCheckStatus(status) _libcublas.cublasZhpr_v2.restype = int _libcublas.cublasZhpr_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasZhpr(handle, uplo, n, alpha, x, incx, AP): """ Rank-1 operation on double precision Hermitian packed matrix. References ---------- `cublashpr `_ """ status = _libcublas.cublasZhpr_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(ctypes.c_double(alpha)), int(x), incx, int(AP)) cublasCheckStatus(status) # CHPR2, ZHPR2 _libcublas.cublasChpr2.restype = int _libcublas.cublasChpr2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasChpr2(handle, uplo, n, alpha, x, inx, y, incy, AP): """ Rank-2 operation on single precision Hermitian packed matrix. References ---------- `cublashpr2 `_ """ status = _libcublas.cublasChpr2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(AP)) cublasCheckStatus(status) _libcublas.cublasZhpr2_v2.restype = int _libcublas.cublasZhpr2_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cublasZhpr2(handle, uplo, n, alpha, x, inx, y, incy, AP): """ Rank-2 operation on double precision Hermitian packed matrix. References ---------- `cublashpr2 `_ """ status = _libcublas.cublasZhpr2_v2(handle, _CUBLAS_FILL_MODE[uplo], n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(x), incx, int(y), incy, int(AP)) cublasCheckStatus(status) # SGEMM, CGEMM, DGEMM, ZGEMM _libcublas.cublasSgemm_v2.restype = int _libcublas.cublasSgemm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for real single precision general matrix. References ---------- `cublasgemm `_ """ status = _libcublas.cublasSgemm_v2(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, k, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(B), ldb, ctypes.byref(ctypes.c_float(beta)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasCgemm_v2.restype = int _libcublas.cublasCgemm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for complex single precision general matrix. References ---------- `cublasgemm `_ """ status = _libcublas.cublasCgemm_v2(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, k, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasDgemm_v2.restype = int _libcublas.cublasDgemm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for real double precision general matrix. References ---------- `cublasgemm `_ """ status = _libcublas.cublasDgemm_v2(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, k, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(B), ldb, ctypes.byref(ctypes.c_double(beta)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasZgemm_v2.restype = int _libcublas.cublasZgemm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for complex double precision general matrix. References ---------- `cublasgemm `_ """ status = _libcublas.cublasZgemm_v2(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, k, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) # SSYMM, DSYMM, CSYMM, ZSYMM _libcublas.cublasSsymm_v2.restype = int _libcublas.cublasSsymm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for real single precision symmetric matrix. References ---------- `cublassymm `_ """ status = _libcublas.cublasSsymm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], m, n, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(B), ldb, ctypes.byref(ctypes.c_float(beta)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasDsymm_v2.restype = int _libcublas.cublasDsymm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for real double precision symmetric matrix. References ---------- `cublassymm `_ """ status = _libcublas.cublasDsymm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], m, n, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(B), ldb, ctypes.byref(ctypes.c_double(beta)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasCsymm_v2.restype = int _libcublas.cublasCsymm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for complex single precision symmetric matrix. References ---------- `cublassymm `_ """ status = _libcublas.cublasCsymm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], m, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasZsymm_v2.restype = int _libcublas.cublasZsymm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for complex double precision symmetric matrix. References ---------- `cublassymm `_ """ status = _libcublas.cublasZsymm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], m, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) # SSYRK, DSYRK, CSYRK, ZSYRK _libcublas.cublasSsyrk_v2.restype = int _libcublas.cublasSsyrk_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc): """ Rank-k operation on real single precision symmetric matrix. References ---------- `cublassyrk `_ """ status = _libcublas.cublasSsyrk_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, ctypes.byref(ctypes.c_float(beta)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasDsyrk_v2.restype = int _libcublas.cublasDsyrk_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc): """ Rank-k operation on real double precision symmetric matrix. References ---------- `cublassyrk `_ """ status = _libcublas.cublasDsyrk_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasCsyrk_v2.restype = int _libcublas.cublasCsyrk_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc): """ Rank-k operation on complex single precision symmetric matrix. References ---------- `cublassyrk `_ """ status = _libcublas.cublasCsyrk_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasZsyrk_v2.restype = int _libcublas.cublasZsyrk_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc): """ Rank-k operation on complex double precision symmetric matrix. References ---------- `cublassyrk `_ """ status = _libcublas.cublasZsyrk_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) # SSYR2K, DSYR2K, CSYR2K, ZSYR2K _libcublas.cublasSsyr2k_v2.restype = int _libcublas.cublasSsyr2k_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasSsyr2k(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Rank-2k operation on real single precision symmetric matrix. References ---------- `cublassyr2k `_ """ status = _libcublas.cublasSsyr2k_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(B), ldb, ctypes.byref(ctypes.c_float(beta)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasDsyr2k_v2.restype = int _libcublas.cublasDsyr2k_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasDsyr2k(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Rank-2k operation on real double precision symmetric matrix. References ---------- `cublassyr2k `_ """ status = _libcublas.cublasDsyr2k_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(B), ldb, ctypes.byref(ctypes.c_double(beta)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasCsyr2k_v2.restype = int _libcublas.cublasCsyr2k_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCsyr2k(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Rank-2k operation on complex single precision symmetric matrix. References ---------- `cublassyr2k `_ """ status = _libcublas.cublasCsyr2k_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasZsyr2k_v2.restype = int _libcublas.cublasZsyr2k_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZsyr2k(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Rank-2k operation on complex double precision symmetric matrix. References ---------- `cublassyr2k `_ """ status = _libcublas.cublasZsyr2k_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) # STRMM, DTRMM, CTRMM, ZTRMM _libcublas.cublasStrmm_v2.restype = int _libcublas.cublasStrmm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasStrmm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc): """ Matrix-matrix product for real single precision triangular matrix. References ---------- `cublastrmm `_ """ status = _libcublas.cublasStrmm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(B), ldb, int(C), ldc) cublasCheckStatus(status) _libcublas.cublasDtrmm_v2.restype = int _libcublas.cublasDtrmm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDtrmm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc): """ Matrix-matrix product for real double precision triangular matrix. References ---------- `cublastrmm `_ """ status = _libcublas.cublasDtrmm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(B), ldb, int(C), ldc) cublasCheckStatus(status) _libcublas.cublasCtrmm_v2.restype = int _libcublas.cublasCtrmm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCtrmm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc): """ Matrix-matrix product for complex single precision triangular matrix. References ---------- `cublastrmm `_ """ status = _libcublas.cublasCtrmm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb) cublasCheckStatus(status) _libcublas.cublasZtrmm_v2.restype = int _libcublas.cublasZtrmm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZtrmm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc): """ Matrix-matrix product for complex double precision triangular matrix. References ---------- `cublastrmm `_ """ status = _libcublas.cublasZtrmm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, int(C), ldc) cublasCheckStatus(status) # STRSM, DTRSM, CTRSM, ZTRSM _libcublas.cublasStrsm_v2.restype = int _libcublas.cublasStrsm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb): """ Solve a real single precision triangular system with multiple right-hand sides. References ---------- `cublastrsm `_ """ status = _libcublas.cublasStrsm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(B), ldb) cublasCheckStatus(status) _libcublas.cublasDtrsm_v2.restype = int _libcublas.cublasDtrsm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb): """ Solve a real double precision triangular system with multiple right-hand sides. References ---------- `cublastrsm `_ """ status = _libcublas.cublasDtrsm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(B), ldb) cublasCheckStatus(status) _libcublas.cublasCtrsm_v2.restype = int _libcublas.cublasCtrsm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasCtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb): """ Solve a complex single precision triangular system with multiple right-hand sides. References ---------- `cublastrsm `_ """ status = _libcublas.cublasCtrsm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb) cublasCheckStatus(status) _libcublas.cublasZtrsm_v2.restype = int _libcublas.cublasZtrsm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def cublasZtrsm(handle, side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb): """ Solve complex double precision triangular system with multiple right-hand sides. References ---------- `cublastrsm `_ """ status = _libcublas.cublasZtrsm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb) cublasCheckStatus(status) # CHEMM, ZHEMM _libcublas.cublasChemm_v2.restype = int _libcublas.cublasChemm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasChemm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for single precision Hermitian matrix. References ---------- `cublashemm `_ """ status = _libcublas.cublasChemm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], m, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasZhemm_v2.restype = int _libcublas.cublasZhemm_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZhemm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for double precision Hermitian matrix. References ---------- `cublashemm `_ """ status = _libcublas.cublasZhemm_v2(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], m, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) # CHERK, ZHERK _libcublas.cublasCherk_v2.restype = int _libcublas.cublasCherk_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasCherk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc): """ Rank-k operation on single precision Hermitian matrix. References ---------- `cublasherk `_ """ status = _libcublas.cublasCherk_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, ctypes.byref(ctypes.c_float(beta)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasZherk_v2.restype = int _libcublas.cublasZherk_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZherk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc): """ Rank-k operation on double precision Hermitian matrix. References ---------- `cublasherk `_ """ status = _libcublas.cublasZherk_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, ctypes.byref(ctypes.c_double(beta)), int(C), ldc) cublasCheckStatus(status) # CHER2K, ZHER2K _libcublas.cublasCher2k_v2.restype = int _libcublas.cublasCher2k_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def cublasCher2k(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Rank-2k operation on single precision Hermitian matrix. References ---------- `cublasher2k `_ """ status = _libcublas.cublasCher2k_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) _libcublas.cublasZher2k_v2.restype = int _libcublas.cublasZher2k_v2.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cublasZher2k(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Rank-2k operation on double precision Hermitian matrix. References ---------- `cublasher2k `_ """ status = _libcublas.cublasZher2k_v2(handle, _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], n, k, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(C), ldc) cublasCheckStatus(status) ### BLAS-like extension routines ### # SGEAM, DGEAM, CGEAM, ZGEAM _GEAM_doc = Template( """ Matrix-matrix addition/transposition (${precision} ${real}). Computes the sum of two ${precision} ${real} scaled and possibly (conjugate) transposed matrices. Parameters ---------- handle : int CUBLAS context transa, transb : char 't' if they are transposed, 'c' if they are conjugate transposed, 'n' if otherwise. m : int Number of rows in `A` and `C`. n : int Number of columns in `B` and `C`. alpha : ${num_type} Constant by which to scale `A`. A : ctypes.c_void_p Pointer to first matrix operand (`A`). lda : int Leading dimension of `A`. beta : ${num_type} Constant by which to scale `B`. B : ctypes.c_void_p Pointer to second matrix operand (`B`). ldb : int Leading dimension of `A`. C : ctypes.c_void_p Pointer to result matrix (`C`). ldc : int Leading dimension of `C`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> alpha = ${alpha_data} >>> beta = ${beta_data} >>> a = ${a_data_1} >>> b = ${b_data_1} >>> c = ${c_data_1} >>> a_gpu = gpuarray.to_gpu(a) >>> b_gpu = gpuarray.to_gpu(b) >>> c_gpu = gpuarray.empty(c.shape, c.dtype) >>> h = cublasCreate() >>> ${func}(h, 'n', 'n', c.shape[0], c.shape[1], alpha, a_gpu.gpudata, a.shape[0], beta, b_gpu.gpudata, b.shape[0], c_gpu.gpudata, c.shape[0]) >>> np.allclose(c_gpu.get(), c) True >>> a = ${a_data_2} >>> b = ${b_data_2} >>> c = ${c_data_2} >>> a_gpu = gpuarray.to_gpu(a.T.copy()) >>> b_gpu = gpuarray.to_gpu(b.T.copy()) >>> c_gpu = gpuarray.empty(c.T.shape, c.dtype) >>> transa = 'c' if np.iscomplexobj(a) else 't' >>> ${func}(h, transa, 'n', c.shape[0], c.shape[1], alpha, a_gpu.gpudata, a.shape[0], beta, b_gpu.gpudata, b.shape[0], c_gpu.gpudata, c.shape[0]) >>> np.allclose(c_gpu.get().T, c) True >>> cublasDestroy(h) References ---------- `cublasgeam `_ """) if _cublas_version >= 5000: _libcublas.cublasSgeam.restype = int _libcublas.cublasSgeam.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc): """ Real matrix-matrix addition/transposition. """ status = _libcublas.cublasSgeam(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, ctypes.byref(ctypes.c_float(beta)), int(B), ldb, int(C), ldc) cublasCheckStatus(status) cublasSgeam.__doc__ = _GEAM_doc.substitute(precision='single precision', real='real', num_type='numpy.float32', alpha_data='np.float32(np.random.rand())', beta_data='np.float32(np.random.rand())', a_data_1='np.random.rand(2, 3).astype(np.float32)', b_data_1='np.random.rand(2, 3).astype(np.float32)', a_data_2='np.random.rand(2, 3).astype(np.float32)', b_data_2='np.random.rand(3, 2).astype(np.float32)', c_data_1='alpha*a+beta*b', c_data_2='alpha*a.T+beta*b', func='cublasSgeam') if _cublas_version >= 5000: _libcublas.cublasDgeam.restype = int _libcublas.cublasDgeam.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc): """ Real matrix-matrix addition/transposition. """ status = _libcublas.cublasDgeam(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, ctypes.byref(ctypes.c_double(beta)), int(B), ldb, int(C), ldc) cublasCheckStatus(status) cublasDgeam.__doc__ = _GEAM_doc.substitute(precision='double precision', real='real', num_type='numpy.float64', alpha_data='np.float64(np.random.rand())', beta_data='np.float64(np.random.rand())', a_data_1='np.random.rand(2, 3).astype(np.float64)', b_data_1='np.random.rand(2, 3).astype(np.float64)', a_data_2='np.random.rand(2, 3).astype(np.float64)', b_data_2='np.random.rand(3, 2).astype(np.float64)', c_data_1='alpha*a+beta*b', c_data_2='alpha*a.T+beta*b', func='cublasDgeam') if _cublas_version >= 5000: _libcublas.cublasCgeam.restype = int _libcublas.cublasCgeam.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasCgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc): """ Complex matrix-matrix addition/transposition. """ status = _libcublas.cublasCgeam(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(B), ldb, int(C), ldc) cublasCheckStatus(status) cublasCgeam.__doc__ = _GEAM_doc.substitute(precision='single precision', real='complex', num_type='numpy.complex64', alpha_data='np.complex64(np.random.rand()+1j*np.random.rand())', beta_data='np.complex64(np.random.rand()+1j*np.random.rand())', a_data_1='(np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex64)', a_data_2='(np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex64)', b_data_1='(np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex64)', b_data_2='(np.random.rand(3, 2)+1j*np.random.rand(3, 2)).astype(np.complex64)', c_data_1='alpha*a+beta*b', c_data_2='alpha*np.conj(a).T+beta*b', func='cublasCgeam') if _cublas_version >= 5000: _libcublas.cublasZgeam.restype = int _libcublas.cublasZgeam.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasZgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc): """ Complex matrix-matrix addition/transposition. """ status = _libcublas.cublasZgeam(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(B), ldb, int(C), ldc) cublasCheckStatus(status) cublasZgeam.__doc__ = _GEAM_doc.substitute(precision='double precision', real='complex', num_type='numpy.complex128', alpha_data='np.complex128(np.random.rand()+1j*np.random.rand())', beta_data='np.complex128(np.random.rand()+1j*np.random.rand())', a_data_1='(np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128)', a_data_2='(np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128)', b_data_1='(np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128)', b_data_2='(np.random.rand(3, 2)+1j*np.random.rand(3, 2)).astype(np.complex128)', c_data_1='alpha*a+beta*b', c_data_2='alpha*np.conj(a).T+beta*b', func='cublasZgeam') ### Batched routines ### # SgemmBatched, DgemmBatched if _cublas_version >= 5000: _libcublas.cublasSgemmBatched.restype = int _libcublas.cublasSgemmBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] @_cublas_version_req(5.0) def cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchCount): """ Matrix-matrix product for arrays of real single precision general matrices. References ---------- `cublasgemmBatched `_ """ status = _libcublas.cublasSgemmBatched(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, k, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(B), ldb, ctypes.byref(ctypes.c_float(beta)), int(C), ldc, batchCount) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasDgemmBatched.restype = int _libcublas.cublasDgemmBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] @_cublas_version_req(5.0) def cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchCount): """ Matrix-matrix product for arrays of real double precision general matrices. References ---------- `cublasgemmBatched `_ """ status = _libcublas.cublasDgemmBatched(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, k, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(B), ldb, ctypes.byref(ctypes.c_double(beta)), int(C), ldc, batchCount) cublasCheckStatus(status) # CgemmBatched, ZgemmBatched if _cublas_version >= 5000: _libcublas.cublasCgemmBatched.restype = int _libcublas.cublasCgemmBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] @_cublas_version_req(5.0) def cublasCgemmBatched(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchCount): """ Matrix-matrix product for arrays of complex single precision general matrices. References ---------- `cublasgemmBatched `_ """ status = _libcublas.cublasCgemmBatched(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, k, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuFloatComplex(beta.real, beta.imag)), int(C), ldc, batchCount) if _cublas_version >= 5000: _libcublas.cublasZgemmBatched.restype = int _libcublas.cublasZgemmBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] @_cublas_version_req(5.0) def cublasZgemmBatched(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchCount): """ Matrix-matrix product for arrays of complex double precision general matrices. References ---------- `cublasgemmBatched `_ """ status = _libcublas.cublasZgemmBatched(handle, _CUBLAS_OP[transa], _CUBLAS_OP[transb], m, n, k, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(A), lda, int(B), ldb, ctypes.byref(cuda.cuDoubleComplex(beta.real, beta.imag)), int(C), ldc, batchCount) # StrsmBatched, DtrsmBatched if _cublas_version >= 5000: _libcublas.cublasStrsmBatched.restype = int _libcublas.cublasStrsmBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] @_cublas_version_req(5.0) def cublasStrsmBatched(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount): """ This function solves an array of triangular linear systems with multiple right-hand-sides. References ---------- `cublastrsmBatched `_ """ status = _libcublas.cublasStrsmBatched(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(ctypes.c_float(alpha)), int(A), lda, int(B), ldb, batchCount) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasDtrsmBatched.restype = int _libcublas.cublasDtrsmBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] @_cublas_version_req(5.0) def cublasDtrsmBatched(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount): """ This function solves an array of triangular linear systems with multiple right-hand-sides. References ---------- `cublastrsmBatched `_ """ status = _libcublas.cublasDtrsmBatched(handle, _CUBLAS_SIDE_MODE[side], _CUBLAS_FILL_MODE[uplo], _CUBLAS_OP[trans], _CUBLAS_DIAG[diag], m, n, ctypes.byref(ctypes.c_double(alpha)), int(A), lda, int(B), ldb, batchCount) cublasCheckStatus(status) # SgetrfBatched, DgetrfBatched if _cublas_version >= 5000: _libcublas.cublasSgetrfBatched.restype = int _libcublas.cublasSgetrfBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize): """ This function performs the LU factorization of an array of n x n matrices. References ---------- `cublasgetrfBatched `_ """ status = _libcublas.cublasSgetrfBatched(handle, n, int(A), lda, int(P), int(info), batchSize) cublasCheckStatus(status) if _cublas_version >= 5000: _libcublas.cublasDgetrfBatched.restype = int _libcublas.cublasDgetrfBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize): """ This function performs the LU factorization of an array of n x n matrices. References ---------- `cublasgetrfBatched `_ """ status = _libcublas.cublasDgetrfBatched(handle, n, int(A), lda, int(P), int(info), batchSize) cublasCheckStatus(status) # SgetriBatched, Dgetribatched if _cublas_version >= 5050: _libcublas.cublasSgetriBatched.restype = int _libcublas.cublasSgetriBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.5) def cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize): """ This function performs the inversion of an array of n x n matrices. Notes ----- The matrices must be factorized first using cublasSgetrfBatched. References ---------- `cublasgetriBatched `_ """ status = _libcublas.cublasSgetriBatched(handle, n, int(A), lda, int(P), int(C), ldc, int(info), batchSize) cublasCheckStatus(status) if _cublas_version >= 5050: _libcublas.cublasDgetriBatched.restype = int _libcublas.cublasDgetriBatched.argtypes = [_types.handle, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.5) def cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize): """ This function performs the inversion of an array of n x n matrices. Notes ----- The matrices must be factorized first using cublasDgetrfBatched. References ---------- `cublasgetriBatched `_ """ status = _libcublas.cublasDgetriBatched(handle, n, int(A), lda, int(P), int(C), ldc, int(info), batchSize) if _cublas_version >= 5000: _libcublas.cublasSdgmm.restype = \ _libcublas.cublasDdgmm.restype = \ _libcublas.cublasCdgmm.restype = \ _libcublas.cublasZdgmm.restype = int _libcublas.cublasSdgmm.argtypes = \ _libcublas.cublasDdgmm.argtypes = \ _libcublas.cublasCdgmm.argtypes = \ _libcublas.cublasZdgmm.argtypes = [_types.handle, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] @_cublas_version_req(5.0) def cublasSdgmm(handle, side, m, n, A, lda, x, incx, C, ldc): """ Multiplies a matrix with a diagonal matrix. References ---------- `cublasdgmm `_ """ status = _libcublas.cublasSdgmm(handle, _CUBLAS_SIDE_MODE[side], m, n, int(A), lda, int(x), incx, int(C), ldc) cublasCheckStatus(status) @_cublas_version_req(5.0) def cublasDdgmm(handle, side, m, n, A, lda, x, incx, C, ldc): """ Multiplies a matrix with a diagonal matrix. References ---------- `cublasdgmm `_ """ status = _libcublas.cublasDdgmm(handle, _CUBLAS_SIDE_MODE[side], m, n, int(A), lda, int(x), incx, int(C), ldc) cublasCheckStatus(status) @_cublas_version_req(5.0) def cublasCdgmm(handle, side, m, n, A, lda, x, incx, C, ldc): """ Multiplies a matrix with a diagonal matrix. References ---------- `cublasdgmm `_ """ status = _libcublas.cublasCdgmm(handle, _CUBLAS_SIDE_MODE[side], m, n, int(A), lda, int(x), incx, int(C), ldc) cublasCheckStatus(status) @_cublas_version_req(5.0) def cublasZdgmm(handle, side, m, n, A, lda, x, incx, C, ldc): """ Multiplies a matrix with a diagonal matrix. References ---------- `cublasdgmm `_ """ status = _libcublas.cublasZdgmm(handle, _CUBLAS_SIDE_MODE[side], m, n, int(A), lda, int(x), incx, int(C), ldc) cublasCheckStatus(status) if __name__ == "__main__": import doctest doctest.testmod() scikit-cuda-0.5.1/skcuda/cuda.py000066400000000000000000000002311261465507300164620ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to CUDA functions. """ from __future__ import absolute_import from .cudart import * from .cudadrv import * scikit-cuda-0.5.1/skcuda/cudadrv.py000066400000000000000000000165011261465507300172050ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to CUDA driver functions. """ import sys, ctypes # Load library: if 'linux' in sys.platform: _libcuda_libname_list = ['libcuda.so'] elif sys.platform == 'darwin': _libcuda_libname_list = ['libcuda.dylib'] elif sys.platform == 'win32': _libcuda_libname_list = ['cuda.dll', 'nvcuda.dll'] else: raise RuntimeError('unsupported platform') # Print understandable error message when library cannot be found: _libcuda = None for _libcuda_libname in _libcuda_libname_list: try: if sys.platform == 'win32': _libcuda = ctypes.windll.LoadLibrary(_libcuda_libname) else: _libcuda = ctypes.cdll.LoadLibrary(_libcuda_libname) except OSError: pass else: break if _libcuda == None: raise OSError('CUDA driver library not found') # Exceptions corresponding to various CUDA driver errors: class CUDA_ERROR(Exception): """CUDA error.""" pass class CUDA_ERROR_INVALID_VALUE(CUDA_ERROR): pass class CUDA_ERROR_OUT_OF_MEMORY(CUDA_ERROR): pass class CUDA_ERROR_NOT_INITIALIZED(CUDA_ERROR): pass class CUDA_ERROR_DEINITIALIZED(CUDA_ERROR): pass class CUDA_ERROR_PROFILER_DISABLED(CUDA_ERROR): pass class CUDA_ERROR_PROFILER_NOT_INITIALIZED(CUDA_ERROR): pass class CUDA_ERROR_PROFILER_ALREADY_STARTED(CUDA_ERROR): pass class CUDA_ERROR_PROFILER_ALREADY_STOPPED(CUDA_ERROR): pass class CUDA_ERROR_NO_DEVICE(CUDA_ERROR): pass class CUDA_ERROR_INVALID_DEVICE(CUDA_ERROR): pass class CUDA_ERROR_INVALID_IMAGE(CUDA_ERROR): pass class CUDA_ERROR_INVALID_CONTEXT(CUDA_ERROR): pass class CUDA_ERROR_CONTEXT_ALREADY_CURRENT(CUDA_ERROR): pass class CUDA_ERROR_MAP_FAILED(CUDA_ERROR): pass class CUDA_ERROR_UNMAP_FAILED(CUDA_ERROR): pass class CUDA_ERROR_ARRAY_IS_MAPPED(CUDA_ERROR): pass class CUDA_ERROR_ALREADY_MAPPED(CUDA_ERROR): pass class CUDA_ERROR_NO_BINARY_FOR_GPU(CUDA_ERROR): pass class CUDA_ERROR_ALREADY_ACQUIRED(CUDA_ERROR): pass class CUDA_ERROR_NOT_MAPPED(CUDA_ERROR): pass class CUDA_ERROR_NOT_MAPPED_AS_ARRAY(CUDA_ERROR): pass class CUDA_ERROR_NOT_MAPPED_AS_POINTER(CUDA_ERROR): pass class CUDA_ERROR_ECC_UNCORRECTABLE(CUDA_ERROR): pass class CUDA_ERROR_UNSUPPORTED_LIMIT(CUDA_ERROR): pass class CUDA_ERROR_CONTEXT_ALREADY_IN_USE(CUDA_ERROR): pass class CUDA_ERROR_PEER_ACCESS_UNSUPPORTED(CUDA_ERROR): pass class CUDA_ERROR_INVALID_PTX(CUDA_ERROR): pass class CUDA_ERROR_INVALID_GRAPHICS_CONTEXT(CUDA_ERROR): pass class CUDA_ERROR_INVALID_SOURCE(CUDA_ERROR): pass class CUDA_ERROR_FILE_NOT_FOUND(CUDA_ERROR): pass class CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND(CUDA_ERROR): pass class CUDA_ERROR_SHARED_OBJECT_INIT_FAILED(CUDA_ERROR): pass class CUDA_ERROR_OPERATING_SYSTEM(CUDA_ERROR): pass class CUDA_ERROR_INVALID_HANDLE(CUDA_ERROR): pass class CUDA_ERROR_NOT_FOUND(CUDA_ERROR): pass class CUDA_ERROR_NOT_READY(CUDA_ERROR): pass class CUDA_ERROR_ILLEGAL_ADDRESS(CUDA_ERROR): pass class CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES(CUDA_ERROR): pass class CUDA_ERROR_LAUNCH_TIMEOUT(CUDA_ERROR): pass class CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING(CUDA_ERROR): pass class CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED(CUDA_ERROR): pass class CUDA_ERROR_PEER_ACCESS_NOT_ENABLED(CUDA_ERROR): pass class CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE(CUDA_ERROR): pass class CUDA_ERROR_CONTEXT_IS_DESTROYED(CUDA_ERROR): pass class CUDA_ERROR_ASSERT(CUDA_ERROR): pass class CUDA_ERROR_TOO_MANY_PEERS(CUDA_ERROR): pass class CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED(CUDA_ERROR): pass class CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED(CUDA_ERROR): pass class CUDA_ERROR_HARDWARE_STACK_ERROR(CUDA_ERROR): pass class CUDA_ERROR_ILLEGAL_INSTRUCTION(CUDA_ERROR): pass class CUDA_ERROR_MISALIGNED_ADDRESS(CUDA_ERROR): pass class CUDA_ERROR_INVALID_ADDRESS_SPACE(CUDA_ERROR): pass class CUDA_ERROR_INVALID_PC(CUDA_ERROR): pass class CUDA_ERROR_LAUNCH_FAILED(CUDA_ERROR): pass class CUDA_ERROR_NOT_PERMITTED(CUDA_ERROR): pass class CUDA_ERROR_NOT_SUPPORTED(CUDA_ERROR): pass class CUDA_ERROR_UNKNOWN(CUDA_ERROR): pass CUDA_EXCEPTIONS = { 1: CUDA_ERROR_INVALID_VALUE, 2: CUDA_ERROR_OUT_OF_MEMORY, 3: CUDA_ERROR_NOT_INITIALIZED, 4: CUDA_ERROR_DEINITIALIZED, 5: CUDA_ERROR_PROFILER_DISABLED, 6: CUDA_ERROR_PROFILER_NOT_INITIALIZED, 7: CUDA_ERROR_PROFILER_ALREADY_STARTED, 8: CUDA_ERROR_PROFILER_ALREADY_STOPPED, 100: CUDA_ERROR_NO_DEVICE, 101: CUDA_ERROR_INVALID_DEVICE, 200: CUDA_ERROR_INVALID_IMAGE, 201: CUDA_ERROR_INVALID_CONTEXT, 202: CUDA_ERROR_CONTEXT_ALREADY_CURRENT, 205: CUDA_ERROR_MAP_FAILED, 206: CUDA_ERROR_UNMAP_FAILED, 207: CUDA_ERROR_ARRAY_IS_MAPPED, 208: CUDA_ERROR_ALREADY_MAPPED, 209: CUDA_ERROR_NO_BINARY_FOR_GPU, 210: CUDA_ERROR_ALREADY_ACQUIRED, 211: CUDA_ERROR_NOT_MAPPED, 212: CUDA_ERROR_NOT_MAPPED_AS_ARRAY, 213: CUDA_ERROR_NOT_MAPPED_AS_POINTER, 214: CUDA_ERROR_ECC_UNCORRECTABLE, 215: CUDA_ERROR_UNSUPPORTED_LIMIT, 216: CUDA_ERROR_CONTEXT_ALREADY_IN_USE, 217: CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, 218: CUDA_ERROR_INVALID_PTX, 219: CUDA_ERROR_INVALID_GRAPHICS_CONTEXT, 300: CUDA_ERROR_INVALID_SOURCE, 301: CUDA_ERROR_FILE_NOT_FOUND, 302: CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, 303: CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, 304: CUDA_ERROR_OPERATING_SYSTEM, 400: CUDA_ERROR_INVALID_HANDLE, 500: CUDA_ERROR_NOT_FOUND, 600: CUDA_ERROR_NOT_READY, 700: CUDA_ERROR_ILLEGAL_ADDRESS, 701: CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, 702: CUDA_ERROR_LAUNCH_TIMEOUT, 703: CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, 704: CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, 705: CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, 708: CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, 709: CUDA_ERROR_CONTEXT_IS_DESTROYED, 710: CUDA_ERROR_ASSERT, 711: CUDA_ERROR_TOO_MANY_PEERS, 712: CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, 713: CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, 714: CUDA_ERROR_HARDWARE_STACK_ERROR, 715: CUDA_ERROR_ILLEGAL_INSTRUCTION, 716: CUDA_ERROR_MISALIGNED_ADDRESS, 717: CUDA_ERROR_INVALID_ADDRESS_SPACE, 718: CUDA_ERROR_INVALID_PC, 719: CUDA_ERROR_LAUNCH_FAILED, 800: CUDA_ERROR_NOT_PERMITTED, 801: CUDA_ERROR_NOT_SUPPORTED, 999: CUDA_ERROR_UNKNOWN } def cuCheckStatus(status): """ Raise CUDA exception. Raise an exception corresponding to the specified CUDA driver error code. Parameters ---------- status : int CUDA driver error code. See Also -------- CUDA_EXCEPTIONS """ if status != 0: try: raise CUDA_EXCEPTIONS[status] except KeyError: raise CUDA_ERROR CU_POINTER_ATTRIBUTE_CONTEXT = 1 CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2 CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3 CU_POINTER_ATTRIBUTE_HOST_POINTER = 4 _libcuda.cuPointerGetAttribute.restype = int _libcuda.cuPointerGetAttribute.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_uint] def cuPointerGetAttribute(attribute, ptr): data = ctypes.c_void_p() status = _libcuda.cuPointerGetAttribute(data, attribute, ptr) cuCheckStatus(status) return data scikit-cuda-0.5.1/skcuda/cudart.py000066400000000000000000000551711261465507300170450ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to CUDA runtime functions. """ import atexit, ctypes, platform, re, sys, warnings import numpy as np # Load library: _version_list = [7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] if 'linux' in sys.platform: _libcudart_libname_list = ['libcudart.so'] + \ ['libcudart.so.%s' % v for v in _version_list] elif sys.platform == 'darwin': _libcudart_libname_list = ['libcudart.dylib'] elif sys.platform == 'win32': if sys.maxsize > 2**32: _libcudart_libname_list = ['cudart.dll'] + \ ['cudart64_%s.dll' % int(10*v) for v in _version_list] else: _libcudart_libname_list = ['cudart.dll'] + \ ['cudart32_%s.dll' % int(10*v) for v in _version_list] else: raise RuntimeError('unsupported platform') # Print understandable error message when library cannot be found: _libcudart = None for _libcudart_libname in _libcudart_libname_list: try: if sys.platform == 'win32': _libcudart = ctypes.windll.LoadLibrary(_libcudart_libname) else: _libcudart = ctypes.cdll.LoadLibrary(_libcudart_libname) except OSError: pass else: break if _libcudart == None: raise OSError('CUDA runtime library not found') # Code adapted from PARRET: def POINTER(obj): """ Create ctypes pointer to object. Notes ----- This function converts None to a real NULL pointer because of bug in how ctypes handles None on 64-bit platforms. """ p = ctypes.POINTER(obj) if not isinstance(p.from_param, classmethod): def from_param(cls, x): if x is None: return cls() else: return x p.from_param = classmethod(from_param) return p # Classes corresponding to CUDA vector structures: class float2(ctypes.Structure): _fields_ = [ ('x', ctypes.c_float), ('y', ctypes.c_float) ] class cuFloatComplex(float2): @property def value(self): return complex(self.x, self.y) class double2(ctypes.Structure): _fields_ = [ ('x', ctypes.c_double), ('y', ctypes.c_double) ] class cuDoubleComplex(double2): @property def value(self): return complex(self.x, self.y) def gpuarray_ptr(g): """ Return ctypes pointer to data in GPUAarray object. """ addr = int(g.gpudata) if g.dtype == np.int8: return ctypes.cast(addr, POINTER(ctypes.c_byte)) if g.dtype == np.uint8: return ctypes.cast(addr, POINTER(ctypes.c_ubyte)) if g.dtype == np.int16: return ctypes.cast(addr, POINTER(ctypes.c_short)) if g.dtype == np.uint16: return ctypes.cast(addr, POINTER(ctypes.c_ushort)) if g.dtype == np.int32: return ctypes.cast(addr, POINTER(ctypes.c_int)) if g.dtype == np.uint32: return ctypes.cast(addr, POINTER(ctypes.c_uint)) if g.dtype == np.int64: return ctypes.cast(addr, POINTER(ctypes.c_long)) if g.dtype == np.uint64: return ctypes.cast(addr, POINTER(ctypes.c_ulong)) if g.dtype == np.float32: return ctypes.cast(addr, POINTER(ctypes.c_float)) elif g.dtype == np.float64: return ctypes.cast(addr, POINTER(ctypes.c_double)) elif g.dtype == np.complex64: return ctypes.cast(addr, POINTER(cuFloatComplex)) elif g.dtype == np.complex128: return ctypes.cast(addr, POINTER(cuDoubleComplex)) else: raise ValueError('unrecognized type') _libcudart.cudaGetErrorString.restype = ctypes.c_char_p _libcudart.cudaGetErrorString.argtypes = [ctypes.c_int] def cudaGetErrorString(e): """ Retrieve CUDA error string. Return the string associated with the specified CUDA error status code. Parameters ---------- e : int Error number. Returns ------- s : str Error string. """ return _libcudart.cudaGetErrorString(e) # Generic CUDA error: class cudaError(Exception): """CUDA error.""" pass # Exceptions corresponding to various CUDA runtime errors: class cudaErrorMissingConfiguration(cudaError): __doc__ = _libcudart.cudaGetErrorString(1) pass class cudaErrorMemoryAllocation(cudaError): __doc__ = _libcudart.cudaGetErrorString(2) pass class cudaErrorInitializationError(cudaError): __doc__ = _libcudart.cudaGetErrorString(3) pass class cudaErrorLaunchFailure(cudaError): __doc__ = _libcudart.cudaGetErrorString(4) pass class cudaErrorPriorLaunchFailure(cudaError): __doc__ = _libcudart.cudaGetErrorString(5) pass class cudaErrorLaunchTimeout(cudaError): __doc__ = _libcudart.cudaGetErrorString(6) pass class cudaErrorLaunchOutOfResources(cudaError): __doc__ = _libcudart.cudaGetErrorString(7) pass class cudaErrorInvalidDeviceFunction(cudaError): __doc__ = _libcudart.cudaGetErrorString(8) pass class cudaErrorInvalidConfiguration(cudaError): __doc__ = _libcudart.cudaGetErrorString(9) pass class cudaErrorInvalidDevice(cudaError): __doc__ = _libcudart.cudaGetErrorString(10) pass class cudaErrorInvalidValue(cudaError): __doc__ = _libcudart.cudaGetErrorString(11) pass class cudaErrorInvalidPitchValue(cudaError): __doc__ = _libcudart.cudaGetErrorString(12) pass class cudaErrorInvalidSymbol(cudaError): __doc__ = _libcudart.cudaGetErrorString(13) pass class cudaErrorMapBufferObjectFailed(cudaError): __doc__ = _libcudart.cudaGetErrorString(14) pass class cudaErrorUnmapBufferObjectFailed(cudaError): __doc__ = _libcudart.cudaGetErrorString(15) pass class cudaErrorInvalidHostPointer(cudaError): __doc__ = _libcudart.cudaGetErrorString(16) pass class cudaErrorInvalidDevicePointer(cudaError): __doc__ = _libcudart.cudaGetErrorString(17) pass class cudaErrorInvalidTexture(cudaError): __doc__ = _libcudart.cudaGetErrorString(18) pass class cudaErrorInvalidTextureBinding(cudaError): __doc__ = _libcudart.cudaGetErrorString(19) pass class cudaErrorInvalidChannelDescriptor(cudaError): __doc__ = _libcudart.cudaGetErrorString(20) pass class cudaErrorInvalidMemcpyDirection(cudaError): __doc__ = _libcudart.cudaGetErrorString(21) pass class cudaErrorTextureFetchFailed(cudaError): __doc__ = _libcudart.cudaGetErrorString(23) pass class cudaErrorTextureNotBound(cudaError): __doc__ = _libcudart.cudaGetErrorString(24) pass class cudaErrorSynchronizationError(cudaError): __doc__ = _libcudart.cudaGetErrorString(25) pass class cudaErrorInvalidFilterSetting(cudaError): __doc__ = _libcudart.cudaGetErrorString(26) pass class cudaErrorInvalidNormSetting(cudaError): __doc__ = _libcudart.cudaGetErrorString(27) pass class cudaErrorMixedDeviceExecution(cudaError): __doc__ = _libcudart.cudaGetErrorString(28) pass class cudaErrorCudartUnloading(cudaError): __doc__ = _libcudart.cudaGetErrorString(29) pass class cudaErrorUnknown(cudaError): __doc__ = _libcudart.cudaGetErrorString(30) pass class cudaErrorNotYetImplemented(cudaError): __doc__ = _libcudart.cudaGetErrorString(31) pass class cudaErrorMemoryValueTooLarge(cudaError): __doc__ = _libcudart.cudaGetErrorString(32) pass class cudaErrorInvalidResourceHandle(cudaError): __doc__ = _libcudart.cudaGetErrorString(33) pass class cudaErrorNotReady(cudaError): __doc__ = _libcudart.cudaGetErrorString(34) pass class cudaErrorInsufficientDriver(cudaError): __doc__ = _libcudart.cudaGetErrorString(35) pass class cudaErrorSetOnActiveProcess(cudaError): __doc__ = _libcudart.cudaGetErrorString(36) pass class cudaErrorInvalidSurface(cudaError): __doc__ = _libcudart.cudaGetErrorString(37) pass class cudaErrorNoDevice(cudaError): __doc__ = _libcudart.cudaGetErrorString(38) pass class cudaErrorECCUncorrectable(cudaError): __doc__ = _libcudart.cudaGetErrorString(39) pass class cudaErrorSharedObjectSymbolNotFound(cudaError): __doc__ = _libcudart.cudaGetErrorString(40) pass class cudaErrorSharedObjectInitFailed(cudaError): __doc__ = _libcudart.cudaGetErrorString(41) pass class cudaErrorUnsupportedLimit(cudaError): __doc__ = _libcudart.cudaGetErrorString(42) pass class cudaErrorDuplicateVariableName(cudaError): __doc__ = _libcudart.cudaGetErrorString(43) pass class cudaErrorDuplicateTextureName(cudaError): __doc__ = _libcudart.cudaGetErrorString(44) pass class cudaErrorDuplicateSurfaceName(cudaError): __doc__ = _libcudart.cudaGetErrorString(45) pass class cudaErrorDevicesUnavailable(cudaError): __doc__ = _libcudart.cudaGetErrorString(46) pass class cudaErrorInvalidKernelImage(cudaError): __doc__ = _libcudart.cudaGetErrorString(47) pass class cudaErrorNoKernelImageForDevice(cudaError): __doc__ = _libcudart.cudaGetErrorString(48) pass class cudaErrorIncompatibleDriverContext(cudaError): __doc__ = _libcudart.cudaGetErrorString(49) pass class cudaErrorPeerAccessAlreadyEnabled(cudaError): __doc__ = _libcudart.cudaGetErrorString(50) pass class cudaErrorPeerAccessNotEnabled(cudaError): __doc__ = _libcudart.cudaGetErrorString(51) pass class cudaErrorDeviceAlreadyInUse(cudaError): __doc__ = _libcudart.cudaGetErrorString(54) pass class cudaErrorProfilerDisabled(cudaError): __doc__ = _libcudart.cudaGetErrorString(55) pass class cudaErrorProfilerNotInitialized(cudaError): __doc__ = _libcudart.cudaGetErrorString(56) pass class cudaErrorProfilerAlreadyStarted(cudaError): __doc__ = _libcudart.cudaGetErrorString(57) pass class cudaErrorProfilerAlreadyStopped(cudaError): __doc__ = _libcudart.cudaGetErrorString(58) pass class cudaErrorAssert(cudaError): __doc__ = _libcudart.cudaGetErrorString(59) pass class cudaErrorTooManyPeers(cudaError): __doc__ = _libcudart.cudaGetErrorString(60) pass class cudaErrorHostMemoryAlreadyRegistered(cudaError): __doc__ = _libcudart.cudaGetErrorString(61) pass class cudaErrorHostMemoryNotRegistered(cudaError): __doc__ = _libcudart.cudaGetErrorString(62) pass class cudaErrorOperatingSystem(cudaError): __doc__ = _libcudart.cudaGetErrorString(63) pass class cudaErrorPeerAccessUnsupported(cudaError): __doc__ = _libcudart.cudaGetErrorString(64) pass class cudaErrorLaunchMaxDepthExceeded(cudaError): __doc__ = _libcudart.cudaGetErrorString(65) pass class cudaErrorLaunchFileScopedTex(cudaError): __doc__ = _libcudart.cudaGetErrorString(66) pass class cudaErrorLaunchFileScopedSurf(cudaError): __doc__ = _libcudart.cudaGetErrorString(67) pass class cudaErrorSyncDepthExceeded(cudaError): __doc__ = _libcudart.cudaGetErrorString(68) pass class cudaErrorLaunchPendingCountExceeded(cudaError): __doc__ = _libcudart.cudaGetErrorString(69) pass class cudaErrorNotPermitted(cudaError): __doc__ = _libcudart.cudaGetErrorString(70) pass class cudaErrorNotSupported(cudaError): __doc__ = _libcudart.cudaGetErrorString(71) pass class cudaErrorHardwareStackError(cudaError): __doc__ = _libcudart.cudaGetErrorString(72) pass class cudaErrorIllegalInstruction(cudaError): __doc__ = _libcudart.cudaGetErrorString(73) pass class cudaErrorMisalignedAddress(cudaError): __doc__ = _libcudart.cudaGetErrorString(74) pass class cudaErrorInvalidAddressSpace(cudaError): __doc__ = _libcudart.cudaGetErrorString(75) pass class cudaErrorInvalidPc(cudaError): __doc__ = _libcudart.cudaGetErrorString(76) pass class cudaErrorIllegalAddress(cudaError): __doc__ = _libcudart.cudaGetErrorString(77) pass class cudaErrorInvalidPtx(cudaError): __doc__ = _libcudart.cudaGetErrorString(78) pass class cudaErrorInvalidGraphicsContext(cudaError): __doc__ = _libcudart.cudaGetErrorString(79) class cudaErrorStartupFailure(cudaError): __doc__ = _libcudart.cudaGetErrorString(127) pass cudaExceptions = { 1: cudaErrorMissingConfiguration, 2: cudaErrorMemoryAllocation, 3: cudaErrorInitializationError, 4: cudaErrorLaunchFailure, 5: cudaErrorPriorLaunchFailure, 6: cudaErrorLaunchTimeout, 7: cudaErrorLaunchOutOfResources, 8: cudaErrorInvalidDeviceFunction, 9: cudaErrorInvalidConfiguration, 10: cudaErrorInvalidDevice, 11: cudaErrorInvalidValue, 12: cudaErrorInvalidPitchValue, 13: cudaErrorInvalidSymbol, 14: cudaErrorMapBufferObjectFailed, 15: cudaErrorUnmapBufferObjectFailed, 16: cudaErrorInvalidHostPointer, 17: cudaErrorInvalidDevicePointer, 18: cudaErrorInvalidTexture, 19: cudaErrorInvalidTextureBinding, 20: cudaErrorInvalidChannelDescriptor, 21: cudaErrorInvalidMemcpyDirection, 22: cudaError, 23: cudaErrorTextureFetchFailed, 24: cudaErrorTextureNotBound, 25: cudaErrorSynchronizationError, 26: cudaErrorInvalidFilterSetting, 27: cudaErrorInvalidNormSetting, 28: cudaErrorMixedDeviceExecution, 29: cudaErrorCudartUnloading, 30: cudaErrorUnknown, 31: cudaErrorNotYetImplemented, 32: cudaErrorMemoryValueTooLarge, 33: cudaErrorInvalidResourceHandle, 34: cudaErrorNotReady, 35: cudaErrorInsufficientDriver, 36: cudaErrorSetOnActiveProcess, 37: cudaErrorInvalidSurface, 38: cudaErrorNoDevice, 39: cudaErrorECCUncorrectable, 40: cudaErrorSharedObjectSymbolNotFound, 41: cudaErrorSharedObjectInitFailed, 42: cudaErrorUnsupportedLimit, 43: cudaErrorDuplicateVariableName, 44: cudaErrorDuplicateTextureName, 45: cudaErrorDuplicateSurfaceName, 46: cudaErrorDevicesUnavailable, 47: cudaErrorInvalidKernelImage, 48: cudaErrorNoKernelImageForDevice, 49: cudaErrorIncompatibleDriverContext, 50: cudaErrorPeerAccessAlreadyEnabled, 51: cudaErrorPeerAccessNotEnabled, 52: cudaError, 53: cudaError, 54: cudaErrorDeviceAlreadyInUse, 55: cudaErrorProfilerDisabled, 56: cudaErrorProfilerNotInitialized, 57: cudaErrorProfilerAlreadyStarted, 58: cudaErrorProfilerAlreadyStopped, 59: cudaErrorAssert, 60: cudaErrorTooManyPeers, 61: cudaErrorHostMemoryAlreadyRegistered, 62: cudaErrorHostMemoryNotRegistered, 63: cudaErrorOperatingSystem, 64: cudaErrorPeerAccessUnsupported, 65: cudaErrorLaunchMaxDepthExceeded, 66: cudaErrorLaunchFileScopedTex, 67: cudaErrorLaunchFileScopedSurf, 68: cudaErrorSyncDepthExceeded, 69: cudaErrorLaunchPendingCountExceeded, 70: cudaErrorNotPermitted, 71: cudaErrorNotSupported, 72: cudaErrorHardwareStackError, 73: cudaErrorIllegalInstruction, 74: cudaErrorMisalignedAddress, 75: cudaErrorInvalidAddressSpace, 76: cudaErrorInvalidPc, 77: cudaErrorIllegalAddress, 78: cudaErrorInvalidPtx, 79: cudaErrorInvalidGraphicsContext, 127: cudaErrorStartupFailure } def cudaCheckStatus(status): """ Raise CUDA exception. Raise an exception corresponding to the specified CUDA runtime error code. Parameters ---------- status : int CUDA runtime error code. See Also -------- cudaExceptions """ if status != 0: try: raise cudaExceptions[status] except KeyError: raise cudaError('unknown CUDA error %s' % status) # Memory allocation functions (adapted from pystream): _libcudart.cudaMalloc.restype = int _libcudart.cudaMalloc.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t] def cudaMalloc(count, ctype=None): """ Allocate device memory. Allocate memory on the device associated with the current active context. Parameters ---------- count : int Number of bytes of memory to allocate ctype : _ctypes.SimpleType, optional ctypes type to cast returned pointer. Returns ------- ptr : ctypes pointer Pointer to allocated device memory. """ ptr = ctypes.c_void_p() status = _libcudart.cudaMalloc(ctypes.byref(ptr), count) cudaCheckStatus(status) if ctype != None: ptr = ctypes.cast(ptr, ctypes.POINTER(ctype)) return ptr _libcudart.cudaFree.restype = int _libcudart.cudaFree.argtypes = [ctypes.c_void_p] def cudaFree(ptr): """ Free device memory. Free allocated memory on the device associated with the current active context. Parameters ---------- ptr : ctypes pointer Pointer to allocated device memory. """ status = _libcudart.cudaFree(ptr) cudaCheckStatus(status) _libcudart.cudaMallocPitch.restype = int _libcudart.cudaMallocPitch.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(ctypes.c_size_t), ctypes.c_size_t, ctypes.c_size_t] def cudaMallocPitch(pitch, rows, cols, elesize): """ Allocate pitched device memory. Allocate pitched memory on the device associated with the current active context. Parameters ---------- pitch : int Pitch for allocation. rows : int Requested pitched allocation height. cols : int Requested pitched allocation width. elesize : int Size of memory element. Returns ------- ptr : ctypes pointer Pointer to allocated device memory. """ ptr = ctypes.c_void_p() status = _libcudart.cudaMallocPitch(ctypes.byref(ptr), ctypes.c_size_t(pitch), cols*elesize, rows) cudaCheckStatus(status) return ptr, pitch # Memory copy modes: cudaMemcpyHostToHost = 0 cudaMemcpyHostToDevice = 1 cudaMemcpyDeviceToHost = 2 cudaMemcpyDeviceToDevice = 3 cudaMemcpyDefault = 4 _libcudart.cudaMemcpy.restype = int _libcudart.cudaMemcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int] def cudaMemcpy_htod(dst, src, count): """ Copy memory from host to device. Copy data from host memory to device memory. Parameters ---------- dst : ctypes pointer Device memory pointer. src : ctypes pointer Host memory pointer. count : int Number of bytes to copy. """ status = _libcudart.cudaMemcpy(dst, src, ctypes.c_size_t(count), cudaMemcpyHostToDevice) cudaCheckStatus(status) def cudaMemcpy_dtoh(dst, src, count): """ Copy memory from device to host. Copy data from device memory to host memory. Parameters ---------- dst : ctypes pointer Host memory pointer. src : ctypes pointer Device memory pointer. count : int Number of bytes to copy. """ status = _libcudart.cudaMemcpy(dst, src, ctypes.c_size_t(count), cudaMemcpyDeviceToHost) cudaCheckStatus(status) _libcudart.cudaMemGetInfo.restype = int _libcudart.cudaMemGetInfo.argtypes = [ctypes.c_void_p, ctypes.c_void_p] def cudaMemGetInfo(): """ Return the amount of free and total device memory. Returns ------- free : long Free memory in bytes. total : long Total memory in bytes. """ free = ctypes.c_size_t() total = ctypes.c_size_t() status = _libcudart.cudaMemGetInfo(ctypes.byref(free), ctypes.byref(total)) cudaCheckStatus(status) return free.value, total.value _libcudart.cudaSetDevice.restype = int _libcudart.cudaSetDevice.argtypes = [ctypes.c_int] def cudaSetDevice(dev): """ Set current CUDA device. Select a device to use for subsequent CUDA operations. Parameters ---------- dev : int Device number. """ status = _libcudart.cudaSetDevice(dev) cudaCheckStatus(status) _libcudart.cudaGetDevice.restype = int _libcudart.cudaGetDevice.argtypes = [ctypes.POINTER(ctypes.c_int)] def cudaGetDevice(): """ Get current CUDA device. Return the identifying number of the device currently used to process CUDA operations. Returns ------- dev : int Device number. """ dev = ctypes.c_int() status = _libcudart.cudaGetDevice(ctypes.byref(dev)) cudaCheckStatus(status) return dev.value _libcudart.cudaDriverGetVersion.restype = int _libcudart.cudaDriverGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)] def cudaDriverGetVersion(): """ Get installed CUDA driver version. Return the version of the installed CUDA driver as an integer. If no driver is detected, 0 is returned. Returns ------- version : int Driver version. """ version = ctypes.c_int() status = _libcudart.cudaDriverGetVersion(ctypes.byref(version)) cudaCheckStatus(status) return version.value try: _cudart_version = str(cudaDriverGetVersion()) except: _cudart_version = '9999' class _cudart_version_req(object): """ Decorator to replace function with a placeholder that raises an exception if the installed CUDA Runtime version is not greater than `v`. """ def __init__(self, v): self.vs = str(v) if isinstance(v, int): major = str(v) minor = '0' else: major, minor = re.search('(\d+)\.(\d+)', self.vs).groups() self.vi = major.ljust(2, '0')+minor.ljust(2, '0') def __call__(self,f): def f_new(*args,**kwargs): raise NotImplementedError('CUDART '+self.vs+' required') f_new.__doc__ = f.__doc__ if _cudart_version >= self.vi: return f else: return f_new # Memory types: cudaMemoryTypeHost = 1 cudaMemoryTypeDevice = 2 class cudaPointerAttributes(ctypes.Structure): _fields_ = [ ('memoryType', ctypes.c_int), ('device', ctypes.c_int), ('devicePointer', ctypes.c_void_p), ('hostPointer', ctypes.c_void_p) ] _libcudart.cudaPointerGetAttributes.restype = int _libcudart.cudaPointerGetAttributes.argtypes = [ctypes.c_void_p, ctypes.c_void_p] def cudaPointerGetAttributes(ptr): """ Get memory pointer attributes. Returns attributes of the specified pointer. Parameters ---------- ptr : ctypes pointer Memory pointer to examine. Returns ------- memory_type : int Memory type; 1 indicates host memory, 2 indicates device memory. device : int Number of device associated with pointer. Notes ----- This function only works with CUDA 4.0 and later. """ attributes = cudaPointerAttributes() status = \ _libcudart.cudaPointerGetAttributes(ctypes.byref(attributes), ptr) cudaCheckStatus(status) return attributes.memoryType, attributes.device scikit-cuda-0.5.1/skcuda/cufft.py000066400000000000000000000255471261465507300166760ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to CUFFT functions. Note: this module does not explicitly depend on PyCUDA. """ import ctypes import sys # Load library: _version_list = [7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] if 'linux' in sys.platform: _libcufft_libname_list = ['libcufft.so'] + \ ['libcufft.so.%s' % v for v in _version_list] elif sys.platform == 'darwin': _libcufft_libname_list = ['libcufft.dylib'] elif sys.platform == 'win32': if sys.maxsize > 2**32: _libcufft_libname_list = ['cufft.dll'] + \ ['cufft64_%s.dll' % int(10*v) for v in _version_list] else: _libcufft_libname_list = ['cufft.dll'] + \ ['cufft32_%s.dll' % int(10*v) for v in _version_list] else: raise RuntimeError('unsupported platform') # Print understandable error message when library cannot be found: _libcufft = None for _libcufft_libname in _libcufft_libname_list: try: if sys.platform == 'win32': _libcufft = ctypes.windll.LoadLibrary(_libcufft_libname) else: _libcufft = ctypes.cdll.LoadLibrary(_libcufft_libname) except OSError: pass else: break if _libcufft == None: raise OSError('cufft library not found') # General CUFFT error: class cufftError(Exception): """CUFFT error""" pass # Exceptions corresponding to different CUFFT errors: class cufftInvalidPlan(cufftError): """CUFFT was passed an invalid plan handle.""" pass class cufftAllocFailed(cufftError): """CUFFT failed to allocate GPU memory.""" pass class cufftInvalidType(cufftError): """The user requested an unsupported type.""" pass class cufftInvalidValue(cufftError): """The user specified a bad memory pointer.""" pass class cufftInternalError(cufftError): """Internal driver error.""" pass class cufftExecFailed(cufftError): """CUFFT failed to execute an FFT on the GPU.""" pass class cufftSetupFailed(cufftError): """The CUFFT library failed to initialize.""" pass class cufftInvalidSize(cufftError): """The user specified an unsupported FFT size.""" pass class cufftUnalignedData(cufftError): """Input or output does not satisfy texture alignment requirements.""" pass cufftExceptions = { 0x1: cufftInvalidPlan, 0x2: cufftAllocFailed, 0x3: cufftInvalidType, 0x4: cufftInvalidValue, 0x5: cufftInternalError, 0x6: cufftExecFailed, 0x7: cufftSetupFailed, 0x8: cufftInvalidSize, 0x9: cufftUnalignedData } class _types: """Some alias types.""" plan = ctypes.c_int stream = ctypes.c_void_p def cufftCheckStatus(status): """Raise an exception if the specified CUBLAS status is an error.""" if status != 0: try: raise cufftExceptions[status] except KeyError: raise cufftError # Data transformation types: CUFFT_R2C = 0x2a CUFFT_C2R = 0x2c CUFFT_C2C = 0x29 CUFFT_D2Z = 0x6a CUFFT_Z2D = 0x6c CUFFT_Z2Z = 0x69 # Transformation directions: CUFFT_FORWARD = -1 CUFFT_INVERSE = 1 # FFTW compatibility modes: CUFFT_COMPATIBILITY_NATIVE = 0x00 CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02 CUFFT_COMPATIBILITY_FFTW_ALL = 0x03 # FFT functions implemented by CUFFT: _libcufft.cufftPlan1d.restype = int _libcufft.cufftPlan1d.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int] def cufftPlan1d(nx, fft_type, batch): """ Create 1D FFT plan configuration. References ---------- `cufftPlan1d `_ """ plan = _types.plan() status = _libcufft.cufftPlan1d(ctypes.byref(plan), nx, fft_type, batch) cufftCheckStatus(status) return plan _libcufft.cufftPlan2d.restype = int _libcufft.cufftPlan2d.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int] def cufftPlan2d(nx, ny, fft_type): """ Create 2D FFT plan configuration. References ---------- `cufftPlan2d `_ """ plan = _types.plan() status = _libcufft.cufftPlan2d(ctypes.byref(plan), nx, ny, fft_type) cufftCheckStatus(status) return plan _libcufft.cufftPlan3d.restype = int _libcufft.cufftPlan3d.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] def cufftPlan3d(nx, ny, nz, fft_type): """ Create 3D FFT plan configuration. References ---------- `cufftPlan3d `_ """ plan = _types.plan() status = _libcufft.cufftPlan3d(ctypes.byref(plan), nx, ny, nz, fft_type) cufftCheckStatus(status) return plan _libcufft.cufftPlanMany.restype = int _libcufft.cufftPlanMany.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int] def cufftPlanMany(rank, n, inembed, istride, idist, onembed, ostride, odist, fft_type, batch): """ Create batched FFT plan configuration. References ---------- `cufftPlanMany `_ """ plan = _types.plan() status = _libcufft.cufftPlanMany(ctypes.byref(plan), rank, n, inembed, istride, idist, onembed, ostride, odist, fft_type, batch) cufftCheckStatus(status) return plan _libcufft.cufftDestroy.restype = int _libcufft.cufftDestroy.argtypes = [_types.plan] def cufftDestroy(plan): """Destroy FFT plan. References ---------- `cufftDestroy `_ """ status = _libcufft.cufftDestroy(plan) cufftCheckStatus(status) _libcufft.cufftSetCompatibilityMode.restype = int _libcufft.cufftSetCompatibilityMode.argtypes = [_types.plan, ctypes.c_int] def cufftSetCompatibilityMode(plan, mode): """ Set FFTW compatibility mode. References ---------- `cufftSetCompatibilityMode `_ """ status = _libcufft.cufftSetCompatibilityMode(plan, mode) cufftCheckStatus(status) _libcufft.cufftExecC2C.restype = int _libcufft.cufftExecC2C.argtypes = [_types.plan, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cufftExecC2C(plan, idata, odata, direction): """Execute single precision complex-to-complex transform plan as specified by `direction`. References ---------- `cufftExecC2C `_ """ status = _libcufft.cufftExecC2C(plan, idata, odata, direction) cufftCheckStatus(status) _libcufft.cufftExecR2C.restype = int _libcufft.cufftExecR2C.argtypes = [_types.plan, ctypes.c_void_p, ctypes.c_void_p] def cufftExecR2C(plan, idata, odata): """ Execute single precision real-to-complex forward transform plan. References ---------- `cufftExecR2C `_ """ status = _libcufft.cufftExecR2C(plan, idata, odata) cufftCheckStatus(status) _libcufft.cufftExecC2R.restype = int _libcufft.cufftExecC2R.argtypes = [_types.plan, ctypes.c_void_p, ctypes.c_void_p] def cufftExecC2R(plan, idata, odata): """ Execute single precision complex-to-real reverse transform plan. References ---------- `cufftExecC2R `_ """ status = _libcufft.cufftExecC2R(plan, idata, odata) cufftCheckStatus(status) _libcufft.cufftExecZ2Z.restype = int _libcufft.cufftExecZ2Z.argtypes = [_types.plan, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def cufftExecZ2Z(plan, idata, odata, direction): """ Execute double precision complex-to-complex transform plan as specified by `direction`. References ---------- `cufftExecZ2Z `_ """ status = _libcufft.cufftExecZ2Z(plan, idata, odata, direction) cufftCheckStatus(status) _libcufft.cufftExecD2Z.restype = int _libcufft.cufftExecD2Z.argtypes = [_types.plan, ctypes.c_void_p, ctypes.c_void_p] def cufftExecD2Z(plan, idata, odata): """ Execute double precision real-to-complex forward transform plan. References ---------- `cufftExecD2Z `_ """ status = _libcufft.cufftExecD2Z(plan, idata, odata) cufftCheckStatus(status) _libcufft.cufftExecZ2D.restype = int _libcufft.cufftExecZ2D.argtypes = [_types.plan, ctypes.c_void_p, ctypes.c_void_p] def cufftExecZ2D(plan, idata, odata): """ Execute double precision complex-to-real transform plan. References ---------- `cufftExecZ2D `_ """ status = _libcufft.cufftExecZ2D(plan, idata, odata) cufftCheckStatus(status) _libcufft.cufftSetStream.restype = int _libcufft.cufftSetStream.argtypes = [_types.plan, _types.stream] def cufftSetStream(plan, stream): """ Associate a CUDA stream with a CUFFT plan. References ---------- `cufftSetStream `_ """ status = _libcufft.cufftSetStream(plan, stream) cufftCheckStatus(status) scikit-cuda-0.5.1/skcuda/cula.py000066400000000000000000001717171261465507300165140ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to CULA toolkit. """ from __future__ import absolute_import import sys import ctypes import atexit import numpy as np from . import cuda # Load CULA library: if 'linux' in sys.platform: _libcula_libname_list = ['libcula_lapack.so', 'libcula_lapack_basic.so', 'libcula.so'] elif sys.platform == 'darwin': _libcula_libname_list = ['libcula_lapack.dylib', 'libcula.dylib'] elif sys.platform == 'win32': _libcula_libname_list = ['cula_lapack.dll', 'cula_lapack_basic.dll'] else: raise RuntimeError('unsupported platform') _load_err = '' for _lib in _libcula_libname_list: try: _libcula = ctypes.cdll.LoadLibrary(_lib) except OSError: _load_err += ('' if _load_err == '' else ', ') + _lib else: _load_err = '' break if _load_err: raise OSError('%s not found' % _load_err) # Check whether the free or standard version of the toolkit is # installed by trying to access a function that is only available in # the latter: try: _libcula.culaDeviceMalloc except AttributeError: _libcula_toolkit = 'free' else: _libcula_toolkit = 'standard' # Function for retrieving string associated with specific CULA error # code: _libcula.culaGetStatusString.restype = ctypes.c_char_p _libcula.culaGetStatusString.argtypes = [ctypes.c_int] def culaGetStatusString(e): """ Get string associated with the specified CULA status code. Parameters ---------- e : int Status code. Returns ------- s : str Status string. """ return _libcula.culaGetStatusString(e) # Generic CULA error: class culaError(Exception): """CULA error.""" pass # Exceptions corresponding to various CULA errors: class culaNotFound(culaError): """CULA shared library not found""" pass class culaStandardNotFound(culaError): """Standard CULA Dense toolkit unavailable""" pass class culaNotInitialized(culaError): try: __doc__ = culaGetStatusString(1) except: pass pass class culaNoHardware(culaError): try: __doc__ = culaGetStatusString(2) except: pass pass class culaInsufficientRuntime(culaError): try: __doc__ = culaGetStatusString(3) except: pass pass class culaInsufficientComputeCapability(culaError): try: __doc__ = culaGetStatusString(4) except: pass pass class culaInsufficientMemory(culaError): try: __doc__ = culaGetStatusString(5) except: pass pass class culaFeatureNotImplemented(culaError): try: __doc__ = culaGetStatusString(6) except: pass pass class culaArgumentError(culaError): try: __doc__ = culaGetStatusString(7) except: pass pass class culaDataError(culaError): try: __doc__ = culaGetStatusString(8) except: pass pass class culaBlasError(culaError): try: __doc__ = culaGetStatusString(9) except: pass pass class culaRuntimeError(culaError): try: __doc__ = culaGetStatusString(10) except: pass pass class culaBadStorageFormat(culaError): try: __doc__ = culaGetStatusString(11) except: pass pass class culaInvalidReferenceHandle(culaError): try: __doc__ = culaGetStatusString(12) except: pass pass class culaUnspecifiedError(culaError): try: __doc__ = culaGetStatusString(13) except: pass pass culaExceptions = { -1: culaNotFound, 1: culaNotInitialized, 2: culaNoHardware, 3: culaInsufficientRuntime, 4: culaInsufficientComputeCapability, 5: culaInsufficientMemory, 6: culaFeatureNotImplemented, 7: culaArgumentError, 8: culaDataError, 9: culaBlasError, 10: culaRuntimeError, 11: culaBadStorageFormat, 12: culaInvalidReferenceHandle, 13: culaUnspecifiedError, } # CULA functions: _libcula.culaGetErrorInfo.restype = int def culaGetErrorInfo(): """ Returns extended information code for the last CULA error. Returns ------- err : int Extended information code. """ return _libcula.culaGetErrorInfo() _libcula.culaGetErrorInfoString.restype = int _libcula.culaGetErrorInfoString.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def culaGetErrorInfoString(e, i, bufsize=100): """ Returns a readable CULA error string. Returns a readable error string corresponding to a given CULA error code and extended error information code. Parameters ---------- e : int CULA error code. i : int Extended information code. bufsize : int Length of string to return. Returns ------- s : str Error string. """ buf = ctypes.create_string_buffer(bufsize) status = _libcula.culaGetErrorInfoString(e, i, buf, bufsize) culaCheckStatus(status) return buf.value def culaGetLastStatus(): """ Returns the last status code returned from a CULA function. Returns ------- s : int Status code. """ return _libcula.culaGetLastStatus() def culaCheckStatus(status): """ Raise an exception corresponding to the specified CULA status code. Parameters ---------- status : int CULA status code. """ if status != 0: error = culaGetErrorInfo() try: raise culaExceptions[status](error) except KeyError: raise culaError(error) _libcula.culaSelectDevice.restype = int _libcula.culaSelectDevice.argtypes = [ctypes.c_int] def culaSelectDevice(dev): """ Selects a device with which CULA will operate. Parameters ---------- dev : int GPU device number. Notes ----- Must be called before `culaInitialize`. """ status = _libcula.culaSelectDevice(dev) culaCheckStatus(status) _libcula.culaGetExecutingDevice.restype = int _libcula.culaGetExecutingDevice.argtypes = [ctypes.c_void_p] def culaGetExecutingDevice(): """ Reports the id of the GPU device used by CULA. Returns ------- dev : int Device id. """ dev = ctypes.c_int() status = _libcula.culaGetExecutingDevice(ctypes.byref(dev)) culaCheckStatus(status) return dev.value def culaFreeBuffers(): """ Releases any memory buffers stored internally by CULA. """ _libcula.culaFreeBuffers() _libcula.culaGetVersion.restype = int def culaGetVersion(): """ Report the version number of CULA. """ return _libcula.culaGetVersion() _libcula.culaGetCudaMinimumVersion.restype = int def culaGetCudaMinimumVersion(): """ Report the minimum version of CUDA required by CULA. """ return _libcula.culaGetCudaMinimumVersion() _libcula.culaGetCudaRuntimeVersion.restype = int def culaGetCudaRuntimeVersion(): """ Report the version of the CUDA runtime linked to by the CULA library. """ return _libcula.culaGetCudaRuntimeVersion() _libcula.culaGetCudaDriverVersion.restype = int def culaGetCudaDriverVersion(): """ Report the version of the CUDA driver installed on the system. """ return _libcula.culaGetCudaDriverVersion() _libcula.culaGetCublasMinimumVersion.restype = int def culaGetCublasMinimumVersion(): """ Report the version of CUBLAS required by CULA. """ return _libcula.culaGetCublasMinimumVersion() _libcula.culaGetCublasRuntimeVersion.restype = int def culaGetCublasRuntimeVersion(): """ Report the version of CUBLAS linked to by CULA. """ return _libcula.culaGetCublasRuntimeVersion() _libcula.culaGetDeviceCount.restype = int def culaGetDeviceCount(): """ Report the number of available GPU devices. """ return _libcula.culaGetDeviceCount() _libcula.culaInitialize.restype = int def culaInitialize(): """ Initialize CULA. Notes ----- Must be called before using any other CULA functions. """ status = _libcula.culaInitialize() culaCheckStatus(status) _libcula.culaShutdown.restype = int def culaShutdown(): """ Shuts down CULA. """ status = _libcula.culaShutdown() culaCheckStatus(status) # Shut down CULA upon exit: atexit.register(_libcula.culaShutdown) # LAPACK functions available in CULA Dense Free: # SGESV, CGESV _libcula.culaDeviceSgesv.restype = \ _libcula.culaDeviceCgesv.restype = int _libcula.culaDeviceSgesv.argtypes = \ _libcula.culaDeviceCgesv.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def culaDeviceSgesv(n, nrhs, a, lda, ipiv, b, ldb): """ Solve linear system with LU factorization. """ status = _libcula.culaDeviceSgesv(n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) def culaDeviceCgesv(n, nrhs, a, lda, ipiv, b, ldb): """ Solve linear system with LU factorization. """ status = _libcula.culaDeviceCgesv(n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) # SGETRF, CGETRF _libcula.culaDeviceSgetrf.restype = \ _libcula.culaDeviceCgetrf.restype = int _libcula.culaDeviceSgetrf.argtypes = \ _libcula.culaDeviceCgetrf.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def culaDeviceSgetrf(m, n, a, lda, ipiv): """ LU factorization. """ status = _libcula.culaDeviceSgetrf(m, n, int(a), lda, int(ipiv)) culaCheckStatus(status) def culaDeviceCgetrf(m, n, a, lda, ipiv): """ LU factorization. """ status = _libcula.culaDeviceCgetrf(m, n, int(a), lda, int(ipiv)) culaCheckStatus(status) # SGEQRF, CGEQRF _libcula.culaDeviceSgeqrf.restype = \ _libcula.culaDeviceCgeqrf.restype = int _libcula.culaDeviceSgeqrf.argtypes = \ _libcula.culaDeviceCgeqrf.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def culaDeviceSgeqrf(m, n, a, lda, tau): """ QR factorization. """ status = _libcula.culaDeviceSgeqrf(m, n, int(a), lda, int(tau)) culaCheckStatus(status) def culaDeviceCgeqrf(m, n, a, lda, tau): """ QR factorization. """ status = _libcula.culaDeviceCgeqrf(m, n, int(a), lda, int(tau)) culaCheckStatus(status) # SORGQR, CUNGQR _libcula.culaDeviceSorgqr.restype = \ _libcula.culaDeviceCungqr.restype = int _libcula.culaDeviceSorgqr.argtypes = \ _libcula.culaDeviceCungqr.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def culaDeviceSorgqr(m, n, k, a, lda, tau): """ QR factorization - Generate Q from QR factorization """ status = _libcula.culaDeviceSorgqr(m, n, k, int(a), lda, int(tau)) culaCheckStatus(status) def culaDeviceCungqr(m, n, k, a, lda, tau): """ QR factorization - Generate Q from QR factorization """ status = _libcula.culaDeviceCungqr(m, n, k, int(a), lda, int(tau)) culaCheckStatus(status) # SGELS, CGELS _libcula.culaDeviceSgels.restype = \ _libcula.culaDeviceCgels.restype = int _libcula.culaDeviceSgels.argtypes = \ _libcula.culaDeviceCgels.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def culaDeviceSgels(trans, m, n, nrhs, a, lda, b, ldb): """ Solve linear system with QR or LQ factorization. """ trans = trans.encode('ascii') status = _libcula.culaDeviceSgels(trans, m, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) def culaDeviceCgels(trans, m, n, nrhs, a, lda, b, ldb): """ Solve linear system with QR or LQ factorization. """ trans = trans.encode('ascii') status = _libcula.culaDeviceCgels(trans, m, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) # SGGLSE, CGGLSE _libcula.culaDeviceSgglse.restype = \ _libcula.culaDeviceCgglse.restype = int _libcula.culaDeviceSgglse.argtypes = \ _libcula.culaDeviceCgglse.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def culaDeviceSgglse(m, n, p, a, lda, b, ldb, c, d, x): """ Solve linear equality-constrained least squares problem. """ status = _libcula.culaDeviceSgglse(m, n, p, int(a), lda, int(b), ldb, int(c), int(d), int(x)) culaCheckStatus(status) def culaDeviceCgglse(m, n, p, a, lda, b, ldb, c, d, x): """ Solve linear equality-constrained least squares problem. """ status = _libcula.culaDeviceCgglse(m, n, p, int(a), lda, int(b), ldb, int(c), int(d), int(x)) culaCheckStatus(status) # SGESVD, CGESVD _libcula.culaDeviceSgesvd.restype = \ _libcula.culaDeviceCgesvd.restype = int _libcula.culaDeviceSgesvd.argtypes = \ _libcula.culaDeviceCgesvd.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def culaDeviceSgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt): """ SVD decomposition. """ jobu = jobu.encode('ascii') jobvt = jobvt.encode('ascii') status = _libcula.culaDeviceSgesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt) culaCheckStatus(status) def culaDeviceCgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt): """ SVD decomposition. """ jobu = jobu.encode('ascii') jobvt = jobvt.encode('ascii') status = _libcula.culaDeviceCgesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt) culaCheckStatus(status) # LAPACK functions available in CULA Dense: # DGESV, ZGESV try: _libcula.culaDeviceDgesv.restype = \ _libcula.culaDeviceZgesv.restype = int _libcula.culaDeviceDgesv.argtypes = \ _libcula.culaDeviceZgesv.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceDgesv(n, nrhs, a, lda, ipiv, b, ldb): """ Solve linear system with LU factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgesv(n, nrhs, a, lda, ipiv, b, ldb): """ Solve linear system with LU factorization. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceDgesv(n, nrhs, a, lda, ipiv, b, ldb): """ Solve linear system with LU factorization. """ status = _libcula.culaDeviceDgesv(n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) def culaDeviceZgesv(n, nrhs, a, lda, ipiv, b, ldb): """ Solve linear system with LU factorization. """ status = _libcula.culaDeviceZgesv(n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) # DGETRF, ZGETRF try: _libcula.culaDeviceDgetrf.restype = \ _libcula.culaDeviceZgetrf.restype = int _libcula.culaDeviceDgetrf.argtypes = \ _libcula.culaDeviceZgetrf.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] except AttributeError: def culaDeviceDgetrf(m, n, a, lda, ipiv): """ LU factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgetrf(m, n, a, lda, ipiv): """ LU factorization. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceDgetrf(m, n, a, lda, ipiv): """ LU factorization. """ status = _libcula.culaDeviceDgetrf(m, n, int(a), lda, int(ipiv)) culaCheckStatus(status) def culaDeviceZgetrf(m, n, a, lda, ipiv): """ LU factorization. """ status = _libcula.culaDeviceZgetrf(m, n, int(a), lda, int(ipiv)) culaCheckStatus(status) # DGEQRF, ZGEQRF try: _libcula.culaDeviceDgeqrf.restype = \ _libcula.culaDeviceZgeqrf.restype = int _libcula.culaDeviceDgeqrf.argtypes = \ _libcula.culaDeviceZgeqrf.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] except AttributeError: def culaDeviceDgeqrf(m, n, a, lda, tau): """ QR factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgeqrf(m, n, a, lda, tau): """ QR factorization. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceDgeqrf(m, n, a, lda, tau): """ QR factorization. """ status = _libcula.culaDeviceDgeqrf(m, n, int(a), lda, int(tau)) culaCheckStatus(status) def culaDeviceZgeqrf(m, n, a, lda, tau): """ QR factorization. """ status = _libcula.culaDeviceZgeqrf(m, n, int(a), lda, int(tau)) culaCheckStatus(status) # DORGQR, ZUNGQR try: _libcula.culaDeviceDorgqr.restype = \ _libcula.culaDeviceZungqr.restype = int _libcula.culaDeviceDorgqr.argtypes = \ _libcula.culaDeviceZungqr.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] except AttributeError: def culaDeviceDorgqr(m, n, k, a, lda, tau): """ QR factorization - Generate Q from QR factorization """ raise NotImplementedError('CULA Dense required') def culaDeviceDorgqr(m, n, k, a, lda, tau): """ QR factorization - Generate Q from QR factorization """ raise NotImplementedError('CULA Dense required') else: def culaDeviceDorgqr(m, n, k, a, lda, tau): """ QR factorization. """ status = _libcula.culaDeviceDorgqr(m, n, k, int(a), lda, int(tau)) culaCheckStatus(status) def culaDeviceZungqr(m, n, k, a, lda, tau): """ QR factorization. """ status = _libcula.culaDeviceZungqr(m, n, k, int(a), lda, int(tau)) culaCheckStatus(status) # SGETRI, CGETRI, DGETRI, ZGETRI try: _libcula.culaDeviceSgetri.restype = \ _libcula.culaDeviceCgetri.restype = \ _libcula.culaDeviceDgetri.restype = \ _libcula.culaDeviceZgetri.restype = int _libcula.culaDeviceSgetri.argtypes = \ _libcula.culaDeviceCgetri.argtypes = \ _libcula.culaDeviceDgetri.argtypes = \ _libcula.culaDeviceZgetri.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] except AttributeError: def culaDeviceSgetri(n, a, lda, ipiv): """ Compute Inverse Matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceCgetri(n, a, lda, ipiv): """ Compute Inverse Matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceDgetri(n, a, lda, ipiv): """ Compute Inverse Matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgetri(n, a, lda, ipiv): """ Compute Inverse Matrix. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceSgetri(n, a, lda, ipiv): """ Compute Inverse Matrix. """ status = _libcula.culaDeviceSgetri(n, int(a), lda, int(ipiv)) culaCheckStatus(status) def culaDeviceCgetri(n, a, lda, ipiv): """ Compute Inverse Matrix. """ status = _libcula.culaDeviceCgetri(n, int(a), lda, int(ipiv)) culaCheckStatus(status) def culaDeviceDgetri(n, a, lda, ipiv): """ Compute Inverse Matrix. """ status = _libcula.culaDeviceDgetri(n, int(a), lda, int(ipiv)) culaCheckStatus(status) def culaDeviceZgetri(n, a, lda, ipiv): """ Compute Inverse Matrix. """ status = _libcula.culaDeviceZgetri(n, int(a), lda, int(ipiv)) culaCheckStatus(status) # DGELS, ZGELS try: _libcula.culaDeviceDgels.restype = \ _libcula.culaDeviceZgels.restype = int _libcula.culaDeviceDgels.argtypes = \ _libcula.culaDeviceZgels.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceDgels(trans, m, n, nrhs, a, lda, b, ldb): """ Solve linear system with QR or LQ factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgels(trans, m, n, nrhs, a, lda, b, ldb): """ Solve linear system with QR or LQ factorization. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceDgels(trans, m, n, nrhs, a, lda, b, ldb): """ Solve linear system with QR or LQ factorization. """ trans = trans.encode('ascii') status = _libcula.culaDeviceDgels(trans, m, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) def culaDeviceZgels(trans, m, n, nrhs, a, lda, b, ldb): """ Solve linear system with QR or LQ factorization. """ trans = trans.encode('ascii') status = _libcula.culaDeviceZgels(trans, m, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) # DGGLSE, ZGGLSE try: _libcula.culaDeviceDgglse.restype = \ _libcula.culaDeviceZgglse.restype = int _libcula.culaDeviceDgglse.argtypes = \ _libcula.culaDeviceZgglse.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] except AttributeError: def culaDeviceDgglse(m, n, p, a, lda, b, ldb, c, d, x): """ Solve linear equality-constrained least squares problem. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgglse(m, n, p, a, lda, b, ldb, c, d, x): """ Solve linear equality-constrained least squares problem. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceDgglse(m, n, p, a, lda, b, ldb, c, d, x): """ Solve linear equality-constrained least squares problem. """ status = _libcula.culaDeviceDgglse(m, n, p, int(a), lda, int(b), ldb, int(c), int(d), int(x)) culaCheckStatus(status) def culaDeviceZgglse(m, n, p, a, lda, b, ldb, c, d, x): """ Solve linear equality-constrained least squares problem. """ status = _libcula.culaDeviceZgglse(m, n, p, int(a), lda, int(b), ldb, int(c), int(d), int(x)) culaCheckStatus(status) # DGESVD, ZGESVD try: _libcula.culaDeviceDgesvd.restype = \ _libcula.culaDeviceZgesvd.restype = int _libcula.culaDeviceDgesvd.argtypes = \ _libcula.culaDeviceZgesvd.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceDgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt): """ SVD decomposition. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt): """ SVD decomposition. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceDgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt): """ SVD decomposition. """ jobu = jobu.encode('ascii') jobvt = jobvt.encode('ascii') status = _libcula.culaDeviceDgesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt) culaCheckStatus(status) def culaDeviceZgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt): """ SVD decomposition. """ jobu = jobu.encode('ascii') jobvt = jobvt.encode('ascii') status = _libcula.culaDeviceZgesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt) culaCheckStatus(status) # SPOSV, CPOSV, DPOSV, ZPOSV try: _libcula.culaDeviceSposv.restype = \ _libcula.culaDeviceCposv.restype = \ _libcula.culaDeviceDposv.restype = \ _libcula.culaDeviceZposv.restype = int _libcula.culaDeviceSposv.argtypes = \ _libcula.culaDeviceCposv.argtypes = \ _libcula.culaDeviceDposv.argtypes = \ _libcula.culaDeviceZposv.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceSposv(upio, n, nrhs, a, lda, b, ldb): """ Solve positive definite linear system with Cholesky factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceCposv(upio, n, nrhs, a, lda, b, ldb): """ Solve positive definite linear system with Cholesky factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceDposv(upio, n, nrhs, a, lda, b, ldb): """ Solve positive definite linear system with Cholesky factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceDposv(upio, n, nrhs, a, lda, b, ldb): """ Solve positive definite linear system with Cholesky factorization. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceSposv(upio, n, nrhs, a, lda, b, ldb): """ Solve positive definite linear system with Cholesky factorization. """ upio = upio.encode('ascii') status = _libcula.culaDeviceSposv(upio, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) def culaDeviceCposv(upio, n, nrhs, a, lda, b, ldb): """ Solve positive definite linear system with Cholesky factorization. """ upio = upio.encode('ascii') status = _libcula.culaDeviceCposv(upio, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) def culaDeviceDposv(upio, n, nrhs, a, lda, b, ldb): """ Solve positive definite linear system with Cholesky factorization. """ upio = upio.encode('ascii') status = _libcula.culaDeviceDposv(upio, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) def culaDeviceZposv(upio, n, nrhs, a, lda, b, ldb): """ Solve positive definite linear system with Cholesky factorization. """ upio = upio.encode('ascii') status = _libcula.culaDeviceZposv(upio, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) # SPOTRF, CPOTRF, DPOTRF, ZPOTRF try: _libcula.culaDeviceSpotrf.restype = \ _libcula.culaDeviceCpotrf.restype = \ _libcula.culaDeviceDpotrf.restype = \ _libcula.culaDeviceZpotrf.restype = int _libcula.culaDeviceSpotrf.argtypes = \ _libcula.culaDeviceCpotrf.argtypes = \ _libcula.culaDeviceDpotrf.argtypes = \ _libcula.culaDeviceZpotrf.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceSpotrf(uplo, n, a, lda): """ Cholesky factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceCpotrf(uplo, n, a, lda): """ Cholesky factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceDpotrf(uplo, n, a, lda): """ Cholesky factorization. """ raise NotImplementedError('CULA Dense required') def culaDeviceZpotrf(uplo, n, a, lda): """ Cholesky factorization. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceSpotrf(uplo, n, a, lda): """ Cholesky factorization. """ uplo = uplo.encode('ascii') status = _libcula.culaDeviceSpotrf(uplo, n, int(a), lda) culaCheckStatus(status) def culaDeviceCpotrf(uplo, n, a, lda): """ Cholesky factorization. """ uplo = uplo.encode('ascii') status = _libcula.culaDeviceCpotrf(uplo, n, int(a), lda) culaCheckStatus(status) def culaDeviceDpotrf(uplo, n, a, lda): """ Cholesky factorization. """ uplo = uplo.encode('ascii') status = _libcula.culaDeviceDpotrf(uplo, n, int(a), lda) culaCheckStatus(status) def culaDeviceZpotrf(uplo, n, a, lda): """ Cholesky factorization. """ uplo = uplo.encode('ascii') status = _libcula.culaDeviceZpotrf(uplo, n, int(a), lda) culaCheckStatus(status) # SSYEV, DSYEV, CHEEV, ZHEEV try: _libcula.culaDeviceSsyev.restype = \ _libcula.culaDeviceDsyev.restype = \ _libcula.culaDeviceCheev.restype = \ _libcula.culaDeviceZheev.restype = int _libcula.culaDeviceSsyev.argtypes = \ _libcula.culaDeviceDsyev.argtypes = \ _libcula.culaDeviceCheev.argtypes = \ _libcula.culaDeviceZheev.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] except AttributeError: def culaDeviceSsyev(jobz, uplo, n, a, lda, w): """ Symmetric eigenvalue decomposition. """ raise NotImplementedError('CULA Dense required') def culaDeviceDsyev(jobz, uplo, n, a, lda, w): """ Symmetric eigenvalue decomposition. """ raise NotImplementedError('CULA Dense required') def culaDeviceCheev(jobz, uplo, n, a, lda, w): """ Hermitian eigenvalue decomposition. """ raise NotImplementedError('CULA Dense required') def culaDeviceZheev(jobz, uplo, n, a, lda, w): """ Hermitian eigenvalue decomposition. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceSsyev(jobz, uplo, n, a, lda, w): """ Symmetric eigenvalue decomposition. """ jobz = jobz.encode('ascii') uplo = uplo.encode('ascii') status = _libcula.culaDeviceSsyev(jobz, uplo, n, int(a), lda, int(w)) culaCheckStatus(status) def culaDeviceDsyev(jobz, uplo, n, a, lda, w): """ Symmetric eigenvalue decomposition. """ jobz = jobz.encode('ascii') uplo = uplo.encode('ascii') status = _libcula.culaDeviceDsyev(jobz, uplo, n, int(a), lda, int(w)) culaCheckStatus(status) def culaDeviceCheev(jobz, uplo, n, a, lda, w): """ Hermitian eigenvalue decomposition. """ jobz = jobz.encode('ascii') uplo = uplo.encode('ascii') status = _libcula.culaDeviceCheev(jobz, uplo, n, int(a), lda, int(w)) culaCheckStatus(status) def culaDeviceZheev(jobz, uplo, n, a, lda, w): """ Hermitian eigenvalue decomposition. """ jobz = jobz.encode('ascii') uplo = uplo.encode('ascii') status = _libcula.culaDeviceZheev(jobz, uplo, n, int(a), lda, int(w)) culaCheckStatus(status) # BLAS routines provided by CULA: # SGEMM, DGEMM, CGEMM, ZGEMM _libcula.culaDeviceSgemm.restype = \ _libcula.culaDeviceDgemm.restype = \ _libcula.culaDeviceCgemm.restype = \ _libcula.culaDeviceZgemm.restype = int _libcula.culaDeviceSgemm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] _libcula.culaDeviceDgemm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_double, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_double, ctypes.c_void_p, ctypes.c_int] _libcula.culaDeviceCgemm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, cuda.cuFloatComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, cuda.cuFloatComplex, ctypes.c_void_p, ctypes.c_int] _libcula.culaDeviceZgemm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, cuda.cuDoubleComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, cuda.cuDoubleComplex, ctypes.c_void_p, ctypes.c_int] def culaDeviceSgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for general matrix. """ transa = transa.encode('ascii') transb = transb.encode('ascii') status = _libcula.culaDeviceSgemm(transa, transb, m, n, k, alpha, int(A), lda, int(B), ldb, beta, int(C), ldc) culaCheckStatus(status) def culaDeviceDgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for general matrix. """ transa = transa.encode('ascii') transb = transb.encode('ascii') status = _libcula.culaDeviceDgemm(transa, transb, m, n, k, alpha, int(A), lda, int(B), ldb, beta, int(C), ldc) culaCheckStatus(status) def culaDeviceCgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for complex general matrix. """ transa = transa.encode('ascii') transb = transb.encode('ascii') status = _libcula.culaDeviceCgemm(transa, transb, m, n, k, cuda.cuFloatComplex(alpha.real, alpha.imag), int(A), lda, int(B), ldb, cuda.cuFloatComplex(beta.real, beta.imag), int(C), ldc) culaCheckStatus(status) def culaDeviceZgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for complex general matrix. """ transa = transa.encode('ascii') transb = transb.encode('ascii') status = _libcula.culaDeviceZgemm(transa, transb, m, n, k, cuda.cuDoubleComplex(alpha.real, alpha.imag), int(A), lda, int(B), ldb, cuda.cuDoubleComplex(beta.real, beta.imag), int(C), ldc) culaCheckStatus(status) # SGEMV, DGEMV, CGEMV, ZGEMV _libcula.culaDeviceSgemv.restype = \ _libcula.culaDeviceDgemv.restype = \ _libcula.culaDeviceCgemv.restype = \ _libcula.culaDeviceZgemv.restype = int _libcula.culaDeviceSgemv.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] _libcula.culaDeviceDgemv.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_double, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_double, ctypes.c_void_p, ctypes.c_int] _libcula.culaDeviceCgemv.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, cuda.cuFloatComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, cuda.cuFloatComplex, ctypes.c_void_p, ctypes.c_int] _libcula.culaDeviceZgemv.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, cuda.cuDoubleComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, cuda.cuDoubleComplex, ctypes.c_void_p, ctypes.c_int] def culaDeviceSgemv(trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real general matrix. """ trans = trans.encode('ascii') status = _libcula.culaDeviceSgemv(trans, m, n, alpha, int(A), lda, int(x), incx, beta, int(y), incy) culaCheckStatus(status) def culaDeviceDgemv(trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for real general matrix. """ trans = trans.encode('ascii') status = _libcula.culaDeviceDgemv(trans, m, n, alpha, int(A), lda, int(x), incx, beta, int(y), incy) culaCheckStatus(status) def culaDeviceCgemv(trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex general matrix. """ trans = trans.encode('ascii') status = _libcula.culaDeviceCgemv(trans, m, n, cuda.cuFloatComplex(alpha.real, alpha.imag), int(A), lda, int(x), incx, cuda.cuFloatComplex(beta.real, beta.imag), int(y), incy) culaCheckStatus(status) def culaDeviceZgemv(trans, m, n, alpha, A, lda, x, incx, beta, y, incy): """ Matrix-vector product for complex general matrix. """ trans = trans.encode('ascii') status = _libcula.culaDeviceZgemv(trans, m, n, cuda.cuDoubleComplex(alpha.real, alpha.imag), int(A), lda, int(x), incx, cuda.cuDoubleComplex(beta.real, beta.imag), int(y), incy) culaCheckStatus(status) #GEEV # Sgeev, Dgeev, Cgeev, Zgeev _libcula.culaDeviceSgeev.restype = \ _libcula.culaDeviceDgeev.restype = \ _libcula.culaDeviceCgeev.restype = \ _libcula.culaDeviceZgeev.restype = int _libcula.culaDeviceSgeev.argtypes = \ _libcula.culaDeviceDgeev.argtypes = [ctypes.c_char, #jobvl ctypes.c_char, #jobvr ctypes.c_int, #n, the order of the matrix ctypes.c_void_p, #a ctypes.c_int, #lda ctypes.c_void_p, #wr ctypes.c_void_p, #wi ctypes.c_void_p, #vl ctypes.c_int, #ldvl ctypes.c_void_p, #vr ctypes.c_int] #ldvr _libcula.culaDeviceCgeev.argtypes = \ _libcula.culaDeviceZgeev.argtypes = [ctypes.c_char, #jobvl ctypes.c_char, #jobvr ctypes.c_int, #n, the order of the matrix ctypes.c_void_p, #a ctypes.c_int, #lda ctypes.c_void_p, #w ctypes.c_void_p, #vl ctypes.c_int, #ldvl ctypes.c_void_p, #vr ctypes.c_int] #ldvr def culaDeviceSgeev(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr): """ General Eigenproblem solver. """ jobvl = jobvl.encode('ascii') jobvr = jobvr.encode('ascii') status = _libcula.culaDeviceSgeev(jobvl, jobvr, n, int(a), lda, int(wr), int(wi), int(vl), ldvl, int(vr), ldvr) culaCheckStatus(status) def culaDeviceDgeev(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr): """ General Eigenproblem solver. """ jobvl = jobvl.encode('ascii') jobvr = jobvr.encode('ascii') status = _libcula.culaDeviceDgeev(jobvl, jobvr, n, int(a), lda, int(wr), int(wi), int(vl), ldvl, int(vr), ldvr) culaCheckStatus(status) def culaDeviceCgeev(jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr): """ General Eigenproblem solver. """ jobvl = jobvl.encode('ascii') jobvr = jobvr.encode('ascii') status = _libcula.culaDeviceCgeev(jobvl, jobvr, n, int(a), lda, int(w), int(vl), ldvl, int(vr), ldvr) culaCheckStatus(status) def culaDeviceZgeev(jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr): """ General Eigenproblem solver. """ jobvl = jobvl.encode('ascii') jobvr = jobvr.encode('ascii') status = _libcula.culaDeviceZgeev(jobvl, jobvr, n, int(a), lda, int(w), int(vl), ldvl, int(vr), ldvr) culaCheckStatus(status) # Auxiliary routines: try: _libcula.culaDeviceSgeTranspose.restype = \ _libcula.culaDeviceDgeTranspose.restype = \ _libcula.culaDeviceCgeTranspose.restype = \ _libcula.culaDeviceZgeTranspose.restype = int _libcula.culaDeviceSgeTranspose.argtypes = \ _libcula.culaDeviceDgeTranspose.argtypes = \ _libcula.culaDeviceCgeTranspose.argtypes = \ _libcula.culaDeviceZgeTranspose.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceSgeTranspose(m, n, A, lda, B, ldb): """ Transpose of real general matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceDgeTranspose(m, n, A, lda, B, ldb): """ Transpose of real general matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceCgeTranspose(m, n, A, lda, B, ldb): """ Transpose of complex general matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgeTranspose(m, n, A, lda, B, ldb): """ Transpose of complex general matrix. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceSgeTranspose(m, n, A, lda, B, ldb): """ Transpose of real general matrix. """ status = _libcula.culaDeviceSgeTranspose(m, n, int(A), lda, int(B), ldb) culaCheckStatus(status) def culaDeviceDgeTranspose(m, n, A, lda, B, ldb): """ Transpose of real general matrix. """ status = _libcula.culaDeviceDgeTranspose(m, n, int(A), lda, int(B), ldb) culaCheckStatus(status) def culaDeviceCgeTranspose(m, n, A, lda, B, ldb): """ Transpose of complex general matrix. """ status = _libcula.culaDeviceCgeTranspose(m, n, int(A), lda, int(B), ldb) culaCheckStatus(status) def culaDeviceZgeTranspose(m, n, A, lda, B, ldb): """ Transpose of complex general matrix. """ status = _libcula.culaDeviceZgeTranspose(m, n, int(A), lda, int(B), ldb) culaCheckStatus(status) try: _libcula.culaDeviceSgeTransposeInplace.restype = \ _libcula.culaDeviceDgeTransposeInplace.restype = \ _libcula.culaDeviceCgeTransposeInplace.restype = \ _libcula.culaDeviceZgeTransposeInplace.restype = int _libcula.culaDeviceSgeTransposeInplace.argtypes = \ _libcula.culaDeviceDgeTransposeInplace.argtypes = \ _libcula.culaDeviceCgeTransposeInplace.argtypes = \ _libcula.culaDeviceZgeTransposeInplace.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceSgeTransposeInplace(n, A, lda): """ Inplace transpose of real square matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceDgeTransposeInplace(n, A, lda): """ Inplace transpose of real square matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceCgeTransposeInplace(n, A, lda): """ Inplace transpose of complex square matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgeTransposeInplace(n, A, lda): """ Inplace transpose of complex square matrix. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceSgeTransposeInplace(n, A, lda): """ Inplace transpose of real square matrix. """ status = _libcula.culaDeviceSgeTransposeInplace(n, int(A), lda) culaCheckStatus(status) def culaDeviceDgeTransposeInplace(n, A, lda): """ Inplace transpose of real square matrix. """ status = _libcula.culaDeviceDgeTransposeInplace(n, int(A), lda) culaCheckStatus(status) def culaDeviceCgeTransposeInplace(n, A, lda): """ Inplace transpose of complex square matrix. """ status = _libcula.culaDeviceCgeTransposeInplace(n, int(A), lda) culaCheckStatus(status) def culaDeviceZgeTransposeInplace(n, A, lda): """ Inplace transpose of complex square matrix. """ status = _libcula.culaDeviceZgeTransposeInplace(n, int(A), lda) culaCheckStatus(status) try: _libcula.culaDeviceCgeTransposeConjugate.restype = \ _libcula.culaDeviceZgeTransposeConjugate.restype = int _libcula.culaDeviceCgeTransposeConjugate.argtypes = \ _libcula.culaDeviceZgeTransposeConjugate.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceCgeTransposeConjugate(m, n, A, lda, B, ldb): """ Conjugate transpose of complex general matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgeTransposeConjugate(m, n, A, lda, B, ldb): """ Conjugate transpose of complex general matrix. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceCgeTransposeConjugate(m, n, A, lda, B, ldb): """ Conjugate transpose of complex general matrix. """ status = _libcula.culaDeviceCgeTransposeConjugate(m, n, int(A), lda, int(B), ldb) culaCheckStatus(status) def culaDeviceZgeTransposeConjugate(m, n, A, lda, B, ldb): """ Conjugate transpose of complex general matrix. """ status = _libcula.culaDeviceZgeTransposeConjugate(m, n, int(A), lda, int(B), ldb) culaCheckStatus(status) try: _libcula.culaDeviceCgeTransposeConjugateInplace.restype = \ _libcula.culaDeviceZgeTransposeConjugateInplace.restype = int _libcula.culaDeviceCgeTransposeConjugateInplace.argtypes = \ _libcula.culaDeviceZgeTransposeConjugateInplace.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceCgeTransposeConjugateInplace(n, A, lda): """ Inplace conjugate transpose of complex square matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgeTransposeConjugateInplace(n, A, lda): """ Inplace conjugate transpose of complex square matrix. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceCgeTransposeConjugateInplace(n, A, lda): """ Inplace conjugate transpose of complex square matrix. """ status = _libcula.culaDeviceCgeTransposeConjugateInplace(n, int(A), lda) culaCheckStatus(status) def culaDeviceZgeTransposeConjugateInplace(n, A, lda): """ Inplace conjugate transpose of complex square matrix. """ status = _libcula.culaDeviceZgeTransposeConjugateInplace(n, int(A), lda) culaCheckStatus(status) try: _libcula.culaDeviceCgeConjugate.restype = \ _libcula.culaDeviceZgeConjugate.restype = int _libcula.culaDeviceCgeConjugate.argtypes = \ _libcula.culaDeviceZgeConjugate.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceCgeConjugate(m, n, A, lda): """ Conjugate of complex general matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceZgeConjugate(m, n, A, lda): """ Conjugate of complex general matrix. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceCgeConjugate(m, n, A, lda): """ Conjugate of complex general matrix. """ status = _libcula.culaDeviceCgeConjugate(m, n, int(A), lda) culaCheckStatus(status) def culaDeviceZgeConjugate(m, n, A, lda): """ Conjugate of complex general matrix. """ status = _libcula.culaDeviceZgeConjugate(m, n, int(A), lda) culaCheckStatus(status) try: _libcula.culaDeviceCtrConjugate.restype = \ _libcula.culaDeviceZtrConjugate.restype = int _libcula.culaDeviceCtrConjugate.argtypes = \ _libcula.culaDeviceZtrConjugate.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceCtrConjugate(uplo, diag, m, n, A, lda): """ Conjugate of complex upper or lower triangle matrix. """ raise NotImplementedError('CULA Dense required') def culaDeviceZtrConjugate(uplo, diag, m, n, A, lda): """ Conjugate of complex upper or lower triangle matrix. """ raise NotImplementedError('CULA Dense required') else: def culaDeviceCtrConjugate(uplo, diag, m, n, A, lda): """ Conjugate of complex upper or lower triangle matrix. """ uplo = uplo.encode('ascii') status = _libcula.culaDeviceCtrConjugate(uplo, diag, m, n, int(A), lda) culaCheckStatus(status) def culaDeviceZtrConjugate(uplo, diag, m, n, A, lda): """ Conjugate of complex upper or lower triangle matrix. """ uplo = uplo.encode('ascii') status = _libcula.culaDeviceZtrConjugate(uplo, diag, m, n, int(A), lda) culaCheckStatus(status) try: _libcula.culaDeviceSgeNancheck.restype = \ _libcula.culaDeviceDgeNancheck.restype = \ _libcula.culaDeviceCgeNancheck.restype = \ _libcula.culaDeviceZgeNancheck.restype = int _libcula.culaDeviceSgeNancheck.argtypes = \ _libcula.culaDeviceDgeNancheck.argtypes = \ _libcula.culaDeviceCgeNancheck.argtypes = \ _libcula.culaDeviceZgeNancheck.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] except AttributeError: def culaDeviceSgeNancheck(m, n, A, lda): """ Check a real general matrix for invalid entries """ raise NotImplementedError('CULA Dense required') def culaDeviceDgeNancheck(m, n, A, lda): """ Check a real general matrix for invalid entries """ raise NotImplementedError('CULA Dense required') def culaDeviceCgeNancheck(m, n, A, lda): """ Check a complex general matrix for invalid entries """ raise NotImplementedError('CULA Dense required') def culaDeviceZgeNancheck(m, n, A, lda): """ Check a complex general matrix for invalid entries """ raise NotImplementedError('CULA Dense required') else: def culaDeviceSgeNancheck(m, n, A, lda): """ Check a real general matrix for invalid entries """ status = _libcula.culaDeviceSgeNancheck(m, n, int(A), lda) try: culaCheckStatus(status) except culaDataError: return True return False def culaDeviceDgeNancheck(m, n, A, lda): """ Check a real general matrix for invalid entries """ status = _libcula.culaDeviceDgeNancheck(m, n, int(A), lda) try: culaCheckStatus(status) except culaDataError: return True return False def culaDeviceCgeNancheck(m, n, A, lda): """ Check a complex general matrix for invalid entries """ status = _libcula.culaDeviceCgeNancheck(m, n, int(A), lda) try: culaCheckStatus(status) except culaDataError: return True return False def culaDeviceZgeNancheck(m, n, A, lda): """ Check a complex general matrix for invalid entries """ status = _libcula.culaDeviceZgeNancheck(m, n, int(A), lda) try: culaCheckStatus(status) except culaDataError: return True return False if __name__ == "__main__": import doctest doctest.testmod() scikit-cuda-0.5.1/skcuda/cusolver.py000066400000000000000000001102251261465507300174150ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to CUSOLVER functions. Note: this module does not explicitly depend on PyCUDA. """ from . import cudart if int(cudart._cudart_version) < 7000: raise ImportError('CUSOLVER library only available in CUDA 7.0 and later') import ctypes import sys import numpy as np from . import cuda # Load library: _version_list = [7.5, 7.0] if 'linux' in sys.platform: _libcusolver_libname_list = ['libcusolver.so'] + \ ['libsolver.so.%s' % v for v in _version_list] elif sys.platform == 'darwin': _libcusolver_libname_list = ['libcusolver.dylib'] elif sys.platform == 'win32': if sys.maxsize > 2**32: _libcusolver_libname_list = ['cusolver.dll'] + \ ['cusolver64_%s.dll' % int(10*v) for v in _version_list] else: _libcusolver_libname_list = ['cusolver.dll'] + \ ['cusolver32_%s.dll' % int(10*v) for v in _version_list] else: raise RuntimeError('unsupported platform') # Print understandable error message when library cannot be found: _libcusolver = None for _libcusolver_libname in _libcusolver_libname_list: try: if sys.platform == 'win32': _libcusolver = ctypes.windll.LoadLibrary(_libcusolver_libname) else: _libcusolver = ctypes.cdll.LoadLibrary(_libcusolver_libname) except OSError: pass else: break if _libcusolver == None: raise OSError('cusolver library not found') class CUSOLVER_ERROR(Exception): """CUSOLVER error.""" pass class CUSOLVER_STATUS_NOT_INITIALIZED(CUSOLVER_ERROR): """CUSOLVER library not initialized.""" pass class CUSOLVER_STATUS_ALLOC_FAILED(CUSOLVER_ERROR): """CUSOLVER memory allocation failed.""" pass class CUSOLVER_STATUS_INVALID_VALUE(CUSOLVER_ERROR): """Invalid value passed to CUSOLVER function.""" pass class CUSOLVER_STATUS_ARCH_MISMATCH(CUSOLVER_ERROR): """CUSOLVER architecture mismatch.""" pass class CUSOLVER_STATUS_MAPPING_ERROR(CUSOLVER_ERROR): """CUSOLVER mapping error.""" pass class CUSOLVER_STATUS_EXECUTION_FAILED(CUSOLVER_ERROR): """CUSOLVER execution failed.""" pass class CUSOLVER_STATUS_INTERNAL_ERROR(CUSOLVER_ERROR): """CUSOLVER internal error.""" pass class CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED(CUSOLVER_ERROR): """Matrix type not supported by CUSOLVER.""" pass class CUSOLVER_STATUS_NOT_SUPPORTED(CUSOLVER_ERROR): """Operation not supported by CUSOLVER.""" pass class CUSOLVER_STATUS_ZERO_PIVOT(CUSOLVER_ERROR): """Zero pivot encountered by CUSOLVER.""" pass class CUSOLVER_STATUS_INVALID_LICENSE(CUSOLVER_ERROR): """Invalid CUSOLVER license.""" pass CUSOLVER_EXCEPTIONS = { 1: CUSOLVER_STATUS_NOT_INITIALIZED, 2: CUSOLVER_STATUS_ALLOC_FAILED, 3: CUSOLVER_STATUS_INVALID_VALUE, 4: CUSOLVER_STATUS_ARCH_MISMATCH, 5: CUSOLVER_STATUS_MAPPING_ERROR, 6: CUSOLVER_STATUS_EXECUTION_FAILED, 7: CUSOLVER_STATUS_INTERNAL_ERROR, 8: CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED, 9: CUSOLVER_STATUS_NOT_SUPPORTED, 10: CUSOLVER_STATUS_ZERO_PIVOT, 11: CUSOLVER_STATUS_INVALID_LICENSE } def cusolverCheckStatus(status): """ Raise CUSOLVER exception. Raise an exception corresponding to the specified CUSOLVER error code. Parameters ---------- status : int CUSOLVER error code. See Also -------- CUSOLVER_EXCEPTIONS """ if status != 0: try: raise CUSOLVER_EXCEPTIONS[status] except KeyError: raise CUSOLVER_ERROR # Helper functions: _libcusolver.cusolverDnCreate.restype = int _libcusolver.cusolverDnCreate.argtypes = [ctypes.c_void_p] def cusolverDnCreate(): """ Create cuSolverDn context. Returns ------- handle : int cuSolverDn context. References ---------- `cusolverDnCreate `_ """ handle = ctypes.c_void_p() status = _libcusolver.cusolverDnCreate(ctypes.byref(handle)) cusolverCheckStatus(status) return handle.value _libcusolver.cusolverDnDestroy.restype = int _libcusolver.cusolverDnDestroy.argtypes = [ctypes.c_void_p] def cusolverDnDestroy(handle): """ Destroy cuSolverDn context. Parameters ---------- handle : int cuSolverDn context. References ---------- `cusolverDnDestroy `_ """ status = _libcusolver.cusolverDnDestroy(handle) cusolverCheckStatus(status) _libcusolver.cusolverDnSetStream.restype = int _libcusolver.cusolverDnSetStream.argtypes = [ctypes.c_int, ctypes.c_int] def cusolverDnSetStream(handle, stream): """ Set stream used by cuSolverDN library. Parameters ---------- handle : int cuSolverDN context. stream : int Stream to be used. References ---------- `cusolverDnSetStream `_ """ status = _libcusolver.cusolverDnSetStream(handle, stream) cusolverCheckStatus(status) _libcusolver.cusolverDnGetStream.restype = int _libcusolver.cusolverDnGetStream.argtypes = [ctypes.c_int, ctypes.c_void_p] def cusolverDnGetStream(handle): """ Get stream used by cuSolverDN library. Parameters ---------- handle : int cuSolverDN context. Returns ------- stream : int Stream used by context. References ---------- `cusolverDnGetStream `_ """ stream = ctypes.c_int() status = _libcusolver.cusolverDnGetStream(handle, ctypes.byref(stream)) cusolverCheckStatus(status) return status.value # Dense solver functions: # SGETRF, DGETRF, CGETRF, ZGETRF _libcusolver.cusolverDnSgetrf_bufferSize.restype = int _libcusolver.cusolverDnSgetrf_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnSgetrf_bufferSize(handle, m, n, A, lda): """ Calculate size of work buffer used by cusolverDnSgetrf. References ---------- `cusolvernSgetrf `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnSgetrf_bufferSize(handle, m, n, int(A), n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnSgetrf.restype = int _libcusolver.cusolverDnSgetrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo): """ Compute LU factorization of a real single precision m x n matrix. References ---------- `cusolverDngetrf `_ """ status = _libcusolver.cusolverDnSgetrf(handle, m, n, int(A), lda, int(Workspace), int(devIpiv), int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnDgetrf_bufferSize.restype = int _libcusolver.cusolverDnDgetrf_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnDgetrf_bufferSize(handle, m, n, A, lda): """ Calculate size of work buffer used by cusolverDnDgetrf. References ---------- `cusolverDngetrf `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnDgetrf_bufferSize(handle, m, n, int(A), n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnDgetrf.restype = int _libcusolver.cusolverDnDgetrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo): """ Compute LU factorization of a real double precision m x n matrix. References ---------- `cusolverDngetrf `_ """ status = _libcusolver.cusolverDnDgetrf(handle, m, n, int(A), lda, int(Workspace), int(devIpiv), int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnCgetrf_bufferSize.restype = int _libcusolver.cusolverDnCgetrf_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnCgetrf_bufferSize(handle, m, n, A, lda): """ Calculate size of work buffer used by cusolverDnCgetrf. References ---------- `cusolverDngetrf `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnCgetrf_bufferSize(handle, m, n, int(A), n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnCgetrf.restype = int _libcusolver.cusolverDnCgetrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cusolverDnCgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo): """ Compute LU factorization of a complex single precision m x n matrix. References ---------- `cusolverDngetrf `_ """ status = _libcusolver.cusolverDnCgetrf(handle, m, n, int(A), lda, int(Workspace), int(devIpiv), int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnZgetrf_bufferSize.restype = int _libcusolver.cusolverDnZgetrf_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnZgetrf_bufferSize(handle, m, n, A, lda): """ Calculate size of work buffer used by cusolverDnZgetrf. References ---------- `cusolverDngetrf `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnZgetrf_bufferSize(handle, m, n, int(A), n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnZgetrf.restype = int _libcusolver.cusolverDnZgetrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cusolverDnZgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo): """ Compute LU factorization of a complex double precision m x n matrix. References ---------- `cusolverDngetrf `_ """ status = _libcusolver.cusolverDnZgetrf(handle, m, n, int(A), lda, int(Workspace), int(devIpiv), int(devInfo)) cusolverCheckStatus(status) # SGETRS, DGETRS, CGETRS, ZGETRS _libcusolver.cusolverDnSgetrs.restype = int _libcusolver.cusolverDnSgetrs.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo): """ Solve real single precision linear system. References ---------- `cusolverDngetrs `_ """ status = _libcusolver.cusolverDnSgetrs(handle, trans, n, nrhs, int(A), lda, int(devIpiv), int(B), ldb, int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnDgetrs.restype = int _libcusolver.cusolverDnDgetrs.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo): """ Solve real double precision linear system. References ---------- `cusolverDngetrs `_ """ status = _libcusolver.cusolverDnDgetrs(handle, trans, n, nrhs, int(A), lda, int(devIpiv), int(B), ldb, int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnCgetrs.restype = int _libcusolver.cusolverDnCgetrs.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnCgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo): """ Solve complex single precision linear system. References ---------- `cusolverDngetrs `_ """ status = _libcusolver.cusolverDnCgetrs(handle, trans, n, nrhs, int(A), lda, int(devIpiv), int(B), ldb, int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnZgetrs.restype = int _libcusolver.cusolverDnZgetrs.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnZgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo): """ Solve complex double precision linear system. References ---------- `cusolverDngetrs `_ """ status = _libcusolver.cusolverDnZgetrs(handle, trans, n, nrhs, int(A), lda, int(devIpiv), int(B), ldb, int(devInfo)) cusolverCheckStatus(status) # SGESVD, DGESVD, CGESVD, ZGESVD _libcusolver.cusolverDnSgesvd_bufferSize.restype = int _libcusolver.cusolverDnSgesvd_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p] def cusolverDnSgesvd_bufferSize(handle, m, n): """ Calculate size of work buffer used by cusolverDnSgesvd. References ---------- `cusolverDngesvd `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnSgesvd_bufferSize(handle, m, n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnSgesvd.restype = int _libcusolver.cusolverDnSgesvd.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, Work, Lwork, rwork, devInfo): """ Compute real single precision singular value decomposition. References ---------- `cusolverDngesvd `_ """ status = _libcusolver.cusolverDnSgesvd(handle, jobu, jobvt, m, n, int(A), lda, int(S), int(U), ldu, int(VT), ldvt, int(Work), Lwork, int(rwork), int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnDgesvd_bufferSize.restype = int _libcusolver.cusolverDnDgesvd_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p] def cusolverDnDgesvd_bufferSize(handle, m, n): """ Calculate size of work buffer used by cusolverDnDgesvd. References ---------- `cusolverDngesvd `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnDgesvd_bufferSize(handle, m, n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnDgesvd.restype = int _libcusolver.cusolverDnDgesvd.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, Work, Lwork, rwork, devInfo): """ Compute real double precision singular value decomposition. References ---------- `cusolverDngesvd `_ """ status = _libcusolver.cusolverDnDgesvd(handle, jobu, jobvt, m, n, int(A), lda, int(S), int(U), ldu, int(VT), ldvt, int(Work), Lwork, int(rwork), int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnCgesvd_bufferSize.restype = int _libcusolver.cusolverDnCgesvd_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p] def cusolverDnCgesvd_bufferSize(handle, m, n): """ Calculate size of work buffer used by cusolverDnCgesvd. References ---------- `cusolverDngesvd `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnCgesvd_bufferSize(handle, m, n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnCgesvd.restype = int _libcusolver.cusolverDnCgesvd.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cusolverDnCgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, Work, Lwork, rwork, devInfo): """ Compute complex single precision singular value decomposition. References ---------- `cusolverDngesvd `_ """ status = _libcusolver.cusolverDnCgesvd(handle, jobu, jobvt, m, n, int(A), lda, int(S), int(U), ldu, int(VT), ldvt, int(Work), Lwork, int(rwork), int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnZgesvd_bufferSize.restype = int _libcusolver.cusolverDnZgesvd_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p] def cusolverDnZgesvd_bufferSize(handle, m, n): """ Calculate size of work buffer used by cusolverDnZgesvd. References ---------- `cusolverDngesvd `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnZgesvd_bufferSize(handle, m, n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnZgesvd.restype = int _libcusolver.cusolverDnZgesvd.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cusolverDnZgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, Work, Lwork, rwork, devInfo): """ Compute complex double precision singular value decomposition. References ---------- `cusolverDngesvd `_ """ status = _libcusolver.cusolverDnZgesvd(handle, jobu, jobvt, m, n, int(A), lda, int(S), int(U), ldu, int(VT), ldvt, int(Work), Lwork, int(rwork), int(devInfo)) cusolverCheckStatus(status) # SGEQRF, DGEQRF, CGEQRF, ZGEQRF _libcusolver.cusolverDnSgeqrf_bufferSize.restype = int _libcusolver.cusolverDnSgeqrf_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda): """ Calculate size of work buffer used by cusolverDnSgeqrf. References ---------- `cusolverDngeqrf `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnSgeqrf_bufferSize(handle, m, n, int(A), n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnSgeqrf.restype = int _libcusolver.cusolverDnSgeqrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo): """ Compute QR factorization of a real single precision m x n matrix. References ---------- `cusolverDngeqrf `_ """ status = _libcusolver.cusolverDnSgeqrf(handle, m, n, int(A), lda, int(TAU), int(Workspace), Lwork, int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnDgeqrf_bufferSize.restype = int _libcusolver.cusolverDnDgeqrf_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda): """ Calculate size of work buffer used by cusolverDnDgeqrf. References ---------- `cusolverDngeqrf `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnDgeqrf_bufferSize(handle, m, n, int(A), n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnDgeqrf.restype = int _libcusolver.cusolverDnDgeqrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo): """ Compute QR factorization of a real double precision m x n matrix. References ---------- `cusolverDngeqrf `_ """ status = _libcusolver.cusolverDnDgeqrf(handle, m, n, int(A), lda, int(TAU), int(Workspace), Lwork, int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnCgeqrf_bufferSize.restype = int _libcusolver.cusolverDnCgeqrf_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnCgeqrf_bufferSize(handle, m, n, A, lda): """ Calculate size of work buffer used by cusolverDnCgeqrf. References ---------- `cusolverDngeqrf `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnCgeqrf_bufferSize(handle, m, n, int(A), n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnCgeqrf.restype = int _libcusolver.cusolverDnCgeqrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnCgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo): """ Compute QR factorization of a complex single precision m x n matrix. References ---------- `cusolverDngeqrf `_ """ status = _libcusolver.cusolverDnCgeqrf(handle, m, n, int(A), lda, int(TAU), int(Workspace), Lwork, int(devInfo)) cusolverCheckStatus(status) _libcusolver.cusolverDnZgeqrf_bufferSize.restype = int _libcusolver.cusolverDnZgeqrf_bufferSize.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnZgeqrf_bufferSize(handle, m, n, A, lda): """ Calculate size of work buffer used by cusolverDnZgeqrf. References ---------- `cusolverDngeqrf `_ """ Lwork = ctypes.c_int() status = _libcusolver.cusolverDnZgeqrf_bufferSize(handle, m, n, int(A), n, ctypes.byref(Lwork)) cusolverCheckStatus(status) return Lwork.value _libcusolver.cusolverDnZgeqrf.restype = int _libcusolver.cusolverDnZgeqrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def cusolverDnZgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo): """ Compute QR factorization of a complex double precision m x n matrix. References ---------- `cusolverDngeqrf `_ """ status = _libcusolver.cusolverDnZgeqrf(handle, m, n, int(A), lda, int(TAU), int(Workspace), Lwork, int(devInfo)) cusolverCheckStatus(status) scikit-cuda-0.5.1/skcuda/cusparse.py000066400000000000000000000246621261465507300174110ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to CUSPARSE functions. Note: this module does not explicitly depend on PyCUDA. """ import atexit import ctypes.util import platform from string import Template import sys import warnings import numpy as np import cuda # Load library: _version_list = [7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0] if 'linux' in sys.platform: _libcusparse_libname_list = ['libcusparse.so'] + \ ['libcusparse.so.%s' % v for v in _version_list] elif sys.platform == 'darwin': _libcusparse_libname_list = ['libcusparse.dylib'] elif sys.platform == 'win32': if platform.machine().endswith('64'): _libcusparse_libname_list = ['cusparse.dll'] + \ ['cusparse64_%s.dll' % int(10*v) for v in _version_list] else: _libcusparse_libname_list = ['cusparse.dll'] + \ ['cusparse32_%s.dll' % int(10*v) for v in _version_list] else: raise RuntimeError('unsupported platform') # Print understandable error message when library cannot be found: _libcusparse = None for _libcusparse_libname in _libcusparse_libname_list: try: if sys.platform == 'win32': _libcusparse = ctypes.windll.LoadLibrary(_libcusparse_libname) else: _libcusparse = ctypes.cdll.LoadLibrary(_libcusparse_libname) except OSError: pass else: break if _libcusparse == None: OSError('CUDA sparse library not found') class cusparseError(Exception): """CUSPARSE error""" pass class cusparseStatusNotInitialized(cusparseError): """CUSPARSE library not initialized""" pass class cusparseStatusAllocFailed(cusparseError): """CUSPARSE resource allocation failed""" pass class cusparseStatusInvalidValue(cusparseError): """Unsupported value passed to the function""" pass class cusparseStatusArchMismatch(cusparseError): """Function requires a feature absent from the device architecture""" pass class cusparseStatusMappingError(cusparseError): """An access to GPU memory space failed""" pass class cusparseStatusExecutionFailed(cusparseError): """GPU program failed to execute""" pass class cusparseStatusInternalError(cusparseError): """An internal CUSPARSE operation failed""" pass class cusparseStatusMatrixTypeNotSupported(cusparseError): """The matrix type is not supported by this function""" pass cusparseExceptions = { 1: cusparseStatusNotInitialized, 2: cusparseStatusAllocFailed, 3: cusparseStatusInvalidValue, 4: cusparseStatusArchMismatch, 5: cusparseStatusMappingError, 6: cusparseStatusExecutionFailed, 7: cusparseStatusInternalError, 8: cusparseStatusMatrixTypeNotSupported, } # Matrix types: CUSPARSE_MATRIX_TYPE_GENERAL = 0 CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1 CUSPARSE_MATRIX_TYPE_HERMITIAN = 2 CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 CUSPARSE_FILL_MODE_LOWER = 0 CUSPARSE_FILL_MODE_UPPER = 1 # Whether or not a matrix' diagonal entries are unity: CUSPARSE_DIAG_TYPE_NON_UNIT = 0 CUSPARSE_DIAG_TYPE_UNIT = 1 # Matrix index bases: CUSPARSE_INDEX_BASE_ZERO = 0 CUSPARSE_INDEX_BASE_ONE = 1 # Operation types: CUSPARSE_OPERATION_NON_TRANSPOSE = 0 CUSPARSE_OPERATION_TRANSPOSE = 1 CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 # Whether or not to parse elements of a dense matrix row or column-wise. CUSPARSE_DIRECTION_ROW = 0 CUSPARSE_DIRECTION_COLUMN = 1 # Helper functions: class cusparseMatDescr(ctypes.Structure): _fields_ = [ ('MatrixType', ctypes.c_int), ('FillMode', ctypes.c_int), ('DiagType', ctypes.c_int), ('IndexBase', ctypes.c_int) ] def cusparseCheckStatus(status): """ Raise CUSPARSE exception Raise an exception corresponding to the specified CUSPARSE error code. Parameters ---------- status : int CUSPARSE error code. See Also -------- cusparseExceptions """ if status != 0: try: raise cusparseExceptions[status] except KeyError: raise cusparseError _libcusparse.cusparseCreate.restype = int _libcusparse.cusparseCreate.argtypes = [ctypes.c_void_p] def cusparseCreate(): """ Initialize CUSPARSE. Initializes CUSPARSE and creates a handle to a structure holding the CUSPARSE library context. Returns ------- handle : int CUSPARSE library context. """ handle = ctypes.c_int() status = _libcusparse.cusparseCreate(ctypes.byref(handle)) cusparseCheckStatus(status) return handle.value _libcusparse.cusparseDestroy.restype = int _libcusparse.cusparseDestroy.argtypes = [ctypes.c_int] def cusparseDestroy(handle): """ Release CUSPARSE resources. Releases hardware resources used by CUSPARSE Parameters ---------- handle : int CUSPARSE library context. """ status = _libcusparse.cusparseDestroy(handle) cusparseCheckStatus(status) _libcusparse.cusparseGetVersion.restype = int _libcusparse.cusparseGetVersion.argtypes = [ctypes.c_int, ctypes.c_void_p] def cusparseGetVersion(handle): """ Return CUSPARSE library version. Returns the version number of the CUSPARSE library. Parameters ---------- handle : int CUSPARSE library context. Returns ------- version : int CUSPARSE library version number. """ version = ctypes.c_int() status = _libcusparse.cusparseGetVersion(handle, ctypes.byref(version)) cusparseCheckStatus(status) return version.value _libcusparse.cusparseSetStream.restype = int _libcusparse.cusparseSetStream.argtypes = [ctypes.c_int, ctypes.c_int] def cusparseSetStream(handle, id): """ Sets the CUSPARSE stream in which kernels will run. Parameters ---------- handle : int CUSPARSE library context. id : int Stream ID. """ status = _libcusparse.cusparseSetStream(handle, id) cusparseCheckStatus(status) _libcusparse.cusparseCreateMatDescr.restype = int _libcusparse.cusparseCreateMatDescr.argtypes = [cusparseMatDescr] def cusparseCreateMatDescr(): """ Initialize a sparse matrix descriptor. Initializes the `MatrixType` and `IndexBase` fields of the matrix descriptor to the default values `CUSPARSE_MATRIX_TYPE_GENERAL` and `CUSPARSE_INDEX_BASE_ZERO`. Returns ------- desc : cusparseMatDescr Matrix descriptor. """ desc = cusparseMatrixDesc() status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(desc)) cusparseCheckStatus(status) return desc _libcusparse.cusparseDestroyMatDescr.restype = int _libcusparse.cusparseDestroyMatDescr.argtypes = [ctypes.c_int] def cusparseDestroyMatDescr(desc): """ Releases the memory allocated for the matrix descriptor. Parameters ---------- desc : cusparseMatDescr Matrix descriptor. """ status = _libcusparse.cusparseDestroyMatDescr(desc) cusparseCheckStatus(status) _libcusparse.cusparseSetMatType.restype = int _libcusparse.cusparseSetMatType.argtypes = [cusparseMatDescr, ctypes.c_int] def cusparseSetMatType(desc, type): """ Sets the matrix type of the specified matrix. Parameters ---------- desc : cusparseMatDescr Matrix descriptor. type : int Matrix type. """ status = _libcusparse.cusparseSetMatType(desc, type) cusparseCheckStatus(status) _libcusparse.cusparseGetMatType.restype = int _libcusparse.cusparseGetMatType.argtypes = [cusparseMatDescr] def cusparseGetMatType(desc): """ Gets the matrix type of the specified matrix. Parameters ---------- desc : cusparseMatDescr Matrix descriptor. Returns ------- type : int Matrix type. """ return _libcusparse.cusparseGetMatType(desc) # Format conversion functions: _libcusparse.cusparseSnnz.restype = int _libcusparse.cusparseSnnz.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, cusparseMatDescr, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def cusparseSnnz(handle, dirA, m, n, descrA, A, lda, nnzPerRowColumn, nnzTotalDevHostPtr): """ Compute number of non-zero elements per row, column, or dense matrix. Parameters ---------- handle : int CUSPARSE library context. dirA : int Data direction of elements. m : int Rows in A. n : int Columns in A. descrA : cusparseMatDescr Matrix descriptor. A : pycuda.gpuarray.GPUArray Dense matrix of dimensions (lda, n). lda : int Leading dimension of A. Returns ------- nnzPerRowColumn : pycuda.gpuarray.GPUArray Array of length m or n containing the number of non-zero elements per row or column, respectively. nnzTotalDevHostPtr : pycuda.gpuarray.GPUArray Total number of non-zero elements in device or host memory. """ # Unfinished: nnzPerRowColumn = gpuarray.empty() nnzTotalDevHostPtr = gpuarray.empty() status = _libcusparse.cusparseSnnz(handle, dirA, m, n, descrA, int(A), lda, int(nnzPerRowColumn), int(nnzTotalDevHostPtr)) cusparseCheckStatus(status) return nnzPerVector, nnzHost _libcusparse.cusparseSdense2csr.restype = int _libcusparse.cusparseSdense2csr.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, cusparseMatDescr, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def cusparseSdense2csr(handle, m, n, descrA, A, lda, nnzPerRow, csrValA, csrRowPtrA, csrColIndA): # Unfinished pass scikit-cuda-0.5.1/skcuda/fft.py000066400000000000000000000231261261465507300163350ustar00rootroot00000000000000#!/usr/bin/env python """ PyCUDA-based FFT functions. """ import pycuda.driver as drv import pycuda.gpuarray as gpuarray import pycuda.elementwise as el import pycuda.tools as tools import numpy as np from . import cufft from .cufft import CUFFT_COMPATIBILITY_NATIVE, \ CUFFT_COMPATIBILITY_FFTW_PADDING, \ CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC, \ CUFFT_COMPATIBILITY_FFTW_ALL from . import misc class Plan: """ CUFFT plan class. This class represents an FFT plan for CUFFT. Parameters ---------- shape : tuple of ints Transform shape. May contain more than 3 elements. in_dtype : { numpy.float32, numpy.float64, numpy.complex64, numpy.complex128 } Type of input data. out_dtype : { numpy.float32, numpy.float64, numpy.complex64, numpy.complex128 } Type of output data. batch : int Number of FFTs to configure in parallel (default is 1). stream : pycuda.driver.Stream Stream with which to associate the plan. If no stream is specified, the default stream is used. mode : int FFTW compatibility mode. inembed : numpy.array with dtype=numpy.int32 number of elements in each dimension of the input array istride : int distance between two successive input elements in the least significant (innermost) dimension idist : int distance between the first element of two consective batches in the input data onembed : numpy.array with dtype=numpy.int32 number of elements in each dimension of the output array ostride : int distance between two successive output elements in the least significant (innermost) dimension odist : int distance between the first element of two consective batches in the output data """ def __init__(self, shape, in_dtype, out_dtype, batch=1, stream=None, mode=0x01, inembed=None, istride=1, idist=0, onembed=None, ostride=1, odist=0): if np.isscalar(shape): self.shape = (shape, ) else: self.shape = shape self.in_dtype = in_dtype self.out_dtype = out_dtype if batch <= 0: raise ValueError('batch size must be greater than 0') self.batch = batch # Determine type of transformation: if in_dtype == np.float32 and out_dtype == np.complex64: self.fft_type = cufft.CUFFT_R2C self.fft_func = cufft.cufftExecR2C elif in_dtype == np.complex64 and out_dtype == np.float32: self.fft_type = cufft.CUFFT_C2R self.fft_func = cufft.cufftExecC2R elif in_dtype == np.complex64 and out_dtype == np.complex64: self.fft_type = cufft.CUFFT_C2C self.fft_func = cufft.cufftExecC2C elif in_dtype == np.float64 and out_dtype == np.complex128: self.fft_type = cufft.CUFFT_D2Z self.fft_func = cufft.cufftExecD2Z elif in_dtype == np.complex128 and out_dtype == np.float64: self.fft_type = cufft.CUFFT_Z2D self.fft_func = cufft.cufftExecZ2D elif in_dtype == np.complex128 and out_dtype == np.complex128: self.fft_type = cufft.CUFFT_Z2Z self.fft_func = cufft.cufftExecZ2Z else: raise ValueError('unsupported input/output type combination') # Check for double precision support: capability = misc.get_compute_capability(misc.get_current_device()) if capability < 1.3 and \ (misc.isdoubletype(in_dtype) or misc.isdoubletype(out_dtype)): raise RuntimeError('double precision requires compute capability ' '>= 1.3 (you have %g)' % capability) if inembed is not None: inembed = inembed.ctypes.data if onembed is not None: onembed = onembed.ctypes.data # Set up plan: if len(self.shape) > 0: n = np.asarray(self.shape, np.int32) self.handle = cufft.cufftPlanMany( len(self.shape), n.ctypes.data, inembed, istride, idist, onembed, ostride, odist, self.fft_type, self.batch) else: raise ValueError('invalid transform size') # Set FFTW compatibility mode: cufft.cufftSetCompatibilityMode(self.handle, mode) # Associate stream with plan: if stream != None: cufft.cufftSetStream(self.handle, stream.handle) def __del__(self): # Don't complain if handle destruction fails because the plan # may have already been cleaned up: try: cufft.cufftDestroy(self.handle) except: pass def _scale_inplace(a, x_gpu): """ Scale an array by a specified value in-place. """ # Cache the kernel to avoid invoking the compiler if the # specified scale factor and array type have already been encountered: try: func = _scale_inplace.cache[(a, x_gpu.dtype)] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = el.ElementwiseKernel( "{ctype} a, {ctype} *x".format(ctype=ctype), "x[i] /= a") _scale_inplace.cache[(a, x_gpu.dtype)] = func func(x_gpu.dtype.type(a), x_gpu) _scale_inplace.cache = {} def _fft(x_gpu, y_gpu, plan, direction, scale=None): """ Fast Fourier Transform. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. y_gpu : pycuda.gpuarray.GPUArray Output array. plan : Plan FFT plan. direction : { cufft.CUFFT_FORWARD, cufft.CUFFT_INVERSE } Transform direction. Only affects in-place transforms. Optional Parameters ------------------- scale : int or float Scale the values in the output array by dividing them by this value. Notes ----- This function should not be called directly. """ if (x_gpu.gpudata == y_gpu.gpudata) and \ plan.fft_type not in [cufft.CUFFT_C2C, cufft.CUFFT_Z2Z]: raise ValueError('can only compute in-place transform of complex data') if direction == cufft.CUFFT_FORWARD and \ plan.in_dtype in np.sctypes['complex'] and \ plan.out_dtype in np.sctypes['float']: raise ValueError('cannot compute forward complex -> real transform') if direction == cufft.CUFFT_INVERSE and \ plan.in_dtype in np.sctypes['float'] and \ plan.out_dtype in np.sctypes['complex']: raise ValueError('cannot compute inverse real -> complex transform') if plan.fft_type in [cufft.CUFFT_C2C, cufft.CUFFT_Z2Z]: plan.fft_func(plan.handle, int(x_gpu.gpudata), int(y_gpu.gpudata), direction) else: plan.fft_func(plan.handle, int(x_gpu.gpudata), int(y_gpu.gpudata)) # Scale the result by dividing it by the number of elements: if scale != None: _scale_inplace(scale, y_gpu) def fft(x_gpu, y_gpu, plan, scale=False): """ Fast Fourier Transform. Compute the FFT of some data in device memory using the specified plan. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. y_gpu : pycuda.gpuarray.GPUArray FFT of input array. plan : Plan FFT plan. scale : bool, optional If True, scale the computed FFT by the number of elements in the input array. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> N = 128 >>> x = np.asarray(np.random.rand(N), np.float32) >>> xf = np.fft.fft(x) >>> x_gpu = gpuarray.to_gpu(x) >>> xf_gpu = gpuarray.empty(N/2+1, np.complex64) >>> plan = Plan(x.shape, np.float32, np.complex64) >>> fft(x_gpu, xf_gpu, plan) >>> np.allclose(xf[0:N/2+1], xf_gpu.get(), atol=1e-6) True Returns ------- y_gpu : pycuda.gpuarray.GPUArray Computed FFT. Notes ----- For real to complex transformations, this function computes N/2+1 non-redundant coefficients of a length-N input signal. """ if scale == True: return _fft(x_gpu, y_gpu, plan, cufft.CUFFT_FORWARD, x_gpu.size/plan.batch) else: return _fft(x_gpu, y_gpu, plan, cufft.CUFFT_FORWARD) def ifft(x_gpu, y_gpu, plan, scale=False): """ Inverse Fast Fourier Transform. Compute the inverse FFT of some data in device memory using the specified plan. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. y_gpu : pycuda.gpuarray.GPUArray Inverse FFT of input array. plan : Plan FFT plan. scale : bool, optional If True, scale the computed inverse FFT by the number of elements in the output array. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> N = 128 >>> x = np.asarray(np.random.rand(N), np.float32) >>> xf = np.asarray(np.fft.fft(x), np.complex64) >>> xf_gpu = gpuarray.to_gpu(xf[0:N/2+1]) >>> x_gpu = gpuarray.empty(N, np.float32) >>> plan = Plan(N, np.complex64, np.float32) >>> ifft(xf_gpu, x_gpu, plan, True) >>> np.allclose(x, x_gpu.get(), atol=1e-6) True Notes ----- For complex to real transformations, this function assumes the input contains N/2+1 non-redundant FFT coefficents of a signal of length N. """ if scale == True: return _fft(x_gpu, y_gpu, plan, cufft.CUFFT_INVERSE, y_gpu.size/plan.batch) else: return _fft(x_gpu, y_gpu, plan, cufft.CUFFT_INVERSE) if __name__ == "__main__": import doctest doctest.testmod() scikit-cuda-0.5.1/skcuda/include/000077500000000000000000000000001261465507300166235ustar00rootroot00000000000000scikit-cuda-0.5.1/skcuda/include/cuConstants.h000066400000000000000000000007771261465507300213130ustar00rootroot00000000000000// Various mathematical/computational constants #if !defined(CU_CONSTANTS_H_) #define CU_CONSTANTS_H_ #define PI 3.14159265358979323846 // pi #define PIO2 1.57079632679489661923 // pi/2 #define EUL 0.577215664901532860606512090082402 // Euler's constant // maximum single and double precision floating point numbers #ifndef FLT_MAX #define FLT_MAX 3.402823466E+38F #endif #ifndef DBL_MAX #define DBL_MAX 1.7976931348623158e+308 #endif #endif /* !defined(CU_CONSTANTS_H_) */ scikit-cuda-0.5.1/skcuda/include/cuSpecialFuncs.h000066400000000000000000000302211261465507300217010ustar00rootroot00000000000000// Special functions for CUDA // Some of these functions are adapted from the Cephes and specfun // libraries included in scipy: // http://www.netlib.org/cephes/ // http://www.netlib.org/specfun/ #include #include "cuConstants.h" #if !defined(CU_SPECIAL_FUNCS_H_) #define CU_SPECIAL_FUNCS_H_ #define CFLOAT pycuda::complex #define CDOUBLE pycuda::complex /* Sinc function. */ __device__ float sincf(float x) { if (x == 0.0) return 1.0; else return sinpif(x)/(PI*x); } __device__ double sinc(double x) { if (x == 0.0) return 1.0; else return sinpi(x)/(PI*x); } /* Polynomial evaluation. */ __device__ float polevlf(float x, float *coef, int N) { float ans; float *p; int i; p = coef; ans = *p++; i = N; do ans = ans*x + *p++; while (--i); return (ans); } __device__ float p1evlf(float x, float *coef, int N) { float ans; float *p; int i; p = coef; ans = x + *p++; i = N-1; do ans = ans*x + *p++; while (--i); return (ans); } __device__ double polevl(double x, double *coef, int N) { double ans; double *p; int i; p = coef; ans = *p++; i = N; do ans = ans*x + *p++; while (--i); return (ans); } __device__ double p1evl(double x, double *coef, int N) { double ans; double *p; int i; p = coef; ans = x + *p++; i = N-1; do ans = ans*x + *p++; while (--i); return (ans); } /* Constants used to compute the sine/cosine integrals. */ __constant__ float SNf[] = { -8.39167827910303881427E-11, 4.62591714427012837309E-8, -9.75759303843632795789E-6, 9.76945438170435310816E-4, -4.13470316229406538752E-2, 1.00000000000000000302E0, }; __constant__ float SDf[] = { 2.03269266195951942049E-12, 1.27997891179943299903E-9, 4.41827842801218905784E-7, 9.96412122043875552487E-5, 1.42085239326149893930E-2, 9.99999999999999996984E-1, }; __constant__ float CNf[] = { 2.02524002389102268789E-11, -1.35249504915790756375E-8, 3.59325051419993077021E-6, -4.74007206873407909465E-4, 2.89159652607555242092E-2, -1.00000000000000000080E0, }; __constant__ float CDf[] = { 4.07746040061880559506E-12, 3.06780997581887812692E-9, 1.23210355685883423679E-6, 3.17442024775032769882E-4, 5.10028056236446052392E-2, 4.00000000000000000080E0, }; __constant__ float FN4f[] = { 4.23612862892216586994E0, 5.45937717161812843388E0, 1.62083287701538329132E0, 1.67006611831323023771E-1, 6.81020132472518137426E-3, 1.08936580650328664411E-4, 5.48900223421373614008E-7, }; __constant__ float FD4f[] = { /* 1.00000000000000000000E0,*/ 8.16496634205391016773E0, 7.30828822505564552187E0, 1.86792257950184183883E0, 1.78792052963149907262E-1, 7.01710668322789753610E-3, 1.10034357153915731354E-4, 5.48900252756255700982E-7, }; __constant__ float FN8f[] = { 4.55880873470465315206E-1, 7.13715274100146711374E-1, 1.60300158222319456320E-1, 1.16064229408124407915E-2, 3.49556442447859055605E-4, 4.86215430826454749482E-6, 3.20092790091004902806E-8, 9.41779576128512936592E-11, 9.70507110881952024631E-14, }; __constant__ float FD8f[] = { /* 1.00000000000000000000E0,*/ 9.17463611873684053703E-1, 1.78685545332074536321E-1, 1.22253594771971293032E-2, 3.58696481881851580297E-4, 4.92435064317881464393E-6, 3.21956939101046018377E-8, 9.43720590350276732376E-11, 9.70507110881952025725E-14, }; __constant__ float GN4f[] = { 8.71001698973114191777E-2, 6.11379109952219284151E-1, 3.97180296392337498885E-1, 7.48527737628469092119E-2, 5.38868681462177273157E-3, 1.61999794598934024525E-4, 1.97963874140963632189E-6, 7.82579040744090311069E-9, }; __constant__ float GD4f[] = { /* 1.00000000000000000000E0,*/ 1.64402202413355338886E0, 6.66296701268987968381E-1, 9.88771761277688796203E-2, 6.22396345441768420760E-3, 1.73221081474177119497E-4, 2.02659182086343991969E-6, 7.82579218933534490868E-9, }; __constant__ float GN8f[] = { 6.97359953443276214934E-1, 3.30410979305632063225E-1, 3.84878767649974295920E-2, 1.71718239052347903558E-3, 3.48941165502279436777E-5, 3.47131167084116673800E-7, 1.70404452782044526189E-9, 3.85945925430276600453E-12, 3.14040098946363334640E-15, }; __constant__ float GD8f[] = { /* 1.00000000000000000000E0,*/ 1.68548898811011640017E0, 4.87852258695304967486E-1, 4.67913194259625806320E-2, 1.90284426674399523638E-3, 3.68475504442561108162E-5, 3.57043223443740838771E-7, 1.72693748966316146736E-9, 3.87830166023954706752E-12, 3.14040098946363335242E-15, }; __constant__ double SN[] = { -8.39167827910303881427E-11, 4.62591714427012837309E-8, -9.75759303843632795789E-6, 9.76945438170435310816E-4, -4.13470316229406538752E-2, 1.00000000000000000302E0, }; __constant__ double SD[] = { 2.03269266195951942049E-12, 1.27997891179943299903E-9, 4.41827842801218905784E-7, 9.96412122043875552487E-5, 1.42085239326149893930E-2, 9.99999999999999996984E-1, }; __constant__ double CN[] = { 2.02524002389102268789E-11, -1.35249504915790756375E-8, 3.59325051419993077021E-6, -4.74007206873407909465E-4, 2.89159652607555242092E-2, -1.00000000000000000080E0, }; __constant__ double CD[] = { 4.07746040061880559506E-12, 3.06780997581887812692E-9, 1.23210355685883423679E-6, 3.17442024775032769882E-4, 5.10028056236446052392E-2, 4.00000000000000000080E0, }; __constant__ double FN4[] = { 4.23612862892216586994E0, 5.45937717161812843388E0, 1.62083287701538329132E0, 1.67006611831323023771E-1, 6.81020132472518137426E-3, 1.08936580650328664411E-4, 5.48900223421373614008E-7, }; __constant__ double FD4[] = { /* 1.00000000000000000000E0,*/ 8.16496634205391016773E0, 7.30828822505564552187E0, 1.86792257950184183883E0, 1.78792052963149907262E-1, 7.01710668322789753610E-3, 1.10034357153915731354E-4, 5.48900252756255700982E-7, }; __constant__ double FN8[] = { 4.55880873470465315206E-1, 7.13715274100146711374E-1, 1.60300158222319456320E-1, 1.16064229408124407915E-2, 3.49556442447859055605E-4, 4.86215430826454749482E-6, 3.20092790091004902806E-8, 9.41779576128512936592E-11, 9.70507110881952024631E-14, }; __constant__ double FD8[] = { /* 1.00000000000000000000E0,*/ 9.17463611873684053703E-1, 1.78685545332074536321E-1, 1.22253594771971293032E-2, 3.58696481881851580297E-4, 4.92435064317881464393E-6, 3.21956939101046018377E-8, 9.43720590350276732376E-11, 9.70507110881952025725E-14, }; __constant__ double GN4[] = { 8.71001698973114191777E-2, 6.11379109952219284151E-1, 3.97180296392337498885E-1, 7.48527737628469092119E-2, 5.38868681462177273157E-3, 1.61999794598934024525E-4, 1.97963874140963632189E-6, 7.82579040744090311069E-9, }; __constant__ double GD4[] = { /* 1.00000000000000000000E0,*/ 1.64402202413355338886E0, 6.66296701268987968381E-1, 9.88771761277688796203E-2, 6.22396345441768420760E-3, 1.73221081474177119497E-4, 2.02659182086343991969E-6, 7.82579218933534490868E-9, }; __constant__ double GN8[] = { 6.97359953443276214934E-1, 3.30410979305632063225E-1, 3.84878767649974295920E-2, 1.71718239052347903558E-3, 3.48941165502279436777E-5, 3.47131167084116673800E-7, 1.70404452782044526189E-9, 3.85945925430276600453E-12, 3.14040098946363334640E-15, }; __constant__ double GD8[] = { /* 1.00000000000000000000E0,*/ 1.68548898811011640017E0, 4.87852258695304967486E-1, 4.67913194259625806320E-2, 1.90284426674399523638E-3, 3.68475504442561108162E-5, 3.57043223443740838771E-7, 1.72693748966316146736E-9, 3.87830166023954706752E-12, 3.14040098946363335242E-15, }; /* Sine/cosine integrals. */ __device__ void sicif(float x, float *si, float *ci) { float z, c, s, f, g; short sign; if (x < 0.0) { sign = -1; x = -x; } else sign = 0; if (x == 0.0) { *si = 0; *ci = -FLT_MAX; return; } if (x > 1.0e9) { *si = PIO2 - cosf(x)/x; *ci = sinf(x)/x; } if (x > 4.0) goto asympt; z = x*x; s = x*polevlf(z, SNf, 5)/polevlf(z, SDf, 5); c = z*polevlf(z, CNf, 5)/polevlf(z, CDf, 5); if (sign) s = -s; *si = s; *ci = EUL + logf(x) + c; return; asympt: s = sinf(x); c = cosf(x); z = 1.0/(x*x); if (x < 8.0) { f = polevlf(z, FN4f, 6)/(x*p1evlf(z, FD4f, 7)); g = z*polevlf(z, GN4f, 7)/p1evlf(z, GD4f, 7); } else { f = polevlf(z, FN8f, 8)/(x*p1evlf(z, FD8f, 8)); g = z*polevlf(z, GN8f, 8)/p1evlf(z, GD8f, 9); } *si = PIO2 - f*c - g*s; if (sign) *si = -(*si); *ci = f*s - g*c; return; } __device__ void sici(double x, double *si, double *ci) { double z, c, s, f, g; short sign; if (x < 0.0) { sign = -1; x = -x; } else sign = 0; if (x == 0.0) { *si = 0; *ci = -DBL_MAX; return; } if (x > 1.0e9) { *si = PIO2 - cos(x)/x; *ci = sin(x)/x; } if (x > 4.0) goto asympt; z = x*x; s = x*polevl(z, SN, 5)/polevl(z, SD, 5); c = z*polevl(z, CN, 5)/polevl(z, CD, 5); if (sign) s = -s; *si = s; *ci = EUL + log(x) + c; return; asympt: s = sin(x); c = cos(x); z = 1.0/(x*x); if (x < 8.0) { f = polevl(z, FN4, 6)/(x*p1evl(z, FD4, 7)); g = z*polevl(z, GN4, 7)/p1evl(z, GD4, 7); } else { f = polevl(z, FN8, 8)/(x*p1evl(z, FD8, 8)); g = z*polevl(z, GN8, 8)/p1evl(z, GD8, 9); } *si = PIO2 - f*c - g*s; if (sign) *si = -(*si); *ci = f*s - g*c; return; } /* exponential integrals */ __device__ CFLOAT exp1f(CFLOAT z) { float x = real(z); float a0 = abs(z); CFLOAT ce1, cr, ct0, kc, ct; if (a0 == 0.0) ce1 = CFLOAT(1.0e300, 0.0); else if ((a0 < 10.0) || (x < 0.0 && a0 < 20.0)) { ce1 = CFLOAT(1.0, 0.0); cr = CFLOAT(1.0, 0.0); for (int k = 1; k <= 150; k++) { cr = -(cr * float(k) * z)/CFLOAT((k + 1.0) * (k + 1.0), 0.0); ce1 = ce1 + cr; if (abs(cr) <= abs(ce1)*1.0e-15) break; } ce1 = CFLOAT(-EUL,0.0)-log(z)+(z*ce1); } else { ct0 = CFLOAT(0.0, 0.0); for (int k = 120; k >= 1; k--) { kc = CFLOAT(k, 0.0); ct0 = kc/(CFLOAT(1.0,0.0)+(kc/(z+ct0))); } ct = CFLOAT(1.0, 0.0)/(z+ct0); ce1 = exp(-z)*ct; if (x <= 0.0 && imag(z) == 0.0) ce1 = ce1-CFLOAT(0.0, -PI); } return ce1; } __device__ CFLOAT expif(CFLOAT z) { CFLOAT cei = exp1f(-z); cei = -cei+(log(z)-log(CFLOAT(1.0)/z))/CFLOAT(2.0)-log(-z); return cei; } __device__ CDOUBLE exp1(CDOUBLE z) { double x = real(z); double a0 = abs(z); CDOUBLE ce1, cr, ct0, kc, ct; if (a0 == 0.0) ce1 = CDOUBLE(1.0e300, 0.0); else if ((a0 < 10.0) || (x < 0.0 && a0 < 20.0)) { ce1 = CDOUBLE(1.0, 0.0); cr = CDOUBLE(1.0, 0.0); for (int k = 1; k <= 150; k++) { cr = -(cr * double(k) * z)/CDOUBLE((k + 1.0) * (k + 1.0), 0.0); ce1 = ce1 + cr; if (abs(cr) <= abs(ce1)*1.0e-15) break; } ce1 = CDOUBLE(-EUL,0.0)-log(z)+(z*ce1); } else { ct0 = CDOUBLE(0.0, 0.0); for (int k = 120; k >= 1; k--) { kc = CDOUBLE(k, 0.0); ct0 = kc/(CDOUBLE(1.0,0.0)+(kc/(z+ct0))); } ct = CDOUBLE(1.0, 0.0)/(z+ct0); ce1 = exp(-z)*ct; if (x <= 0.0 && imag(z) == 0.0) ce1 = ce1-CDOUBLE(0.0, -PI); } return ce1; } __device__ CDOUBLE expi(CDOUBLE z) { CDOUBLE cei = exp1(-z); cei = -cei+(log(z)-log(CDOUBLE(1.0)/z))/CDOUBLE(2.0)-log(-z); return cei; } #endif /* !defined(CU_SPECIAL_FUNCS_H_) */ scikit-cuda-0.5.1/skcuda/info.py000066400000000000000000000025731261465507300165140ustar00rootroot00000000000000#!/usr/bin/env python """ scikit-cuda =========== scikit-cuda provides Python interfaces to many of the functions in the CUDA device/runtime, CUBLAS, CUFFT, and CUSOLVER libraries distributed as part of NVIDIA's CUDA Programming Toolkit [1]_, as well as interfaces to select functions in the free and standard versions of the CULA Dense Toolkit [2]_. Both low-level wrapper functions similar to their C counterparts and high-level functions comparable to those in NumPy and Scipy [3]_ are provided High-level modules ------------------ - autoinit Automatic GPU library initialization module. - fft Fast Fourier Transform functions. - integrate Numerical integration functions. - linalg Linear algebra functions. - rlinalg Randomized linear algebra functions. - misc Miscellaneous support functions. - special Special math functions. Low-level modules ----------------- - cublas Function wrappers for the CUBLAS library. - cufft Function wrappers for the CUFFT library. - cuda Function wrappers for the CUDA device/runtime libraries. - cula Function wrappers for the CULA library. - cusolver Function wrappers for the CUSOLVER library. - pcula Function wrappers for the multi-GPU CULA library. .. [1] http://www.nvidia.com/cuda .. [2] http://www.culatools.com/ .. [5] http://www.scipy.org/ """ scikit-cuda-0.5.1/skcuda/integrate.py000066400000000000000000000165561261465507300175510ustar00rootroot00000000000000#!/usr/bin/env python """ PyCUDA-based integration functions. """ from string import Template from pycuda.tools import context_dependent_memoize from pycuda.compiler import SourceModule import pycuda.elementwise as elementwise import pycuda.gpuarray as gpuarray import pycuda.tools as tools import numpy as np import cublas import misc from misc import init def gen_trapz_mult(N, dtype): """ Generate multiplication array for 1D trapezoidal integration. Generates an array whose dot product with some array of equal length is equivalent to the definite integral of the latter computed using trapezoidal integration. Parameters ---------- N : int Length of array. dtype : float type Floating point type to use when generating the array. Returns ------- result : pycuda.gpuarray.GPUArray Generated array. """ if dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') ctype = tools.dtype_to_ctype(dtype) func = elementwise.ElementwiseKernel("{ctype} *x".format(ctype=ctype), "x[i] = ((i == 0) || (i == {M})) ? 0.5 : 1".format(M=N-1)) x_gpu = gpuarray.empty(N, dtype) func(x_gpu) return x_gpu def trapz(x_gpu, dx=1.0, handle=None): """ 1D trapezoidal integration. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array to integrate. dx : scalar Spacing. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- result : float Definite integral as approximated by the trapezoidal rule. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray >>> import numpy as np >>> import integrate >>> integrate.init() >>> x = np.asarray(np.random.rand(10), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> z = integrate.trapz(x_gpu) >>> np.allclose(np.trapz(x), z) True """ if handle is None: handle = misc._global_cublas_handle if len(x_gpu.shape) > 1: raise ValueError('input array must be 1D') if np.iscomplex(dx): raise ValueError('dx must be real') float_type = x_gpu.dtype.type if float_type == np.complex64: cublas_func = cublas.cublasCdotu elif float_type == np.float32: cublas_func = cublas.cublasSdot elif float_type == np.complex128: cublas_func = cublas.cublasZdotu elif float_type == np.float64: cublas_func = cublas.cublasDdot else: raise ValueError('unsupported input type') trapz_mult_gpu = gen_trapz_mult(x_gpu.size, float_type) result = cublas_func(handle, x_gpu.size, x_gpu.gpudata, 1, trapz_mult_gpu.gpudata, 1) return float_type(dx)*result @context_dependent_memoize def _get_trapz2d_mult_kernel(use_double, use_complex): template = Template(""" #include #if ${use_double} #if ${use_complex} #define TYPE pycuda::complex #else #define TYPE double #endif #else #if ${use_complex} #define TYPE pycuda::complex #else #define TYPE float #endif #endif // Ny: number of rows // Nx: number of columns __global__ void gen_trapz2d_mult(TYPE *mult, unsigned int Ny, unsigned int Nx) { unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+ blockIdx.x*blockDim.x+threadIdx.x; if (idx < Nx*Ny) { if (idx == 0 || idx == Nx-1 || idx == Nx*(Ny-1) || idx == Nx*Ny-1) mult[idx] = TYPE(0.25); else if ((idx > 0 && idx < Nx-1) || (idx % Nx == 0) || (((idx + 1) % Nx) == 0) || (idx > Nx*(Ny-1) && idx < Nx*Ny-1)) mult[idx] = TYPE(0.5); else mult[idx] = TYPE(1.0); } } """) # Set this to False when debugging to make sure the compiled kernel is # not cached: tmpl = template.substitute(use_double=use_double, use_complex=use_complex) cache_dir=None mod = SourceModule(tmpl, cache_dir=cache_dir) return mod.get_function("gen_trapz2d_mult") def gen_trapz2d_mult(mat_shape, dtype): """ Generate multiplication matrix for 2D trapezoidal integration. Generates a matrix whose dot product with some other matrix of equal length (when flattened) is equivalent to the definite double integral of the latter computed using trapezoidal integration. Parameters ---------- mat_shape : tuple Shape of matrix. dtype : float type Floating point type to use when generating the array. Returns ------- result : pycuda.gpuarray.GPUArray Generated matrix. """ if dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') use_double = int(dtype in [np.float64, np.complex128]) use_complex = int(dtype in [np.complex64, np.complex128]) # Allocate output matrix: Ny, Nx = mat_shape mult_gpu = gpuarray.empty(mat_shape, dtype) # Get block/grid sizes: dev = misc.get_current_device() block_dim, grid_dim = misc.select_block_grid_sizes(dev, mat_shape) gen_trapz2d_mult = _get_trapz2d_mult_kernel(use_double, use_complex) gen_trapz2d_mult(mult_gpu, np.uint32(Ny), np.uint32(Nx), block=block_dim, grid=grid_dim) return mult_gpu def trapz2d(x_gpu, dx=1.0, dy=1.0, handle=None): """ 2D trapezoidal integration. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input matrix to integrate. dx : float X-axis spacing. dy : float Y-axis spacing handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- result : float Definite double integral as approximated by the trapezoidal rule. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray >>> import numpy as np >>> import integrate >>> integrate.init() >>> x = np.asarray(np.random.rand(10, 10), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> z = integrate.trapz2d(x_gpu) >>> np.allclose(np.trapz(np.trapz(x)), z) True """ if handle is None: handle = misc._global_cublas_handle if len(x_gpu.shape) != 2: raise ValueError('input array must be 2D') if np.iscomplex(dx) or np.iscomplex(dy): raise ValueError('dx and dy must be real') float_type = x_gpu.dtype.type if float_type == np.complex64: cublas_func = cublas.cublasCdotu elif float_type == np.float32: cublas_func = cublas.cublasSdot elif float_type == np.complex128: cublas_func = cublas.cublasZdotu elif float_type == np.float64: cublas_func = cublas.cublasDdot else: raise ValueError('unsupported input type') trapz_mult_gpu = gen_trapz2d_mult(x_gpu.shape, float_type) result = cublas_func(handle, x_gpu.size, x_gpu.gpudata, 1, trapz_mult_gpu.gpudata, 1) return float_type(dx)*float_type(dy)*result if __name__ == "__main__": import doctest doctest.testmod() scikit-cuda-0.5.1/skcuda/linalg.py000066400000000000000000002504441261465507300170310ustar00rootroot00000000000000#!/usr/bin/env python """ PyCUDA-based linear algebra functions. """ from __future__ import absolute_import, division from pprint import pprint from string import Template from pycuda.tools import context_dependent_memoize from pycuda.compiler import SourceModule from pycuda.reduction import ReductionKernel import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.elementwise as el import pycuda.tools as tools import numpy as np from . import cublas from . import misc import sys if sys.version_info < (3,): range = xrange class LinAlgError(Exception): """Linear Algebra Error.""" pass try: from . import cula _has_cula = True except (ImportError, OSError): _has_cula = False from .misc import init, add_matvec, div_matvec, mult_matvec # Get installation location of C headers: from . import install_headers def svd(a_gpu, jobu='A', jobvt='A'): """ Singular Value Decomposition. Factors the matrix `a` into two unitary matrices, `u` and `vh`, and a 1-dimensional array of real, non-negative singular values, `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)` to decompose. jobu : {'A', 'S', 'O', 'N'} If 'A', return the full `u` matrix with shape `(m, m)`. If 'S', return the `u` matrix with shape `(m, k)`. If 'O', return the `u` matrix with shape `(m, k) without allocating a new matrix. If 'N', don't return `u`. jobvt : {'A', 'S', 'O', 'N'} If 'A', return the full `vh` matrix with shape `(n, n)`. If 'S', return the `vh` matrix with shape `(k, n)`. If 'O', return the `vh` matrix with shape `(k, n) without allocating a new matrix. If 'N', don't return `vh`. Returns ------- u : pycuda.gpuarray.GPUArray Unitary matrix of shape `(m, m)` or `(m, k)` depending on value of `jobu`. s : pycuda.gpuarray.GPUArray Array containing the singular values, sorted such that `s[i] >= s[i+1]`. `s` is of length `min(m, n)`. vh : pycuda.gpuarray.GPUArray Unitary matrix of shape `(n, n)` or `(k, n)`, depending on `jobvt`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix regardless of the values of `jobu` and `jobvt`. Only one of `jobu` or `jobvt` may be set to `O`, and then only for a square matrix. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6) >>> a = np.asarray(a, np.complex64) >>> a_gpu = gpuarray.to_gpu(a) >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 'S', 'S') >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating # point numbers: data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func = cula.culaDeviceCgesvd elif data_type == np.float32: cula_func = cula.culaDeviceSgesvd else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func = cula.culaDeviceZgesvd elif data_type == np.float64: cula_func = cula.culaDeviceDgesvd else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = np.array(a_gpu.shape, int) # workaround for bug #131 square = (n == m) # Since the input matrix is transposed, jobu and jobvt must also # be switched because the computed matrices will be returned in # reversed order: jobvt, jobu = jobu, jobvt # Set the leading dimension of the input matrix: lda = max(1, m) # Allocate the array of singular values: s_gpu = gpuarray.empty(min(m, n), real_type, allocator=alloc) # Set the leading dimension and allocate u: jobu = jobu.upper() jobvt = jobvt.upper() ldu = m if jobu == 'A': u_gpu = gpuarray.empty((ldu, m), data_type, allocator=alloc) elif jobu == 'S': u_gpu = gpuarray.empty((min(m, n), ldu), data_type, allocator=alloc) elif jobu == 'O': if not square: raise ValueError('in-place computation of singular vectors '+ 'of non-square matrix not allowed') ldu = 1 u_gpu = a_gpu else: ldu = 1 u_gpu = gpuarray.empty((), data_type, allocator=alloc) # Set the leading dimension and allocate vh: if jobvt == 'A': ldvt = n vh_gpu = gpuarray.empty((n, n), data_type, allocator=alloc) elif jobvt == 'S': ldvt = min(m, n) vh_gpu = gpuarray.empty((n, ldvt), data_type, allocator=alloc) elif jobvt == 'O': if jobu == 'O': raise ValueError('jobu and jobvt cannot both be O') if not square: raise ValueError('in-place computation of singular vectors '+ 'of non-square matrix not allowed') ldvt = 1 vh_gpu = a_gpu else: ldvt = 1 vh_gpu = gpuarray.empty((), data_type, allocator=alloc) # Compute SVD and check error status: cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda, int(s_gpu.gpudata), int(u_gpu.gpudata), ldu, int(vh_gpu.gpudata), ldvt) # Free internal CULA memory: cula.culaFreeBuffers() # Since the input is assumed to be transposed, it is necessary to # return the computed matrices in reverse order: if jobu in ['A', 'S', 'O'] and jobvt in ['A', 'S', 'O']: return vh_gpu, s_gpu, u_gpu elif jobu == 'N' and jobvt != 'N': return vh_gpu, s_gpu elif jobu != 'N' and jobvt == 'N': return s_gpu, u_gpu else: return s_gpu def cho_factor(a_gpu, uplo='L'): """ Cholesky factorisation Performs an in-place cholesky factorisation on the matrix `a` such that `a = x*x.T` or `x.T*x`, if the lower='L' or upper='U' triangle of `a` is used, respectively. Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` to decompose. uplo: use the upper='U' or lower='L' (default) triangle of 'a' Returns ------- a: pycuda.gpuarray.GPUArray Cholesky factorised matrix Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.linalg >>> import linalg >>> linalg.init() >>> a = np.array([[3.0,0.0],[0.0,7.0]]) >>> a = np.asarray(a, np.float64) >>> a_gpu = gpuarray.to_gpu(a) >>> cho_factor(a_gpu) >>> np.allclose(a_gpu.get(), scipy.linalg.cho_factor(a)[0]) True """ if not _has_cula: raise NotImplementError('CULA not installed') data_type = a_gpu.dtype.type real_type = np.float32 if cula._libcula_toolkit == 'standard': if data_type == np.complex64: cula_func = cula.culaDeviceCpotrf elif data_type == np.float32: cula_func = cula.culaDeviceSpotrf elif data_type == np.complex128: cula_func = cula.culaDeviceZpotrf elif data_type == np.float64: cula_func = cula.culaDeviceDpotrf else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('Cholesky factorisation not included in CULA Dense Free version') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: n, m = a_gpu.shape if (n!=m): raise ValueError('Matrix must be symmetric positive-definite') # Set the leading dimension of the input matrix: lda = max(1, m) cula_func(uplo, n, int(a_gpu.gpudata), lda) # Free internal CULA memory: cula.culaFreeBuffers() # In-place operation. No return matrix. Result is stored in the input matrix. def cho_solve(a_gpu, b_gpu, uplo='L'): """ Cholesky solver Solve a system of equations via cholesky factorization, i.e. `a*x = b`. Overwrites `b` to give `inv(a)*b`, and overwrites the chosen triangle of `a` with factorized triangle Parameters ---------- a : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` to decompose. b : pycuda.gpuarray.GPUArray Input matrix of shape `(m, 1)` to decompose. uplo: chr use the upper='U' or lower='L' (default) triangle of `a`. Returns ------- a: pycuda.gpuarray.GPUArray Cholesky factorised matrix Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.linalg >>> import linalg >>> linalg.init() >>> a = np.array([[3.0,0.0],[0.0,7.0]]) >>> a = np.asarray(a, np.float64) >>> a_gpu = gpuarray.to_gpu(a) >>> b = np.array([11.,19.]) >>> b = np.asarray(b, np.float64) >>> b_gpu = gpuarray.to_gpu(b) >>> cho_solve(a_gpu,b_gpu) >>> np.allclose(b_gpu.get(), scipy.linalg.cho_solve(scipy.linalg.cho_factor(a), b)) True """ if not _has_cula: raise NotImplementError('CULA not installed') data_type = a_gpu.dtype.type real_type = np.float32 if cula._libcula_toolkit == 'standard': if data_type == np.complex64: cula_func = cula.culaDeviceCposv elif data_type == np.float32: cula_func = cula.culaDeviceSposv elif data_type == np.complex128: cula_func = cula.culaDeviceZposv elif data_type == np.float64: cula_func = cula.culaDeviceDposv else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('Cholesky factorisation not included in CULA Dense Free version') # Since CUDA assumes that arrays are stored in column-major # format, the input matrix is assumed to be transposed: na, ma = a_gpu.shape if (na!=ma): raise ValueError('Matrix must be symmetric positive-definite') if a_gpu.flags.c_contiguous != b_gpu.flags.c_contiguous: raise ValueError('unsupported combination of input order') b_shape = b_gpu.shape if len(b_shape) == 1: b_shape = (b_shape[0], 1) if a_gpu.flags.f_contiguous: lda = max(1, na) ldb = max(1, b_shape[0]) else: lda = max(1, ma) ldb = lda if b_shape[1] > 1: raise ValueError('only vectors allowed in c-order RHS') # Assuming we are only solving for a vector. Hence, nrhs = 1 cula_func(uplo, na, b_shape[1], int(a_gpu.gpudata), lda, int(b_gpu.gpudata), ldb) # Free internal CULA memory: cula.culaFreeBuffers() # In-place operation. No return matrix. Result is stored in the input matrix # and in the input vector. def add_dot(a_gpu, b_gpu, c_gpu, transa='N', transb='N', alpha=1.0, beta=1.0, handle=None): """ Calculates the dot product of two arrays and adds it to a third matrix. In essence, this computes C = alpha * (A B) + beta * C For 2D arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix product; the result has shape `(m, n)`. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array. b_gpu : pycuda.gpuarray.GPUArray Input array. c_gpu : pycuda.gpuarray.GPUArray Cummulative array. transa : char If 'T', compute the product of the transpose of `a_gpu`. If 'C', compute the product of the Hermitian of `a_gpu`. transb : char If 'T', compute the product of the transpose of `b_gpu`. If 'C', compute the product of the Hermitian of `b_gpu`. handle : int (optional) CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- c_gpu : pycuda.gpuarray.GPUArray Notes ----- The matrices must all contain elements of the same data type. """ if handle is None: handle = misc._global_cublas_handle # Get the shapes of the arguments (accounting for the # possibility that one of them may only have one dimension): a_shape = a_gpu.shape b_shape = b_gpu.shape if len(a_shape) == 1: a_shape = (1, a_shape[0]) if len(b_shape) == 1: b_shape = (1, b_shape[0]) # Perform matrix multiplication for 2D arrays: if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64): cublas_func = cublas.cublasCgemm alpha = np.complex64(alpha) beta = np.complex64(beta) elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32): cublas_func = cublas.cublasSgemm alpha = np.float32(alpha) beta = np.float32(beta) elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128): cublas_func = cublas.cublasZgemm alpha = np.complex128(alpha) beta = np.complex128(beta) elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64): cublas_func = cublas.cublasDgemm alpha = np.float64(alpha) beta = np.float64(beta) else: raise ValueError('unsupported combination of input types') transa = transa.lower() transb = transb.lower() a_f_order = a_gpu.strides[1] > a_gpu.strides[0] b_f_order = b_gpu.strides[1] > b_gpu.strides[0] c_f_order = c_gpu.strides[1] > c_gpu.strides[0] if a_f_order != b_f_order: raise ValueError('unsupported combination of input order') if a_f_order != c_f_order: raise ValueError('invalid order for c_gpu') if a_f_order: # F order array if transa in ['t', 'c']: k, m = a_shape elif transa in ['n']: m, k = a_shape else: raise ValueError('invalid value for transa') if transb in ['t', 'c']: n, l = b_shape elif transb in ['n']: l, n = b_shape else: raise ValueError('invalid value for transb') if l != k: raise ValueError('objects are not aligned') lda = max(1, a_gpu.strides[1] // a_gpu.dtype.itemsize) ldb = max(1, b_gpu.strides[1] // b_gpu.dtype.itemsize) ldc = max(1, c_gpu.strides[1] // c_gpu.dtype.itemsize) if c_gpu.shape != (m, n) or c_gpu.dtype != a_gpu.dtype: raise ValueError('invalid value for c_gpu') cublas_func(handle, transa, transb, m, n, k, alpha, a_gpu.gpudata, lda, b_gpu.gpudata, ldb, beta, c_gpu.gpudata, ldc) else: if transb in ['t', 'c']: m, k = b_shape elif transb in ['n']: k, m = b_shape else: raise ValueError('invalid value for transb') if transa in ['t', 'c']: l, n = a_shape elif transa in ['n']: n, l = a_shape else: raise ValueError('invalid value for transa') if l != k: raise ValueError('objects are not aligned') lda = max(1, a_gpu.strides[0] // a_gpu.dtype.itemsize) ldb = max(1, b_gpu.strides[0] // b_gpu.dtype.itemsize) ldc = max(1, c_gpu.strides[0] // c_gpu.dtype.itemsize) # Note that the desired shape of the output matrix is the transpose # of what CUBLAS assumes: if c_gpu.shape != (n, m) or c_gpu.dtype != a_gpu.dtype: raise ValueError('invalid value for c_gpu') cublas_func(handle, transb, transa, m, n, k, alpha, b_gpu.gpudata, ldb, a_gpu.gpudata, lda, beta, c_gpu.gpudata, ldc) return c_gpu def dot(x_gpu, y_gpu, transa='N', transb='N', handle=None, out=None): """ Dot product of two arrays. For 1D arrays, this function computes the inner product. For 2D arrays of shapes `(m, k)` and `(k, n)`, it computes the matrix product; the result has shape `(m, n)`. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. y_gpu : pycuda.gpuarray.GPUArray Input array. transa : char If 'T', compute the product of the transpose of `x_gpu`. If 'C', compute the product of the Hermitian of `x_gpu`. transb : char If 'T', compute the product of the transpose of `y_gpu`. If 'C', compute the product of the Hermitian of `y_gpu`. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. out : pycuda.gpuarray.GPUArray, optional Output argument. Will be used to store the result. Returns ------- c_gpu : pycuda.gpuarray.GPUArray, float{32,64}, or complex{64,128} Inner product of `x_gpu` and `y_gpu`. When the inputs are 1D arrays, the result will be returned as a scalar. Notes ----- The input matrices must all contain elements of the same data type. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> import misc >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 2), np.float32) >>> b = np.asarray(np.random.rand(2, 2), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> b_gpu = gpuarray.to_gpu(b) >>> c_gpu = linalg.dot(a_gpu, b_gpu) >>> np.allclose(np.dot(a, b), c_gpu.get()) True >>> d = np.asarray(np.random.rand(5), np.float32) >>> e = np.asarray(np.random.rand(5), np.float32) >>> d_gpu = gpuarray.to_gpu(d) >>> e_gpu = gpuarray.to_gpu(e) >>> f = linalg.dot(d_gpu, e_gpu) >>> np.allclose(np.dot(d, e), f) True """ if handle is None: handle = misc._global_cublas_handle x_shape = tuple(int(i) for i in x_gpu.shape) # workaround for bug #131 y_shape = tuple(int(i) for i in y_gpu.shape) if len(x_shape) == 1: x_shape = (1, x_shape[0]) if len(y_shape) == 1: y_shape = (1, y_shape[0]) if len(x_gpu.shape) == 1 and len(y_gpu.shape) == 1: if x_gpu.size != y_gpu.size: raise ValueError('arrays must be of same length') # Compute inner product for 1D arrays: if (x_gpu.dtype == np.complex64 and y_gpu.dtype == np.complex64): cublas_func = cublas.cublasCdotu elif (x_gpu.dtype == np.float32 and y_gpu.dtype == np.float32): cublas_func = cublas.cublasSdot elif (x_gpu.dtype == np.complex128 and y_gpu.dtype == np.complex128): cublas_func = cublas.cublasZdotu elif (x_gpu.dtype == np.float64 and y_gpu.dtype == np.float64): cublas_func = cublas.cublasDdot else: raise ValueError('unsupported combination of input types') return cublas_func(handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) else: transa = transa.lower() transb = transb.lower() if out is None: if transa in ['t', 'c']: k, m = x_shape else: m, k = x_shape if transb in ['t', 'c']: n, l = y_shape else: l, n = y_shape alloc = misc._global_cublas_allocator if x_gpu.strides[1] > x_gpu.strides[0]: # F order out = gpuarray.empty((m, n), x_gpu.dtype, order="F", allocator=alloc) else: out = gpuarray.empty((m, n), x_gpu.dtype, order="C", allocator=alloc) return add_dot(x_gpu, y_gpu, out, transa, transb, 1.0, 0.0, handle) def mdot(*args, **kwargs): """ Product of several matrices. Computes the matrix product of several arrays of shapes. Parameters ---------- a_gpu, b_gpu, ... : pycuda.gpuarray.GPUArray Arrays to multiply. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- c_gpu : pycuda.gpuarray.GPUArray Matrix product of `a_gpu`, `b_gpu`, etc. Notes ----- The input matrices must all contain elements of the same data type. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 2), np.float32) >>> b = np.asarray(np.random.rand(2, 2), np.float32) >>> c = np.asarray(np.random.rand(2, 2), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> b_gpu = gpuarray.to_gpu(b) >>> c_gpu = gpuarray.to_gpu(c) >>> d_gpu = linalg.mdot(a_gpu, b_gpu, c_gpu) >>> np.allclose(np.dot(a, np.dot(b, c)), d_gpu.get()) True """ if ' handle' in kwargs and kwargs['handle'] is not None: handle = kwargs['handle'] else: handle = misc._global_cublas_handle # Free the temporary matrix allocated when computing the dot # product: out_gpu = args[0] for next_gpu in args[1:]: temp_gpu = dot(out_gpu, next_gpu, handle=handle) out_gpu.gpudata.free() del(out_gpu) out_gpu = temp_gpu del(temp_gpu) return out_gpu def dot_diag(d_gpu, a_gpu, trans='N', overwrite=False, handle=None): """ Dot product of diagonal and non-diagonal arrays. Computes the matrix product of a diagonal array represented as a vector and a non-diagonal array. Parameters ---------- d_gpu : pycuda.gpuarray.GPUArray Array of length `N` corresponding to the diagonal of the multiplier. a_gpu : pycuda.gpuarray.GPUArray Multiplicand array with shape `(N, M)`. Must have same data type as `d_gpu`. trans : char If 'T', compute the product of the transpose of `a_gpu`. overwrite : bool (default: False) If true, save the result in `a_gpu`. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- r_gpu : pycuda.gpuarray.GPUArray The computed matrix product. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> d = np.random.rand(4) >>> a = np.random.rand(4, 4) >>> d_gpu = gpuarray.to_gpu(d) >>> a_gpu = gpuarray.to_gpu(a) >>> r_gpu = linalg.dot_diag(d_gpu, a_gpu) >>> np.allclose(np.dot(np.diag(d), a), r_gpu.get()) True """ if handle is None: handle = misc._global_cublas_handle if not (len(d_gpu.shape) == 1 or (d_gpu.shape[0] == 1 or d_gpu.shape[1] == 1)): raise ValueError('d_gpu must be a vector') if len(a_gpu.shape) != 2: raise ValueError('a_gpu must be a matrix') trans = trans.lower() if trans == 'n': rows, cols = a_gpu.shape else: cols, rows = a_gpu.shape N = d_gpu.size if N != rows: raise ValueError('incompatible dimensions') if a_gpu.dtype != d_gpu.dtype: raise ValueError('argument types must be the same') if (a_gpu.dtype == np.complex64): cublas_func = cublas.cublasCdgmm elif (a_gpu.dtype == np.float32): cublas_func = cublas.cublasSdgmm elif (a_gpu.dtype == np.complex128): cublas_func = cublas.cublasZdgmm elif (a_gpu.dtype == np.float64): cublas_func = cublas.cublasDdgmm else: raise ValueError('unsupported input type') if overwrite: r_gpu = a_gpu else: r_gpu = a_gpu.copy() if (trans == 'n' and a_gpu.flags.c_contiguous) \ or (trans == 't' and a_gpu.flags.f_contiguous): side = "R" else: side = "L" lda = a_gpu.shape[1] if a_gpu.flags.c_contiguous else a_gpu.shape[0] ldr = lda n, m = a_gpu.shape if a_gpu.flags.f_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) cublas_func(handle, side, n, m, a_gpu.gpudata, lda, d_gpu.gpudata, 1, r_gpu.gpudata, ldr) return r_gpu def add_diag(d_gpu, a_gpu, overwrite=False, handle=None): """ Adds a vector to the diagonal of an array. This is the same as A + diag(D), but faster. Parameters ---------- d_gpu : pycuda.gpuarray.GPUArray Array of length `N` corresponding to the vector to be added to the diagonal. a_gpu : pycuda.gpuarray.GPUArray Summand array with shape `(N, N)`. overwrite : bool (default: False) If true, save the result in `a_gpu`. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- r_gpu : pycuda.gpuarray.GPUArray The computed sum product. Notes ----- `d_gpu` and `a_gpu` must have the same precision data type. """ if handle is None: handle = misc._global_cublas_handle if not (len(d_gpu.shape) == 1 or (d_gpu.shape[0] == 1 or d_gpu.shape[1] == 1)): raise ValueError('d_gpu must be a vector') if len(a_gpu.shape) != 2: raise ValueError('a_gpu must be a matrix') if a_gpu.shape[0] != a_gpu.shape[1]: raise ValueError('a_gpu must be square') if d_gpu.size != a_gpu.shape[0]: raise ValueError('incompatible dimensions') if a_gpu.dtype != d_gpu.dtype: raise ValueError('precision of argument types must be the same') if (a_gpu.dtype == np.complex64): axpy = cublas.cublasCaxpy elif (a_gpu.dtype == np.float32): axpy = cublas.cublasSaxpy elif (a_gpu.dtype == np.complex128): axpy = cublas.cublasZaxpy elif (a_gpu.dtype == np.float64): axpy = cublas.cublasDaxpy else: raise ValueError('unsupported input type') if overwrite: r_gpu = a_gpu else: r_gpu = a_gpu.copy() n = a_gpu.shape[0] axpy(handle, n, 1.0, d_gpu.gpudata, int(1), r_gpu.gpudata, int(n+1)) return r_gpu def _transpose(a_gpu, conj=False, handle=None): if handle is None: handle = misc._global_cublas_handle if len(a_gpu.shape) != 2: raise ValueError('a_gpu must be a matrix') if (a_gpu.dtype == np.complex64): func = cublas.cublasCgeam elif (a_gpu.dtype == np.float32): func = cublas.cublasSgeam elif (a_gpu.dtype == np.complex128): func = cublas.cublasZgeam elif (a_gpu.dtype == np.float64): func = cublas.cublasDgeam else: raise ValueError('unsupported input type') if conj: transa = 'c' else: transa = 't' M, N = np.array(a_gpu.shape, int) # workaround for bug #131 at_gpu = gpuarray.empty((N, M), a_gpu.dtype) func(handle, transa, 't', M, N, 1.0, a_gpu.gpudata, N, 0.0, a_gpu.gpudata, N, at_gpu.gpudata, M) return at_gpu def transpose(a_gpu, handle=None): """ Matrix transpose. Transpose a matrix in device memory and return an object representing the transposed matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. Returns ------- at_gpu : pycuda.gpuarray.GPUArray Transposed matrix of shape `(n, m)`. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Examples -------- >>> import pycuda.autoinit >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> at_gpu = linalg.transpose(a_gpu) >>> np.all(a.T == at_gpu.get()) True >>> b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> bt_gpu = linalg.transpose(b_gpu) >>> np.all(b.T == bt_gpu.get()) True """ return _transpose(a_gpu, False, handle) def hermitian(a_gpu, handle=None): """ Hermitian (conjugate) matrix transpose. Conjugate transpose a matrix in device memory and return an object representing the transposed matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- at_gpu : pycuda.gpuarray.GPUArray Transposed matrix of shape `(n, m)`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> at_gpu = linalg.hermitian(a_gpu) >>> np.all(a.T == at_gpu.get()) True >>> b = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> bt_gpu = linalg.hermitian(b_gpu) >>> np.all(np.conj(b.T) == bt_gpu.get()) True """ return _transpose(a_gpu, True, handle) def conj(x_gpu, overwrite=False): """ Complex conjugate. Compute the complex conjugate of the array in device memory. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array of shape `(m, n)`. overwrite : bool (default: False) If true, save the result in the specified array. If false, return the result in a newly allocated array. Returns ------- xc_gpu : pycuda.gpuarray.GPUArray Conjugate of the input array. If `overwrite` is true, the returned matrix is the same as the input array. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = linalg.conj(x_gpu) >>> np.all(x == np.conj(y_gpu.get())) True """ # Don't attempt to process non-complex matrix types: if x_gpu.dtype in [np.float32, np.float64]: return x_gpu try: func = conj.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = el.ElementwiseKernel( "{ctype} *x, {ctype} *y".format(ctype=ctype), "y[i] = conj(x[i])") conj.cache[x_gpu.dtype] = func if overwrite: func(x_gpu, x_gpu) return x_gpu else: y_gpu = gpuarray.empty_like(x_gpu) func(x_gpu, y_gpu) return y_gpu conj.cache = {} @context_dependent_memoize def _get_diag_kernel(use_double, use_complex): template = Template(""" #include #if ${use_double} #if ${use_complex} #define FLOAT pycuda::complex #else #define FLOAT double #endif #else #if ${use_complex} #define FLOAT pycuda::complex #else #define FLOAT float #endif #endif // Assumes that d already contains zeros in all positions. // N must contain the number of elements in v. __global__ void diag(FLOAT *v, FLOAT *d, int N) { unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+ blockIdx.x*blockDim.x+threadIdx.x; if (idx < N) d[idx*(N+1)] = v[idx]; } """) # Set this to False when debugging to make sure the compiled kernel is # not cached: tmpl = template.substitute(use_double=use_double, use_complex=use_complex) cache_dir=None diag_mod = SourceModule(tmpl, cache_dir=cache_dir) return diag_mod.get_function("diag") def diag(v_gpu): """ Construct a diagonal matrix if input array is one-dimensional, or extracts diagonal entries of a two-dimensional array. If input-array is one-dimensional: Constructs a matrix in device memory whose diagonal elements correspond to the elements in the specified array; all non-diagonal elements are set to 0. If input-array is two-dimensional: Constructs an array in device memory whose elements correspond to the elements along the main-diagonal of the specified array. Parameters ---------- v_obj : pycuda.gpuarray.GPUArray Input array of shape `(n,m)`. Returns ------- d_gpu : pycuda.gpuarray.GPUArray If v_obj has shape `(n,1)`, output is diagonal matrix of dimensions `[n, n]`. If v_obj has shape `(n,m)`, output is array of length `min(n,m)`. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> v = np.array([1, 2, 3, 4, 5, 6], np.float32) >>> v_gpu = gpuarray.to_gpu(v) >>> d_gpu = linalg.diag(v_gpu) >>> np.all(d_gpu.get() == np.diag(v)) True >>> v = np.array([1j, 2j, 3j, 4j, 5j, 6j], np.complex64) >>> v_gpu = gpuarray.to_gpu(v) >>> d_gpu = linalg.diag(v_gpu) >>> np.all(d_gpu.get() == np.diag(v)) True >>> v = np.array([[1., 2., 3.],[4., 5., 6.]], np.float64) >>> v_gpu = gpuarray.to_gpu(v) >>> d_gpu = linalg.diag(v_gpu) >>> d_gpu array([ 1., 5.]) """ if v_gpu.dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') alloc = misc._global_cublas_allocator if (len(v_gpu.shape) > 1) and (len(v_gpu.shape) < 3): if (v_gpu.dtype == np.complex64): func = cublas.cublasCcopy elif (v_gpu.dtype == np.float32): func = cublas.cublasScopy elif (v_gpu.dtype == np.complex128): func = cublas.cublasZcopy elif (v_gpu.dtype == np.float64): func = cublas.cublasDcopy else: raise ValueError('unsupported input type') n = int(min(v_gpu.shape)) # workaround for bug #131 incx = v_gpu.shape[1]+1 # Allocate the output array d_gpu = gpuarray.empty(n, v_gpu.dtype.type, allocator=alloc) handle = misc._global_cublas_handle func(handle, n, v_gpu.gpudata, incx, d_gpu.gpudata, 1) return d_gpu elif len(v_gpu.shape) >= 3: raise ValueError('input array cannot have greater than 2-dimensions') use_double = int(v_gpu.dtype in [np.float64, np.complex128]) use_complex = int(v_gpu.dtype in [np.complex64, np.complex128]) # Initialize output matrix: d_gpu = misc.zeros((v_gpu.size, v_gpu.size), v_gpu.dtype, allocator=alloc) # Get block/grid sizes: dev = misc.get_current_device() block_dim, grid_dim = misc.select_block_grid_sizes(dev, d_gpu.shape) diag = _get_diag_kernel(use_double, use_complex) diag(v_gpu, d_gpu, np.uint32(v_gpu.size), block=block_dim, grid=grid_dim) return d_gpu @context_dependent_memoize def _get_eye_kernel(dtype): ctype=tools.dtype_to_ctype(dtype) return el.ElementwiseKernel("{ctype} *e".format(ctype=ctype), "e[i] = 1") def eye(N, dtype=np.float32): """ Construct a 2D matrix with ones on the diagonal and zeros elsewhere. Constructs a matrix in device memory whose diagonal elements are set to 1 and non-diagonal elements are set to 0. Parameters ---------- N : int Number of rows or columns in the output matrix. dtype : type Matrix data type. Returns ------- e_gpu : pycuda.gpuarray.GPUArray Diagonal matrix of dimensions `[N, N]` with diagonal values set to 1. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> N = 5 >>> e_gpu = linalg.eye(N) >>> np.all(e_gpu.get() == np.eye(N)) True >>> e_gpu = linalg.eye(N, np.complex64) >>> np.all(e_gpu.get() == np.eye(N, dtype=np.complex64)) True """ if dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') if N <= 0: raise ValueError('N must be greater than 0') alloc = misc._global_cublas_allocator e_gpu = misc.zeros((N, N), dtype, allocator=alloc) func = _get_eye_kernel(dtype) func(e_gpu, slice=slice(0, N*N, N+1)) return e_gpu def pinv(a_gpu, rcond=1e-15): """ Moore-Penrose pseudoinverse. Compute the Moore-Penrose pseudoinverse of the specified matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, n)`. rcond : float Singular values smaller than `rcond`*max(singular_values)` are set to zero. Returns ------- a_inv_gpu : pycuda.gpuarray.GPUArray Pseudoinverse of input matrix. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. If the input matrix is square, the pseudoinverse uses less memory. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(8, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> a_inv_gpu = linalg.pinv(a_gpu) >>> np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), 1e-4) True >>> b = np.asarray(np.random.rand(8, 4)+1j*np.random.rand(8, 4), np.complex64) >>> b_gpu = gpuarray.to_gpu(b) >>> b_inv_gpu = linalg.pinv(b_gpu) >>> np.allclose(np.linalg.pinv(b), b_inv_gpu.get(), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') # Perform in-place SVD if the matrix is square to save memory: if a_gpu.shape[0] == a_gpu.shape[1]: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 'o') else: u_gpu, s_gpu, vh_gpu = svd(a_gpu, 's', 's') # Suppress very small singular values and convert the singular value array # to complex if the original matrix is complex so that the former can be # handled by dot_diag(): cutoff_gpu = gpuarray.max(s_gpu)*rcond real_ctype = tools.dtype_to_ctype(s_gpu.dtype) if a_gpu.dtype in [np.complex64, np.complex128]: if s_gpu.dtype == np.float32: complex_dtype = np.complex64 elif s_gpu.dtype == np.float64: complex_dtype = np.complex128 else: raise ValueError('cannot convert singular values to complex') s_complex_gpu = gpuarray.empty(len(s_gpu), complex_dtype) complex_ctype = tools.dtype_to_ctype(complex_dtype) cutoff_func = el.ElementwiseKernel("{real_ctype} *s_real, {complex_ctype} *s_complex," " {real_ctype} *cutoff".format(real_ctype=real_ctype, complex_ctype=complex_ctype), "if (s_real[i] > cutoff[0]) {s_complex[i] = 1/s_real[i];} else {s_complex[i] = 0;}") cutoff_func(s_gpu, s_complex_gpu, cutoff_gpu) # Compute the pseudoinverse without allocating a new diagonal matrix: return dot(vh_gpu, dot_diag(s_complex_gpu, u_gpu, 't'), 'c', 'c') else: cutoff_func = el.ElementwiseKernel("{real_ctype} *s, {real_ctype} *cutoff".format(real_ctype=real_ctype), "if (s[i] > cutoff[0]) {s[i] = 1/s[i];} else {s[i] = 0;}") cutoff_func(s_gpu, cutoff_gpu) # Compute the pseudoinverse without allocating a new diagonal matrix: return dot(vh_gpu, dot_diag(s_gpu, u_gpu, 't'), 'c', 'c') @context_dependent_memoize def _get_tril_kernel(use_double, use_complex, cols): template = Template(""" #include #if ${use_double} #if ${use_complex} #define FLOAT pycuda::complex #else #define FLOAT double #endif #else #if ${use_complex} #define FLOAT pycuda::complex #else #define FLOAT float #endif #endif __global__ void tril(FLOAT *a, unsigned int N) { unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+ blockIdx.x*blockDim.x+threadIdx.x; unsigned int ix = idx/${cols}; unsigned int iy = idx%${cols}; if (idx < N) { if (ix < iy) a[idx] = 0.0; } } """) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir=None tmpl = template.substitute(use_double=use_double, use_complex=use_complex, cols=cols) mod = SourceModule(tmpl, cache_dir=cache_dir) return mod.get_function("tril") def tril(a_gpu, overwrite=False, handle=None): """ Lower triangle of a matrix. Return the lower triangle of a square matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` overwrite : bool (default: False) If true, zero out the upper triangle of the matrix. If false, return the result in a newly allocated matrix. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- l_gpu : pycuda.gpuarray The lower triangle of the original matrix. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> l_gpu = linalg.tril(a_gpu, False) >>> np.allclose(np.tril(a), l_gpu.get()) True """ if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator if len(a_gpu.shape) != 2 or a_gpu.shape[0] != a_gpu.shape[1]: raise ValueError('matrix must be square') if a_gpu.dtype == np.float32: swap_func = cublas.cublasSswap copy_func = cublas.cublasScopy use_double = 0 use_complex = 0 elif a_gpu.dtype == np.float64: swap_func = cublas.cublasDswap copy_func = cublas.cublasDcopy use_double = 1 use_complex = 0 elif a_gpu.dtype == np.complex64: swap_func = cublas.cublasCswap copy_func = cublas.cublasCcopy use_double = 0 use_complex = 1 elif a_gpu.dtype == np.complex128: swap_func = cublas.cublasZswap copy_func = cublas.cublasZcopy use_double = 1 use_complex = 1 else: raise ValueError('unrecognized type') N = a_gpu.shape[0] # Get block/grid sizes: dev = misc.get_current_device() block_dim, grid_dim = misc.select_block_grid_sizes(dev, a_gpu.shape) tril = _get_tril_kernel(use_double, use_complex, cols=N) if not overwrite: # workaround for bug #131 a_orig_gpu = gpuarray.empty(tuple(int(i) for i in a_gpu.shape), a_gpu.dtype, allocator=alloc) copy_func(handle, a_gpu.size, int(a_gpu.gpudata), 1, int(a_orig_gpu.gpudata), 1) tril(a_gpu, np.uint32(a_gpu.size), block=block_dim, grid=grid_dim) if overwrite: return a_gpu else: # Restore original contents of a_gpu: swap_func(handle, a_gpu.size, int(a_gpu.gpudata), 1, int(a_orig_gpu.gpudata), 1) return a_orig_gpu @context_dependent_memoize def _get_triu_kernel(use_double, use_complex, cols): template = Template(""" #include #if ${use_double} #if ${use_complex} #define FLOAT pycuda::complex #else #define FLOAT double #endif #else #if ${use_complex} #define FLOAT pycuda::complex #else #define FLOAT float #endif #endif __global__ void triu(FLOAT *a, unsigned int N) { unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+ blockIdx.x*blockDim.x+threadIdx.x; unsigned int ix = idx/${cols}; unsigned int iy = idx%${cols}; if (idx < N) { if (ix > iy) a[idx] = 0.0; } } """) # Set this to False when debugging to make sure the compiled kernel is # not cached: cache_dir=None tmpl = template.substitute(use_double=use_double, use_complex=use_complex, cols=cols) mod = SourceModule(tmpl, cache_dir=cache_dir) return mod.get_function("triu") def triu(a_gpu, k=0, overwrite=False, handle=None): """ Upper triangle of a matrix. Return the upper triangle of a square matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input matrix of shape `(m, m)` overwrite : bool (default: False) If true, zero out the lower triangle of the matrix. If false, return the result in a newly allocated matrix. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- u_gpu : pycuda.gpuarray The upper triangle of the original matrix. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> a = np.asarray(np.random.rand(4, 4), np.float32) >>> a_gpu = gpuarray.to_gpu(a) >>> u_gpu = linalg.triu(a_gpu, False) >>> np.allclose(np.triu(a), u_gpu.get()) True """ if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator if len(a_gpu.shape) != 2 or a_gpu.shape[0] != a_gpu.shape[1]: raise ValueError('matrix must be square') if a_gpu.dtype == np.float32: swap_func = cublas.cublasSswap copy_func = cublas.cublasScopy use_double = 0 use_complex = 0 elif a_gpu.dtype == np.float64: swap_func = cublas.cublasDswap copy_func = cublas.cublasDcopy use_double = 1 use_complex = 0 elif a_gpu.dtype == np.complex64: swap_func = cublas.cublasCswap copy_func = cublas.cublasCcopy use_double = 0 use_complex = 1 elif a_gpu.dtype == np.complex128: swap_func = cublas.cublasZswap copy_func = cublas.cublasZcopy use_double = 1 use_complex = 1 else: raise ValueError('unrecognized type') N = int(a_gpu.shape[0]) # Get block/grid sizes: dev = misc.get_current_device() block_dim, grid_dim = misc.select_block_grid_sizes(dev, a_gpu.shape) tril = _get_triu_kernel(use_double, use_complex, cols=N) if not overwrite: a_orig_gpu = gpuarray.empty( (N,N), a_gpu.dtype, allocator=alloc) copy_func(handle, a_gpu.size, int(a_gpu.gpudata), 1, int(a_orig_gpu.gpudata), 1) tril(a_gpu, np.uint32(a_gpu.size), block=block_dim, grid=grid_dim) if overwrite: return a_gpu else: # Restore original contents of a_gpu: swap_func(handle, a_gpu.size, int(a_gpu.gpudata), 1, int(a_orig_gpu.gpudata), 1) return a_orig_gpu def multiply(x_gpu, y_gpu, overwrite=False): """ Multiply arguments element-wise. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray Input arrays to be multiplied. dev : pycuda.driver.Device Device object to be used. overwrite : bool (default: False) If true, return the result in `y_gpu`. is false, return the result in a newly allocated array. Returns ------- z_gpu : pycuda.gpuarray.GPUArray The element-wise product of the input arrays. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.asarray(np.random.rand(4, 4), np.float32) >>> y = np.asarray(np.random.rand(4, 4), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = gpuarray.to_gpu(y) >>> z_gpu = linalg.multiply(x_gpu, y_gpu) >>> np.allclose(x*y, z_gpu.get()) True """ alloc = misc._global_cublas_allocator if x_gpu.shape != y_gpu.shape: raise ValueError('input arrays must have the same shape') if x_gpu.dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') x_ctype = tools.dtype_to_ctype(x_gpu.dtype) y_ctype = tools.dtype_to_ctype(y_gpu.dtype) if overwrite: func = el.ElementwiseKernel("{x_ctype} *x, {y_ctype} *y".format(x_ctype=x_ctype, y_ctype=y_ctype), "y[i] *= x[i]") func(x_gpu, y_gpu) return y_gpu else: result_type = np.result_type(x_gpu.dtype, y_gpu.dtype) # workaround for bug #131 z_gpu = gpuarray.empty(tuple(int(i) for i in x_gpu.shape), result_type, allocator=alloc) func = \ el.ElementwiseKernel("{x_ctype} *x, {y_ctype} *y, {z_type} *z".format(x_ctype=x_ctype, y_ctype=y_ctype, z_type=tools.dtype_to_ctype(result_type)), "z[i] = x[i]*y[i]") func(x_gpu, y_gpu, z_gpu) return z_gpu def norm(x_gpu, handle=None): """ Euclidean norm (2-norm) of real vector. Computes the Euclidean norm of an array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- nrm : real Euclidean norm of `x`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.asarray(np.random.rand(4, 4), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> nrm = linalg.norm(x_gpu) >>> np.allclose(nrm, np.linalg.norm(x)) True >>> x_gpu = gpuarray.to_gpu(np.array([3+4j, 12-84j])) >>> linalg.norm(x_gpu) 85.0 """ if handle is None: handle = misc._global_cublas_handle if len(x_gpu.shape) != 1: x_gpu = x_gpu.ravel() # Compute inner product for 1D arrays: if (x_gpu.dtype == np.complex64): cublas_func = cublas.cublasScnrm2 elif (x_gpu.dtype == np.float32): cublas_func = cublas.cublasSnrm2 elif (x_gpu.dtype == np.complex128): cublas_func = cublas.cublasDznrm2 elif (x_gpu.dtype == np.float64): cublas_func = cublas.cublasDnrm2 else: raise ValueError('unsupported input type') return cublas_func(handle, x_gpu.size, x_gpu.gpudata, 1) def scale(alpha, x_gpu, alpha_real=False, handle=None): """ Scale a vector by a factor alpha. Parameters ---------- alpha : scalar Scale parameter x_gpu : pycuda.gpuarray.GPUArray Input array. alpha_real : bool If `True` and `x_gpu` is complex, then one of the specialized versions `cublasCsscal` or `cublasZdscal` is used which might improve performance for large arrays. (By default, `alpha` is coerced to the corresponding complex type.) handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import numpy as np >>> import linalg >>> linalg.init() >>> x = np.asarray(np.random.rand(4, 4), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> alpha = 2.4 >>> linalg.scale(alpha, x_gpu) >>> np.allclose(x_gpu.get(), alpha*x) True """ if handle is None: handle = misc._global_cublas_handle if len(x_gpu.shape) != 1: x_gpu = x_gpu.ravel() cublas_func = { np.float32: cublas.cublasSscal, np.float64: cublas.cublasDscal, np.complex64: cublas.cublasCsscal if alpha_real else cublas.cublasCscal, np.complex128: cublas.cublasZdscal if alpha_real else cublas.cublasZscal }.get(x_gpu.dtype.type, None) if cublas_func: return cublas_func(handle, x_gpu.size, alpha, x_gpu.gpudata, 1) else: raise ValueError('unsupported input type') def inv(a_gpu, overwrite=False, ipiv_gpu=None): """ Compute the inverse of a matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Square (n, n) matrix to be inverted. overwrite : bool (default: False) Discard data in `a` (may improve performance). ipiv_gpu : pycuda.gpuarray.GPUArray (optional) Temporary array of size n, can be supplied to save allocations. Returns ------- ainv_gpu : pycuda.gpuarray.GPUArray Inverse of the matrix `a`. Raises ------ LinAlgError : If `a` is singular. ValueError : * If `a` is not square, or not 2-dimensional. * If ipiv was not None but had the wrong dtype or shape. """ if len(a_gpu.shape) != 2 or a_gpu.shape[0] != a_gpu.shape[1]: raise ValueError('expected square matrix') if (a_gpu.dtype == np.complex64): getrf = cula.culaDeviceCgetrf getri = cula.culaDeviceCgetri elif (a_gpu.dtype == np.float32): getrf = cula.culaDeviceSgetrf getri = cula.culaDeviceSgetri elif (a_gpu.dtype == np.complex128): getrf = cula.culaDeviceZgetrf getri = cula.culaDeviceZgetri elif (a_gpu.dtype == np.float64): getrf = cula.culaDeviceDgetrf getri = cula.culaDeviceDgetri n = int(a_gpu.shape[0]) # workaround for bug #131 if ipiv_gpu is None: alloc = misc._global_cublas_allocator ipiv_gpu = gpuarray.empty((n, 1), np.int32, allocator=alloc) elif ipiv_gpu.dtype != np.int32 or np.prod(ipiv_gpu.shape) < n: raise ValueError('invalid ipiv provided') out = a_gpu if overwrite else a_gpu.copy() try: getrf(n, n, out.gpudata, n, ipiv_gpu.gpudata) getri(n, out.gpudata, n, ipiv_gpu.gpudata) except cula.culaDataError as e: raise LinAlgError(e) return out def trace(x_gpu, handle=None): """ Return the sum along the main diagonal of the array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Matrix to calculate the trace of. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- trace : number trace of x_gpu """ if handle is None: handle = misc._global_cublas_handle if len(x_gpu.shape) != 2: raise ValueError('Only 2D matrices are supported') one = gpuarray.to_gpu(np.ones(1, dtype=x_gpu.dtype)) if (x_gpu.dtype == np.complex64): cublas_func = cublas.cublasCdotu elif (x_gpu.dtype == np.float32): cublas_func = cublas.cublasSdot elif (x_gpu.dtype == np.complex128): cublas_func = cublas.cublasZdotu elif (x_gpu.dtype == np.float64): cublas_func = cublas.cublasDdot if not cublas_func: raise ValueError('unsupported input type') if x_gpu.flags.c_contiguous: incx = x_gpu.shape[1] + 1 else: incx = x_gpu.shape[0] + 1 return cublas_func(handle, np.min(x_gpu.shape), x_gpu.gpudata, incx, one.gpudata, 0) @context_dependent_memoize def _get_det_kernel(dtype): ctype = tools.dtype_to_ctype(dtype) args = "int* ipiv, {ctype}* x, unsigned xn".format(ctype=ctype) return ReductionKernel(dtype, "1.0", "a*b", "(ipiv[i] != i+1) ? -x[i*xn+i] : x[i*xn+i]", args) def det(a_gpu, overwrite=False, ipiv_gpu=None, handle=None): """ Compute the determinant of a square matrix. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray The square n*n matrix of which to calculate the determinant. overwrite : bool (default: False) Discard data in `a` (may improve performance). handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. ipiv_gpu : pycuda.gpuarray.GPUArray (optional) Temporary array of size n, can be supplied to save allocations. Returns ------- det : number determinant of a_gpu """ if handle is None: handle = misc._global_cublas_handle if len(a_gpu.shape) != 2: raise ValueError('Only 2D matrices are supported') if a_gpu.shape[0] != a_gpu.shape[1]: raise ValueError('Only square matrices are supported') if (a_gpu.dtype == np.complex64): getrf = cula.culaDeviceCgetrf elif (a_gpu.dtype == np.float32): getrf = cula.culaDeviceSgetrf elif (a_gpu.dtype == np.complex128): getrf = cula.culaDeviceZgetrf elif (a_gpu.dtype == np.float64): getrf = cula.culaDeviceDgetrf else: raise ValueError('unsupported input type') n = int(a_gpu.shape[0]) # workaround for bug #131 if ipiv_gpu is None: alloc = misc._global_cublas_allocator ipiv_gpu = gpuarray.empty((n, 1), np.int32, allocator=alloc) elif ipiv_gpu.dtype != np.int32 or np.prod(ipiv_gpu.shape) < n: raise ValueError('invalid ipiv provided') out = a_gpu if overwrite else a_gpu.copy() try: getrf(n, n, out.gpudata, n, ipiv_gpu.gpudata) return _get_det_kernel(a_gpu.dtype)(ipiv_gpu, out, n).get() except cula.culaDataError as e: raise LinAlgError(e) def qr(a_gpu, mode='reduced', handle=None): """ QR Decomposition. Factor the real/complex matrix `a` as `QR`, where `Q` is an orthonormal/unitary matrix and `R` is an upper triangular matrix. Parameters ---------- a_gpu: pycuda.gpuarray.GPUArray Real/complex input matrix `a` with dimensions `(m, n)`. `a` is assumed to be `m`>=`n`. mode : {'reduced', 'economic', 'r'} 'reduced' : returns `Q`, `R` with dimensions `(m, k)` and `(k, n)` (default). 'economic' : returns `Q` only with dimensions `(m, k)`. 'r' : returns `R` only with dimensions `(k, n)` with `k`=min`(m,n)`. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- q_gpu : pycuda.gpuarray.GPUArray Orthonormal/unitary matrix (depending on whether or not `A` is real/complex). r_gpu : pycuda.gpuarray.GPUArray The upper-triangular matrix. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. Arrays are assumed to be stored in column-major order, i.e., order='F'. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> from skcuda import linalg >>> linalg.init() >>> # Rectangular matrix A, np.float32 >>> A = np.array(np.random.randn(9, 7), np.float32, order='F') >>> A_gpu = gpuarray.to_gpu(A) >>> Q_gpu, R_gpu = linalg.qr(A_gpu, 'reduced') >>> np.allclose(A, np.dot(Q_gpu.get(), R_gpu.get()), 1e-4) True >>> # Square matrix A, np.complex128 >>> A = np.random.randn(9, 9) + 1j*np.random.randn(9, 9) >>> A = np.asarray(A, np.complex128, order='F') >>> A_gpu = gpuarray.to_gpu(A) >>> Q_gpu, R_gpu = linalg.qr(A_gpu, 'reduced') >>> np.allclose(A, np.dot(Q_gpu.get(), R_gpu.get()), 1e-4) True >>> np.allclose(np.identity(Q_gpu.shape[0]) + 1j*0, np.dot(Q_gpu.get().conj().T, Q_gpu.get()), 1e-4) True >>> # Numpy QR and CULA QR >>> A = np.array(np.random.randn(9, 7), np.float32, order='F') >>> Q, R = np.linalg.qr(A, 'reduced') >>> a_gpu = gpuarray.to_gpu(A) >>> Q_gpu, R_gpu = linalg.qr(a_gpu, 'reduced') >>> np.allclose(Q, Q_gpu.get(), 1e-4) True >>> np.allclose(R, R_gpu.get(), 1e-4) True """ if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating # point numbers: data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func_qr = cula.culaDeviceCgeqrf cula_func_q = cula.culaDeviceCungqr copy_func = cublas.cublasCcopy use_double = 0 use_complex = 1 isreal=False elif data_type == np.float32: cula_func_qr = cula.culaDeviceSgeqrf cula_func_q = cula.culaDeviceSorgqr copy_func = cublas.cublasScopy use_double = 0 use_complex = 0 isreal=True else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func_qr = cula.culaDeviceZgeqrf cula_func_q = cula.culaDeviceZungqr copy_func = cublas.cublasZcopy use_double = 1 use_complex = 1 isreal=False elif data_type == np.float64: cula_func_qr = cula.culaDeviceDgeqrf cula_func_q = cula.culaDeviceDorgqr copy_func = cublas.cublasDcopy use_double = 1 use_complex = 0 isreal=True else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') # CUDA assumes that arrays are stored in column-major order m, n = np.array(a_gpu.shape, int) if m>> # Compute right eigenvectors of a symmetric matrix A and verify A*vr = vr*w >>> a = np.array(([1,3],[3,5]), np.float32, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> vr_gpu, w_gpu = linalg.eig(a_gpu, 'N', 'V') >>> np.allclose(np.dot(a, vr_gpu.get()), np.dot(vr_gpu.get(), np.diag(w_gpu.get())), 1e-4) True >>> # Compute left eigenvectors of a symmetric matrix A and verify vl.T*A = w*vl.T >>> a = np.array(([1,3],[3,5]), np.float32, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> w_gpu, vl_gpu = linalg.eig(a_gpu, 'V', 'N') >>> np.allclose(np.dot(vl_gpu.get().T, a), np.dot(np.diag(w_gpu.get()), vl_gpu.get().T), 1e-4) True >>> # Compute left/right eigenvectors of a symmetric matrix A and verify A = vr*w*vl.T >>> a = np.array(([1,3],[3,5]), np.float32, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> vr_gpu, w_gpu, vl_gpu = linalg.eig(a_gpu, 'V', 'V') >>> np.allclose(a, np.dot(vr_gpu.get(), np.dot(np.diag(w_gpu.get()), vl_gpu.get().T)), 1e-4) True >>> # Compute eigenvalues of a square matrix A and verify that trace(A)=sum(w) >>> a = np.array(np.random.rand(9,9), np.float32, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> w_gpu = linalg.eig(a_gpu, 'N', 'N') >>> np.allclose(np.trace(a), sum(w_gpu.get()), 1e-4) True >>> # Compute eigenvalues of a real valued matrix A possessing complex e-valuesand >>> a = np.array(np.array(([1, -2], [1, 3])), np.float32, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> w_gpu = linalg.eig(a_gpu, 'N', 'N', imag='T') True >>> # Compute eigenvalues of a complex valued matrix A and verify that trace(A)=sum(w) >>> a = np.array(np.random.rand(2,2) + 1j*np.random.rand(2,2), np.complex64, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> w_gpu = linalg.eig(a_gpu, 'N', 'N') >>> np.allclose(np.trace(a), sum(w_gpu.get()), 1e-4) True """ if not _has_cula: raise NotImplementedError('CULA not installed') alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating # point numbers: data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func_geev = cula.culaDeviceCgeev imag='F' elif data_type == np.float32: cula_func_geev = cula.culaDeviceSgeev else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func_geev = cula.culaDeviceZgeev imag='F' elif data_type == np.float64: cula_func_geev = cula.culaDeviceDgeev else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') # CUDA assumes that arrays are stored in column-major order m, n = np.array(a_gpu.shape, int) #Check input if(m!=n): raise ValueError('matrix is not square!') jobvl = jobvl.upper() jobvr = jobvr.upper() if jobvl not in ['N', 'V'] : raise ValueError('jobvl has to be "N" or "V" ') if jobvr not in ['N', 'V'] : raise ValueError('jobvr has to be "N" or "V" ') if imag not in ['T', 'F'] : raise ValueError('imag has to be "T" or "F" ') # Allocate vl, vr, and w: vl_gpu = gpuarray.empty((m,m), data_type, order="F", allocator=alloc) vr_gpu = gpuarray.empty((m,m), data_type, order="F", allocator=alloc) w_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc) if data_type == np.complex64 or data_type == np.complex128: #culaDeviceCgeev(jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr) cula_func_geev(jobvl, jobvr, m, a_gpu.gpudata, m, w_gpu.gpudata, vl_gpu.gpudata , m , vr_gpu.gpudata, m ) elif data_type == np.float32: wi_gpu = gpuarray.zeros(m, data_type, order="F", allocator=alloc) cula_func_geev(jobvl, jobvr, m, a_gpu.gpudata, m, w_gpu.gpudata, wi_gpu.gpudata, vl_gpu.gpudata , m , vr_gpu.gpudata, m ) elif data_type == np.float64: wi_gpu = gpuarray.zeros(m, data_type, order="F", allocator=alloc) cula_func_geev(jobvl, jobvr, m, a_gpu.gpudata, m, w_gpu.gpudata, wi_gpu.gpudata, vl_gpu.gpudata , m , vr_gpu.gpudata, m ) if imag == 'T': w_gpu = w_gpu + (1j)*wi_gpu # Free internal CULA memory: cula.culaFreeBuffers() if jobvl == 'N' and jobvr == 'N': return w_gpu elif jobvl == 'V' and jobvr == 'V': return vr_gpu, w_gpu, vl_gpu elif jobvl == 'V' and jobvr == 'N': return w_gpu, vl_gpu, elif jobvl == 'N' and jobvr == 'V': return vr_gpu, w_gpu @context_dependent_memoize def _get_vander_kernel(use_double, use_complex, rows, cols): template = Template(""" #include #if ${use_double} #if ${use_complex} #define FLOAT pycuda::complex #else #define FLOAT double #endif #else #if ${use_complex} #define FLOAT pycuda::complex #else #define FLOAT float #endif #endif __global__ void vander(FLOAT *a, FLOAT *b, int m, int n) { unsigned int ix; unsigned int r = blockIdx.x*blockDim.x+threadIdx.x; if(r < m) { for(int i=1; i>> a = np.array(np.array([1, 2, 3]), np.float32, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> linalg.vander(a_gpu, n=4) array([[ 1., 1., 2., 12.], [ 1., 2., 6., 36.], [ 1., 3., 3., 18.]], dtype=float32) """ if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator data_type = a_gpu.dtype.type if a_gpu.dtype == np.float32: use_double = 0 use_complex = 0 elif a_gpu.dtype == np.float64: use_double = 1 use_complex = 0 elif a_gpu.dtype == np.complex64: use_double = 0 use_complex = 1 elif a_gpu.dtype == np.complex128: use_double = 1 use_complex = 1 else: raise ValueError('unrecognized type') m = int(a_gpu.shape[0]) if n == None: n = int(m) vander_gpu = gpuarray.empty((m, n), data_type, order='F', allocator=alloc) vander_gpu[ : , 0 ] = vander_gpu[ : , 0 ] * 0 + 1 # Get block/grid sizes: dev = misc.get_current_device() block_dim, grid_dim = misc.select_block_grid_sizes(dev, vander_gpu.shape) # Allocate Vandermonde matrix: vander = _get_vander_kernel(use_double, use_complex, rows=m, cols=n) # Call kernel: vander(vander_gpu, a_gpu, np.uint32(m), np.uint32(n), block=block_dim, grid=grid_dim) # Return return vander_gpu def dmd(a_gpu, k=None, modes='exact', handle=None): """ Dynamic Mode Decomposition. Dynamic Mode Decomposition (DMD) is a data processing algorithm which allows to decompose a matrix `a` in space and time. The matrix `a` is decomposed as `a = FBV`, where the columns of `F` contain the dynamic modes. The modes are ordered corresponding to the amplitudes stored in the diagonal matrix `B`. `V` is a Vandermonde matrix describing the temporal evolution. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Real/complex input matrix `a` with dimensions `(m, n)`. k : int, optional If `k < (n-1)` low-rank Dynamic Mode Decomposition is computed. modes : `{'standard', 'exact'}` 'standard' : uses the standard definition to compute the dynamic modes, `F = U * W`. 'exact' : computes the exact dynamic modes, `F = Y * V * (S**-1) * W`. handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- f_gpu : pycuda.gpuarray.GPUArray Matrix containing the dynamic modes of shape `(m, n-1)` or `(m, k)`. b_gpu : pycuda.gpuarray.GPUArray 1-D array containing the amplitudes of length `min(n-1, k)`. v_gpu : pycuda.gpuarray.GPUArray Vandermonde matrix of shape `(n-1, n-1)` or `(k, n-1)`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. Arrays are assumed to be stored in column-major order, i.e., order='F'. References ---------- M. R. Jovanovic, P. J. Schmid, and J. W. Nichols. "Low-rank and sparse dynamic mode decomposition." Center for Turbulence Research Annual Research Briefs (2012): 139-152. J. H. Tu, et al. "On dynamic mode decomposition: theory and applications." arXiv preprint arXiv:1312.0041 (2013). """ #************************************************************************* #*** Author: N. Benjamin Erichson *** #*** <2015> *** #*** License: BSD 3 clause *** #************************************************************************* if not _has_cula: raise NotImplementedError('CULA not installed') if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func_gesvd = cula.culaDeviceCgesvd cublas_func_gemm = cublas.cublasCgemm cublas_func_dgmm = cublas.cublasCdgmm cula_func_gels = cula.culaDeviceCgels copy_func = cublas.cublasCcopy alpha = np.complex64(1.0) beta = np.complex64(0.0) TRANS_type = 'C' elif data_type == np.float32: cula_func_gesvd = cula.culaDeviceSgesvd cublas_func_gemm = cublas.cublasSgemm cublas_func_dgmm = cublas.cublasSdgmm cula_func_gels = cula.culaDeviceSgels copy_func = cublas.cublasScopy alpha = np.float32(1.0) beta = np.float32(0.0) TRANS_type = 'T' else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func_gesvd = cula.culaDeviceZgesvd cublas_func_gemm = cublas.cublasZgemm cublas_func_dgmm = cublas.cublasZdgmm cula_func_gels = cula.culaDeviceZgels copy_func = cublas.cublasZcopy alpha = np.complex128(1.0) beta = np.complex128(0.0) TRANS_type = 'C' elif data_type == np.float64: cula_func_gesvd = cula.culaDeviceDgesvd cublas_func_gemm = cublas.cublasDgemm cublas_func_dgmm = cublas.cublasDdgmm cula_func_gels = cula.culaDeviceDgels copy_func = cublas.cublasDcopy alpha = np.float64(1.0) beta = np.float64(0.0) TRANS_type = 'T' else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') #CUDA assumes that arrays are stored in column-major order m, n = np.array(a_gpu.shape, int) nx = n-1 #Set k if k == None : k = nx if k > nx or k < 1: raise ValueError('k is not valid') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Split data into lef and right snapshot sequence #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Note: we need a copy of X_gpu, because SVD destroys X_gpu #While Y_gpu is just a pointer X_gpu = gpuarray.empty((m, n), data_type, order="F", allocator=alloc) copy_func(handle, X_gpu.size, int(a_gpu.gpudata), 1, int(X_gpu.gpudata), 1) X_gpu = X_gpu[:, :nx] Y_gpu = a_gpu[:, 1:] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Singular Value Decomposition #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #gesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt) #Parameters #---------- #a : pycuda.gpuarray.GPUArray of shape (m, n) #jobu : {'A', 'S', 'O', 'N'} # If 'A', return the full `u` matrix with shape `(m, m)`. # If 'S', return the `u` matrix with shape `(m, nx)`. # If 'O', return the `u` matrix with shape `(m, nx) without # allocating a new matrix. #jobvt : {'A', 'S', 'O', 'N'} # If 'A', return the full `vh` matrix with shape `(nx, nx)`. # If 'S', return the `vh` matrix with shape `(nx, nx)`. # If 'O', return the `vh` matrix with shape `(nx, nx) without # allocating a new matrix. # #Returns #------- #u : pycuda.gpuarray.GPUArray # Unitary matrix of shape `(m, m)` or `(m, nx)` #s : pycuda.gpuarray.GPUArray # Array containing the singular values, sorted such that `s[i] >= s[i+1]`. # `s` is of length `min(m, nx)`. #v : pycuda.gpuarray.GPUArray # Unitary matrix of shape `(nx, nx)` or `(nx, nx)` #Allocate s, U, Vt for economic SVD #Note: singular values are always real s_gpu = gpuarray.empty(nx, real_type, order="F", allocator=alloc) U_gpu = gpuarray.empty((m,nx), data_type, order="F", allocator=alloc) Vt_gpu = gpuarray.empty((nx,nx), data_type, order="F", allocator=alloc) #Economic SVD cula_func_gesvd('S', 'S', m, nx, int(X_gpu.gpudata), m, int(s_gpu.gpudata), int(U_gpu.gpudata), m, int(Vt_gpu.gpudata), nx) #Low-rank DMD: trancate SVD if k < nx if k != nx: s_gpu = s_gpu[:k] U_gpu = U_gpu[: , :k] Vt_gpu = Vt_gpu[:k , : ] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Solve the LS problem to find estimate for M using the pseudo-inverse #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #real: M = U.T * Y * Vt.T * S**-1 #complex: M = U.H * Y * Vt.H * S**-1 #Let G = Y * Vt.H * S**-1, hence M = M * G #Allocate G and M G_gpu = gpuarray.empty((m,k), data_type, order="F", allocator=alloc) M_gpu = gpuarray.empty((k,k), data_type, order="F", allocator=alloc) #i) s = s **-1 (inverse) if data_type == np.complex64 or data_type == np.complex128: s_gpu = 1/s_gpu s_gpu = s_gpu + 1j * gpuarray.zeros_like(s_gpu) else: s_gpu = 1/s_gpu #ii) real/complex: scale Vt = diag(s**-1) * Vt cublas_func_dgmm(handle, 'l', k, k, int(Vt_gpu.gpudata), k, int(s_gpu.gpudata), 1, int(Vt_gpu.gpudata), k) #iii) real: G = Y * (S**-1 * Vt).T, complex: G = Y * (S**-1 * Vt).H cublas_func_gemm(handle, 'n', TRANS_type, m, k, k, alpha, int(Y_gpu.gpudata), m, int(Vt_gpu.gpudata), k, beta, int(G_gpu.gpudata), m ) #iv) real/complex: M = M * G cublas_func_gemm(handle, TRANS_type, 'n', k, k, m, alpha, int(U_gpu.gpudata), m, int(G_gpu.gpudata), m, beta, int(M_gpu.gpudata), k ) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Eigen Decomposition #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Note: If a_gpu is real the imag part is omitted Vr_gpu, w_gpu = eig(M_gpu, 'N', 'V', 'F') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute DMD Modes #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ F_gpu = gpuarray.empty((m,k), data_type, order="F", allocator=alloc) modes = modes.lower() if modes == 'exact': #Compute (exact) DMD modes: F = Y * V * S**-1 * W = G * W cublas_func_gemm(handle, 'n', 'n', m, k, k, alpha, G_gpu.gpudata, m, Vr_gpu.gpudata, k, beta, G_gpu.gpudata, m ) F_gpu_temp = G_gpu elif modes == 'standard': #Compute (standard) DMD modes: F = U * W cublas_func_gemm(handle, 'n', 'n', m, k, k, alpha, U_gpu.gpudata, m, Vr_gpu.gpudata, k, beta, U_gpu.gpudata, m ) F_gpu_temp = U_gpu else: raise ValueError('Type of modes is not supported, choose "exact" or "standard".') #Copy is required, because gels destroys input copy_func(handle, F_gpu_temp.size, int(F_gpu_temp.gpudata), 1, int(F_gpu.gpudata), 1) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute amplitueds b using least-squares: Fb=x1 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #x1_gpu = a_gpu[:,0].copy() x1_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc) copy_func(handle, x1_gpu.size, int(a_gpu[:,0].gpudata), 1, int(x1_gpu.gpudata), 1) cula_func_gels( 'N', m, k, int(1) , F_gpu_temp.gpudata, m, x1_gpu.gpudata, m) b_gpu = x1_gpu #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute Vandermonde matrix (CPU) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ V_gpu = vander(w_gpu, n=nx) # Free internal CULA memory: cula.culaFreeBuffers() #Return return F_gpu, b_gpu[:k], V_gpu if __name__ == "__main__": import doctest doctest.testmod() scikit-cuda-0.5.1/skcuda/magma.py000066400000000000000000002025601261465507300166410ustar00rootroot00000000000000#!/usr/bin/env python """ Python interface to MAGMA toolkit. """ from __future__ import absolute_import, division, print_function import sys import ctypes import atexit import numpy as np from . import cuda # Load MAGMA library: if 'linux' in sys.platform: _libmagma_libname_list = ['libmagma.so'] elif sys.platform == 'darwin': _libmagma_libname_list = ['magma.so', 'libmagma.dylib'] elif sys.platform == 'win32': _libmagma_libname_list = ['magma.dll'] else: raise RuntimeError('unsupported platform') _load_err = '' for _lib in _libmagma_libname_list: try: _libmagma = ctypes.cdll.LoadLibrary(_lib) except OSError: _load_err += ('' if _load_err == '' else ', ') + _lib else: _load_err = '' break if _load_err: raise OSError('%s not found' % _load_err) # Exceptions corresponding to various MAGMA errors: _libmagma.magma_strerror.restype = ctypes.c_char_p _libmagma.magma_strerror.argtypes = [ctypes.c_int] # MAGMA below 1.4.0 uses "L" and "U" to select upper/lower triangular # matrices, MAGMA 1.5+ uses numeric constants. This dict will be filled # in magma_init() and will convert between the two modes accordingly _uplo_conversion = {} def magma_strerror(error): """ Return string corresponding to specified MAGMA error code. """ return _libmagma.magma_strerror(error) class MagmaError(Exception): def __init__(self, status, info=None): self._status = status self._info = info errstr = "%s (Code: %d)" % (magma_strerror(status), status) super(MagmaError,self).__init__(errstr) def magmaCheckStatus(status): """ Raise an exception corresponding to the specified MAGMA status code. """ if status != 0: raise MagmaError(status) # Utility functions: _libmagma.magma_version.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def magma_version(): """ Get MAGMA version. """ majv = ctypes.c_int() minv = ctypes.c_int() micv = ctypes.c_int() _libmagma.magma_version(ctypes.byref(majv), ctypes.byref(minv), ctypes.byref(micv)) return (majv.value, minv.value, micv.value) _libmagma.magma_uplo_const.restype = ctypes.c_int _libmagma.magma_uplo_const.argtypes = [ctypes.c_char] _libmagma.magma_init.restype = int def magma_init(): """ Initialize MAGMA. """ global _uplo_conversion status = _libmagma.magma_init() magmaCheckStatus(status) v = magma_version() if v >= (1, 5, 0): _uplo_conversion.update({"L": _libmagma.magma_uplo_const(b"L"), "l": _libmagma.magma_uplo_const(b"l"), "U": _libmagma.magma_uplo_const(b"U"), "u": _libmagma.magma_uplo_const(b"u")}) else: _uplo_conversion.update({"L": "L", "l": "l", "U": "u", "u": "u"}) _libmagma.magma_finalize.restype = int def magma_finalize(): """ Finalize MAGMA. """ status = _libmagma.magma_finalize() magmaCheckStatus(status) _libmagma.magma_getdevice_arch.restype = int def magma_getdevice_arch(): """ Get device architecture. """ return _libmagma.magma_getdevice_arch() _libmagma.magma_getdevice.argtypes = [ctypes.c_void_p] def magma_getdevice(): """ Get current device used by MAGMA. """ dev = ctypes.c_int() _libmagma.magma_getdevice(ctypes.byref(dev)) return dev.value _libmagma.magma_setdevice.argtypes = [ctypes.c_int] def magma_setdevice(dev): """ Get current device used by MAGMA. """ _libmagma.magma_setdevice(dev) def magma_device_sync(): """ Synchronize device used by MAGMA. """ _libmagma.magma_device_sync() # BLAS routines # ISAMAX, IDAMAX, ICAMAX, IZAMAX _libmagma.magma_isamax.restype = int _libmagma.magma_isamax.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_isamax(n, dx, incx): """ Index of maximum magnitude element. """ return _libmagma.magma_isamax(n, int(dx), incx) _libmagma.magma_idamax.restype = int _libmagma.magma_idamax.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_idamax(n, dx, incx): """ Index of maximum magnitude element. """ return _libmagma.magma_idamax(n, int(dx), incx) _libmagma.magma_icamax.restype = int _libmagma.magma_icamax.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_icamax(n, dx, incx): """ Index of maximum magnitude element. """ return _libmagma.magma_icamax(n, int(dx), incx) _libmagma.magma_izamax.restype = int _libmagma.magma_izamax.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_izamax(n, dx, incx): """ Index of maximum magnitude element. """ return _libmagma.magma_izamax(n, int(dx), incx) # ISAMIN, IDAMIN, ICAMIN, IZAMIN _libmagma.magma_isamin.restype = int _libmagma.magma_isamin.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_isamin(n, dx, incx): """ Index of minimum magnitude element. """ return _libmagma.magma_isamin(n, int(dx), incx) _libmagma.magma_idamin.restype = int _libmagma.magma_idamin.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_idamin(n, dx, incx): """ Index of minimum magnitude element. """ return _libmagma.magma_idamin(n, int(dx), incx) _libmagma.magma_icamin.restype = int _libmagma.magma_icamin.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_icamin(n, dx, incx): """ Index of minimum magnitude element. """ return _libmagma.magma_icamin(n, int(dx), incx) _libmagma.magma_izamin.restype = int _libmagma.magma_izamin.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_izamin(n, dx, incx): """ Index of minimum magnitude element. """ return _libmagma.magma_izamin(n, int(dx), incx) # SASUM, DASUM, SCASUM, DZASUM _libmagma.magma_sasum.restype = int _libmagma.magma_sasum.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_sasum(n, dx, incx): """ Sum of absolute values of vector. """ return _libmagma.magma_sasum(n, int(dx), incx) _libmagma.magma_dasum.restype = int _libmagma.magma_dasum.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_dasum(n, dx, incx): """ Sum of absolute values of vector. """ return _libmagma.magma_dasum(n, int(dx), incx) _libmagma.magma_scasum.restype = int _libmagma.magma_scasum.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_scasum(n, dx, incx): """ Sum of absolute values of vector. """ return _libmagma.magma_scasum(n, int(dx), incx) _libmagma.magma_dzasum.restype = int _libmagma.magma_dzasum.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_dzasum(n, dx, incx): """ Sum of absolute values of vector. """ return _libmagma.magma_dzasum(n, int(dx), incx) # SAXPY, DAXPY, CAXPY, ZAXPY _libmagma.magma_saxpy.restype = int _libmagma.magma_saxpy.argtypes = [ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_saxpy(n, alpha, dx, incx, dy, incy): """ Vector addition. """ _libmagma.magma_saxpy(n, alpha, int(dx), incx, int(dy), incy) _libmagma.magma_daxpy.restype = int _libmagma.magma_daxpy.argtypes = [ctypes.c_int, ctypes.c_double, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_daxpy(n, alpha, dx, incx, dy, incy): """ Vector addition. """ _libmagma.magma_daxpy(n, alpha, int(dx), incx, int(dy), incy) _libmagma.magma_caxpy.restype = int _libmagma.magma_caxpy.argtypes = [ctypes.c_int, cuda.cuFloatComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_caxpy(n, alpha, dx, incx, dy, incy): """ Vector addition. """ _libmagma.magma_caxpy(n, ctypes.byref(cuda.cuFloatComplex(alpha.real, alpha.imag)), int(dx), incx, int(dy), incy) _libmagma.magma_zaxpy.restype = int _libmagma.magma_zaxpy.argtypes = [ctypes.c_int, cuda.cuDoubleComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_zaxpy(n, alpha, dx, incx, dy, incy): """ Vector addition. """ _libmagma.magma_zaxpy(n, ctypes.byref(cuda.cuDoubleComplex(alpha.real, alpha.imag)), int(dx), incx, int(dy), incy) # SCOPY, DCOPY, CCOPY, ZCOPY _libmagma.magma_scopy.restype = int _libmagma.magma_scopy.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_scopy(n, dx, incx, dy, incy): """ Vector copy. """ _libmagma.magma_scopy(n, int(dx), incx, int(dy), incy) _libmagma.magma_dcopy.restype = int _libmagma.magma_dcopy.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_dcopy(n, dx, incx, dy, incy): """ Vector copy. """ _libmagma.magma_dcopy(n, int(dx), incx, int(dy), incy) _libmagma.magma_ccopy.restype = int _libmagma.magma_ccopy.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_ccopy(n, dx, incx, dy, incy): """ Vector copy. """ _libmagma.magma_ccopy(n, int(dx), incx, int(dy), incy) _libmagma.magma_zcopy.restype = int _libmagma.magma_zcopy.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_zcopy(n, dx, incx, dy, incy): """ Vector copy. """ _libmagma.magma_zcopy(n, int(dx), incx, int(dy), incy) # SDOT, DDOT, CDOTU, CDOTC, ZDOTU, ZDOTC _libmagma.magma_sdot.restype = ctypes.c_float _libmagma.magma_sdot.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_sdot(n, dx, incx, dy, incy): """ Vector dot product. """ return _libmagma.magma_sdot(n, int(dx), incx, int(dy), incy) _libmagma.magma_ddot.restype = ctypes.c_double _libmagma.magma_ddot.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_ddot(n, dx, incx, dy, incy): """ Vector dot product. """ return _libmagma.magma_ddot(n, int(dx), incx, int(dy), incy) _libmagma.magma_cdotc.restype = cuda.cuFloatComplex _libmagma.magma_cdotc.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_cdotc(n, dx, incx, dy, incy): """ Vector dot product. """ return _libmagma.magma_cdotc(n, int(dx), incx, int(dy), incy) _libmagma.magma_cdotu.restype = cuda.cuFloatComplex _libmagma.magma_cdotu.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_cdotu(n, dx, incx, dy, incy): """ Vector dot product. """ return _libmagma.magma_cdotu(n, int(dx), incx, int(dy), incy) _libmagma.magma_zdotc.restype = cuda.cuDoubleComplex _libmagma.magma_zdotc.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_zdotc(n, dx, incx, dy, incy): """ Vector dot product. """ return _libmagma.magma_zdotc(n, int(dx), incx, int(dy), incy) _libmagma.magma_zdotu.restype = cuda.cuDoubleComplex _libmagma.magma_zdotu.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_zdotu(n, dx, incx, dy, incy): """ Vector dot product. """ return _libmagma.magma_zdotu(n, int(dx), incx, int(dy), incy) # SNRM2, DNRM2, SCNRM2, DZNRM2 _libmagma.magma_snrm2.restype = ctypes.c_float _libmagma.magma_snrm2.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_snrm2(n, dx, incx): """ Euclidean norm (2-norm) of vector. """ return _libmagma.magma_snrm2(n, int(dx), incx) _libmagma.magma_dnrm2.restype = ctypes.c_double _libmagma.magma_dnrm2.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_dnrm2(n, dx, incx): """ Euclidean norm (2-norm) of vector. """ return _libmagma.magma_dnrm2(n, int(dx), incx) _libmagma.magma_scnrm2.restype = ctypes.c_float _libmagma.magma_scnrm2.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_scnrm2(n, dx, incx): """ Euclidean norm (2-norm) of vector. """ return _libmagma.magma_scnrm2(n, int(dx), incx) _libmagma.magma_dznrm2.restype = ctypes.c_double _libmagma.magma_dznrm2.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_dznrm2(n, dx, incx): """ Euclidean norm (2-norm) of vector. """ return _libmagma.magma_dznrm2(n, int(dx), incx) # SROT, DROT, CROT, CSROT, ZROT, ZDROT _libmagma.magma_srot.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_float] def magma_srot(n, dx, incx, dy, incy, dc, ds): """ Apply a rotation to vectors. """ _libmagma.magma_srot(n, int(dx), incx, int(dy), incy, dc, ds) # SROTM, DROTM _libmagma.magma_srotm.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_srotm(n, dx, incx, dy, incy, param): """ Apply a real modified Givens rotation. """ _libmagma.magma_srotm(n, int(dx), incx, int(dy), incy, param) # SROTMG, DROTMG _libmagma.magma_srotmg.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def magma_srotmg(d1, d2, x1, y1, param): """ Construct a real modified Givens rotation matrix. """ _libmagma.magma_srotmg(int(d1), int(d2), int(x1), int(y1), param) # SSCAL, DSCAL, CSCAL, CSCAL, CSSCAL, ZSCAL, ZDSCAL _libmagma.magma_sscal.argtypes = [ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def magma_sscal(n, alpha, dx, incx): """ Scale a vector by a scalar. """ _libmagma.magma_sscal(n, alpha, int(dx), incx) # SSWAP, DSWAP, CSWAP, ZSWAP _libmagma.magma_sswap.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_sswap(n, dA, ldda, dB, lddb): """ Swap vectors. """ _libmagma.magma_sswap(n, int(dA), ldda, int(dB), lddb) # SGEMV, DGEMV, CGEMV, ZGEMV _libmagma.magma_sgemv.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def magma_sgemv(trans, m, n, alpha, dA, ldda, dx, incx, beta, dy, incy): """ Matrix-vector product for general matrix. """ _libmagma.magma_sgemv(trans, m, n, alpha, int(dA), ldda, dx, incx, beta, int(dy), incy) # SGER, DGER, CGERU, CGERC, ZGERU, ZGERC _libmagma.magma_sger.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_sger(m, n, alpha, dx, incx, dy, incy, dA, ldda): """ Rank-1 operation on real general matrix. """ _libmagma.magma_sger(m, n, alpha, int(dx), incx, int(dy), incy, int(dA), ldda) # SSYMV, DSYMV, CSYMV, ZSYMV _libmagma.magma_ssymv.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def magma_ssymv(uplo, n, alpha, dA, ldda, dx, incx, beta, dy, incy): _libmagma.magma_ssymv(uplo, n, alpha, int(dA), ldda, int(dx), incx, beta, int(dy), incy) # SSYR, DSYR, CSYR, ZSYR _libmagma.magma_ssyr.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_ssyr(uplo, n, alpha, dx, incx, dA, ldda): _libmagma.magma_ssyr(uplo, n, alpha, int(dx), incx, int(dA), ldda) # SSYR2, DSYR2, CSYR2, ZSYR2 _libmagma.magma_ssyr2.argtypes = [ctypes.c_char, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_ssyr2(uplo, n, alpha, dx, incx, dy, incy, dA, ldda): _libmagma.magma_ssyr2(uplo, n, alpha, int(dx), incx, int(dy), incy, int(dA), ldda) # STRMV, DTRMV, CTRMV, ZTRMV _libmagma.magma_strmv.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_strmv(uplo, trans, diag, n, dA, ldda, dx, incx): _libmagma.magma_strmv(uplo, trans, diag, n, int(dA), ldda, int(dx), incx) # STRSV, DTRSV, CTRSV, ZTRSV _libmagma.magma_strsv.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_strsv(uplo, trans, diag, n, dA, ldda, dx, incx): _libmagma.magma_strsv(uplo, trans, diag, n, int(dA), ldda, int(dx), incx) # SGEMM, DGEMM, CGEMM, ZGEMM _libmagma.magma_sgemm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def magma_sgemm(transA, transB, m, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc): _libmagma.magma_sgemm(transA, transB, m, n, k, alpha, int(dA), ldda, int(dB), lddb, beta, int(dC), lddc) _libmagma.magma_zgemm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def magma_zgemm(transA, transB, m, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc): _libmagma.magma_zgemm(transA, transB, m, n, k, alpha, int(dA), ldda, int(dB), lddb, beta, int(dC), lddc) # SSYMM, DSYMM, CSYMM, ZSYMM _libmagma.magma_ssymm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def magma_ssymm(side, uplo, m, n, alpha, dA, ldda, dB, lddb, beta, dC, lddc): _libmagma.magma_ssymm(side, uplo, m, n, alpha, int(dA), ldda, int(dB), lddb, beta, int(dC), lddc) # SSYRK, DSYRK, CSYRK, ZSYRK _libmagma.magma_ssyrk.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def magma_ssyrk(uplo, trans, n, k, alpha, dA, ldda, beta, dC, lddc): _libmagma.magma_ssyrk(uplo, trans, n, k, alpha, int(dA), ldda, beta, int(dC), lddc) # SSYR2K, DSYR2K, CSYR2K, ZSYR2K _libmagma.magma_ssyr2k.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def magma_ssyr2k(uplo, trans, n, k, alpha, dA, ldda, dB, lddb, beta, dC, lddc): _libmagma.magma_ssyr2k(uplo, trans, n, k, alpha, int(dA), ldda, int(dB), lddb, beta, int(dC), lddc) # STRMM, DTRMM, CTRMM, ZTRMM _libmagma.magma_strmm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_strmm(side, uplo, trans, diag, m, n, alpha, dA, ldda, dB, lddb): _libmagma.magma_strmm(uplo, trans, diag, m, n, alpha, int(dA), ldda, int(dB), lddb) # STRSM, DTRSM, CTRSM, ZTRSM _libmagma.magma_strsm.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magma_strsm(side, uplo, trans, diag, m, n, alpha, dA, ldda, dB, lddb): _libmagma.magma_strsm(uplo, trans, diag, m, n, alpha, int(dA), ldda, int(dB), lddb) # Auxiliary routines: _libmagma.magma_vec_const.restype = int _libmagma.magma_vec_const.argtypes = [ctypes.c_char] def magma_vec_const(job): return _libmagma.magma_vec_const(job) _libmagma.magma_get_spotrf_nb.restype = int _libmagma.magma_get_spotrf_nb.argtypes = [ctypes.c_int] def magma_get_spotrf_nb(m): return _libmagma.magma_get_spotrf_nb(m) _libmagma.magma_get_sgetrf_nb.restype = int _libmagma.magma_get_sgetrf_nb.argtypes = [ctypes.c_int] def magma_get_sgetrf_nb(m): return _libmagma.magma_get_sgetrf_nb(m) _libmagma.magma_get_sgetri_nb.restype = int _libmagma.magma_get_sgetri_nb.argtypes = [ctypes.c_int] def magma_get_sgetri_nb(m): return _libmagma.magma_get_sgetri_nb(m) _libmagma.magma_get_sgeqp3_nb.restype = int _libmagma.magma_get_sgeqp3_nb.argtypes = [ctypes.c_int] def magma_get_sgeqp3_nb(m): return _libmagma.magma_get_sgeqp3_nb(m) _libmagma.magma_get_sgeqrf_nb.restype = int _libmagma.magma_get_sgeqrf_nb.argtypes = [ctypes.c_int] def magma_get_sgeqrf_nb(m): return _libmagma.magma_get_sgeqrf_nb(m) _libmagma.magma_get_sgeqlf_nb.restype = int _libmagma.magma_get_sgeqlf_nb.argtypes = [ctypes.c_int] def magma_get_sgeqlf_nb(m): return _libmagma.magma_get_sgeqlf_nb(m) _libmagma.magma_get_sgehrd_nb.restype = int _libmagma.magma_get_sgehrd_nb.argtypes = [ctypes.c_int] def magma_get_sgehrd_nb(m): return _libmagma.magma_get_sgehrd_nb(m) _libmagma.magma_get_ssytrd_nb.restype = int _libmagma.magma_get_ssytrd_nb.argtypes = [ctypes.c_int] def magma_get_ssytrd_nb(m): return _libmagma.magma_get_ssytrd_nb(m) _libmagma.magma_get_sgelqf_nb.restype = int _libmagma.magma_get_sgelqf_nb.argtypes = [ctypes.c_int] def magma_get_sgelqf_nb(m): return _libmagma.magma_get_sgelqf_nb(m) _libmagma.magma_get_sgebrd_nb.restype = int _libmagma.magma_get_sgebrd_nb.argtypes = [ctypes.c_int] def magma_get_sgebrd_nb(m): return _libmagma.magma_get_sgebrd_nb(m) _libmagma.magma_get_ssygst_nb.restype = int _libmagma.magma_get_ssygst_nb.argtypes = [ctypes.c_int] def magma_get_ssygst_nb(m): return _libmagma.magma_get_ssgyst_nb(m) _libmagma.magma_get_sgesvd_nb.restype = int _libmagma.magma_get_sgesvd_nb.argtypes = [ctypes.c_int] def magma_get_sgesvd_nb(m): return _libmagma.magma_get_sgesvd_nb(m) _libmagma.magma_get_dgesvd_nb.restype = int _libmagma.magma_get_dgesvd_nb.argtypes = [ctypes.c_int] def magma_get_dgesvd_nb(m): return _libmagma.magma_get_dgesvd_nb(m) _libmagma.magma_get_cgesvd_nb.restype = int _libmagma.magma_get_cgesvd_nb.argtypes = [ctypes.c_int] def magma_get_cgesvd_nb(m): return _libmagma.magma_get_cgesvd_nb(m) _libmagma.magma_get_zgesvd_nb.restype = int _libmagma.magma_get_zgesvd_nb.argtypes = [ctypes.c_int] def magma_get_zgesvd_nb(m): return _libmagma.magma_get_zgesvd_nb(m) _libmagma.magma_get_ssygst_nb_m.restype = int _libmagma.magma_get_ssygst_nb_m.argtypes = [ctypes.c_int] def magma_get_ssygst_nb_m(m): return _libmagma.magma_get_ssgyst_nb_m(m) _libmagma.magma_get_sbulge_nb.restype = int _libmagma.magma_get_sbulge_nb.argtypes = [ctypes.c_int] def magma_get_sbulge_nb(m): return _libmagma.magma_get_sbulge_nb(m) _libmagma.magma_get_sbulge_nb_mgpu.restype = int _libmagma.magma_get_sbulge_nb_mgpu.argtypes = [ctypes.c_int] def magma_get_sbulge_nb_mgpu(m): return _libmagma.magma_get_sbulge_nb_mgpu(m) _libmagma.magma_get_dsytrd_nb.restype = int _libmagma.magma_get_dsytrd_nb.argtypes = [ctypes.c_int] def magma_get_dsytrd_nb(m): return _libmagma.magma_get_dsytrd_nb(m) # LAPACK routines # SGEBRD, DGEBRD, CGEBRD, ZGEBRD _libmagma.magma_sgebrd.restype = int _libmagma.magma_sgebrd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgebrd(m, n, A, lda, d, e, tauq, taup, work, lwork, info): """ Reduce matrix to bidiagonal form. """ status = _libmagma.magma_sgebrd.argtypes(m, n, int(A), lda, int(d), int(e), int(tauq), int(taup), int(work), int(lwork), int(info)) magmaCheckStatus(status) # SGEHRD2, DGEHRD2, CGEHRD2, ZGEHRD2 _libmagma.magma_sgehrd2.restype = int _libmagma.magma_sgehrd2.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgehrd2(n, ilo, ihi, A, lda, tau, work, lwork, info): """ Reduce matrix to upper Hessenberg form. """ status = _libmagma.magma_sgehrd2(n, ilo, ihi, int(A), lda, int(tau), int(work), lwork, int(info)) magmaCheckStatus(status) # SGEHRD, DGEHRD, CGEHRD, ZGEHRD _libmagma.magma_sgehrd.restype = int _libmagma.magma_sgehrd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgehrd(n, ilo, ihi, A, lda, tau, work, lwork, dT, info): """ Reduce matrix to upper Hessenberg form (fast algorithm). """ status = _libmagma.magma_sgehrd(n, ilo, ihi, int(A), lda, int(tau), int(work), lwork, int(dT), int(info)) magmaCheckStatus(status) # SGELQF, DGELQF, CGELQF, ZGELQF _libmagma.magma_sgelqf.restype = int _libmagma.magma_sgelqf.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgelqf(m, n, A, lda, tau, work, lwork, info): """ LQ factorization. """ status = _libmagma.magma_sgelqf(m, n, int(A), lda, int(tau), int(work), lwork, int(info)) magmaCheckStatus(status) # SGEQRF, DGEQRF, CGEQRF, ZGEQRF _libmagma.magma_sgeqrf.restype = int _libmagma.magma_sgeqrf.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgeqrf(m, n, A, lda, tau, work, lwork, info): """ QR factorization. """ status = _libmagma.magma_sgeqrf(m, n, int(A), lda, int(tau), int(work), lwork, int(info)) magmaCheckStatus(status) # SGEQRF4, DGEQRF4, CGEQRF4, ZGEQRF4 _libmagma.magma_sgeqrf4.restype = int _libmagma.magma_sgeqrf4.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgeqrf4(num_gpus, m, n, a, lda, tau, work, lwork, info): """ """ status = _libmagma.magma_sgeqrf4(num_gpus, m, n, int(a), lda, int(tau), int(work), lwork, int(info)) magmaCheckStatus(status) # SGEQRF, DGEQRF, CGEQRF, ZGEQRF (ooc) _libmagma.magma_sgeqrf_ooc.restype = int _libmagma.magma_sgeqrf_ooc.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgeqrf_ooc(m, n, A, lda, tau, work, lwork, info): """ QR factorization (ooc). """ status = _libmagma.magma_sgeqrf_ooc(m, n, int(A), lda, int(tau), int(work), lwork, int(info)) magmaCheckStatus(status) # SGESV, DGESV, CGESV, ZGESV _libmagma.magma_sgesv.restype = int _libmagma.magma_sgesv.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgesv(n, nhrs, A, lda, ipiv, B, ldb, info): """ Solve system of linear equations. """ status = _libmagma.magma_sgesv(n, nhrs, int(A), lda, int(ipiv), int(B), ldb, int(info)) magmaCheckStatus(status) # SGETRF, DGETRF, CGETRF, ZGETRF _libmagma.magma_sgetrf.restype = int _libmagma.magma_sgetrf.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def magma_sgetrf(m, n, A, lda, ipiv, info): """ LU factorization. """ status = _libmagma.magma_sgetrf(m, n, int(A), lda, int(ipiv), int(info)) magmaCheckStatus(status) ## SGETRF2, DGETRF2, CGETRF2, ZGETRF2 #_libmagma.magma_sgetrf2.restype = int #_libmagma.magma_sgetrf2.argtypes = [ctypes.c_int, # ctypes.c_int, # ctypes.c_void_p, # ctypes.c_int, # ctypes.c_void_p, # ctypes.c_void_p] #def magma_sgetrf2(m, n, A, lda, ipiv, info): # # """ # LU factorization (multi-GPU). # """ # # status = _libmagma.magma_sgetrf2(m, n, int(A), lda, # int(ipiv), int(info)) # magmaCheckStatus(status) # SGEEV, DGEEV, CGEEV, ZGEEV _libmagma.magma_sgeev.restype = int _libmagma.magma_sgeev.argtypes = [ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def magma_sgeev(jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr, work, lwork, rwork, info): """ Compute eigenvalues and eigenvectors. """ status = _libmagma.magma_sgeev(jobvl, jobvr, n, int(a), lda, int(w), int(vl), ldvl, int(vr), ldvr, int(work), lwork, int(rwork), int(info)) magmaCheckStatus(status) # SGESVD, DGESVD, CGESVD, ZGESVD _libmagma.magma_sgesvd.restype = int _libmagma.magma_sgesvd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info): """ SVD decomposition. """ status = _libmagma.magma_sgesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt, int(work), lwork, int(info)) magmaCheckStatus(status) _libmagma.magma_dgesvd.restype = int _libmagma.magma_dgesvd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_dgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info): """ SVD decomposition. """ status = _libmagma.magma_dgesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt, int(work), lwork, int(info)) magmaCheckStatus(status) _libmagma.magma_cgesvd.restype = int _libmagma.magma_cgesvd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_cgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, rwork, info): """ SVD decomposition. """ status = _libmagma.magma_cgesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt, int(work), lwork, int(rwork), int(info)) magmaCheckStatus(status) _libmagma.magma_zgesvd.restype = int _libmagma.magma_zgesvd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def magma_zgesvd(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, rwork, info): """ SVD decomposition. """ status = _libmagma.magma_zgesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt, int(work), lwork, int(rwork), int(info)) magmaCheckStatus(status) # SGESDD, DGESDD, CGESDD, ZGESDD _libmagma.magma_sgesdd.restype = int _libmagma.magma_sgesdd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def magma_sgesdd(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, iwork, info): """ SDD decomposition. """ status = _libmagma.magma_sgesdd(jobz, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt, int(work), lwork, int(iwork), int(info)) magmaCheckStatus(status) _libmagma.magma_dgesdd.restype = int _libmagma.magma_dgesdd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def magma_dgesdd(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, iwork, info): """ SDD decomposition. """ status = _libmagma.magma_dgesdd(jobz, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt, int(work), lwork, int(iwork), int(info)) magmaCheckStatus(status) _libmagma.magma_cgesdd.restype = int _libmagma.magma_cgesdd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def magma_cgesdd(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, rwork, iwork, info): """ SDD decomposition. """ status = _libmagma.magma_cgesdd(jobz, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt, int(work), lwork, int(rwork), int(iwork), int(info)) magmaCheckStatus(status) _libmagma.magma_zgesdd.restype = int _libmagma.magma_zgesdd.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] def magma_zgesdd(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, rwork, iwork, info): """ SDD decomposition. """ status = _libmagma.magma_zgesdd(jobz, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt, int(work), lwork, int(rwork), int(iwork), int(info)) magmaCheckStatus(status) # SPOSV, DPOSV, CPOSV, ZPOSV _libmagma.magma_sposv_gpu.restype = int _libmagma.magma_sposv_gpu.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sposv_gpu(uplo, n, nhrs, a_gpu, lda, b_gpu, ldb): """ Solve linear system with positive semidefinite coefficient matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_sposv_gpu(uplo, n, nhrs, int(a_gpu), lda, int(b_gpu), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_dposv_gpu.restype = int _libmagma.magma_dposv_gpu.argtypes = _libmagma.magma_sposv_gpu.argtypes def magma_dposv_gpu(uplo, n, nhrs, a_gpu, lda, b_gpu, ldb): """ Solve linear system with positive semidefinite coefficient matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_dposv_gpu(uplo, n, nhrs, int(a_gpu), lda, int(b_gpu), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_cposv_gpu.restype = int _libmagma.magma_cposv_gpu.argtypes = _libmagma.magma_sposv_gpu.argtypes def magma_cposv_gpu(uplo, n, nhrs, a_gpu, lda, b_gpu, ldb): """ Solve linear system with positive semidefinite coefficient matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_cposv_gpu(uplo, n, nhrs, int(a_gpu), lda, int(b_gpu), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_zposv_gpu.restype = int _libmagma.magma_zposv_gpu.argtypes = _libmagma.magma_sposv_gpu.argtypes def magma_zposv_gpu(uplo, n, nhrs, a_gpu, lda, b_gpu, ldb): """ Solve linear system with positive semidefinite coefficient matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_zposv_gpu(uplo, n, nhrs, int(a_gpu), lda, int(b_gpu), ldb, ctypes.byref(info)) magmaCheckStatus(status) # SGESV, DGESV, CGESV, ZGESV _libmagma.magma_sgesv_gpu.restype = int _libmagma.magma_sgesv_gpu.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgesv_gpu(n, nhrs, A, lda, ipiv, B, ldb): """ Solve system of linear equations. """ info = ctypes.c_int() status = _libmagma.magma_sgesv_gpu(n, nhrs, int(A), lda, int(ipiv), int(B), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_dgesv_gpu.restype = int _libmagma.magma_dgesv_gpu.argtypes = _libmagma.magma_sgesv_gpu.argtypes def magma_dgesv_gpu(n, nhrs, A, lda, ipiv, B, ldb): """ Solve system of linear equations. """ info = ctypes.c_int() status = _libmagma.magma_dgesv_gpu(n, nhrs, int(A), lda, int(ipiv), int(B), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_cgesv_gpu.restype = int _libmagma.magma_cgesv_gpu.argtypes = _libmagma.magma_sgesv_gpu.argtypes def magma_cgesv_gpu(n, nhrs, A, lda, ipiv, B, ldb): """ Solve system of linear equations. """ info = ctypes.c_int() status = _libmagma.magma_cgesv_gpu(n, nhrs, int(A), lda, int(ipiv), int(B), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_zgesv_gpu.restype = int _libmagma.magma_zgesv_gpu.argtypes = _libmagma.magma_sgesv_gpu.argtypes def magma_zgesv_gpu(n, nhrs, A, lda, ipiv, B, ldb): """ Solve system of linear equations. """ info = ctypes.c_int() status = _libmagma.magma_zgesv_gpu(n, nhrs, int(A), lda, int(ipiv), int(B), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_sgesv_nopiv_gpu.restype = int _libmagma.magma_sgesv_nopiv_gpu.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_sgesv_nopiv_gpu(n, nhrs, A, lda, B, ldb): """ Solve system of linear equations. """ info = ctypes.c_int() status = _libmagma.magma_sgesv_nopiv_gpu(n, nhrs, int(A), lda, int(B), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_dgesv_nopiv_gpu.restype = int _libmagma.magma_dgesv_nopiv_gpu.argtypes = _libmagma.magma_sgesv_nopiv_gpu.argtypes def magma_dgesv_nopiv_gpu(n, nhrs, A, lda, B, ldb): """ Solve system of linear equations. """ info = ctypes.c_int() status = _libmagma.magma_dgesv_nopiv_gpu(n, nhrs, int(A), lda, int(B), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_cgesv_nopiv_gpu.restype = int _libmagma.magma_cgesv_nopiv_gpu.argtypes = _libmagma.magma_sgesv_nopiv_gpu.argtypes def magma_cgesv_nopiv_gpu(n, nhrs, A, lda, B, ldb): """ Solve system of linear equations. """ info = ctypes.c_int() status = _libmagma.magma_cgesv_nopiv_gpu(n, nhrs, int(A), lda, int(B), ldb, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_zgesv_nopiv_gpu.restype = int _libmagma.magma_zgesv_nopiv_gpu.argtypes = _libmagma.magma_sgesv_nopiv_gpu.argtypes def magma_zgesv_nopiv_gpu(n, nhrs, A, lda, B, ldb): """ Solve system of linear equations. """ info = ctypes.c_int() status = _libmagma.magma_zgesv_nopiv_gpu(n, nhrs, int(A), lda, int(B), ldb, ctypes.byref(info)) magmaCheckStatus(status) # SPOTRF, DPOTRF, CPOTRF, ZPOTRF _libmagma.magma_spotrf_gpu.restype = int _libmagma.magma_spotrf_gpu.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_spotrf_gpu(uplo, n, A, lda): """ Cholesky factorization of positive symmetric matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_spotrf_gpu(uplo, n, int(A), lda, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_dpotrf_gpu.restype = int _libmagma.magma_dpotrf_gpu.argtypes = _libmagma.magma_spotrf_gpu.argtypes def magma_dpotrf_gpu(uplo, n, A, lda): """ Cholesky factorization of positive symmetric matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_dpotrf_gpu(uplo, n, int(A), lda, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_cpotrf_gpu.restype = int _libmagma.magma_cpotrf_gpu.argtypes = _libmagma.magma_spotrf_gpu.argtypes def magma_cpotrf_gpu(uplo, n, A, lda): """ Cholesky factorization of positive symmetric matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_cpotrf_gpu(uplo, n, int(A), lda, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_zpotrf_gpu.restype = int _libmagma.magma_zpotrf_gpu.argtypes = _libmagma.magma_zpotrf_gpu.argtypes def magma_zpotrf_gpu(uplo, n, A, lda): """ Cholesky factorization of positive symmetric matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_zpotrf_gpu(uplo, n, int(A), lda, ctypes.byref(info)) magmaCheckStatus(status) # SPOTRI, DPOTRI, CPOTRI, ZPOTRI _libmagma.magma_spotri_gpu.restype = int _libmagma.magma_spotri_gpu.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_spotri_gpu(uplo, n, A, lda): """ Inverse using the Cholesky factorization of positive symmetric matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_spotri_gpu(uplo, n, int(A), lda, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_dpotri_gpu.restype = int _libmagma.magma_dpotri_gpu.argtypes = _libmagma.magma_spotri_gpu.argtypes def magma_dpotri_gpu(uplo, n, A, lda): """ Inverse using the Cholesky factorization of positive symmetric matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_dpotri_gpu(uplo, n, int(A), lda, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_cpotri_gpu.restype = int _libmagma.magma_cpotri_gpu.argtypes = _libmagma.magma_spotri_gpu.argtypes def magma_cpotri_gpu(uplo, n, A, lda): """ Inverse using the Cholesky factorization of positive symmetric matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_cpotri_gpu(uplo, n, int(A), lda, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_zpotri_gpu.restype = int _libmagma.magma_zpotri_gpu.argtypes = _libmagma.magma_spotri_gpu.argtypes def magma_zpotri_gpu(uplo, n, A, lda): """ Inverse using the Cholesky factorization of positive symmetric matrix. """ uplo = _uplo_conversion[uplo] info = ctypes.c_int() status = _libmagma.magma_zpotri_gpu(uplo, n, int(A), lda, ctypes.byref(info)) magmaCheckStatus(status) # SGETRF, DGETRF, CGETRF, ZGETRF _libmagma.magma_sgetrf_gpu.restype = int _libmagma.magma_sgetrf_gpu.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] def magma_sgetrf_gpu(n, m, A, lda, ipiv): """ LU factorization. """ info = ctypes.c_int() status = _libmagma.magma_sgetrf_gpu(n, m, int(A), lda, int(ipiv), ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_dgetrf_gpu.restype = int _libmagma.magma_dgetrf_gpu.argtypes = _libmagma.magma_sgetrf_gpu.argtypes def magma_dgetrf_gpu(n, m, A, lda, ipiv): """ LU factorization. """ info = ctypes.c_int() status = _libmagma.magma_dgetrf_gpu(n, m, int(A), lda, int(ipiv), ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_cgetrf_gpu.restype = int _libmagma.magma_cgetrf_gpu.argtypes = _libmagma.magma_sgetrf_gpu.argtypes def magma_cgetrf_gpu(n, m, A, lda, ipiv): """ LU factorization. """ info = ctypes.c_int() status = _libmagma.magma_cgetrf_gpu(n, m, int(A), lda, int(ipiv), ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_zgetrf_gpu.restype = int _libmagma.magma_zgetrf_gpu.argtypes = _libmagma.magma_sgetrf_gpu.argtypes def magma_zgetrf_gpu(n, m, A, lda, ipiv): """ LU factorization. """ info = ctypes.c_int() status = _libmagma.magma_zgetrf_gpu(n, m, int(A), lda, int(ipiv), ctypes.byref(info)) magmaCheckStatus(status) # SSYEVD, DSYEVD _libmagma.magma_ssyevd_gpu.restype = int _libmagma.magma_ssyevd_gpu.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_ssyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork): """ Compute eigenvalues of real symmetric matrix. """ info = ctypes.c_int() status = _libmagma.magma_ssyevd_gpu(jobz, uplo, n, int(dA), ldda, int(w), int(wA), ldwa, int(work), lwork, int(iwork), liwork, ctypes.byref(info)) magmaCheckStatus(status) _libmagma.magma_dsyevd_gpu.restype = int _libmagma.magma_dsyevd_gpu.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def magma_dsyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork): """ Compute eigenvalues of real symmetric matrix. """ info = ctypes.c_int() status = _libmagma.magma_dsyevd_gpu(jobz, uplo, n, int(dA), ldda, int(w), int(wA), ldwa, int(work), lwork, int(iwork), liwork, ctypes.byref(info)) magmaCheckStatus(status) # SYMMETRIZE _libmagma.magmablas_ssymmetrize.restype = int _libmagma.magmablas_ssymmetrize.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def magmablas_ssymmetrize(uplo, n, A, lda): """ Symmetrize a triangular matrix. """ uplo = _uplo_conversion[uplo] status = _libmagma.magmablas_ssymmetrize(uplo, n, int(A), lda) magmaCheckStatus(status) _libmagma.magmablas_dsymmetrize.restype = int _libmagma.magmablas_dsymmetrize.argtypes = _libmagma.magmablas_ssymmetrize.argtypes def magmablas_dsymmetrize(uplo, n, A, lda): """ Symmetrize a triangular matrix. """ uplo = _uplo_conversion[uplo] status = _libmagma.magmablas_dsymmetrize(uplo, n, int(A), lda) magmaCheckStatus(status) _libmagma.magmablas_csymmetrize.restype = int _libmagma.magmablas_csymmetrize.argtypes = _libmagma.magmablas_ssymmetrize.argtypes def magmablas_csymmetrize(uplo, n, A, lda): """ Symmetrize a triangular matrix. """ uplo = _uplo_conversion[uplo] status = _libmagma.magmablas_csymmetrize(uplo, n, int(A), lda) magmaCheckStatus(status) _libmagma.magmablas_zsymmetrize.restype = int _libmagma.magmablas_zsymmetrize.argtypes = _libmagma.magmablas_ssymmetrize.argtypes def magmablas_zsymmetrize(uplo, n, A, lda): """ Symmetrize a triangular matrix. """ uplo = _uplo_conversion[uplo] status = _libmagma.magmablas_zsymmetrize(uplo, n, int(A), lda) magmaCheckStatus(status) scikit-cuda-0.5.1/skcuda/misc.py000066400000000000000000001413341261465507300165130ustar00rootroot00000000000000#!/usr/bin/env python """ Miscellaneous PyCUDA functions. """ from __future__ import absolute_import, division import atexit import numbers from string import Template import pycuda.driver as drv import pycuda.gpuarray as gpuarray import pycuda.elementwise as elementwise import pycuda.reduction as reduction import pycuda.scan as scan import pycuda.tools as tools from pycuda.tools import context_dependent_memoize, dtype_to_ctype from pycuda.compiler import SourceModule from pytools import memoize import numpy as np from . import cuda from . import cublas try: from . import cula _has_cula = True except (ImportError, OSError): _has_cula = False try: from . import cusolver _has_cusolver = True except (ImportError, OSError): _has_cusolver = False try: from . import magma _has_magma = True except (ImportError, OSError): _has_magma = False isdoubletype = lambda x : True if x == np.float64 or \ x == np.complex128 else False isdoubletype.__doc__ = """ Check whether a type has double precision. Parameters ---------- t : numpy float type Type to test. Returns ------- result : bool Result. """ iscomplextype = lambda x : True if x == np.complex64 or \ x == np.complex128 else False iscomplextype.__doc__ = """ Check whether a type is complex. Parameters ---------- t : numpy float type Type to test. Returns ------- result : bool Result. """ def init_device(n=0): """ Initialize a GPU device. Initialize a specified GPU device rather than the default device found by `pycuda.autoinit`. Parameters ---------- n : int Device number. Returns ------- dev : pycuda.driver.Device Initialized device. """ drv.init() dev = drv.Device(n) return dev def init_context(dev): """ Create a context that will be cleaned up properly. Create a context on the specified device and register its pop() method with atexit. Parameters ---------- dev : pycuda.driver.Device GPU device. Returns ------- ctx : pycuda.driver.Context Created context. """ ctx = dev.make_context() atexit.register(ctx.pop) return ctx def done_context(ctx): """ Detach from a context cleanly. Detach from a context and remove its pop() from atexit. Parameters ---------- ctx : pycuda.driver.Context Context from which to detach. """ for i in xrange(len(atexit._exithandlers)): if atexit._exithandlers[i][0] == ctx.pop: del atexit._exithandlers[i] break ctx.detach() global _global_cublas_handle _global_cublas_handle = None global _global_cusolver_handle _global_cusolver_handle = None global _global_cublas_allocator _global_cublas_allocator = None def init(allocator=drv.mem_alloc): """ Initialize libraries used by scikit-cuda. Initialize the CUBLAS, CUSOLVER, and CULA libraries used by high-level functions provided by scikit-cuda. Parameters ---------- allocator : an allocator used internally by some of the high-level functions. Notes ----- This function does not initialize PyCUDA; it uses whatever device and context were initialized in the current host thread. """ # CUBLAS uses whatever device is being used by the host thread: global _global_cublas_handle, _global_cublas_allocator if not _global_cublas_handle: from . import cublas # nest to avoid requiring cublas e.g. for FFT _global_cublas_handle = cublas.cublasCreate() if _global_cublas_allocator is None: _global_cublas_allocator = allocator global _global_cusolver_handle if not _global_cusolver_handle: from . import cusolver _global_cusolver_handle = cusolver.cusolverDnCreate() # culaSelectDevice() need not (and, in fact, cannot) be called # here because the host thread has already been bound to a GPU # device: if _has_cula: cula.culaInitialize() if _has_magma: magma.magma_init() def shutdown(): """ Shutdown libraries used by scikit-cuda. Shutdown the CUBLAS and CULA libraries used by high-level functions provided by scikits-cuda. Notes ----- This function does not shutdown PyCUDA. """ global _global_cublas_handle if _global_cublas_handle: from . import cublas # nest to avoid requiring cublas e.g. for FFT cublas.cublasDestroy(_global_cublas_handle) _global_cublas_handle = None global _global_cusolver_handle if _global_cusolver_handle: from . import cusolver cusolver.cusolverDnDestroy(_global_cusolver_handle) _global_cusolver_handle = None if _has_cula: cula.culaShutdown() def get_compute_capability(dev): """ Get the compute capability of the specified device. Retrieve the compute capability of the specified CUDA device and return it as a floating point value. Parameters ---------- d : pycuda.driver.Device Device object to examine. Returns ------- c : float Compute capability. """ return np.float('.'.join([str(i) for i in dev.compute_capability()])) def get_current_device(): """ Get the device in use by the current context. Returns ------- d : pycuda.driver.Device Device in use by current context. """ return drv.Device(cuda.cudaGetDevice()) @memoize def get_dev_attrs(dev): """ Get select CUDA device attributes. Retrieve select attributes of the specified CUDA device that relate to maximum thread block and grid sizes. Parameters ---------- d : pycuda.driver.Device Device object to examine. Returns ------- attrs : list List containing [MAX_THREADS_PER_BLOCK, (MAX_BLOCK_DIM_X, MAX_BLOCK_DIM_Y, MAX_BLOCK_DIM_Z), (MAX_GRID_DIM_X, MAX_GRID_DIM_Y, MAX_GRID_DIM_Z)] """ attrs = dev.get_attributes() return [attrs[drv.device_attribute.MAX_THREADS_PER_BLOCK], (attrs[drv.device_attribute.MAX_BLOCK_DIM_X], attrs[drv.device_attribute.MAX_BLOCK_DIM_Y], attrs[drv.device_attribute.MAX_BLOCK_DIM_Z]), (attrs[drv.device_attribute.MAX_GRID_DIM_X], attrs[drv.device_attribute.MAX_GRID_DIM_Y], attrs[drv.device_attribute.MAX_GRID_DIM_Z])] iceil = lambda n: int(np.ceil(n)) @memoize def select_block_grid_sizes(dev, data_shape, threads_per_block=None): """ Determine CUDA block and grid dimensions given device constraints. Determine the CUDA block and grid dimensions allowed by a GPU device that are sufficient for processing every element of an array in a separate thread. Parameters ---------- d : pycuda.driver.Device Device object to be used. data_shape : tuple Shape of input data array. Must be of length 2. threads_per_block : int, optional Number of threads to execute in each block. If this is None, the maximum number of threads per block allowed by device `d` is used. Returns ------- block_dim : tuple X, Y, and Z dimensions of minimal required thread block. grid_dim : tuple X and Y dimensions of minimal required block grid. Notes ----- Using the scheme in this function, all of the threads in the grid can be enumerated as `i = blockIdx.y*max_threads_per_block*max_blocks_per_grid+ blockIdx.x*max_threads_per_block+threadIdx.x`. For 2D shapes, the subscripts of the element `data[a, b]` where `data.shape == (A, B)` can be computed as `a = i/B` `b = mod(i,B)`. For 3D shapes, the subscripts of the element `data[a, b, c]` where `data.shape == (A, B, C)` can be computed as `a = i/(B*C)` `b = mod(i, B*C)/C` `c = mod(mod(i, B*C), C)`. For 4D shapes, the subscripts of the element `data[a, b, c, d]` where `data.shape == (A, B, C, D)` can be computed as `a = i/(B*C*D)` `b = mod(i, B*C*D)/(C*D)` `c = mod(mod(i, B*C*D)%(C*D))/D` `d = mod(mod(mod(i, B*C*D)%(C*D)), D)` It is advisable that the number of threads per block be a multiple of the warp size to fully utilize a device's computing resources. """ # Sanity checks: if np.isscalar(data_shape): data_shape = (data_shape,) # Number of elements to process; we need to cast the result of # np.prod to a Python int to prevent PyCUDA's kernel execution # framework from getting confused when N = int(np.prod(data_shape)) # Get device constraints: max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev) if threads_per_block is not None: if threads_per_block > max_threads_per_block: raise ValueError('threads per block exceeds device maximum') else: max_threads_per_block = threads_per_block # Actual number of thread blocks needed: blocks_needed = iceil(N/float(max_threads_per_block)) if blocks_needed <= max_grid_dim[0]: return (max_threads_per_block, 1, 1), (blocks_needed, 1, 1) elif blocks_needed > max_grid_dim[0] and \ blocks_needed <= max_grid_dim[0]*max_grid_dim[1]: return (max_threads_per_block, 1, 1), \ (max_grid_dim[0], iceil(blocks_needed/float(max_grid_dim[0])), 1) elif blocks_needed > max_grid_dim[0]*max_grid_dim[1] and \ blocks_needed <= max_grid_dim[0]*max_grid_dim[1]*max_grid_dim[2]: return (max_threads_per_block, 1, 1), \ (max_grid_dim[0], max_grid_dim[1], iceil(blocks_needed/float(max_grid_dim[0]*max_grid_dim[1]))) else: raise ValueError('array size too large') def zeros(shape, dtype, allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with zeros. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. allocator : callable Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of zeros with the given shape and dtype. Notes ----- This function exists to work around the following numpy bug that prevents pycuda.gpuarray.zeros() from working properly with complex types in pycuda 2011.1.2: http://projects.scipy.org/numpy/ticket/1898 """ out = gpuarray.GPUArray(shape, dtype, allocator) out.fill(0) return out def zeros_like(a): """ Return an array of zeros with the same shape and type as a given array. Parameters ---------- a : array_like The shape and data type of `a` determine the corresponding attributes of the returned array. Returns ------- out : pycuda.gpuarray.GPUArray Array of zeros with the shape and dtype of `a`. """ out = gpuarray.GPUArray(a.shape, a.dtype, drv.mem_alloc) out.fill(0) return out def ones(shape, dtype, allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with ones. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. allocator : callable Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of ones with the given shape and dtype. """ out = gpuarray.GPUArray(shape, dtype, allocator) out.fill(1) return out def ones_like(other): """ Return an array of ones with the same shape and type as a given array. Parameters ---------- other : pycuda.gpuarray.GPUArray Array whose shape and dtype are to be used to allocate a new array. Returns ------- out : pycuda.gpuarray.GPUArray Array of ones with the shape and dtype of `other`. """ out = gpuarray.GPUArray(other.shape, other.dtype, other.allocator) out.fill(1) return out def inf(shape, dtype, allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with infs. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. allocator : callable Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of infs with the given shape and dtype. """ out = gpuarray.GPUArray(shape, dtype, allocator) out.fill(np.inf) return out def maxabs(x_gpu): """ Get maximum absolute value. Find maximum absolute value in the specified array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- m_gpu : pycuda.gpuarray.GPUArray Array containing maximum absolute value in `x_gpu`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import misc >>> x_gpu = gpuarray.to_gpu(np.array([-1, 2, -3], np.float32)) >>> m_gpu = misc.maxabs(x_gpu) >>> np.allclose(m_gpu.get(), 3.0) True """ try: func = maxabs.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) use_double = int(x_gpu.dtype in [np.float64, np.complex128]) ret_type = np.float64 if use_double else np.float32 func = reduction.ReductionKernel(ret_type, neutral="0", reduce_expr="max(a,b)", map_expr="abs(x[i])", arguments="{ctype} *x".format(ctype=ctype)) maxabs.cache[x_gpu.dtype] = func return func(x_gpu) maxabs.cache = {} def cumsum(x_gpu): """ Cumulative sum. Return the cumulative sum of the elements in the specified array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- c_gpu : pycuda.gpuarray.GPUArray Output array containing cumulative sum of `x_gpu`. Notes ----- Higher dimensional arrays are implicitly flattened row-wise by this function. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import misc >>> x_gpu = gpuarray.to_gpu(np.random.rand(5).astype(np.float32)) >>> c_gpu = misc.cumsum(x_gpu) >>> np.allclose(c_gpu.get(), np.cumsum(x_gpu.get())) True """ try: func = cumsum.cache[x_gpu.dtype] except KeyError: func = scan.InclusiveScanKernel(x_gpu.dtype, 'a+b', preamble='#include ') cumsum.cache[x_gpu.dtype] = func return func(x_gpu) cumsum.cache = {} def diff(x_gpu): """ Calculate the discrete difference. Calculates the first order difference between the successive entries of a vector. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input vector. Returns ------- y_gpu : pycuda.gpuarray.GPUArray Discrete difference. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> x = np.asarray(np.random.rand(5), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> y_gpu = misc.diff(x_gpu) >>> np.allclose(np.diff(x), y_gpu.get()) True """ y_gpu = gpuarray.empty(len(x_gpu)-1, x_gpu.dtype) try: func = diff.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = elementwise.ElementwiseKernel("{ctype} *a, {ctype} *b".format(ctype=ctype), "b[i] = a[i+1]-a[i]") diff.cache[x_gpu.dtype] = func func(x_gpu, y_gpu) return y_gpu diff.cache = {} # List of available numerical types provided by numpy: num_types = [np.typeDict[t] for t in \ np.typecodes['AllInteger']+np.typecodes['AllFloat']] # Numbers of bytes occupied by each numerical type: num_nbytes = dict((np.dtype(t),t(1).nbytes) for t in num_types) def set_realloc(x_gpu, data): """ Transfer data into a GPUArray instance. Copies the contents of a numpy array into a GPUArray instance. If the array has a different type or dimensions than the instance, the GPU memory used by the instance is reallocated and the instance updated appropriately. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray GPUArray instance to modify. data : numpy.ndarray Array of data to transfer to the GPU. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> x = np.asarray(np.random.rand(5), np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> x = np.asarray(np.random.rand(10, 1), np.float64) >>> set_realloc(x_gpu, x) >>> np.allclose(x, x_gpu.get()) True """ # Only reallocate if absolutely necessary: if x_gpu.shape != data.shape or x_gpu.size != data.size or \ x_gpu.strides != data.strides or x_gpu.dtype != data.dtype: # Free old memory: x_gpu.gpudata.free() # Allocate new memory: nbytes = num_nbytes[data.dtype] x_gpu.gpudata = drv.mem_alloc(nbytes*data.size) # Set array attributes: x_gpu.shape = data.shape x_gpu.size = data.size x_gpu.strides = data.strides x_gpu.dtype = data.dtype # Update the GPU memory: x_gpu.set(data) def get_by_index(src_gpu, ind): """ Get values in a GPUArray by index. Parameters ---------- src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to extract values. ind : pycuda.gpuarray.GPUArray or numpy.ndarray Array of element indices to set. Must have an integer dtype. Returns ------- res_gpu : pycuda.gpuarray.GPUArray GPUArray with length of `ind` and dtype of `src_gpu` containing selected values. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> src = np.random.rand(5).astype(np.float32) >>> src_gpu = gpuarray.to_gpu(src) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> res_gpu = misc.get_by_index(src_gpu, ind) >>> np.allclose(res_gpu.get(), src[[0, 2, 4]]) True Notes ----- Only supports 1D index arrays. May not be efficient for certain index patterns because of lack of inability to coalesce memory operations. """ # Only support 1D index arrays: assert len(np.shape(ind)) == 1 assert issubclass(ind.dtype.type, numbers.Integral) N = len(ind) assert N <= len(src_gpu) if not isinstance(ind, gpuarray.GPUArray): ind = gpuarray.to_gpu(ind) dest_gpu = gpuarray.empty(N, dtype=src_gpu.dtype) # Manually handle empty index array because it will cause the kernel to # fail if processed: if N == 0: return dest_gpu try: func = get_by_index.cache[(src_gpu.dtype, ind.dtype)] except KeyError: data_ctype = tools.dtype_to_ctype(src_gpu.dtype) ind_ctype = tools.dtype_to_ctype(ind.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(data_ctype=data_ctype, ind_ctype=ind_ctype) func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]") get_by_index.cache[(src_gpu.dtype, ind.dtype)] = func func(dest_gpu, ind, src_gpu, range=slice(0, N, 1)) return dest_gpu get_by_index.cache = {} def set_by_index(dest_gpu, ind, src_gpu, ind_which='dest'): """ Set values in a GPUArray by index. Parameters ---------- dest_gpu : pycuda.gpuarray.GPUArray GPUArray instance to modify. ind : pycuda.gpuarray.GPUArray or numpy.ndarray 1D array of element indices to set. Must have an integer dtype. src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to set values. ind_which : str If set to 'dest', set the elements in `dest_gpu` with indices `ind` to the successive values in `src_gpu`; the lengths of `ind` and `src_gpu` must be equal. If set to 'src', set the successive values in `dest_gpu` to the values in `src_gpu` with indices `ind`; the lengths of `ind` and `dest_gpu` must be equal. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.float32)) >>> misc.set_by_index(dest_gpu, ind, src_gpu, 'dest') >>> np.allclose(dest_gpu.get(), np.array([1, 1, 1, 3, 1], dtype=np.float32)) True >>> dest_gpu = gpuarray.to_gpu(np.zeros(3, dtype=np.float32)) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> src_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) >>> misc.set_by_index(dest_gpu, ind, src_gpu) >>> np.allclose(dest_gpu.get(), np.array([0, 2, 4], dtype=np.float32)) True Notes ----- Only supports 1D index arrays. May not be efficient for certain index patterns because of lack of inability to coalesce memory operations. """ # Only support 1D index arrays: assert len(np.shape(ind)) == 1 assert dest_gpu.dtype == src_gpu.dtype assert issubclass(ind.dtype.type, numbers.Integral) N = len(ind) # Manually handle empty index array because it will cause the kernel to # fail if processed: if N == 0: return if ind_which == 'dest': assert N == len(src_gpu) elif ind_which == 'src': assert N == len(dest_gpu) else: raise ValueError('invalid value for `ind_which`') if not isinstance(ind, gpuarray.GPUArray): ind = gpuarray.to_gpu(ind) try: func = set_by_index.cache[(dest_gpu.dtype, ind.dtype, ind_which)] except KeyError: data_ctype = tools.dtype_to_ctype(dest_gpu.dtype) ind_ctype = tools.dtype_to_ctype(ind.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(data_ctype=data_ctype, ind_ctype=ind_ctype) if ind_which == 'dest': func = elementwise.ElementwiseKernel(v, "dest[ind[i]] = src[i]") else: func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]") set_by_index.cache[(dest_gpu.dtype, ind.dtype, ind_which)] = func func(dest_gpu, ind, src_gpu, range=slice(0, N, 1)) set_by_index.cache = {} @context_dependent_memoize def _get_binaryop_vecmat_kernel(dtype, binary_op): template = Template(""" #include __global__ void opColVecToMat(const ${type} *mat, const ${type} *vec, ${type} *out, const int32_t n, const int32_t m){ const int tx = threadIdx.x; const int ty = threadIdx.y; const int tidx = blockIdx.x * blockDim.x + threadIdx.x; const int tidy = blockIdx.y * blockDim.y + threadIdx.y; extern __shared__ ${type} shared_vec[]; if ((ty == 0) & (tidx < n)) shared_vec[tx] = vec[tidx]; __syncthreads(); if ((tidy < m) & (tidx < n)) { out[tidx*m+tidy] = mat[tidx*m+tidy] ${binary_op} shared_vec[tx]; } } __global__ void opRowVecToMat(const ${type}* mat, const ${type}* vec, ${type}* out, const int n, const int m){ const int tx = threadIdx.x; const int ty = threadIdx.y; const int tidx = blockIdx.x * blockDim.x + threadIdx.x; const int tidy = blockIdx.y * blockDim.y + threadIdx.y; extern __shared__ ${type} shared_vec[]; if ((tx == 0) & (tidy < m)) shared_vec[ty] = vec[tidy]; __syncthreads(); if ((tidy < m) & (tidx < n)) { out[tidx*m+tidy] = mat[tidx*m+tidy] ${binary_op} shared_vec[ty]; } }""") cache_dir=None ctype = dtype_to_ctype(dtype) tmpl = template.substitute(type=ctype, binary_op=binary_op) mod = SourceModule(tmpl) add_row_vec_kernel = mod.get_function('opRowVecToMat') add_col_vec_kernel = mod.get_function('opColVecToMat') return add_row_vec_kernel, add_col_vec_kernel def binaryop_matvec(binary_op, x_gpu, a_gpu, axis=None, out=None, stream=None): """ Applies a binary operation to a vector and each column/row of a matrix. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` op `a_gpu.get()` in host-code. Parameters ---------- binary_op : string, ['+', '-', '/', '*' '%'] The operator to apply x_gpu : pycuda.gpuarray.GPUArray Matrix to which to add the vector. a_gpu : pycuda.gpuarray.GPUArray Vector to add to `x_gpu`. axis : int (optional) The axis onto which the vector is added. By default this is determined automatically by using the first axis with the correct dimensionality. out : pycuda.gpuarray.GPUArray (optional) Optional destination matrix. stream : pycuda.driver.Stream (optional) Optional Stream in which to perform this calculation. Returns ------- out : pycuda.gpuarray.GPUArray result of `x_gpu` + `a_gpu` """ if axis is None: if len(a_gpu.shape) == 1: if a_gpu.shape[0] == x_gpu.shape[1]: axis = 1 else: raise ValueError( "operands could not be broadcast together " "with shapes %s %s" % (x_gpu.shape, a_gpu.shape)) elif a_gpu.shape[1] == x_gpu.shape[1]: # numpy matches inner axes first axis = 1 elif a_gpu.shape[0] == x_gpu.shape[0]: axis = 0 else: raise ValueError( "operands could not be broadcast together " "with shapes %s %s" % (x_gpu.shape, a_gpu.shape)) else: if axis < 0: axis += 2 if axis > 1: raise ValueError('invalid axis') if binary_op not in ['+', '-', '/', '*', '%']: raise ValueError('invalid operator') row_kernel, col_kernel = _get_binaryop_vecmat_kernel(x_gpu.dtype, binary_op) n, m = np.int32(x_gpu.shape[0]), np.int32(x_gpu.shape[1]) block = (24, 24, 1) gridx = int(n // block[0] + 1 * (n % block[0] != 0)) gridy = int(m // block[1] + 1 * (m % block[1] != 0)) grid = (gridx, gridy, 1) if out is None: alloc = _global_cublas_allocator out = gpuarray.empty_like(x_gpu) else: assert out.dtype == x_gpu.dtype assert out.shape == x_gpu.shape if x_gpu.flags.c_contiguous: if axis == 0: col_kernel(x_gpu, a_gpu, out, n, m, block=block, grid=grid, stream=stream, shared=24*x_gpu.dtype.itemsize) elif axis == 1: row_kernel(x_gpu, a_gpu, out, n, m, block=block, grid=grid, stream=stream, shared=24*x_gpu.dtype.itemsize) else: if axis == 0: row_kernel(x_gpu, a_gpu, out, m, n, block=block, grid=grid, stream=stream, shared=24*x_gpu.dtype.itemsize) elif axis == 1: col_kernel(x_gpu, a_gpu, out, m, n, block=block, grid=grid, stream=stream, shared=24*x_gpu.dtype.itemsize) return out import operator def binaryop_2d(c_op, py_op, commutative, x_gpu, y_gpu): if x_gpu.shape == y_gpu.shape: return py_op(x_gpu, y_gpu) elif x_gpu.size == 1: return py_op(x_gpu.get().reshape(()), y_gpu) elif y_gpu.size == 1: return py_op(x_gpu, y_gpu.get().reshape(())) if len(x_gpu.shape) == 2: m, n = x_gpu.shape if y_gpu.shape == (n,): return binaryop_matvec(c_op, x_gpu, y_gpu, axis=1) elif y_gpu.shape == (1, n): return binaryop_matvec(c_op, x_gpu, y_gpu[0], axis=1) elif y_gpu.shape == (m, 1): return binaryop_matvec(c_op, x_gpu, y_gpu.ravel(), axis=0) if len(y_gpu.shape) == 2 and commutative: m, n = y_gpu.shape if x_gpu.shape == (n,): return binaryop_matvec(c_op, y_gpu, x_gpu, axis=1) elif x_gpu.shape == (1, n): return binaryop_matvec(c_op, y_gpu, x_gpu[0], axis=1) elif x_gpu.shape == (m, 1): return binaryop_matvec(c_op, y_gpu, x_gpu.ravel(), axis=0) raise TypeError("unsupported combination of shapes") def add(x_gpu, y_gpu): """ Adds two scalars, vectors, or matrices. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` + `y_gpu.get()` in host code. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray The arrays to be added. Returns ------- out : pycuda.gpuarray.GPUArray Equivalent to `x_gpu.get()` + `y_gpu.get()`. Notes ----- The `out` and `stream` options are not supported because `GPUArray.__add__` doesn't provide them. """ return binaryop_2d("+", operator.add, True, x_gpu, y_gpu) def subtract(x_gpu, y_gpu): """ Subtracts two scalars, vectors, or matrices with broadcasting. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` - `y_gpu.get()` in host code. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray The arrays to be subtracted. Returns ------- out : pycuda.gpuarray.GPUArray Equivalent to `x_gpu.get()` - `y_gpu.get()`. Notes ----- The `out` and `stream` options are not supported because `GPUArray.__sub__` doesn't provide them. """ return binaryop_2d("-", operator.sub, False, x_gpu, y_gpu) def multiply(x_gpu, y_gpu): """ Multiplies two scalars, vectors, or matrices with broadcasting. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` * `y_gpu.get()` in host code. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray The arrays to be multiplied. Returns ------- out : pycuda.gpuarray.GPUArray Equivalent to `x_gpu.get()` * `y_gpu.get()`. Notes ----- The `out` and `stream` options are not supported because `GPUArray.__mul__` doesn't provide them. """ return binaryop_2d("*", operator.mul, True, x_gpu, y_gpu) def divide(x_gpu, y_gpu): """ Divides two scalars, vectors, or matrices with broadcasting. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` / `y_gpu.get()` in host code. Parameters ---------- x_gpu, y_gpu : pycuda.gpuarray.GPUArray The arrays to be divided. Returns ------- out : pycuda.gpuarray.GPUArray Equivalent to `x_gpu.get()` / `y_gpu.get()`. Notes ----- The `out` and `stream` options are not supported because `GPUArray.__div__` doesn't provide them. """ return binaryop_2d("/", operator.div, False, x_gpu, y_gpu) def add_matvec(x_gpu, a_gpu, axis=None, out=None, stream=None): """ Adds a vector to each column/row of the matrix. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` + `a_gpu.get()` in host-code. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Matrix to which to add the vector. a_gpu : pycuda.gpuarray.GPUArray Vector to add to `x_gpu`. axis : int (optional) The axis onto which the vector is added. By default this is determined automatically by using the first axis with the correct dimensionality. out : pycuda.gpuarray.GPUArray (optional) Optional destination matrix. stream : pycuda.driver.Stream (optional) Optional Stream in which to perform this calculation. Returns ------- out : pycuda.gpuarray.GPUArray Result of `x_gpu` + `a_gpu` """ return binaryop_matvec('+', x_gpu, a_gpu, axis, out, stream) def div_matvec(x_gpu, a_gpu, axis=None, out=None, stream=None): """ Divides each column/row of a matrix by a vector. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` / `a_gpu.get()` in host-code. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Matrix to which to add the vector. a_gpu : pycuda.gpuarray.GPUArray Vector to add to `x_gpu`. axis : int (optional) The axis onto which the vector is added. By default this is determined automatically by using the first axis with the correct dimensionality. out : pycuda.gpuarray.GPUArray (optional) Optional destination matrix. stream : pycuda.driver.Stream (optional) Optional Stream in which to perform this calculation. Returns ------- out : pycuda.gpuarray.GPUArray result of `x_gpu` + `a_gpu` """ return binaryop_matvec('/', x_gpu, a_gpu, axis, out, stream) def mult_matvec(x_gpu, a_gpu, axis=None, out=None, stream=None): """ Multiplies a vector elementwise with each column/row of the matrix. The numpy broadcasting rules apply so this would yield the same result as `x_gpu.get()` + `a_gpu.get()` in host-code. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Matrix to which to add the vector. a_gpu : pycuda.gpuarray.GPUArray Vector to add to `x_gpu`. axis : int (optional) The axis onto which the vector is added. By default this is determined automatically by using the first axis with the correct dimensionality. out : pycuda.gpuarray.GPUArray (optional) Optional destination matrix. stream : pycuda.driver.Stream (optional) Optional Stream in which to perform this calculation. Returns ------- out : pycuda.gpuarray.GPUArray result of `x_gpu` + `a_gpu` """ return binaryop_matvec('*', x_gpu, a_gpu, axis, out, stream) def _sum_axis(x_gpu, axis=None, out=None, calc_mean=False, ddof=0, keepdims=False): global _global_cublas_allocator assert isinstance(ddof, numbers.Integral) if axis is None or len(x_gpu.shape) <= 1: out_shape = (1,)*len(x_gpu.shape) if keepdims else () if calc_mean == False: return gpuarray.sum(x_gpu).reshape(out_shape) else: return gpuarray.sum(x_gpu).reshape(out_shape) / (x_gpu.dtype.type(x_gpu.size-ddof)) if axis < 0: axis += 2 if axis > 1: raise ValueError('invalid axis') if x_gpu.flags.c_contiguous: n, m = x_gpu.shape[1], x_gpu.shape[0] lda = x_gpu.shape[1] trans = "n" if axis == 0 else "t" sum_axis, out_axis = (m, n) if axis == 0 else (n, m) else: n, m = x_gpu.shape[0], x_gpu.shape[1] lda = x_gpu.shape[0] trans = "t" if axis == 0 else "n" sum_axis, out_axis = (n, m) if axis == 0 else (m, n) if calc_mean: alpha = (1.0 / (sum_axis-ddof)) else: alpha = 1.0 if (x_gpu.dtype == np.complex64): gemv = cublas.cublasCgemv elif (x_gpu.dtype == np.float32): gemv = cublas.cublasSgemv elif (x_gpu.dtype == np.complex128): gemv = cublas.cublasZgemv elif (x_gpu.dtype == np.float64): gemv = cublas.cublasDgemv alloc = _global_cublas_allocator ons = ones((sum_axis, ), x_gpu.dtype, alloc) if keepdims: out_shape = (1, out_axis) if axis == 0 else (out_axis, 1) else: out_shape = (out_axis,) if out is None: out = gpuarray.empty(out_shape, x_gpu.dtype, alloc) else: assert out.dtype == x_gpu.dtype assert out.size >= out_axis gemv(_global_cublas_handle, trans, n, m, alpha, x_gpu.gpudata, lda, ons.gpudata, 1, 0.0, out.gpudata, 1) return out def sum(x_gpu, axis=None, out=None, keepdims=False): """ Compute the sum along the specified axis. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Array containing numbers whose sum is desired. axis : int (optional) Axis along which the sums are computed. The default is to compute the sum of the flattened array. out : pycuda.gpuarray.GPUArray (optional) Output array in which to place the result. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray sum of elements, or sums of elements along the desired axis. """ return _sum_axis(x_gpu, axis, out=out, keepdims=keepdims) def mean(x_gpu, axis=None, out=None, keepdims=False): """ Compute the arithmetic means along the specified axis. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Array containing numbers whose mean is desired. axis : int (optional) Axis along which the means are computed. The default is to compute the mean of the flattened array. out : pycuda.gpuarray.GPUArray (optional) Output array in which to place the result. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray mean of elements, or means of elements along the desired axis. """ return _sum_axis(x_gpu, axis, calc_mean=True, out=out, keepdims=keepdims) def var(x_gpu, ddof=0, axis=None, stream=None, keepdims=False): """ Compute the variance along the specified axis. Returns the variance of the array elements, a measure of the spread of a distribution. The variance is computed for the flattened array by default, otherwise over the specified axis. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Array containing numbers whose variance is desired. ddof : int (optional) "Delta Degrees of Freedom": the divisor used in computing the variance is ``N - ddof``, where ``N`` is the number of elements. Setting ``ddof = 1`` is equivalent to applying Bessel's correction. axis : int (optional) Axis along which the variance are computed. The default is to compute the variance of the flattened array. stream : pycuda.driver.Stream (optional) Optional CUDA stream in which to perform this calculation keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray variance of elements, or variances of elements along the desired axis. """ def _inplace_pow(x_gpu, p, stream): func = elementwise.get_pow_kernel(x_gpu.dtype) func.prepared_async_call(x_gpu._grid, x_gpu._block, stream, p, x_gpu.gpudata, x_gpu.gpudata, x_gpu.mem_size) if axis is None: m = mean(x_gpu).get() out = x_gpu - m out **= 2 out = _sum_axis(out, axis=None, calc_mean=True, ddof=ddof, out=None, keepdims=keepdims) else: if axis < 0: axis += 2 m = mean(x_gpu, axis=axis) out = add_matvec(x_gpu, -m, axis=1-axis, stream=stream) _inplace_pow(out, 2, stream) out = _sum_axis(out, axis=axis, calc_mean=True, ddof=ddof, out=None, keepdims=keepdims) return out def std(x_gpu, ddof=0, axis=None, stream=None, keepdims=False): """ Compute the standard deviation along the specified axis. Returns the standard deviation of the array elements, a measure of the spread of a distribution. The standard deviation is computed for the flattened array by default, otherwise over the specified axis. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Array containing numbers whose std is desired. ddof : int (optional) "Delta Degrees of Freedom": the divisor used in computing the variance is ``N - ddof``, where ``N`` is the number of elements. Setting ``ddof = 1`` is equivalent to applying Bessel's correction. axis : int (optional) Axis along which the std are computed. The default is to compute the std of the flattened array. stream : pycuda.driver.Stream (optional) Optional CUDA stream in which to perform this calculation keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray or float std of elements, or stds of elements along the desired axis. """ def _inplace_pow(x_gpu, p, stream): func = elementwise.get_pow_kernel(x_gpu.dtype) func.prepared_async_call(x_gpu._grid, x_gpu._block, stream, p, x_gpu.gpudata, x_gpu.gpudata, x_gpu.mem_size) if axis is None: return var(x_gpu, ddof=ddof, stream=stream, keepdims=keepdims) ** 0.5 else: out = var(x_gpu, ddof=ddof, axis=axis, stream=stream, keepdims=keepdims) _inplace_pow(out, 0.5, stream) return out @context_dependent_memoize def _get_minmax_kernel(dtype, min_or_max): template = Template(""" #include __global__ void minmax_column_kernel(${type}* mat, ${type}* target, unsigned int *idx_target, unsigned int width, unsigned int height) { __shared__ ${type} max_vals[32]; __shared__ unsigned int max_idxs[32]; ${type} cur_max = ${init_value}; unsigned int cur_idx = 0; ${type} val = 0; for (unsigned int i = threadIdx.x; i < height; i += 32) { val = mat[blockIdx.x + i * width]; if (val ${cmp_op} cur_max) { cur_max = val; cur_idx = i; } } max_vals[threadIdx.x] = cur_max; max_idxs[threadIdx.x] = cur_idx; __syncthreads(); if (threadIdx.x == 0) { cur_max = ${init_value}; cur_idx = 0; for (unsigned int i = 0; i < 32; i++) if (max_vals[i] ${cmp_op} cur_max) { cur_max = max_vals[i]; cur_idx = max_idxs[i]; } target[blockIdx.x] = cur_max; idx_target[blockIdx.x] = cur_idx; } } __global__ void minmax_row_kernel(${type}* mat, ${type}* target, unsigned int* idx_target, unsigned int width, unsigned int height) { __shared__ ${type} max_vals[32]; __shared__ unsigned int max_idxs[32]; ${type} cur_max = ${init_value}; unsigned int cur_idx = 0; ${type} val = 0; for (unsigned int i = threadIdx.x; i < width; i += 32) { val = mat[blockIdx.x * width + i]; if (val ${cmp_op} cur_max) { cur_max = val; cur_idx = i; } } max_vals[threadIdx.x] = cur_max; max_idxs[threadIdx.x] = cur_idx; __syncthreads(); if (threadIdx.x == 0) { cur_max = ${init_value}; cur_idx = 0; for (unsigned int i = 0; i < 32; i++) if (max_vals[i] ${cmp_op} cur_max) { cur_max = max_vals[i]; cur_idx = max_idxs[i]; } target[blockIdx.x] = cur_max; idx_target[blockIdx.x] = cur_idx; } } """) cache_dir=None ctype = dtype_to_ctype(dtype) if min_or_max=='max': iv = str(np.finfo(dtype).min) tmpl = template.substitute(type=ctype, cmp_op='>', init_value=iv) elif min_or_max=='min': iv = str(np.finfo(dtype).max) tmpl = template.substitute(type=ctype, cmp_op='<', init_value=iv) else: raise ValueError('invalid argument') mod = SourceModule(tmpl) minmax_col_kernel = mod.get_function('minmax_column_kernel') minmax_row_kernel = mod.get_function('minmax_row_kernel') return minmax_col_kernel, minmax_row_kernel def _minmax_impl(a_gpu, axis, min_or_max, stream=None, keepdims=False): ''' Returns both max and argmax (min/argmin) along an axis.''' assert len(a_gpu.shape) < 3 if iscomplextype(a_gpu.dtype): raise ValueError("Cannot compute min/max of complex values") if axis is None or len(a_gpu.shape) <= 1: ## Note: PyCUDA doesn't have an overall argmax/argmin! out_shape = (1,) * len(a_gpu.shape) if min_or_max == 'max': return gpuarray.max(a_gpu).reshape(out_shape), None else: return gpuarray.min(a_gpu).reshape(out_shape), None else: if axis < 0: axis += 2 assert axis in (0, 1) global _global_cublas_allocator alloc = _global_cublas_allocator n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0]) col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max) if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous): if keepdims: out_shape = (1, m) if axis == 0 else (m, 1) else: out_shape = (m,) target = gpuarray.empty(out_shape, dtype=a_gpu.dtype, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(m, 1, 1), stream=stream) else: if keepdims: out_shape = (1, n) if axis == 0 else (n, 1) else: out_shape = (n,) target = gpuarray.empty(out_shape, dtype=a_gpu, allocator=alloc) idx = gpuarray.empty(out_shape, dtype=np.uint32, allocator=alloc) row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n), block=(32, 1, 1), grid=(n, 1, 1), stream=stream) return target, idx def max(a_gpu, axis=None, keepdims=False): ''' Return the maximum of an array or maximum along an axis. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array axis : int (optional) Axis along which the maxima are computed. The default is to compute the maximum of the flattened array. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray or float maximum of elements, or maxima of elements along the desired axis. ''' return _minmax_impl(a_gpu, axis, "max", keepdims=keepdims)[0] def min(a_gpu, axis=None, keepdims=False): ''' Return the minimum of an array or minimum along an axis. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array axis : int (optional) Axis along which the minima are computed. The default is to compute the minimum of the flattened array. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray or float minimum of elements, or minima of elements along the desired axis. ''' return _minmax_impl(a_gpu, axis, "min", keepdims=keepdims)[0] def argmax(a_gpu, axis, keepdims=False): ''' Indices of the maximum values along an axis. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array axis : int Axis along which the maxima are computed. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray Array of indices into the array. ''' if axis is None: raise NotImplementedError("Can't compute global argmax") return _minmax_impl(a_gpu, axis, "max", keepdims=keepdims)[1] def argmin(a_gpu, axis, keepdims=False): ''' Indices of the minimum values along an axis. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Input array axis : int Axis along which the minima are computed. keepdims : bool (optional, default False) If True, the axes which are reduced are left in the result as dimensions with size one. Returns ------- out : pycuda.gpuarray.GPUArray Array of indices into the array. ''' if axis is None: raise NotImplementedError("Can't compute global argmax") return _minmax_impl(a_gpu, axis, "min", keepdims=keepdims)[1] if __name__ == "__main__": import doctest doctest.testmod() scikit-cuda-0.5.1/skcuda/pcula.py000066400000000000000000000611131261465507300166600ustar00rootroot00000000000000#!/usr/bin/env/python """ Python interface to multi-GPU CULA toolkit functions. """ import ctypes import sys import cuda from cula import culaCheckStatus if 'linux' in sys.platform: _libpcula_libname_list = ['libcula_scalapack.so'] elif sys.platform == 'darwin': _libpcula_libname_list = ['libcula_scalapack.dylib'] else: raise RuntimeError('unsupported platform') _load_err = '' for _lib in _libpcula_libname_list: try: _libpcula = ctypes.cdll.LoadLibrary(_lib) except OSError: _load_err += ('' if _load_err == '' else ', ') + _lib else: _load_err = '' break if _load_err: raise OSError('%s not found' % _load_err) class pculaConfig(ctypes.Structure): _fields_ = [ ('ncuda', ctypes.c_int), ('cudaDeviceList', ctypes.c_void_p), ('maxCudaMemoryUsage', ctypes.c_void_p), ('preserveTuningResult', ctypes.c_int), ('dotFileName', ctypes.c_char_p), ('timelineFileName', ctypes.c_char_p)] _libpcula.pculaConfigInit.restype = int _libpcula.pculaConfigInit.argtypes = [ctypes.c_void_p] def pculaConfigInit(config): """ Initialize pCULA configuration structure to sensible defaults. """ status = _libpcula.pculaConfigInit(ctypes.byref(config)) culaCheckStatus(status) # SGEMM, DGEMM, CGEMM, ZGEMM _libpcula.pculaSgemm.restype = int _libpcula.pculaSgemm.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int] def pculaSgemm(config, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for general matrix. """ status = _libpcula.pculaSgemm(ctypes.byref(config), transa, transb, m, n, k, alpha, int(A), lda, int(B), ldb, beta, int(C), ldc) culaCheckStatus(status) _libpcula.pculaDgemm.restype = int _libpcula.pculaDgemm.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_double, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_double, ctypes.c_void_p, ctypes.c_int] def pculaDgemm(config, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for general matrix. """ status = _libpcula.pculaDgemm(ctypes.byref(config), transa, transb, m, n, k, alpha, int(A), lda, int(B), ldb, beta, int(C), ldc) culaCheckStatus(status) _libpcula.pculaCgemm.restype = int _libpcula.pculaCgemm.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, cuda.cuFloatComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, cuda.cuFloatComplex, ctypes.c_void_p, ctypes.c_int] def pculaCgemm(config, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for general matrix. """ status = _libpcula.pculaCgemm(ctypes.byref(config), transa, transb, m, n, k, alpha, int(A), lda, int(B), ldb, beta, int(C), ldc) culaCheckStatus(status) _libpcula.pculaZgemm.restype = int _libpcula.pculaZgemm.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_int, cuda.cuDoubleComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, cuda.cuDoubleComplex, ctypes.c_void_p, ctypes.c_int] def pculaZgemm(config, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc): """ Matrix-matrix product for general matrix. """ status = _libpcula.pculaZgemm(ctypes.byref(config), transa, transb, m, n, k, alpha, int(A), lda, int(B), ldb, beta, int(C), ldc) culaCheckStatus(status) # STRSM, DTRSM, CTRSM, ZTRSM _libpcula.pculaStrsm.restype = int _libpcula.pculaStrsm.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_float, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaStrsm(config, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb): """ Triangular system solve. """ status = _libpcula.pculaStrsm(ctypes.byref(config), side, uplo, transa, diag, m, n, alpha, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaDtrsm.restype = int _libpcula.pculaDtrsm.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_double, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaDtrsm(config, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb): """ Triangular system solve. """ status = _libpcula.pculaDtrsm(ctypes.byref(config), side, uplo, transa, diag, m, n, alpha, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaCtrsm.restype = int _libpcula.pculaCtrsm.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, cuda.cuFloatComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaCtrsm(config, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb): """ Triangular system solve. """ status = _libpcula.pculaCtrsm(ctypes.byref(config), side, uplo, transa, diag, m, n, alpha, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaZtrsm.restype = int _libpcula.pculaZtrsm.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_char, ctypes.c_int, ctypes.c_int, cuda.cuDoubleComplex, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaZtrsm(config, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb): """ Triangular system solve. """ status = _libpcula.pculaZtrsm(ctypes.byref(config), side, uplo, transa, diag, m, n, alpha, int(a), lda, int(b), ldb) culaCheckStatus(status) # SGESV, DGESV, CGESV, ZGESV _libpcula.pculaSgesv.restype = int _libpcula.pculaSgesv.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def pculaSgesv(config, n, nrhs, a, lda, ipiv, b, ldb): """ General system solve using LU decomposition. """ status = _libpcula.pculaSgesv(ctypes.byref(config), n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) _libpcula.pculaDgesv.restype = int _libpcula.pculaDgesv.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def pculaDgesv(config, n, nrhs, a, lda, ipiv, b, ldb): """ General system solve using LU decomposition. """ status = _libpcula.pculaDgesv(ctypes.byref(config), n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) _libpcula.pculaCgesv.restype = int _libpcula.pculaCgesv.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def pculaCgesv(config, n, nrhs, a, lda, ipiv, b, ldb): """ General system solve using LU decomposition. """ status = _libpcula.pculaCgesv(ctypes.byref(config), n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) _libpcula.pculaZgesv.restype = int _libpcula.pculaZgesv.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def pculaZgesv(config, n, nrhs, a, lda, ipiv, b, ldb): """ General system solve using LU decomposition. """ status = _libpcula.pculaZgesv(ctypes.byref(config), n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) # SGETRF, DGETRF, CGETRF, ZGETRF _libpcula.pculaSgetrf.restype = int _libpcula.pculaSgetrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def pculaSgetrf(config, m, n, a, lda, ipiv): """ LU decomposition. """ status = _libpcula.pculaSgetrf(ctypes.byref(config), m, n, int(a), lda, int(ipiv)) culaCheckStatus(status) _libpcula.pculaDgetrf.restype = int _libpcula.pculaDgetrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def pculaDgetrf(config, m, n, a, lda, ipiv): """ LU decomposition. """ status = _libpcula.pculaDgetrf(ctypes.byref(config), m, n, int(a), lda, int(ipiv)) culaCheckStatus(status) _libpcula.pculaCgetrf.restype = int _libpcula.pculaCgetrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def pculaCgetrf(config, m, n, a, lda, ipiv): """ LU decomposition. """ status = _libpcula.pculaCgetrf(ctypes.byref(config), m, n, int(a), lda, int(ipiv)) culaCheckStatus(status) _libpcula.pculaZgetrf.restype = int _libpcula.pculaZgetrf.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p] def pculaZgetrf(config, m, n, a, lda, ipiv): """ LU decomposition. """ status = _libpcula.pculaZgetrf(ctypes.byref(config), m, n, int(a), lda, int(ipiv)) culaCheckStatus(status) # SGETRS, DGETRS, CGETRS, ZGETRS _libpcula.pculaSgetrs.restype = int _libpcula.pculaSgetrs.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def pculaSgetrs(config, trans, n, nrhs, a, lda, ipiv, b, ldb): """ LU solve. """ status = _libpcula.pculaSgetrs(ctypes.byref(config), trans, n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) _libpcula.pculaDgetrs.restype = int _libpcula.pculaDgetrs.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def pculaDgetrs(config, trans, n, nrhs, a, lda, ipiv, b, ldb): """ LU solve. """ status = _libpcula.pculaDgetrs(ctypes.byref(config), trans, n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) _libpcula.pculaCgetrs.restype = int _libpcula.pculaCgetrs.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def pculaCgetrs(config, trans, n, nrhs, a, lda, ipiv, b, ldb): """ LU solve. """ status = _libpcula.pculaCgetrs(ctypes.byref(config), trans, n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) _libpcula.pculaZgetrs.restype = int _libpcula.pculaZgetrs.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] def pculaZgetrs(config, trans, n, nrhs, a, lda, ipiv, b, ldb): """ LU solve. """ status = _libpcula.pculaZgetrs(ctypes.byref(config), trans, n, nrhs, int(a), lda, int(ipiv), int(b), ldb) culaCheckStatus(status) # SPOSV, DPOSV, CPOSV, ZPOSV _libpcula.pculaSposv.restype = int _libpcula.pculaSposv.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaSposv(config, uplo, n, nrhs, a, lda, b, ldb): """ QR factorization. """ status = _libpcula.pculaSposv(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaDposv.restype = int _libpcula.pculaDposv.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaDposv(config, uplo, n, nrhs, a, lda, b, ldb): """ QR factorization. """ status = _libpcula.pculaDposv(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaCposv.restype = int _libpcula.pculaCposv.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaCposv(config, uplo, n, nrhs, a, lda, b, ldb): """ QR factorization. """ status = _libpcula.pculaCposv(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaZposv.restype = int _libpcula.pculaZposv.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaZposv(config, uplo, n, nrhs, a, lda, b, ldb): """ QR factorization. """ status = _libpcula.pculaZposv(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) # SPOTRF, DPOTRF, CPOTRF, ZPOTRF _libpcula.pculaSpotrf.restype = int _libpcula.pculaSpotrf.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaSpotrf(config, uplo, n, a, lda): """ Cholesky decomposition. """ status = _libpcula.pculaSpotrf(ctypes.byref(config), uplo, n, int(a), lda) culaCheckStatus(status) _libpcula.pculaDpotrf.restype = int _libpcula.pculaDpotrf.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaDpotrf(config, uplo, n, a, lda): """ Cholesky decomposition. """ status = _libpcula.pculaDpotrf(ctypes.byref(config), uplo, n, int(a), lda) culaCheckStatus(status) _libpcula.pculaCpotrf.restype = int _libpcula.pculaCpotrf.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaCpotrf(config, uplo, n, a, lda): """ Cholesky decomposition. """ status = _libpcula.pculaCpotrf(ctypes.byref(config), uplo, n, int(a), lda) culaCheckStatus(status) _libpcula.pculaZpotrf.restype = int _libpcula.pculaZpotrf.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaZpotrf(config, uplo, n, a, lda): """ Cholesky decomposition. """ status = _libpcula.pculaZpotrf(ctypes.byref(config), uplo, n, int(a), lda) culaCheckStatus(status) # SPOTRS, DPOTRS, CPOTRS, ZPOTRS _libpcula.pculaSpotrs.restype = int _libpcula.pculaSpotrs.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaSpotrs(config, uplo, n, nrhs, a, lda, b, ldb): """ Cholesky solve. """ status = _libpcula.pculaSpotrs(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaDpotrs.restype = int _libpcula.pculaDpotrs.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaDpotrs(config, uplo, n, nrhs, a, lda, b, ldb): """ Cholesky solve. """ status = _libpcula.pculaDpotrs(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaCpotrs.restype = int _libpcula.pculaCpotrs.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaCpotrs(config, uplo, n, nrhs, a, lda, b, ldb): """ Cholesky solve. """ status = _libpcula.pculaCpotrs(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) _libpcula.pculaZpotrs.restype = int _libpcula.pculaZpotrs.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int] def pculaZpotrs(config, uplo, n, nrhs, a, lda, b, ldb): """ Cholesky solve. """ status = _libpcula.pculaZpotrs(ctypes.byref(config), uplo, n, nrhs, int(a), lda, int(b), ldb) culaCheckStatus(status) scikit-cuda-0.5.1/skcuda/rlinalg.py000066400000000000000000000567051261465507300172170ustar00rootroot00000000000000#!/usr/bin/env python """ PyCUDA-based randomized linear algebra functions. """ from __future__ import absolute_import, division from pprint import pprint from string import Template from pycuda.tools import context_dependent_memoize from pycuda.compiler import SourceModule from pycuda.reduction import ReductionKernel from pycuda import curandom import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.elementwise as el import pycuda.tools as tools import numpy as np from . import cublas from . import misc from . import linalg rand = curandom.MRG32k3aRandomNumberGenerator() import sys if sys.version_info < (3,): range = xrange class LinAlgError(Exception): """Randomized Linear Algebra Error.""" pass try: from . import cula _has_cula = True except (ImportError, OSError): _has_cula = False from .misc import init, add_matvec, div_matvec, mult_matvec from .linalg import hermitian, transpose # Get installation location of C headers: from . import install_headers def rsvd(a_gpu, k=None, p=0, q=0, method="standard", handle=None): """ Randomized Singular Value Decomposition. Randomized algorithm for computing the approximate low-rank singular value decomposition of a rectangular (m, n) matrix `a` with target rank `k << n`. The input matrix a is factored as `a = U * diag(s) * Vt`. The right singluar vectors are the columns of the real or complex unitary matrix `U`. The left singular vectors are the columns of the real or complex unitary matrix `V`. The singular values `s` are non-negative and real numbers. The paramter `p` is a oversampling parameter to improve the approximation. A value between 2 and 10 is recommended. The paramter `q` specifies the number of normlized power iterations (subspace iterations) to reduce the approximation error. This is recommended if the the singular values decay slowly and in practice 1 or 2 iterations achive good results. However, computing power iterations is increasing the computational time. If k > (n/1.5), partial SVD or trancated SVD might be faster. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Real/complex input matrix `a` with dimensions `(m, n)`. k : int `k` is the target rank of the low-rank decomposition, k << min(m,n). p : int `p` sets the oversampling parameter (default k=0). q : int `q` sets the number of power iterations (default=0). method : `{'standard', 'fast'}` 'standard' : Standard algorithm as described in [1, 2] 'fast' : Version II algorithm as described in [2] handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- u_gpu : pycuda.gpuarray Right singular values, array of shape `(m, k)`. s_gpu : pycuda.gpuarray Singular values, 1-d array of length `k`. vt_gpu : pycuda.gpuarray Left singular values, array of shape `(k, n)`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. Arrays are assumed to be stored in column-major order, i.e., order='F'. Input matrix of shape `(m, n)`, where `n>m` is not supported yet. References ---------- N. Halko, P. Martinsson, and J. Tropp. "Finding structure with randomness: probabilistic algorithms for constructing approximate matrix decompositions" (2009). (available at `arXiv `_). S. Voronin and P.Martinsson. "RSVDPACK: Subroutines for computing partial singular value decompositions via randomized sampling on single core, multi core, and GPU architectures" (2015). (available at `arXiv `_). Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> from skcuda import linalg, rlinalg >>> linalg.init() >>> rlinalg.init() >>> #Randomized SVD decomposition of the square matrix `a` with single precision. >>> #Note: There is no gain to use rsvd if k > int(n/1.5) >>> a = np.array(np.random.randn(5, 5), np.float32, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> U, s, Vt = rlinalg.rsvd(a_gpu, k=5, method='standard') >>> np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), 1e-4) True >>> #Low-rank SVD decomposition with target rank k=2 >>> a = np.array(np.random.randn(5, 5), np.float32, order='F') >>> a_gpu = gpuarray.to_gpu(a) >>> U, s, Vt = rlinalg.rsvd(a_gpu, k=2, method='standard') """ #************************************************************************* #*** Author: N. Benjamin Erichson *** #*** *** #*** License: BSD 3 clause *** #************************************************************************* if not _has_cula: raise NotImplementedError('CULA not installed') if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func_gesvd = cula.culaDeviceCgesvd cublas_func_gemm = cublas.cublasCgemm copy_func = cublas.cublasCcopy alpha = np.complex64(1.0) beta = np.complex64(0.0) TRANS_type = 'C' isreal = False elif data_type == np.float32: cula_func_gesvd = cula.culaDeviceSgesvd cublas_func_gemm = cublas.cublasSgemm copy_func = cublas.cublasScopy alpha = np.float32(1.0) beta = np.float32(0.0) TRANS_type = 'T' isreal = True else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func_gesvd = cula.culaDeviceZgesvd cublas_func_gemm = cublas.cublasZgemm copy_func = cublas.cublasZcopy alpha = np.complex128(1.0) beta = np.complex128(0.0) TRANS_type = 'C' isreal = False elif data_type == np.float64: cula_func_gesvd = cula.culaDeviceDgesvd cublas_func_gemm = cublas.cublasDgemm copy_func = cublas.cublasDcopy alpha = np.float64(1.0) beta = np.float64(0.0) TRANS_type = 'T' isreal = True else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') #CUDA assumes that arrays are stored in column-major order m, n = np.array(a_gpu.shape, int) if n>m : raise ValueError('input matrix of shape (m,n), where n>m is not supported') #Set k if k == None : raise ValueError('k must be provided') if k > n or k < 1: raise ValueError('k must be 0 < k <= n') kt = k k = k + p if k > n: k=n #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Generate a random sampling matrix O #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Allocate O if isreal==False: Oc_gpu = gpuarray.empty((n,k), real_type, order="F", allocator=alloc) #End if O_gpu = gpuarray.empty((n,k), real_type, order="F", allocator=alloc) #Draw random samples from a ~ Uniform(-1,1) distribution if isreal==True: rand.fill_uniform(O_gpu) O_gpu = O_gpu * 2 - 1 #Scale to [-1,1] else: rand.fill_uniform(O_gpu) rand.fill_uniform(Oc_gpu) O_gpu = (O_gpu + 1j * Oc_gpu) * 2 -1 #End if #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Build sample matrix Y : Y = A * O #Note: Y should approximate the range of A #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Allocate Y Y_gpu = gpuarray.zeros((m,k), data_type, order="F", allocator=alloc) #Dot product Y = A * O cublas_func_gemm(handle, 'n', 'n', m, k, n, alpha, a_gpu.gpudata, m, O_gpu.gpudata, n, beta, Y_gpu.gpudata, m ) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Orthogonalize Y using economic QR decomposition: Y=QR #If q > 0 perfrom q subspace iterations #Note: economic QR just returns Q, and destroys Y_gpu #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if q > 0: Z_gpu = gpuarray.empty((n,k), data_type, order="F", allocator=alloc) for i in np.arange(1, q+1 ): if( (2*i-2)%q == 0 ): Y_gpu = linalg.qr(Y_gpu, 'economic') cublas_func_gemm(handle, TRANS_type, 'n', n, k, m, alpha, a_gpu.gpudata, m, Y_gpu.gpudata, m, beta, Z_gpu.gpudata, n ) if( (2*i-1)%q == 0 ): Z_gpu = linalg.qr(Z_gpu, 'economic') cublas_func_gemm(handle, 'n', 'n', m, k, n, alpha, a_gpu.gpudata, m, Z_gpu.gpudata, n, beta, Y_gpu.gpudata, m ) #End for #End if Q_gpu = linalg.qr(Y_gpu, 'economic') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Project the data matrix a into a lower dimensional subspace #B = Q.T * A #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Allocate B B_gpu = gpuarray.empty((k,n), data_type, order="F", allocator=alloc) cublas_func_gemm(handle, TRANS_type, 'n', k, n, m, alpha, Q_gpu.gpudata, m, a_gpu.gpudata, m, beta, B_gpu.gpudata, k ) if method == 'standard': #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Singular Value Decomposition #Note: B = U" * S * Vt #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #gesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt) #Allocate s, U, Vt for economic SVD #Note: singular values are always real s_gpu = gpuarray.empty(k, real_type, order="F", allocator=alloc) U_gpu = gpuarray.empty((k,k), data_type, order="F", allocator=alloc) Vt_gpu = gpuarray.empty((k,n), data_type, order="F", allocator=alloc) #Economic SVD cula_func_gesvd('S', 'S', k, n, int(B_gpu.gpudata), k, int(s_gpu.gpudata), int(U_gpu.gpudata), k, int(Vt_gpu.gpudata), k) #Compute right singular vectors as U = Q * U" cublas_func_gemm(handle, 'n', 'n', m, k, k, alpha, Q_gpu.gpudata, m, U_gpu.gpudata, k, beta, Q_gpu.gpudata, m ) U_gpu = Q_gpu #Set pointer # Free internal CULA memory: cula.culaFreeBuffers() #Return return U_gpu[ : , 0:kt ], s_gpu[ 0:kt ], Vt_gpu[ 0:kt , : ] elif method == 'fast': #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Orthogonalize B.T using reduced QR decomposition: B.T = Q" * R" #Note: reduced QR returns Q and R, and destroys B_gpu #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if isreal==True: B_gpu = transpose(B_gpu) #transpose B else: B_gpu = hermitian(B_gpu) #transpose B Qstar_gpu, Rstar_gpu = linalg.qr(B_gpu, 'reduced') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Singular Value Decomposition of R" #Note: R" = U" * S" * Vt" #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #gesvd(jobu, jobvt, m, n, int(a), lda, int(s), int(u), ldu, int(vt), ldvt) #Allocate s, U, Vt for economic SVD #Note: singular values are always real s_gpu = gpuarray.empty(k, real_type, order="F", allocator=alloc) Ustar_gpu = gpuarray.empty((k,k), data_type, order="F", allocator=alloc) Vtstar_gpu = gpuarray.empty((k,k), data_type, order="F", allocator=alloc) #Economic SVD cula_func_gesvd('A', 'A', k, k, int(Rstar_gpu.gpudata), k, int(s_gpu.gpudata), int(Ustar_gpu.gpudata), k, int(Vtstar_gpu.gpudata), k) #Compute right singular vectors as U = Q * Vt.T" cublas_func_gemm(handle, 'n', TRANS_type, m, k, k, alpha, Q_gpu.gpudata, m, Vtstar_gpu.gpudata, k, beta, Q_gpu.gpudata, m ) U_gpu = Q_gpu #Set pointer #Compute left singular vectors as Vt = U".T * Q".T Vt_gpu = gpuarray.empty((k,n), data_type, order="F", allocator=alloc) cublas_func_gemm(handle, TRANS_type, TRANS_type, k, n, k, alpha, Ustar_gpu.gpudata, k, Qstar_gpu.gpudata, n, beta, Vt_gpu.gpudata, k ) # Free internal CULA memory: cula.culaFreeBuffers() #Return return U_gpu[ : , 0:kt ], s_gpu[ 0:kt ], Vt_gpu[ 0:kt , : ] #End if def rdmd(a_gpu, k=None, p=5, q=1, modes='exact', method_rsvd='standard', handle=None): """ Dynamic Mode Decomposition. Dynamic Mode Decomposition (DMD) is a data processing algorithm which allows to decompose a matrix `a` in space and time. The matrix `a` is decomposed as `a = FBV`, where the columns of `F` contain the dynamic modes. The modes are ordered corresponding to the amplitudes stored in the diagonal matrix `B`. `V` is a Vandermonde matrix describing the temporal evolution. Parameters ---------- a_gpu : pycuda.gpuarray.GPUArray Real/complex input matrix `a` with dimensions `(m, n)`. k : int, optional If `k < (n-1)` low-rank Dynamic Mode Decomposition is computed. p : int `p` sets the oversampling parameter for rSVD (default k=5). q : int `q` sets the number of power iterations for rSVD (default=1). modes : `{'standard', 'exact'}` 'standard' : uses the standard definition to compute the dynamic modes, `F = U * W`. 'exact' : computes the exact dynamic modes, `F = Y * V * (S**-1) * W`. method_rsvd : `{'standard', 'fast'}` 'standard' : (default) Standard algorithm as described in [1, 2] 'fast' : Version II algorithm as described in [2] handle : int CUBLAS context. If no context is specified, the default handle from `skcuda.misc._global_cublas_handle` is used. Returns ------- f_gpu : pycuda.gpuarray.GPUArray Matrix containing the dynamic modes of shape `(m, n-1)` or `(m, k)`. b_gpu : pycuda.gpuarray.GPUArray 1-D array containing the amplitudes of length `min(n-1, k)`. v_gpu : pycuda.gpuarray.GPUArray Vandermonde matrix of shape `(n-1, n-1)` or `(k, n-1)`. Notes ----- Double precision is only supported if the standard version of the CULA Dense toolkit is installed. This function destroys the contents of the input matrix. Arrays are assumed to be stored in column-major order, i.e., order='F'. References ---------- N. B. Erichson and C. Donovan. "Randomized Low-Rank Dynamic Mode Decomposition for Motion Detection" Under Review. N. Halko, P. Martinsson, and J. Tropp. "Finding structure with randomness: probabilistic algorithms for constructing approximate matrix decompositions" (2009). (available at `arXiv `_). J. H. Tu, et al. "On dynamic mode decomposition: theory and applications." arXiv preprint arXiv:1312.0041 (2013). """ #************************************************************************* #*** Author: N. Benjamin Erichson *** #*** <2015> *** #*** License: BSD 3 clause *** #************************************************************************* if not _has_cula: raise NotImplementedError('CULA not installed') if handle is None: handle = misc._global_cublas_handle alloc = misc._global_cublas_allocator # The free version of CULA only supports single precision floating data_type = a_gpu.dtype.type real_type = np.float32 if data_type == np.complex64: cula_func_gesvd = cula.culaDeviceCgesvd cublas_func_gemm = cublas.cublasCgemm cublas_func_dgmm = cublas.cublasCdgmm cula_func_gels = cula.culaDeviceCgels copy_func = cublas.cublasCcopy alpha = np.complex64(1.0) beta = np.complex64(0.0) TRANS_type = 'C' elif data_type == np.float32: cula_func_gesvd = cula.culaDeviceSgesvd cublas_func_gemm = cublas.cublasSgemm cublas_func_dgmm = cublas.cublasSdgmm cula_func_gels = cula.culaDeviceSgels copy_func = cublas.cublasScopy alpha = np.float32(1.0) beta = np.float32(0.0) TRANS_type = 'T' else: if cula._libcula_toolkit == 'standard': if data_type == np.complex128: cula_func_gesvd = cula.culaDeviceZgesvd cublas_func_gemm = cublas.cublasZgemm cublas_func_dgmm = cublas.cublasZdgmm cula_func_gels = cula.culaDeviceZgels copy_func = cublas.cublasZcopy alpha = np.complex128(1.0) beta = np.complex128(0.0) TRANS_type = 'C' elif data_type == np.float64: cula_func_gesvd = cula.culaDeviceDgesvd cublas_func_gemm = cublas.cublasDgemm cublas_func_dgmm = cublas.cublasDdgmm cula_func_gels = cula.culaDeviceDgels copy_func = cublas.cublasDcopy alpha = np.float64(1.0) beta = np.float64(0.0) TRANS_type = 'T' else: raise ValueError('unsupported type') real_type = np.float64 else: raise ValueError('double precision not supported') #CUDA assumes that arrays are stored in column-major order m, n = np.array(a_gpu.shape, int) nx = n-1 #Set k if k == None : k = nx if k > nx or k < 1: raise ValueError('k is not valid') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Split data into lef and right snapshot sequence #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Note: we need a copy of X_gpu, because SVD destroys X_gpu #While Y_gpu is just a pointer X_gpu = gpuarray.empty((m, n), data_type, order="F", allocator=alloc) copy_func(handle, X_gpu.size, int(a_gpu.gpudata), 1, int(X_gpu.gpudata), 1) X_gpu = X_gpu[:, :nx] Y_gpu = a_gpu[:, 1:] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Randomized Singular Value Decomposition #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ U_gpu, s_gpu, Vt_gpu = rsvd(X_gpu, k=k, p=p, q=q, method=method_rsvd, handle=handle) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Solve the LS problem to find estimate for M using the pseudo-inverse #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #real: M = U.T * Y * Vt.T * S**-1 #complex: M = U.H * Y * Vt.H * S**-1 #Let G = Y * Vt.H * S**-1, hence M = M * G #Allocate G and M G_gpu = gpuarray.empty((m,k), data_type, order="F", allocator=alloc) M_gpu = gpuarray.empty((k,k), data_type, order="F", allocator=alloc) #i) s = s **-1 (inverse) if data_type == np.complex64 or data_type == np.complex128: s_gpu = 1/s_gpu s_gpu = s_gpu + 1j * gpuarray.zeros_like(s_gpu) else: s_gpu = 1/s_gpu #ii) real/complex: scale Vt = diag(s**-1) * Vt cublas_func_dgmm(handle, 'l', k, k, int(Vt_gpu.gpudata), k, int(s_gpu.gpudata), 1, int(Vt_gpu.gpudata), k) #iii) real: G = Y * (S**-1 * Vt).T, complex: G = Y * (S**-1 * Vt).H cublas_func_gemm(handle, 'n', TRANS_type, m, k, k, alpha, int(Y_gpu.gpudata), m, int(Vt_gpu.gpudata), k, beta, int(G_gpu.gpudata), m ) #iv) real/complex: M = M * G cublas_func_gemm(handle, TRANS_type, 'n', k, k, m, alpha, int(U_gpu.gpudata), m, int(G_gpu.gpudata), m, beta, int(M_gpu.gpudata), k ) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Eigen Decomposition #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Note: If a_gpu is real the imag part is omitted Vr_gpu, w_gpu = linalg.eig(M_gpu, 'N', 'V', 'F') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute DMD Modes #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ F_gpu = gpuarray.empty((m,k), data_type, order="F", allocator=alloc) modes = modes.lower() if modes == 'exact': #Compute (exact) DMD modes: F = Y * V * S**-1 * W = G * W cublas_func_gemm(handle, 'n', 'n', m, k, k, alpha, G_gpu.gpudata, m, Vr_gpu.gpudata, k, beta, G_gpu.gpudata, m ) F_gpu_temp = G_gpu elif modes == 'standard': #Compute (standard) DMD modes: F = U * W cublas_func_gemm(handle, 'n', 'n', m, k, k, alpha, U_gpu.gpudata, m, Vr_gpu.gpudata, k, beta, U_gpu.gpudata, m ) F_gpu_temp = U_gpu else: raise ValueError('Type of modes is not supported, choose "exact" or "standard".') #Copy is required, because gels destroys input copy_func(handle, F_gpu_temp.size, int(F_gpu_temp.gpudata), 1, int(F_gpu.gpudata), 1) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute amplitueds b using least-squares: Fb=x1 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #x1_gpu = a_gpu[:,0].copy() x1_gpu = gpuarray.empty(m, data_type, order="F", allocator=alloc) copy_func(handle, x1_gpu.size, int(a_gpu[:,0].gpudata), 1, int(x1_gpu.gpudata), 1) cula_func_gels( 'N', m, k, int(1) , F_gpu_temp.gpudata, m, x1_gpu.gpudata, m) b_gpu = x1_gpu #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Compute Vandermonde matrix (CPU) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ V_gpu = linalg.vander(w_gpu, n=nx) # Free internal CULA memory: cula.culaFreeBuffers() #Return return F_gpu, b_gpu[:k], V_gpu if __name__ == "__main__": import doctest doctest.testmod() scikit-cuda-0.5.1/skcuda/special.py000066400000000000000000000114431261465507300171750ustar00rootroot00000000000000#!/usr/bin/env python """ PyCUDA-based special functions. """ import os import pycuda.gpuarray as gpuarray import pycuda.elementwise as elementwise import numpy as np from . import misc from .misc import init # Get installation location of C headers: from . import install_headers def sici(x_gpu): """ Sine/Cosine integral. Computes the sine and cosine integral of every element in the input matrix. Parameters ---------- x_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- (si_gpu, ci_gpu) : tuple of GPUArrays Tuple of GPUarrays containing the sine integrals and cosine integrals of the entries of `x_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> x = np.array([[1, 2], [3, 4]], np.float32) >>> x_gpu = gpuarray.to_gpu(x) >>> (si_gpu, ci_gpu) = sici(x_gpu) >>> (si, ci) = scipy.special.sici(x) >>> np.allclose(si, si_gpu.get()) True >>> np.allclose(ci, ci_gpu.get()) True """ if x_gpu.dtype == np.float32: args = 'float *x, float *si, float *ci' op = 'sicif(x[i], &si[i], &ci[i])' elif x_gpu.dtype == np.float64: args = 'double *x, double *si, double *ci' op = 'sici(x[i], &si[i], &ci[i])' else: raise ValueError('unsupported type') try: func = sici.cache[x_gpu.dtype] except KeyError: func = elementwise.ElementwiseKernel(args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"') sici.cache[x_gpu.dtype] = func si_gpu = gpuarray.empty_like(x_gpu) ci_gpu = gpuarray.empty_like(x_gpu) func(x_gpu, si_gpu, ci_gpu) return (si_gpu, ci_gpu) sici.cache = {} def exp1(z_gpu): """ Exponential integral with `n = 1` of complex arguments. Parameters ---------- z_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- e_gpu : GPUArray GPUarrays containing the exponential integrals of the entries of `z_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64) >>> z_gpu = gpuarray.to_gpu(z) >>> e_gpu = exp1(z_gpu) >>> e_sp = scipy.special.exp1(z) >>> np.allclose(e_sp, e_gpu.get()) True """ if z_gpu.dtype == np.complex64: args = 'pycuda::complex *z, pycuda::complex *e' elif z_gpu.dtype == np.complex128: args = 'pycuda::complex *z, pycuda::complex *e' else: raise ValueError('unsupported type') op = 'e[i] = exp1(z[i])' try: func = exp1.cache[z_gpu.dtype] except KeyError: func = elementwise.ElementwiseKernel(args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"') exp1.cache[z_gpu.dtype] = func e_gpu = gpuarray.empty_like(z_gpu) func(z_gpu, e_gpu) return e_gpu exp1.cache = {} def expi(z_gpu): """ Exponential integral of complex arguments. Parameters ---------- z_gpu : GPUArray Input matrix of shape `(m, n)`. Returns ------- e_gpu : GPUArray GPUarrays containing the exponential integrals of the entries of `z_gpu`. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import scipy.special >>> import special >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64) >>> z_gpu = gpuarray.to_gpu(z) >>> e_gpu = expi(z_gpu) >>> e_sp = scipy.special.expi(z) >>> np.allclose(e_sp, e_gpu.get()) True """ if z_gpu.dtype == np.complex64: args = 'pycuda::complex *z, pycuda::complex *e' elif z_gpu.dtype == np.complex128: args = 'pycuda::complex *z, pycuda::complex *e' else: raise ValueError('unsupported type') op = 'e[i] = expi(z[i])' try: func = expi.cache[z_gpu.dtype] except KeyError: func = elementwise.ElementwiseKernel(args, op, options=["-I", install_headers], preamble='#include "cuSpecialFuncs.h"') expi.cache[z_gpu.dtype] = func e_gpu = gpuarray.empty_like(z_gpu) func(z_gpu, e_gpu) return e_gpu expi.cache = {} if __name__ == "__main__": import doctest doctest.testmod() scikit-cuda-0.5.1/skcuda/utils.py000066400000000000000000000131241261465507300167130ustar00rootroot00000000000000#!/usr/bin/env python """ Utility functions. """ import sys import ctypes.util import os import re import subprocess import struct import sys if sys.version_info < (3,): range = xrange try: import elftools except ImportError: import re def get_soname(filename): """ Retrieve SONAME of shared library. Parameters ---------- filename : str Full path to shared library. Returns ------- soname : str SONAME of shared library. Notes ----- This function uses the `objdump` system command on linux and 'otool' on Mac OS X (darwin). """ if sys.platform == 'darwin': cmds = ['otool', '-L', filename] else: # Fallback to linux... what about windows? cmds = ['objdump', '-p', filename] try: p = subprocess.Popen(cmds, stdout=subprocess.PIPE) out = p.communicate()[0].decode() except: raise RuntimeError('error executing {0}'.format(cmds)) if sys.platform == 'darwin': result = re.search('^\s@rpath/(lib.+.dylib)', out, re.MULTILINE) else: result = re.search('^\s+SONAME\s+(.+)$',out,re.MULTILINE) if result: return result.group(1) else: # No SONAME found: raise RuntimeError('no library name found for {0}'.format( (filename,))) else: import ctypes import elftools.elf.elffile as elffile import elftools.construct.macros as macros import elftools.elf.structs as structs def get_soname(filename): """ Retrieve SONAME of shared library. Parameters ---------- filename : str Full path to shared library. Returns ------- soname : str SONAME of shared library. Notes ----- This function uses the pyelftools [ELF] package. References ---------- .. [ELF] http://pypi.python.org/pypi/pyelftools """ stream = open(filename, 'rb') f = elffile.ELFFile(stream) dynamic = f.get_section_by_name('.dynamic') dynstr = f.get_section_by_name('.dynstr') # Handle libraries built for different machine architectures: if f.header['e_machine'] == 'EM_X86_64': st = structs.Struct('Elf64_Dyn', macros.ULInt64('d_tag'), macros.ULInt64('d_val')) elif f.header['e_machine'] == 'EM_386': st = structs.Struct('Elf32_Dyn', macros.ULInt32('d_tag'), macros.ULInt32('d_val')) else: raise RuntimeError('unsupported machine architecture') entsize = dynamic['sh_entsize'] for k in range(dynamic['sh_size']/entsize): result = st.parse(dynamic.data()[k*entsize:(k+1)*entsize]) # The following value for the SONAME tag is specified in elf.h: if result.d_tag == 14: return dynstr.get_string(result.d_val) # No SONAME found: return '' def find_lib_path(name): """ Find full path of a shared library. Searches for the full path of a shared library. On MacOSX and Posix operating systems, this function checks the directories listed in LD_LIBRARY_PATH (if any) and in the ld.so cache. Parameter --------- name : str Link name of library, e.g., cublas for libcublas.so.*. Returns ------- path : str Full path to library. Notes ----- Code adapted from ctypes.util module. Doesn't check whether the architectures of libraries found in LD_LIBRARY_PATH directories conform to that of the machine. """ if sys.platform == 'win32': return ctypes.util.find_library(name) # OSX has no ldconfig, search the DYLD_LIBRARY_PATH directories if sys.platform == 'darwin': # hacky, but as far as I know this is always a symlink # to the latest version of the library available libname = 'lib' + name + '.dylib' for dir_path in os.environ['DYLD_LIBRARY_PATH'].split(':'): if len(dir_path) > 0 and libname in os.listdir(dir_path): return os.path.join(dir_path, libname) return None # First, check the directories in LD_LIBRARY_PATH: expr = r'\s+(lib%s\.[^\s]+)\s+\-\>' % re.escape(name) for dir_path in filter(len, os.environ.get('LD_LIBRARY_PATH', '').split(':')): f = os.popen('/sbin/ldconfig -Nnv %s 2>/dev/null' % dir_path) try: data = f.read() finally: f.close() res = re.search(expr, data) if res: return os.path.join(dir_path, res.group(1)) # Next, check the ld.so cache: uname = os.uname()[4] if uname.startswith("arm"): uname = "arm" if struct.calcsize('l') == 4: machine = uname + '-32' else: machine = uname + '-64' mach_map = { 'x86_64-64': 'libc6,x86-64', 'ppc64-64': 'libc6,64bit', 'sparc64-64': 'libc6,64bit', 's390x-64': 'libc6,64bit', 'ia64-64': 'libc6,IA-64', 'arm-32': 'libc6(,hard-float)?', } abi_type = mach_map.get(machine, 'libc6') expr = r'\s+lib%s\.[^\s]+\s+\(%s.*\=\>\s(.+)' % (re.escape(name), abi_type) f = os.popen('/sbin/ldconfig -p 2>/dev/null') try: data = f.read() finally: f.close() res = re.search(expr, data) if not res: return None return res.group(1) scikit-cuda-0.5.1/skcuda/version.py000066400000000000000000000001231261465507300172330ustar00rootroot00000000000000import pkg_resources __version__ = pkg_resources.require('scikit-cuda')[0].version scikit-cuda-0.5.1/tests/000077500000000000000000000000001261465507300150705ustar00rootroot00000000000000scikit-cuda-0.5.1/tests/test_cublas.py000066400000000000000000000727451261465507300177710ustar00rootroot00000000000000#!/usr/bin/env python """ Unit tests for scikits.cuda.cublas """ from unittest import main, makeSuite, TestCase, TestSuite import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np _SEPS = np.finfo(np.float32).eps _DEPS = np.finfo(np.float64).eps import skcuda.cublas as cublas import skcuda.misc as misc def bptrs(a): """ Pointer array when input represents a batch of matrices. """ return gpuarray.arange(a.ptr,a.ptr+a.shape[0]*a.strides[0],a.strides[0], dtype=cublas.ctypes.c_void_p) class test_cublas(TestCase): def setUp(self): np.random.seed(23) # For reproducible tests. self.cublas_handle = cublas.cublasCreate() def tearDown(self): cublas.cublasDestroy(self.cublas_handle) # ISAMAX, IDAMAX, ICAMAX, IZAMAX def test_cublasIsamax(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasIsamax(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.argmax(x)) def test_cublasIdamax(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasIdamax(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.argmax(x)) def test_cublasIcamax(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasIcamax(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.argmax(np.abs(x.real) + np.abs(x.imag))) def test_cublasIzamax(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasIzamax(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.argmax(np.abs(x.real) + np.abs(x.imag))) # ISAMIN, IDAMIN, ICAMIN, IZAMIN def test_cublasIsamin(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasIsamin(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.argmin(x)) def test_cublasIdamin(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasIdamin(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.argmin(x)) def test_cublasIcamin(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasIcamin(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.argmin(np.abs(x.real) + np.abs(x.imag))) def test_cublasIzamin(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasIzamin(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.argmin(np.abs(x.real) + np.abs(x.imag))) # SASUM, DASUM, SCASUM, DZASUM def test_cublasSasum(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasSasum(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.sum(np.abs(x))) def test_cublasDasum(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasDasum(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.sum(np.abs(x))) def test_cublasScasum(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasScasum(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.sum(np.abs(x.real)+np.abs(x.imag))) def test_cublasDzasum(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasDzasum(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.sum(np.abs(x.real)+np.abs(x.imag))) # SAXPY, DAXPY, CAXPY, ZAXPY def test_cublasSaxpy(self): alpha = np.float32(np.random.rand()) x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float32) y_gpu = gpuarray.to_gpu(y) cublas.cublasSaxpy(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha*x+y) def test_cublasDaxpy(self): alpha = np.float64(np.random.rand()) x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float64) y_gpu = gpuarray.to_gpu(y) cublas.cublasDaxpy(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha*x+y) def test_cublasCaxpy(self): alpha = np.complex64(np.random.rand()+1j*np.random.rand()) x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) y_gpu = gpuarray.to_gpu(y) cublas.cublasCaxpy(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha*x+y) def test_cublasZaxpy(self): alpha = np.complex128(np.random.rand()+1j*np.random.rand()) x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) y_gpu = gpuarray.to_gpu(y) cublas.cublasZaxpy(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), alpha*x+y) # SCOPY, DCOPY, CCOPY, ZCOPY def test_cublasScopy(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.zeros_like(x_gpu) cublas.cublasScopy(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), x_gpu.get()) def test_cublasDcopy(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.zeros_like(x_gpu) cublas.cublasDcopy(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), x_gpu.get()) def test_cublasCcopy(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) y_gpu = misc.zeros_like(x_gpu) cublas.cublasCcopy(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), x_gpu.get()) def test_cublasZcopy(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) y_gpu = misc.zeros_like(x_gpu) cublas.cublasZcopy(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), x_gpu.get()) # SDOT, DDOT, CDOTU, CDOTC, ZDOTU, ZDOTC def test_cublasSdot(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float32) y_gpu = gpuarray.to_gpu(y) result = cublas.cublasSdot(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(result, np.dot(x, y)) def test_cublasDdot(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float64) y_gpu = gpuarray.to_gpu(y) result = cublas.cublasDdot(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(result, np.dot(x, y)) def test_cublasCdotu(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) y_gpu = gpuarray.to_gpu(y) result = cublas.cublasCdotu(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(result, np.dot(x, y)) def test_cublasCdotc(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) y_gpu = gpuarray.to_gpu(y) result = cublas.cublasCdotc(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(result, np.dot(np.conj(x), y)) def test_cublasZdotu(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) y_gpu = gpuarray.to_gpu(y) result = cublas.cublasZdotu(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(result, np.dot(x, y)) def test_cublasZdotc(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) y_gpu = gpuarray.to_gpu(y) result = cublas.cublasZdotc(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(result, np.dot(np.conj(x), y)) # SNRM2, DNRM2, SCNRM2, DZNRM2 def test_cublasSrnm2(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasSnrm2(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.linalg.norm(x)) def test_cublasDrnm2(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasDnrm2(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.linalg.norm(x)) def test_cublasScrnm2(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasScnrm2(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.linalg.norm(x)) def test_cublasDzrnm2(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) result = cublas.cublasDznrm2(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1) assert np.allclose(result, np.linalg.norm(x)) # SSCAL, DSCAL, CSCAL, CSSCAL, ZSCAL, ZDSCAL def test_cublasSscal(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) alpha = np.float32(np.random.rand()) cublas.cublasSscal(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha*x) def test_cublasCscal(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) alpha = np.complex64(np.random.rand()+1j*np.random.rand()) cublas.cublasCscal(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha*x) def test_cublasCsscal(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) alpha = np.float32(np.random.rand()) cublas.cublasCscal(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha*x) def test_cublasDscal(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) alpha = np.float64(np.random.rand()) cublas.cublasDscal(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha*x) def test_cublasZscal(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) alpha = np.complex128(np.random.rand()+1j*np.random.rand()) cublas.cublasZscal(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha*x) def test_cublasZdscal(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) alpha = np.float64(np.random.rand()) cublas.cublasZdscal(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha*x) # SROT, DROT, CROT, CSROT, ZROT, ZDROT def test_cublasSrot(self): x = np.array([1, 2, 3]).astype(np.float32) y = np.array([4, 5, 6]).astype(np.float32) s = 2.0 c = 3.0 x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(x) cublas.cublasSrot(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1, c, s) assert np.allclose(x_gpu.get(), [5, 10, 15]) assert np.allclose(y_gpu.get(), [1, 2, 3]) # SSWAP, DSWAP, CSWAP, ZSWAP def test_cublasSswap(self): x = np.random.rand(5).astype(np.float32) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float32) y_gpu = gpuarray.to_gpu(y) cublas.cublasSswap(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), y) def test_cublasDswap(self): x = np.random.rand(5).astype(np.float64) x_gpu = gpuarray.to_gpu(x) y = np.random.rand(5).astype(np.float64) y_gpu = gpuarray.to_gpu(y) cublas.cublasDswap(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), y) def test_cublasCswap(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) y_gpu = gpuarray.to_gpu(y) cublas.cublasCswap(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), y) def test_cublasZswap(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) x_gpu = gpuarray.to_gpu(x) y = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex128) y_gpu = gpuarray.to_gpu(y) cublas.cublasZswap(self.cublas_handle, x_gpu.size, x_gpu.gpudata, 1, y_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), y) # SGEMV, DGEMV, CGEMV, ZGEMV def test_cublasSgemv(self): a = np.random.rand(2, 3).astype(np.float32) x = np.random.rand(3, 1).astype(np.float32) a_gpu = gpuarray.to_gpu(a.T.copy()) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.empty((2, 1), np.float32) alpha = np.float32(1.0) beta = np.float32(0.0) cublas.cublasSgemv(self.cublas_handle, 'n', 2, 3, alpha, a_gpu.gpudata, 2, x_gpu.gpudata, 1, beta, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), np.dot(a, x)) def test_cublasDgemv(self): a = np.random.rand(2, 3).astype(np.float64) x = np.random.rand(3, 1).astype(np.float64) a_gpu = gpuarray.to_gpu(a.T.copy()) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.empty((2, 1), np.float64) alpha = np.float64(1.0) beta = np.float64(0.0) cublas.cublasDgemv(self.cublas_handle, 'n', 2, 3, alpha, a_gpu.gpudata, 2, x_gpu.gpudata, 1, beta, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), np.dot(a, x)) def test_cublasCgemv(self): a = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex64) x = (np.random.rand(3, 1)+1j*np.random.rand(3, 1)).astype(np.complex64) a_gpu = gpuarray.to_gpu(a.T.copy()) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.empty((2, 1), np.complex64) alpha = np.complex64(1.0) beta = np.complex64(0.0) cublas.cublasCgemv(self.cublas_handle, 'n', 2, 3, alpha, a_gpu.gpudata, 2, x_gpu.gpudata, 1, beta, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), np.dot(a, x)) def test_cublasZgemv(self): a = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128) x = (np.random.rand(3, 1)+1j*np.random.rand(3, 1)).astype(np.complex128) a_gpu = gpuarray.to_gpu(a.T.copy()) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.empty((2, 1), np.complex128) alpha = np.complex128(1.0) beta = np.complex128(0.0) cublas.cublasZgemv(self.cublas_handle, 'n', 2, 3, alpha, a_gpu.gpudata, 2, x_gpu.gpudata, 1, beta, y_gpu.gpudata, 1) assert np.allclose(y_gpu.get(), np.dot(a, x)) # SGEAM, CGEAM, DGEAM, ZDGEAM def test_cublasSgeam(self): a = np.random.rand(2, 3).astype(np.float32) b = np.random.rand(2, 3).astype(np.float32) a_gpu = gpuarray.to_gpu(a.copy()) b_gpu = gpuarray.to_gpu(b.copy()) c_gpu = gpuarray.zeros_like(a_gpu) alpha = np.float32(np.random.rand()) beta = np.float32(np.random.rand()) cublas.cublasSgeam(self.cublas_handle, 'n', 'n', 2, 3, alpha, a_gpu.gpudata, 2, beta, b_gpu.gpudata, 2, c_gpu.gpudata, 2) assert np.allclose(c_gpu.get(), alpha*a+beta*b) def test_cublasCgeam(self): a = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex64) b = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex64) a_gpu = gpuarray.to_gpu(a.copy()) b_gpu = gpuarray.to_gpu(b.copy()) c_gpu = gpuarray.zeros_like(a_gpu) alpha = np.complex64(np.random.rand()+1j*np.random.rand()) beta = np.complex64(np.random.rand()+1j*np.random.rand()) cublas.cublasCgeam(self.cublas_handle, 'n', 'n', 2, 3, alpha, a_gpu.gpudata, 2, beta, b_gpu.gpudata, 2, c_gpu.gpudata, 2) assert np.allclose(c_gpu.get(), alpha*a+beta*b) def test_cublasDgeam(self): a = np.random.rand(2, 3).astype(np.float64) b = np.random.rand(2, 3).astype(np.float64) a_gpu = gpuarray.to_gpu(a.copy()) b_gpu = gpuarray.to_gpu(b.copy()) c_gpu = gpuarray.zeros_like(a_gpu) alpha = np.float64(np.random.rand()) beta = np.float64(np.random.rand()) cublas.cublasDgeam(self.cublas_handle, 'n', 'n', 2, 3, alpha, a_gpu.gpudata, 2, beta, b_gpu.gpudata, 2, c_gpu.gpudata, 2) assert np.allclose(c_gpu.get(), alpha*a+beta*b) def test_cublasZgeam(self): a = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128) b = (np.random.rand(2, 3)+1j*np.random.rand(2, 3)).astype(np.complex128) a_gpu = gpuarray.to_gpu(a.copy()) b_gpu = gpuarray.to_gpu(b.copy()) c_gpu = gpuarray.zeros_like(a_gpu) alpha = np.complex128(np.random.rand()+1j*np.random.rand()) beta = np.complex128(np.random.rand()+1j*np.random.rand()) cublas.cublasZgeam(self.cublas_handle, 'n', 'n', 2, 3, alpha, a_gpu.gpudata, 2, beta, b_gpu.gpudata, 2, c_gpu.gpudata, 2) assert np.allclose(c_gpu.get(), alpha*a+beta*b) # CgemmBatched, ZgemmBatched def test_cublasCgemmBatched(self): l, m, k, n = 11, 7, 5, 3 A = (np.random.rand(l, m, k)+1j*np.random.rand(l, m, k)).astype(np.complex64) B = (np.random.rand(l, k, n)+1j*np.random.rand(l, k, n)).astype(np.complex64) C_res = np.einsum('nij,njk->nik', A, B) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) c_gpu = gpuarray.empty((l, m, n), np.complex64) alpha = np.complex64(1.0) beta = np.complex64(0.0) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) c_arr = bptrs(c_gpu) cublas.cublasCgemmBatched(self.cublas_handle, 'n','n', n, m, k, alpha, b_arr.gpudata, n, a_arr.gpudata, k, beta, c_arr.gpudata, n, l) assert np.allclose(C_res, c_gpu.get()) def test_cublasZgemmBatched(self): l, m, k, n = 11, 7, 5, 3 A = (np.random.rand(l, m, k)+1j*np.random.rand(l, m, k)).astype(np.complex128) B = (np.random.rand(l, k, n)+1j*np.random.rand(l, k, n)).astype(np.complex128) C_res = np.einsum('nij,njk->nik', A, B) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) c_gpu = gpuarray.empty((l, m, n), np.complex128) alpha = np.complex128(1.0) beta = np.complex128(0.0) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) c_arr = bptrs(c_gpu) cublas.cublasZgemmBatched(self.cublas_handle, 'n','n', n, m, k, alpha, b_arr.gpudata, n, a_arr.gpudata, k, beta, c_arr.gpudata, n, l) assert np.allclose(C_res, c_gpu.get()) # SgemmBatched, DgemmBatched def test_cublasSgemmBatched(self): l, m, k, n = 11, 7, 5, 3 A = np.random.rand(l, m, k).astype(np.float32) B = np.random.rand(l, k, n).astype(np.float32) C_res = np.einsum('nij,njk->nik', A, B) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) c_gpu = gpuarray.empty((l, m, n), np.float32) alpha = np.float32(1.0) beta = np.float32(0.0) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) c_arr = bptrs(c_gpu) cublas.cublasSgemmBatched(self.cublas_handle, 'n','n', n, m, k, alpha, b_arr.gpudata, n, a_arr.gpudata, k, beta, c_arr.gpudata, n, l) assert np.allclose(C_res, c_gpu.get()) def test_cublasDgemmBatched(self): l, m, k, n = 11, 7, 5, 3 A = np.random.rand(l, m, k).astype(np.float64) B = np.random.rand(l, k, n).astype(np.float64) C_res = np.einsum('nij,njk->nik',A,B) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) c_gpu = gpuarray.empty((l, m, n), np.float64) alpha = np.float64(1.0) beta = np.float64(0.0) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) c_arr = bptrs(c_gpu) cublas.cublasDgemmBatched(self.cublas_handle, 'n','n', n, m, k, alpha, b_arr.gpudata, n, a_arr.gpudata, k, beta, c_arr.gpudata, n, l) assert np.allclose(C_res, c_gpu.get()) # StrsmBatched, DtrsmBatched def test_cublasStrsmBatched(self): l, m, n = 11, 7, 5 A = np.random.rand(l, m, m).astype(np.float32) B = np.random.rand(l, m, n).astype(np.float32) A = np.array(list(map(np.triu, A))) X = np.array([np.linalg.solve(a, b) for a, b in zip(A, B)]) alpha = np.float32(1.0) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) cublas.cublasStrsmBatched(self.cublas_handle, 'r', 'l', 'n', 'n', n, m, alpha, a_arr.gpudata, m, b_arr.gpudata, n, l) assert np.allclose(X, b_gpu.get(), 5) def test_cublasDtrsmBatched(self): l, m, n = 11, 7, 5 A = np.random.rand(l, m, m).astype(np.float64) B = np.random.rand(l, m, n).astype(np.float64) A = np.array(list(map(np.triu, A))) X = np.array([np.linalg.solve(a, b) for a, b in zip(A, B)]) alpha = np.float64(1.0) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) cublas.cublasDtrsmBatched(self.cublas_handle, 'r', 'l', 'n', 'n', n, m, alpha, a_arr.gpudata, m, b_arr.gpudata, n, l) assert np.allclose(X, b_gpu.get(), 5) # SgetrfBatched, DgetrfBatched def test_cublasSgetrfBatched(self): from scipy.linalg import lu_factor l, m = 11, 7 A = np.random.rand(l, m, m).astype(np.float32) A = np.array([np.matrix(a)*np.matrix(a).T for a in A]) a_gpu = gpuarray.to_gpu(A) a_arr = bptrs(a_gpu) p_gpu = gpuarray.empty((l, m), np.int32) i_gpu = gpuarray.zeros(1, np.int32) X = np.array([ lu_factor(a)[0] for a in A]) cublas.cublasSgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, i_gpu.gpudata, l) X_ = np.array([a.T for a in a_gpu.get()]) assert np.allclose(X, X_, atol=10*_SEPS) def test_cublasDgetrfBatched(self): from scipy.linalg import lu_factor l, m = 11, 7 A = np.random.rand(l, m, m).astype(np.float64) A = np.array([np.matrix(a)*np.matrix(a).T for a in A]) a_gpu = gpuarray.to_gpu(A) a_arr = bptrs(a_gpu) p_gpu = gpuarray.empty((l, m), np.int32) i_gpu = gpuarray.zeros(1, np.int32) X = np.array([ lu_factor(a)[0] for a in A]) cublas.cublasDgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m, p_gpu.gpudata, i_gpu.gpudata, l) X_ = np.array([a.T for a in a_gpu.get()]) assert np.allclose(X,X_) def suite(): s = TestSuite() s.addTest(test_cublas('test_cublasIsamax')) s.addTest(test_cublas('test_cublasIcamax')) s.addTest(test_cublas('test_cublasIsamin')) s.addTest(test_cublas('test_cublasIcamin')) s.addTest(test_cublas('test_cublasSasum')) s.addTest(test_cublas('test_cublasScasum')) s.addTest(test_cublas('test_cublasSaxpy')) s.addTest(test_cublas('test_cublasCaxpy')) s.addTest(test_cublas('test_cublasScopy')) s.addTest(test_cublas('test_cublasCcopy')) s.addTest(test_cublas('test_cublasSdot')) s.addTest(test_cublas('test_cublasCdotu')) s.addTest(test_cublas('test_cublasCdotc')) s.addTest(test_cublas('test_cublasSrnm2')) s.addTest(test_cublas('test_cublasScrnm2')) s.addTest(test_cublas('test_cublasSscal')) s.addTest(test_cublas('test_cublasCscal')) s.addTest(test_cublas('test_cublasSrot')) s.addTest(test_cublas('test_cublasSswap')) s.addTest(test_cublas('test_cublasCswap')) s.addTest(test_cublas('test_cublasSgemv')) s.addTest(test_cublas('test_cublasCgemv')) s.addTest(test_cublas('test_cublasSgeam')) s.addTest(test_cublas('test_cublasCgeam')) s.addTest(test_cublas('test_cublasSgemmBatched')) s.addTest(test_cublas('test_cublasCgemmBatched')) s.addTest(test_cublas('test_cublasStrsmBatched')) s.addTest(test_cublas('test_cublasSgetrfBatched')) if misc.get_compute_capability(pycuda.autoinit.device) >= 1.3: s.addTest(test_cublas('test_cublasIdamax')) s.addTest(test_cublas('test_cublasIzamax')) s.addTest(test_cublas('test_cublasIdamin')) s.addTest(test_cublas('test_cublasIzamin')) s.addTest(test_cublas('test_cublasDasum')) s.addTest(test_cublas('test_cublasDzasum')) s.addTest(test_cublas('test_cublasDaxpy')) s.addTest(test_cublas('test_cublasZaxpy')) s.addTest(test_cublas('test_cublasDcopy')) s.addTest(test_cublas('test_cublasZcopy')) s.addTest(test_cublas('test_cublasDdot')) s.addTest(test_cublas('test_cublasZdotu')) s.addTest(test_cublas('test_cublasZdotc')) s.addTest(test_cublas('test_cublasDrnm2')) s.addTest(test_cublas('test_cublasDzrnm2')) s.addTest(test_cublas('test_cublasDscal')) s.addTest(test_cublas('test_cublasZscal')) s.addTest(test_cublas('test_cublasZdscal')) s.addTest(test_cublas('test_cublasDswap')) s.addTest(test_cublas('test_cublasZswap')) s.addTest(test_cublas('test_cublasDgemv')) s.addTest(test_cublas('test_cublasZgemv')) s.addTest(test_cublas('test_cublasDgeam')) s.addTest(test_cublas('test_cublasZgeam')) s.addTest(test_cublas('test_cublasDgemmBatched')) s.addTest(test_cublas('test_cublasZgemmBatched')) s.addTest(test_cublas('test_cublasDtrsmBatched')) s.addTest(test_cublas('test_cublasDgetrfBatched')) return s if __name__ == '__main__': main(defaultTest = 'suite') scikit-cuda-0.5.1/tests/test_fft.py000066400000000000000000000251261261465507300172660ustar00rootroot00000000000000#!/usr/bin/env python """ Unit tests for scikits.cuda.fft """ from __future__ import division from unittest import main, makeSuite, TestCase, TestSuite import pycuda.autoinit import pycuda.driver as drv import pycuda.gpuarray as gpuarray import numpy as np import skcuda.fft as fft import skcuda.misc as misc atol_float32 = 1e-6 atol_float64 = 1e-8 class test_fft(TestCase): def setUp(self): np.random.seed(0) # for reproducible tests self.N = 8 self.M = 4 self.B = 3 def test_fft_float32_to_complex64_1d(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N//2+1, np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) def test_fft_float32_to_complex64_2d(self): x = np.asarray(np.random.rand(self.N, self.M), np.float32) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.N, self.M//2+1), np.complex64) plan = fft.Plan(x.shape, np.float32, np.complex64) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) def test_batch_fft_float32_to_complex64_1d(self): x = np.asarray(np.random.rand(self.B, self.N), np.float32) xf = np.fft.rfft(x, axis=1) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N//2+1), np.complex64) plan = fft.Plan(x.shape[1], np.float32, np.complex64, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) def test_batch_fft_float32_to_complex64_2d(self): x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float32) xf = np.fft.rfftn(x, axes=(1,2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N, self.M//2+1), np.complex64) plan = fft.Plan([self.N, self.M], np.float32, np.complex64, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) def test_fft_float64_to_complex128_1d(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty(self.N//2+1, np.complex128) plan = fft.Plan(x.shape, np.float64, np.complex128) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64) def test_fft_float64_to_complex128_2d(self): x = np.asarray(np.random.rand(self.N, self.M), np.float64) xf = np.fft.rfftn(x) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.N, self.M//2+1), np.complex128) plan = fft.Plan(x.shape, np.float64, np.complex128) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64) def test_batch_fft_float64_to_complex128_1d(self): x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.fft.rfft(x, axis=1) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N//2+1), np.complex128) plan = fft.Plan(x.shape[1], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64) def test_batch_fft_float64_to_complex128_2d(self): x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64) xf = np.fft.rfftn(x, axes=(1,2)) x_gpu = gpuarray.to_gpu(x) xf_gpu = gpuarray.empty((self.B, self.N, self.M//2+1), np.complex128) plan = fft.Plan([self.N, self.M], np.float64, np.complex128, batch=self.B) fft.fft(x_gpu, xf_gpu, plan) assert np.allclose(xf, xf_gpu.get(), atol=atol_float64) def test_ifft_complex64_to_float32_1d(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.asarray(np.fft.rfftn(x), np.complex64) xf_gpu = gpuarray.to_gpu(xf) x_gpu = gpuarray.empty(self.N, np.float32) plan = fft.Plan(x.shape, np.complex64, np.float32) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32) def test_ifft_complex64_to_float32_2d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.N, self.M), np.float32) xf = np.asarray(np.fft.rfftn(x), np.complex64) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.N, self.M), np.float32) plan = fft.Plan(x.shape, np.complex64, np.float32) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32) def test_batch_ifft_complex64_to_float32_1d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.B, self.N), np.float32) xf = np.asarray(np.fft.rfft(x, axis=1), np.complex64) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.B, self.N), np.float32) plan = fft.Plan(x.shape[1], np.complex64, np.float32, batch=self.B) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32) def test_batch_ifft_complex64_to_float32_2d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float32) xf = np.asarray(np.fft.rfftn(x, axes=(1,2)), np.complex64) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.B, self.N, self.M), np.float32) plan = fft.Plan([self.N, self.M], np.complex64, np.float32, batch=self.B) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float32) def test_ifft_complex128_to_float64_1d(self): x = np.asarray(np.random.rand(self.N), np.float64) xf = np.asarray(np.fft.rfftn(x), np.complex128) xf_gpu = gpuarray.to_gpu(xf) x_gpu = gpuarray.empty(self.N, np.float64) plan = fft.Plan(x.shape, np.complex128, np.float64) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64) def test_ifft_complex128_to_float64_2d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.N, self.M), np.float64) xf = np.asarray(np.fft.rfftn(x), np.complex128) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.N, self.M), np.float64) plan = fft.Plan(x.shape, np.complex128, np.float64) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64) def test_batch_ifft_complex128_to_float64_1d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.B, self.N), np.float64) xf = np.asarray(np.fft.rfft(x, axis=1), np.complex128) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.B, self.N), np.float64) plan = fft.Plan(x.shape[1], np.complex128, np.float64, batch=self.B) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64) def test_batch_ifft_complex128_to_float64_2d(self): # Note that since rfftn returns a Fortran-ordered array, it # needs to be reformatted as a C-ordered array before being # passed to gpuarray.to_gpu: x = np.asarray(np.random.rand(self.B, self.N, self.M), np.float64) xf = np.asarray(np.fft.rfftn(x, axes=(1,2)), np.complex128) xf_gpu = gpuarray.to_gpu(np.ascontiguousarray(xf)) x_gpu = gpuarray.empty((self.B, self.N, self.M), np.float64) plan = fft.Plan([self.N, self.M], np.complex128, np.float64, batch=self.B) fft.ifft(xf_gpu, x_gpu, plan, True) assert np.allclose(x, x_gpu.get(), atol=atol_float64) def test_multiple_streams(self): x = np.asarray(np.random.rand(self.N), np.float32) xf = np.fft.rfftn(x) y = np.asarray(np.random.rand(self.N), np.float32) yf = np.fft.rfftn(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) xf_gpu = gpuarray.empty(self.N//2+1, np.complex64) yf_gpu = gpuarray.empty(self.N//2+1, np.complex64) stream0 = drv.Stream() stream1 = drv.Stream() plan1 = fft.Plan(x.shape, np.float32, np.complex64, stream=stream0) plan2 = fft.Plan(y.shape, np.float32, np.complex64, stream=stream1) fft.fft(x_gpu, xf_gpu, plan1) fft.fft(y_gpu, yf_gpu, plan2) assert np.allclose(xf, xf_gpu.get(), atol=atol_float32) assert np.allclose(yf, yf_gpu.get(), atol=atol_float32) def suite(): s = TestSuite() s.addTest(test_fft('test_fft_float32_to_complex64_1d')) s.addTest(test_fft('test_fft_float32_to_complex64_2d')) s.addTest(test_fft('test_batch_fft_float32_to_complex64_1d')) s.addTest(test_fft('test_batch_fft_float32_to_complex64_2d')) s.addTest(test_fft('test_ifft_complex64_to_float32_1d')) s.addTest(test_fft('test_ifft_complex64_to_float32_2d')) s.addTest(test_fft('test_batch_ifft_complex64_to_float32_1d')) s.addTest(test_fft('test_batch_ifft_complex64_to_float32_2d')) s.addTest(test_fft('test_multiple_streams')) if misc.get_compute_capability(pycuda.autoinit.device) >= 1.3: s.addTest(test_fft('test_fft_float64_to_complex128_1d')) s.addTest(test_fft('test_fft_float64_to_complex128_2d')) s.addTest(test_fft('test_batch_fft_float64_to_complex128_1d')) s.addTest(test_fft('test_batch_fft_float64_to_complex128_2d')) s.addTest(test_fft('test_ifft_complex128_to_float64_1d')) s.addTest(test_fft('test_ifft_complex128_to_float64_2d')) s.addTest(test_fft('test_batch_ifft_complex128_to_float64_1d')) s.addTest(test_fft('test_batch_ifft_complex128_to_float64_2d')) return s if __name__ == '__main__': main(defaultTest = 'suite') scikit-cuda-0.5.1/tests/test_integrate.py000066400000000000000000000053161261465507300204700ustar00rootroot00000000000000""" Unit tests for scikits.cuda.integrate """ from unittest import main, TestCase, TestSuite import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import skcuda.misc as misc import skcuda.integrate as integrate class test_integrate(TestCase): def setUp(self): np.random.seed(0) integrate.init() def test_trapz_float32(self): x = np.asarray(np.random.rand(10), np.float32) x_gpu = gpuarray.to_gpu(x) z = integrate.trapz(x_gpu) assert np.allclose(np.trapz(x), z) def test_trapz_float64(self): x = np.asarray(np.random.rand(10), np.float64) x_gpu = gpuarray.to_gpu(x) z = integrate.trapz(x_gpu) assert np.allclose(np.trapz(x), z) def test_trapz_complex64(self): x = np.asarray(np.random.rand(10)+1j*np.random.rand(10), np.complex64) x_gpu = gpuarray.to_gpu(x) z = integrate.trapz(x_gpu) assert np.allclose(np.trapz(x), z) def test_trapz_complex128(self): x = np.asarray(np.random.rand(10)+1j*np.random.rand(10), np.complex128) x_gpu = gpuarray.to_gpu(x) z = integrate.trapz(x_gpu) assert np.allclose(np.trapz(x), z) def test_trapz2d_float32(self): x = np.asarray(np.random.rand(5, 5), np.float32) x_gpu = gpuarray.to_gpu(x) z = integrate.trapz2d(x_gpu) assert np.allclose(np.trapz(np.trapz(x)), z) def test_trapz2d_float64(self): x = np.asarray(np.random.rand(5, 5), np.float64) x_gpu = gpuarray.to_gpu(x) z = integrate.trapz2d(x_gpu) assert np.allclose(np.trapz(np.trapz(x)), z) def test_trapz2d_complex64(self): x = np.asarray(np.random.rand(5, 5)+1j*np.random.rand(5, 5), np.complex64) x_gpu = gpuarray.to_gpu(x) z = integrate.trapz2d(x_gpu) assert np.allclose(np.trapz(np.trapz(x)), z) def test_trapz2d_complex128(self): x = np.asarray(np.random.rand(5, 5)+1j*np.random.rand(5, 5), np.complex128) x_gpu = gpuarray.to_gpu(x) z = integrate.trapz2d(x_gpu) assert np.allclose(np.trapz(np.trapz(x)), z) def suite(): s = TestSuite() s.addTest(test_integrate('test_trapz_float32')) s.addTest(test_integrate('test_trapz_complex64')) s.addTest(test_integrate('test_trapz2d_float32')) s.addTest(test_integrate('test_trapz2d_complex64')) if misc.get_compute_capability(pycuda.autoinit.device) >= 1.3: s.addTest(test_integrate('test_trapz_float64')) s.addTest(test_integrate('test_trapz_complex128')) s.addTest(test_integrate('test_trapz2d_float64')) s.addTest(test_integrate('test_trapz2d_complex128')) return s if __name__ == '__main__': main(defaultTest = 'suite') scikit-cuda-0.5.1/tests/test_linalg.py000066400000000000000000001315321261465507300177540ustar00rootroot00000000000000#!/usr/bin/env python """ Unit tests for scikits.cuda.linalg """ from unittest import main, makeSuite, TestCase, TestSuite import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np from numpy.testing import assert_raises import skcuda.linalg as linalg import skcuda.misc as misc atol_float32 = 1e-6 atol_float64 = 1e-8 class test_linalg(TestCase): def setUp(self): np.random.seed(0) linalg.init() def test_svd_ss_float32(self): a = np.asarray(np.random.randn(9, 6), np.float32) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 's', 's') assert np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), atol=atol_float32) def test_svd_ss_float64(self): a = np.asarray(np.random.randn(9, 6), np.float64) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 's', 's') assert np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), atol=atol_float64) def test_svd_ss_complex64(self): a = np.asarray(np.random.randn(9, 6) + 1j*np.random.randn(9, 6), np.complex64) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 's', 's') assert np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), atol=atol_float32) def test_svd_ss_complex128(self): a = np.asarray(np.random.randn(9, 6) + 1j*np.random.randn(9, 6), np.complex128) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 's', 's') assert np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), atol=atol_float64) def test_svd_so_float32(self): a = np.asarray(np.random.randn(6, 6), np.float32) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 's', 'o') assert np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), atol=atol_float32) def test_svd_so_float64(self): a = np.asarray(np.random.randn(6, 6), np.float64) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 's', 'o') assert np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), atol=atol_float64) def test_svd_so_complex64(self): a = np.asarray(np.random.randn(6, 6) + 1j*np.random.randn(6, 6), np.complex64) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 's', 'o') assert np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), atol=atol_float32) def test_svd_so_complex128(self): a = np.asarray(np.random.randn(6, 6) + 1j*np.random.randn(6, 6), np.complex128) a_gpu = gpuarray.to_gpu(a) u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 's', 'o') assert np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), atol=atol_float64) def _dot_matrix_tests(self, dtype, transa, transb): a = np.asarray(np.random.rand(4, 2), dtype) if transa == 'n': b = np.asarray(np.random.rand(2, 2), dtype) else: b = np.asarray(np.random.rand(4, 4), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb) aa = a if transa == 'n' else a.T bb = b if transb == 'n' else b.T assert np.allclose(np.dot(aa, bb), c_gpu.get()) a = a.astype(dtype, order="F", copy=True) b = b.astype(dtype, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb) assert np.allclose(np.dot(aa, bb), c_gpu.get()) def test_dot_matrix_float32(self): self._dot_matrix_tests(np.float32, 'n', 'n') self._dot_matrix_tests(np.float32, 'n', 't') self._dot_matrix_tests(np.float32, 't', 'n') self._dot_matrix_tests(np.float32, 't', 't') def test_dot_matrix_float64(self): self._dot_matrix_tests(np.float64, 'n', 'n') self._dot_matrix_tests(np.float64, 'n', 't') self._dot_matrix_tests(np.float64, 't', 'n') self._dot_matrix_tests(np.float64, 't', 't') def test_dot_matrix_complex64(self): self._dot_matrix_tests(np.complex64, 'n', 'n') self._dot_matrix_tests(np.complex64, 'n', 't') self._dot_matrix_tests(np.complex64, 't', 'n') self._dot_matrix_tests(np.complex64, 't', 't') def test_dot_matrix_complex128(self): self._dot_matrix_tests(np.complex128, 'n', 'n') self._dot_matrix_tests(np.complex128, 'n', 't') self._dot_matrix_tests(np.complex128, 't', 'n') self._dot_matrix_tests(np.complex128, 't', 't') def test_dot_matrix_h_complex64(self): a = np.asarray(np.random.rand(2, 4)+1j*np.random.rand(2, 4), np.complex64) b = np.asarray(np.random.rand(2, 2)+1j*np.random.rand(2, 2), np.complex64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get()) a = a.astype(np.complex64, order="F", copy=True) b = b.astype(np.complex64, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get()) def test_dot_matrix_h_complex128(self): a = np.asarray(np.random.rand(2, 4)+1j*np.random.rand(2, 4), np.complex128) b = np.asarray(np.random.rand(2, 2)+1j*np.random.rand(2, 2), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get()) a = a.astype(np.complex128, order="F", copy=True) b = b.astype(np.complex128, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get()) def test_dot_vector_float32(self): a = np.asarray(np.random.rand(5), np.float32) b = np.asarray(np.random.rand(5), np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) a = a.astype(np.float32, order="F", copy=True) b = b.astype(np.float32, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) def test_dot_vector_float64(self): a = np.asarray(np.random.rand(5), np.float64) b = np.asarray(np.random.rand(5), np.float64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) a = a.astype(np.float64, order="F", copy=True) b = b.astype(np.float64, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) def test_dot_vector_complex64(self): a = np.asarray(np.random.rand(5), np.complex64) b = np.asarray(np.random.rand(5), np.complex64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) a = a.astype(np.complex64, order="F", copy=True) b = b.astype(np.complex64, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) def test_dot_vector_complex128(self): a = np.asarray(np.random.rand(5), np.complex128) b = np.asarray(np.random.rand(5), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) a = a.astype(np.complex128, order="F", copy=True) b = b.astype(np.complex128, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) def test_mdot_matrix_float32(self): a = np.asarray(np.random.rand(4, 2), np.float32) b = np.asarray(np.random.rand(2, 2), np.float32) c = np.asarray(np.random.rand(2, 2), np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = linalg.mdot(a_gpu, b_gpu, c_gpu) assert np.allclose(np.dot(a, np.dot(b, c)), d_gpu.get()) def test_mdot_matrix_float64(self): a = np.asarray(np.random.rand(4, 2), np.float64) b = np.asarray(np.random.rand(2, 2), np.float64) c = np.asarray(np.random.rand(2, 2), np.float64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = linalg.mdot(a_gpu, b_gpu, c_gpu) assert np.allclose(np.dot(a, np.dot(b, c)), d_gpu.get()) def test_mdot_matrix_complex64(self): a = np.asarray(np.random.rand(4, 2), np.complex64) b = np.asarray(np.random.rand(2, 2), np.complex64) c = np.asarray(np.random.rand(2, 2), np.complex64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = linalg.mdot(a_gpu, b_gpu, c_gpu) assert np.allclose(np.dot(a, np.dot(b, c)), d_gpu.get()) def test_mdot_matrix_complex128(self): a = np.asarray(np.random.rand(4, 2), np.complex128) b = np.asarray(np.random.rand(2, 2), np.complex128) c = np.asarray(np.random.rand(2, 2), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = linalg.mdot(a_gpu, b_gpu, c_gpu) assert np.allclose(np.dot(a, np.dot(b, c)), d_gpu.get()) def __impl_test_dot_diag(self, dtype): d = np.asarray(np.random.rand(5), dtype) a = np.asarray(np.random.rand(5, 3), dtype) d_gpu = gpuarray.to_gpu(d) a_gpu = gpuarray.to_gpu(a) r_gpu = linalg.dot_diag(d_gpu, a_gpu) assert np.allclose(np.dot(np.diag(d), a), r_gpu.get()) a = a.astype(dtype, order="F", copy=True) d_gpu = gpuarray.to_gpu(d) a_gpu = gpuarray.to_gpu(a) # note: due to pycuda issue #66, this will fail when overwrite=False r_gpu = linalg.dot_diag(d_gpu, a_gpu, overwrite=True) assert np.allclose(np.dot(np.diag(d), a), r_gpu.get()) def test_dot_diag_float32(self): self.__impl_test_dot_diag(np.float32) def test_dot_diag_float64(self): self.__impl_test_dot_diag(np.float64) def test_dot_diag_complex64(self): self.__impl_test_dot_diag(np.complex64) def test_dot_diag_complex128(self): self.__impl_test_dot_diag(np.complex128) def ___impl_test_dot_diag_t(self, dtype): d = np.asarray(np.random.rand(5), dtype) a = np.asarray(np.random.rand(3, 5), dtype) d_gpu = gpuarray.to_gpu(d) a_gpu = gpuarray.to_gpu(a) r_gpu = linalg.dot_diag(d_gpu, a_gpu, 't') assert np.allclose(np.dot(np.diag(d), a.T).T, r_gpu.get()) a = a.astype(dtype, order="F", copy=True) d_gpu = gpuarray.to_gpu(d) a_gpu = gpuarray.to_gpu(a) # note: due to pycuda issue #66, this will fail when overwrite=False r_gpu = linalg.dot_diag(d_gpu, a_gpu, 't', overwrite=True) assert np.allclose(np.dot(np.diag(d), a.T).T, r_gpu.get()) def test_dot_diag_t_float32(self): self.___impl_test_dot_diag_t(np.float32) def test_dot_diag_t_float64(self): self.___impl_test_dot_diag_t(np.float64) def test_dot_diag_t_complex64(self): self.___impl_test_dot_diag_t(np.complex64) def test_dot_diag_t_complex128(self): self.___impl_test_dot_diag_t(np.complex128) def test_transpose_float32(self): # M < N a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float32) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) assert np.all(a.T == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.transpose(b_gpu) assert np.all(b.T == bt_gpu.get()) def test_transpose_float64(self): # M < N a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float64) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) assert np.all(a.T == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.transpose(b_gpu) assert np.all(b.T == bt_gpu.get()) def test_transpose_complex64(self): # M < N a = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex64) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) assert np.all(a.T == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.transpose(b_gpu) assert np.all(b.T == bt_gpu.get()) def test_transpose_complex128(self): # M < N a = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex128) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.transpose(a_gpu) assert np.all(a.T == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.transpose(b_gpu) assert np.all(b.T == bt_gpu.get()) def test_hermitian_float32(self): # M < N a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float32) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.hermitian(a_gpu) assert np.all(a.T == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.hermitian(b_gpu) assert np.all(b.T == bt_gpu.get()) def test_hermitian_complex64(self): # M < N a = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex64) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.hermitian(a_gpu) assert np.all(np.conj(a.T) == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.hermitian(b_gpu) assert np.all(np.conj(b.T) == bt_gpu.get()) def test_hermitian_float64(self): # M < N a = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], np.float64) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.hermitian(a_gpu) assert np.all(a.T == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.hermitian(b_gpu) assert np.all(b.T == bt_gpu.get()) def test_hermitian_complex128(self): # M < N a = np.array([[1j, 2j, 3j, 4j, 5j, 6j], [7j, 8j, 9j, 10j, 11j, 12j]], np.complex128) a_gpu = gpuarray.to_gpu(a) at_gpu = linalg.hermitian(a_gpu) assert np.all(np.conj(a.T) == at_gpu.get()) # M > N b = a.T.copy() b_gpu = gpuarray.to_gpu(b) bt_gpu = linalg.hermitian(b_gpu) assert np.all(np.conj(b.T) == bt_gpu.get()) def test_conj_complex64(self): a = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64) a_gpu = gpuarray.to_gpu(a) r_gpu = linalg.conj(a_gpu) assert np.all(np.conj(a) == r_gpu.get()) def test_conj_complex128(self): a = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex128) a_gpu = gpuarray.to_gpu(a) r_gpu = linalg.conj(a_gpu) assert np.all(np.conj(a) == r_gpu.get()) def test_diag_1d_float32(self): v = np.array([1, 2, 3, 4, 5, 6], np.float32) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_2d_wide_float32(self): v = np.array(np.random.rand(32, 64), np.float32) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_2d_tall_float32(self): v = np.array(np.random.rand(64, 32), np.float32) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_1d_float64(self): v = np.array([1, 2, 3, 4, 5, 6], np.float64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_2d_wide_float64(self): v = np.array(np.random.rand(32, 64), np.float64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_2d_tall_float64(self): v = np.array(np.random.rand(64, 32), np.float64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_1d_complex64(self): v = np.array([1j, 2j, 3j, 4j, 5j, 6j], np.complex64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_2d_wide_complex64(self): v = np.array(np.random.rand(32, 64)*1j, np.complex64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_2d_tall_complex64(self): v = np.array(np.random.rand(64, 32)*1j, np.complex64) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_1d_complex128(self): v = np.array([1j, 2j, 3j, 4j, 5j, 6j], np.complex128) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_2d_wide_complex128(self): v = np.array(np.random.rand(32, 64)*1j, np.complex128) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_diag_2d_tall_complex128(self): v = np.array(np.random.rand(64, 32)*1j, np.complex128) v_gpu = gpuarray.to_gpu(v) d_gpu = linalg.diag(v_gpu) assert np.all(np.diag(v) == d_gpu.get()) def test_eye_float32(self): N = 10 e_gpu = linalg.eye(N, dtype=np.float32) assert np.all(np.eye(N, dtype=np.float32) == e_gpu.get()) def test_eye_float64(self): N = 10 e_gpu = linalg.eye(N, dtype=np.float64) assert np.all(np.eye(N, dtype=np.float64) == e_gpu.get()) def test_eye_complex64(self): N = 10 e_gpu = linalg.eye(N, dtype=np.complex64) assert np.all(np.eye(N, dtype=np.complex64) == e_gpu.get()) def test_eye_complex128(self): N = 10 e_gpu = linalg.eye(N, dtype=np.complex128) assert np.all(np.eye(N, dtype=np.complex128) == e_gpu.get()) def test_pinv_float32(self): a = np.asarray(np.random.rand(8, 4), np.float32) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = linalg.pinv(a_gpu) assert np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=atol_float32) def test_pinv_float64(self): a = np.asarray(np.random.rand(8, 4), np.float64) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = linalg.pinv(a_gpu) assert np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=atol_float64) def test_pinv_complex64(self): a = np.asarray(np.random.rand(8, 4) + \ 1j*np.random.rand(8, 4), np.complex64) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = linalg.pinv(a_gpu) assert np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=atol_float32) def test_pinv_complex128(self): a = np.asarray(np.random.rand(8, 4) + \ 1j*np.random.rand(8, 4), np.complex128) a_gpu = gpuarray.to_gpu(a) a_inv_gpu = linalg.pinv(a_gpu) assert np.allclose(np.linalg.pinv(a), a_inv_gpu.get(), atol=atol_float64) def test_tril_float32(self): a = np.asarray(np.random.rand(4, 4), np.float32) a_gpu = gpuarray.to_gpu(a) l_gpu = linalg.tril(a_gpu) assert np.allclose(np.tril(a), l_gpu.get()) def test_tril_float64(self): a = np.asarray(np.random.rand(4, 4), np.float64) a_gpu = gpuarray.to_gpu(a) l_gpu = linalg.tril(a_gpu) assert np.allclose(np.tril(a), l_gpu.get()) def test_tril_complex64(self): a = np.asarray(np.random.rand(4, 4), np.complex64) a_gpu = gpuarray.to_gpu(a) l_gpu = linalg.tril(a_gpu) assert np.allclose(np.tril(a), l_gpu.get()) def test_tril_complex128(self): a = np.asarray(np.random.rand(4, 4), np.complex128) a_gpu = gpuarray.to_gpu(a) l_gpu = linalg.tril(a_gpu) assert np.allclose(np.tril(a), l_gpu.get()) def test_triu_float32(self): a = np.asarray(np.random.rand(4, 4), np.float32) a_gpu = gpuarray.to_gpu(a) l_gpu = linalg.triu(a_gpu) assert np.allclose(np.triu(a), l_gpu.get()) def test_triu_float64(self): a = np.asarray(np.random.rand(4, 4), np.float64) a_gpu = gpuarray.to_gpu(a) l_gpu = linalg.triu(a_gpu) assert np.allclose(np.triu(a), l_gpu.get()) def test_triu_complex64(self): a = np.asarray(np.random.rand(4, 4), np.complex64) a_gpu = gpuarray.to_gpu(a) l_gpu = linalg.triu(a_gpu) assert np.allclose(np.triu(a), l_gpu.get()) def test_triu_complex128(self): a = np.asarray(np.random.rand(4, 4), np.complex128) a_gpu = gpuarray.to_gpu(a) l_gpu = linalg.triu(a_gpu) assert np.allclose(np.triu(a), l_gpu.get()) def _impl_test_multiply(self, N, dtype): mk_matrix = lambda N, dtype: np.asarray(np.random.rand(N, N), dtype) x = mk_matrix(N, dtype) y = mk_matrix(N, dtype) if np.iscomplexobj(x): x += 1j*mk_matrix(N, dtype) y += 1j*mk_matrix(N, dtype) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) z_gpu = linalg.multiply(x_gpu, y_gpu) assert np.allclose(x*y, z_gpu.get()) def test_multiply_float32(self): self._impl_test_multiply(4, np.float32) def test_multiply_float64(self): self._impl_test_multiply(4, np.float64) def test_multiply_complex64(self): self._impl_test_multiply(4, np.complex64) def test_multiply_complex128(self): self._impl_test_multiply(4, np.complex128) def test_cho_factor_float32(self): from scipy.linalg import cho_factor as cpu_cho_factor x = np.asarray(np.random.rand(4, 4), np.float32) x = np.dot(x.T, x) x_gpu = gpuarray.to_gpu(x) linalg.cho_factor(x_gpu) c = np.triu(cpu_cho_factor(x)[0]) assert np.allclose(c, np.triu(x_gpu.get())) def test_cho_solve_float32(self): x = np.asarray(np.random.rand(4, 4), np.float32) x = np.dot(x.T, x) y = np.asarray(np.random.rand(4), np.float32) c = np.linalg.inv(x).dot(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) linalg.cho_solve(x_gpu, y_gpu) assert np.allclose(c, y_gpu.get(), atol=1e-4) x = np.asarray(np.random.rand(4, 4), np.float32) x = np.dot(x.T, x).astype(np.float32, order="F", copy=True) y = np.asarray(np.random.rand(4, 4), np.float32, order="F") c = np.linalg.inv(x).dot(y) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) linalg.cho_solve(x_gpu, y_gpu) assert np.allclose(c, y_gpu.get(), atol=1e-4) def _impl_test_inv(self, dtype): from scipy.linalg import inv as cpu_inv x = np.asarray(np.random.rand(4, 4), dtype) x = np.dot(x.T, x) x_gpu = gpuarray.to_gpu(x) xinv = cpu_inv(x) xinv_gpu = linalg.inv(x_gpu) assert np.allclose(xinv, xinv_gpu.get(), atol=1e-5) assert xinv_gpu is not x_gpu xinv_gpu = linalg.inv(x_gpu, overwrite=True) assert np.allclose(xinv, xinv_gpu.get(), atol=1e-5) assert xinv_gpu is x_gpu def test_inv_exceptions(self): x = np.asarray([[1, 2], [2, 4]], np.float32) x_gpu = gpuarray.to_gpu(x) assert_raises(linalg.LinAlgError, linalg.inv, x_gpu) def test_inv_float32(self): self._impl_test_inv(np.float32) def test_inv_float64(self): self._impl_test_inv(np.float64) def test_inv_complex64(self): self._impl_test_inv(np.complex64) def test_inv_complex128(self): self._impl_test_inv(np.complex128) def _impl_test_add_diag(self, dtype): x = np.asarray(np.random.rand(4, 4), dtype) d = np.asarray(np.random.rand(1, 4), dtype).reshape(-1) x_gpu = gpuarray.to_gpu(x) d_gpu = gpuarray.to_gpu(d) res_cpu = x + np.diag(d) res_gpu = linalg.add_diag(d_gpu, x_gpu, overwrite=False) assert np.allclose(res_cpu, res_gpu.get(), atol=1e-5) assert res_gpu is not x_gpu res_gpu = linalg.add_diag(d_gpu, x_gpu, overwrite=True) assert np.allclose(res_cpu, res_gpu.get(), atol=1e-5) assert res_gpu is x_gpu def test_add_diag_float32(self): self._impl_test_add_diag(np.float32) def test_add_diag_float64(self): self._impl_test_add_diag(np.float64) def test_add_diag_complex64(self): self._impl_test_add_diag(np.complex64) def test_add_diag_complex128(self): self._impl_test_add_diag(np.complex128) def test_eye_large_float32(self): N = 128 e_gpu = linalg.eye(N, dtype=np.float32) assert np.all(np.eye(N, dtype=np.float32) == e_gpu.get()) def _impl_test_trace(self, dtype): # square matrix x = 10*np.asarray(np.random.rand(4, 4), dtype) x_gpu = gpuarray.to_gpu(x) assert np.allclose(linalg.trace(x_gpu), np.trace(x)) # tall matrix x = np.asarray(np.random.rand(5, 2), dtype) x_gpu = gpuarray.to_gpu(x) assert np.allclose(linalg.trace(x_gpu), np.trace(x)) # fat matrix x = np.asarray(np.random.rand(2, 5), dtype) x_gpu = gpuarray.to_gpu(x) assert np.allclose(linalg.trace(x_gpu), np.trace(x)) def test_trace_float32(self): self._impl_test_trace(np.float32) def test_trace_float64(self): self._impl_test_trace(np.float64) def test_trace_complex64(self): self._impl_test_trace(np.complex64) def test_trace_complex128(self): self._impl_test_trace(np.complex128) def _impl_add_dot_matrix_tests(self, dtype, transa, transb): a = np.asarray(np.random.rand(4, 2), dtype) if transa == 'n': b = np.asarray(np.random.rand(2, 2), dtype) else: b = np.asarray(np.random.rand(4, 4), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) aa = a if transa == 'n' else a.T bb = b if transb == 'n' else b.T c = np.asarray(np.random.rand(aa.shape[0], bb.shape[1]), dtype) c_gpu = gpuarray.to_gpu(c) c_gpu = linalg.add_dot(a_gpu, b_gpu, c_gpu, transa, transb) assert np.allclose(c + np.dot(aa, bb), c_gpu.get()) a = a.astype(dtype, order="F", copy=True) b = b.astype(dtype, order="F", copy=True) c = c.astype(dtype, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) c_gpu = linalg.add_dot(a_gpu, b_gpu, c_gpu, transa, transb) assert np.allclose(c+np.dot(aa, bb), c_gpu.get()) def test_add_dot_matrix_float32(self): self._impl_add_dot_matrix_tests(np.float32, 'n', 'n') self._impl_add_dot_matrix_tests(np.float32, 'n', 't') self._impl_add_dot_matrix_tests(np.float32, 't', 'n') self._impl_add_dot_matrix_tests(np.float32, 't', 't') def test_add_dot_matrix_float64(self): self._impl_add_dot_matrix_tests(np.float64, 'n', 'n') self._impl_add_dot_matrix_tests(np.float64, 'n', 't') self._impl_add_dot_matrix_tests(np.float64, 't', 'n') self._impl_add_dot_matrix_tests(np.float64, 't', 't') def test_add_dot_matrix_complex64(self): self._impl_add_dot_matrix_tests(np.complex64, 'n', 'n') self._impl_add_dot_matrix_tests(np.complex64, 'n', 't') self._impl_add_dot_matrix_tests(np.complex64, 't', 'n') self._impl_add_dot_matrix_tests(np.complex64, 't', 't') def test_add_dot_matrix_complex128(self): self._impl_add_dot_matrix_tests(np.complex128, 'n', 'n') self._impl_add_dot_matrix_tests(np.complex128, 'n', 't') self._impl_add_dot_matrix_tests(np.complex128, 't', 'n') self._impl_add_dot_matrix_tests(np.complex128, 't', 't') def _impl_test_dot_strided(self, dtype): # n/n a = np.asarray(np.random.rand(4, 10), dtype) b = np.asarray(np.random.rand(2, 20), dtype) c = np.zeros((4, 30), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:7], 'n', 'n') res = c_gpu.get() assert np.allclose(np.dot(a[:, 4:6], b[:, 2:8]), res[:, 1:7]) # t/n a = np.asarray(np.random.rand(4, 10), dtype) b = np.asarray(np.random.rand(4, 20), dtype) c = np.zeros((2, 30), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:7], 't', 'n') res = c_gpu.get() assert np.allclose(np.dot(a[:, 4:6].T, b[:, 2:8]), res[:, 1:7]) # n/t a = np.asarray(np.random.rand(4, 10), dtype) b = np.asarray(np.random.rand(6, 20), dtype) c = np.zeros((4, 30), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) linalg.add_dot(a_gpu[:, 4:10], b_gpu[:, 2:8], c_gpu[:, 1:7], 'n', 't') res = c_gpu.get() assert np.allclose(np.dot(a[:, 4:10], b[:, 2:8].T), res[:, 1:7]) # t/t a = np.asarray(np.random.rand(6, 10), dtype) b = np.asarray(np.random.rand(8, 20), dtype) c = np.zeros((2, 30), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) linalg.add_dot(a_gpu[:, 4:6], b_gpu[:, 2:8], c_gpu[:, 1:9], 't', 't') res = c_gpu.get() assert np.allclose(np.dot(a[:, 4:6].T, b[:, 2:8].T), res[:, 1:9]) def test_dot_strided_float32(self): self._impl_test_dot_strided(np.float32) def test_dot_strided_float64(self): self._impl_test_dot_strided(np.float64) def test_dot_strided_complex64(self): self._impl_test_dot_strided(np.complex64) def test_dot_strided_complex128(self): self._impl_test_dot_strided(np.complex128) def _impl_test_det(self, dtype): # random matrix x = 10*np.asarray(np.random.rand(4, 4), dtype) x_gpu = gpuarray.to_gpu(x) assert np.allclose(linalg.det(x_gpu), np.linalg.det(x)) # known matrix (from http://en.wikipedia.org/wiki/Determinant ) x = np.asarray([[-2.0, 2, -3.0], [-1, 1, 3], [2, 0, -1]], dtype) x_gpu = gpuarray.to_gpu(x) assert np.allclose(linalg.det(x_gpu), 18.0) def test_det_float32(self): self._impl_test_det(np.float32) def test_det_float64(self): self._impl_test_det(np.float64) def test_det_complex64(self): self._impl_test_det(np.complex64) def test_det_complex128(self): self._impl_test_det(np.complex128) def test_qr_reduced_float32(self): a = np.asarray(np.random.randn(5, 3), np.float32, order='F') a_gpu = gpuarray.to_gpu(a) q_gpu, r_gpu = linalg.qr(a_gpu, 'reduced') assert np.allclose(a, np.dot(q_gpu.get(), r_gpu.get()), atol=1e-4) def test_qr_reduced_float64(self): a = np.asarray(np.random.randn(5, 3), np.float32, order='F') a_gpu = gpuarray.to_gpu(a) q_gpu, r_gpu = linalg.qr(a_gpu, 'reduced') assert np.allclose(a, np.dot(q_gpu.get(), r_gpu.get()), atol=atol_float64) def test_qr_reduced_complex64(self): a = np.asarray(np.random.randn(9, 6) + 1j*np.random.randn(9, 6), np.complex64, order='F') a_gpu = gpuarray.to_gpu(a) q_gpu, r_gpu = linalg.qr(a_gpu, 'reduced') assert np.allclose(a, np.dot(q_gpu.get(), r_gpu.get()), atol=1e-4) def test_qr_reduced_complex128(self): a = np.asarray(np.random.randn(9, 6) + 1j*np.random.randn(9, 6), np.complex64, order='F') a_gpu = gpuarray.to_gpu(a) q_gpu, r_gpu = linalg.qr(a_gpu, 'reduced') assert np.allclose(a, np.dot(q_gpu.get(), r_gpu.get()), atol=atol_float64) def test_eig_float32(self): a = np.asarray(np.random.rand(9, 9), np.float32, order='F') a_gpu = gpuarray.to_gpu(a) w_gpu = linalg.eig(a_gpu, 'N', 'N') assert np.allclose(np.trace(a), sum(w_gpu.get()), atol=1e-4) def test_eig_float64(self): a = np.asarray(np.random.rand(9, 9), np.float64, order='F') a_gpu = gpuarray.to_gpu(a) w_gpu = linalg.eig(a_gpu, 'N', 'N') assert np.allclose(np.trace(a), sum(w_gpu.get()), atol=atol_float64) def test_eig_complex64(self): a = np.asarray(np.random.rand(9, 9) + 1j*np.random.rand(9, 9), np.complex64, order='F') a_gpu = gpuarray.to_gpu(a) w_gpu = linalg.eig(a_gpu, 'N', 'N') assert np.allclose(np.trace(a), sum(w_gpu.get()), atol=1e-4) def test_eig_complex128(self): a = np.array(np.random.rand(9, 9) + 1j*np.random.rand(9,9), np.complex128, order='F') a_gpu = gpuarray.to_gpu(a) w_gpu = linalg.eig(a_gpu, 'N', 'N') assert np.allclose(np.trace(a), sum(w_gpu.get()), atol=atol_float64) def test_vander_float32(self): a = np.array(np.random.uniform(1,2,5), np.float32, order='F') a_gpu = gpuarray.to_gpu(a) vander_gpu = linalg.vander(a_gpu) assert np.allclose(np.fliplr(np.vander(a)), vander_gpu.get(), atol=atol_float32) def test_vander_float64(self): a = np.array(np.random.uniform(1,2,5), np.float64, order='F') a_gpu = gpuarray.to_gpu(a) vander_gpu = linalg.vander(a_gpu) assert np.allclose(np.fliplr(np.vander(a)), vander_gpu.get(), atol=atol_float64) def test_vander_complex64(self): a = np.array(np.random.uniform(1,2,5) + 1j*np.random.uniform(1,2,5), np.complex64, order='F') a_gpu = gpuarray.to_gpu(a) vander_gpu = linalg.vander(a_gpu) assert np.allclose(np.fliplr(np.vander(a)), vander_gpu.get(), atol=atol_float32) def test_vander_complex128(self): a = np.array(np.random.uniform(1,2,5) + 1j*np.random.uniform(1,2,5), np.complex128, order='F') a_gpu = gpuarray.to_gpu(a) vander_gpu = linalg.vander(a_gpu) assert np.allclose(np.fliplr(np.vander(a)), vander_gpu.get(), atol=atol_float64) def test_dmd_float32(self): m, n = 6, 4 a = np.array(np.fliplr(np.vander(np.random.rand(m)+1, n)), np.float32, order='F') a_gpu = gpuarray.to_gpu(a) f_gpu, b_gpu, v_gpu = linalg.dmd(a_gpu, modes='standard') assert np.allclose(a[:,:(n-1)], np.dot(f_gpu.get(), np.dot(np.diag(b_gpu.get()), v_gpu.get()) ), 1e-4) def test_dmd_float64(self): m, n = 9, 7 a = np.array(np.fliplr(np.vander(np.random.rand(m)+1, n)), np.float64, order='F') a_gpu = gpuarray.to_gpu(a) f_gpu, b_gpu, v_gpu = linalg.dmd(a_gpu, modes='standard') assert np.allclose(a[:,:(n-1)], np.dot(f_gpu.get(), np.dot(np.diag(b_gpu.get()), v_gpu.get()) ), atol_float64) def test_dmd_complex64(self): m, n = 9, 7 a = np.array(np.fliplr(np.vander(np.random.rand(m)+1, n)) + 1j*np.fliplr(np.vander(np.random.rand(m), n)), np.complex64, order='F') a_gpu = gpuarray.to_gpu(a) f_gpu, b_gpu, v_gpu = linalg.dmd(a_gpu, modes='standard') assert np.allclose(a[:,:(n-1)], np.dot(f_gpu.get(), np.dot(np.diag(b_gpu.get()), v_gpu.get()) ), 1e-4) def test_dmd_complex128(self): m, n = 9, 7 a = np.array(np.fliplr(np.vander(np.random.rand(m)+1, n)) + 1j*np.fliplr(np.vander(np.random.rand(m), n)), np.complex128, order='F') a_gpu = gpuarray.to_gpu(a) f_gpu, b_gpu, v_gpu = linalg.dmd(a_gpu, modes='standard') assert np.allclose(a[:,:(n-1)], np.dot(f_gpu.get(), np.dot(np.diag(b_gpu.get()), v_gpu.get()) ), atol_float64) def suite(): s = TestSuite() s.addTest(test_linalg('test_svd_ss_float32')) s.addTest(test_linalg('test_svd_ss_complex64')) s.addTest(test_linalg('test_svd_so_float32')) s.addTest(test_linalg('test_svd_so_complex64')) s.addTest(test_linalg('test_dot_matrix_float32')) s.addTest(test_linalg('test_dot_matrix_complex64')) s.addTest(test_linalg('test_dot_matrix_h_complex64')) s.addTest(test_linalg('test_dot_vector_float32')) s.addTest(test_linalg('test_dot_vector_complex64')) s.addTest(test_linalg('test_mdot_matrix_float32')) s.addTest(test_linalg('test_mdot_matrix_complex64')) s.addTest(test_linalg('test_dot_diag_float32')) s.addTest(test_linalg('test_dot_diag_complex64')) s.addTest(test_linalg('test_dot_diag_t_float32')) s.addTest(test_linalg('test_dot_diag_t_complex64')) s.addTest(test_linalg('test_transpose_float32')) s.addTest(test_linalg('test_transpose_complex64')) s.addTest(test_linalg('test_hermitian_float32')) s.addTest(test_linalg('test_hermitian_complex64')) s.addTest(test_linalg('test_conj_complex64')) s.addTest(test_linalg('test_diag_1d_float32')) s.addTest(test_linalg('test_diag_2d_wide_float32')) s.addTest(test_linalg('test_diag_2d_tall_float32')) s.addTest(test_linalg('test_diag_1d_complex64')) s.addTest(test_linalg('test_diag_2d_wide_complex64')) s.addTest(test_linalg('test_diag_2d_tall_complex64')) s.addTest(test_linalg('test_eye_float32')) s.addTest(test_linalg('test_eye_complex64')) s.addTest(test_linalg('test_pinv_float32')) s.addTest(test_linalg('test_pinv_complex64')) s.addTest(test_linalg('test_tril_float32')) s.addTest(test_linalg('test_tril_complex64')) s.addTest(test_linalg('test_triu_float32')) s.addTest(test_linalg('test_triu_complex64')) s.addTest(test_linalg('test_multiply_float32')) s.addTest(test_linalg('test_multiply_complex64')) s.addTest(test_linalg('test_cho_factor_float32')) s.addTest(test_linalg('test_cho_solve_float32')) s.addTest(test_linalg('test_inv_float32')) s.addTest(test_linalg('test_inv_complex64')) s.addTest(test_linalg('test_add_diag_float32')) s.addTest(test_linalg('test_add_diag_complex64')) s.addTest(test_linalg('test_inv_exceptions')) s.addTest(test_linalg('test_eye_large_float32')) s.addTest(test_linalg('test_trace_float32')) s.addTest(test_linalg('test_trace_complex64')) s.addTest(test_linalg('test_add_dot_matrix_float32')) s.addTest(test_linalg('test_add_dot_matrix_complex64')) s.addTest(test_linalg('test_dot_strided_float32')) s.addTest(test_linalg('test_dot_strided_complex64')) s.addTest(test_linalg('test_det_float32')) s.addTest(test_linalg('test_det_complex64')) s.addTest(test_linalg('test_qr_reduced_float32')) s.addTest(test_linalg('test_qr_reduced_float64')) s.addTest(test_linalg('test_qr_reduced_complex64')) s.addTest(test_linalg('test_qr_reduced_complex128')) s.addTest(test_linalg('test_eig_float32')) s.addTest(test_linalg('test_eig_float64')) s.addTest(test_linalg('test_eig_complex64')) s.addTest(test_linalg('test_eig_complex128')) s.addTest(test_linalg('test_vander_float32')) s.addTest(test_linalg('test_vander_float64')) s.addTest(test_linalg('test_vander_complex64')) s.addTest(test_linalg('test_vander_complex128')) s.addTest(test_linalg('test_dmd_float32')) s.addTest(test_linalg('test_dmd_float64')) s.addTest(test_linalg('test_dmd_complex64')) s.addTest(test_linalg('test_dmd_complex128')) if misc.get_compute_capability(pycuda.autoinit.device) >= 1.3: s.addTest(test_linalg('test_svd_ss_float64')) s.addTest(test_linalg('test_svd_ss_complex128')) s.addTest(test_linalg('test_svd_so_float64')) s.addTest(test_linalg('test_svd_so_complex128')) s.addTest(test_linalg('test_dot_matrix_float64')) s.addTest(test_linalg('test_dot_matrix_complex128')) s.addTest(test_linalg('test_dot_matrix_h_complex128')) s.addTest(test_linalg('test_dot_vector_float64')) s.addTest(test_linalg('test_dot_vector_complex128')) s.addTest(test_linalg('test_mdot_matrix_float64')) s.addTest(test_linalg('test_mdot_matrix_complex128')) s.addTest(test_linalg('test_dot_diag_t_float64')) s.addTest(test_linalg('test_dot_diag_t_complex128')) s.addTest(test_linalg('test_transpose_float64')) s.addTest(test_linalg('test_transpose_complex128')) s.addTest(test_linalg('test_hermitian_float64')) s.addTest(test_linalg('test_hermitian_complex64')) s.addTest(test_linalg('test_conj_complex128')) s.addTest(test_linalg('test_diag_1d_float64')) s.addTest(test_linalg('test_diag_2d_wide_float64')) s.addTest(test_linalg('test_diag_2d_tall_float64')) s.addTest(test_linalg('test_diag_1d_complex128')) s.addTest(test_linalg('test_diag_2d_wide_complex128')) s.addTest(test_linalg('test_diag_2d_tall_complex128')) s.addTest(test_linalg('test_eye_float64')) s.addTest(test_linalg('test_eye_complex128')) s.addTest(test_linalg('test_pinv_float64')) s.addTest(test_linalg('test_pinv_complex128')) s.addTest(test_linalg('test_tril_float64')) s.addTest(test_linalg('test_tril_complex128')) s.addTest(test_linalg('test_triu_float32')) s.addTest(test_linalg('test_triu_complex64')) s.addTest(test_linalg('test_multiply_float64')) s.addTest(test_linalg('test_multiply_complex128')) s.addTest(test_linalg('test_inv_float64')) s.addTest(test_linalg('test_inv_complex128')) s.addTest(test_linalg('test_add_diag_float64')) s.addTest(test_linalg('test_add_diag_complex128')) s.addTest(test_linalg('test_trace_float64')) s.addTest(test_linalg('test_trace_complex128')) s.addTest(test_linalg('test_add_dot_matrix_float64')) s.addTest(test_linalg('test_add_dot_matrix_complex128')) s.addTest(test_linalg('test_dot_strided_float64')) s.addTest(test_linalg('test_dot_strided_complex128')) s.addTest(test_linalg('test_det_float64')) s.addTest(test_linalg('test_det_complex128')) s.addTest(test_linalg('test_qr_reduced_float32')) s.addTest(test_linalg('test_qr_reduced_float64')) s.addTest(test_linalg('test_qr_reduced_complex64')) s.addTest(test_linalg('test_qr_reduced_complex128')) s.addTest(test_linalg('test_eig_float32')) s.addTest(test_linalg('test_eig_float64')) s.addTest(test_linalg('test_eig_complex64')) s.addTest(test_linalg('test_eig_complex128')) s.addTest(test_linalg('test_vander_float32')) s.addTest(test_linalg('test_vander_float64')) s.addTest(test_linalg('test_vander_complex64')) s.addTest(test_linalg('test_vander_complex128')) s.addTest(test_linalg('test_dmd_float32')) s.addTest(test_linalg('test_dmd_float64')) s.addTest(test_linalg('test_dmd_complex64')) s.addTest(test_linalg('test_dmd_complex128')) return s if __name__ == '__main__': main(defaultTest = 'suite') scikit-cuda-0.5.1/tests/test_misc.py000066400000000000000000000427511261465507300174450ustar00rootroot00000000000000#!/usr/bin/env python """ Unit tests for scikits.cuda.misc """ from unittest import main, TestCase, TestSuite import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np from numpy.testing import assert_raises import skcuda.misc as misc class test_misc(TestCase): def setUp(self): np.random.seed(0) misc.init() def test_maxabs_float32(self): x = np.array([-1, 2, -3], np.float32) x_gpu = gpuarray.to_gpu(x) m_gpu = misc.maxabs(x_gpu) assert np.allclose(m_gpu.get(), np.max(np.abs(x))) def test_maxabs_float64(self): x = np.array([-1, 2, -3], np.float64) x_gpu = gpuarray.to_gpu(x) m_gpu = misc.maxabs(x_gpu) assert np.allclose(m_gpu.get(), np.max(np.abs(x))) def test_maxabs_complex64(self): x = np.array([-1j, 2, -3j], np.complex64) x_gpu = gpuarray.to_gpu(x) m_gpu = misc.maxabs(x_gpu) assert np.allclose(m_gpu.get(), np.max(np.abs(x))) def test_maxabs_complex128(self): x = np.array([-1j, 2, -3j], np.complex128) x_gpu = gpuarray.to_gpu(x) m_gpu = misc.maxabs(x_gpu) assert np.allclose(m_gpu.get(), np.max(np.abs(x))) def test_cumsum_float32(self): x = np.array([1, 4, 3, 2, 8], np.float32) x_gpu = gpuarray.to_gpu(x) c_gpu = misc.cumsum(x_gpu) assert np.allclose(c_gpu.get(), np.cumsum(x)) def test_cumsum_float64(self): x = np.array([1, 4, 3, 2, 8], np.float64) x_gpu = gpuarray.to_gpu(x) c_gpu = misc.cumsum(x_gpu) assert np.allclose(c_gpu.get(), np.cumsum(x)) def test_cumsum_complex64(self): x = np.array([1, 4j, 3, 2j, 8], np.complex64) x_gpu = gpuarray.to_gpu(x) c_gpu = misc.cumsum(x_gpu) assert np.allclose(c_gpu.get(), np.cumsum(x)) def test_cumsum_complex128(self): x = np.array([1, 4j, 3, 2j, 8], np.complex128) x_gpu = gpuarray.to_gpu(x) c_gpu = misc.cumsum(x_gpu) assert np.allclose(c_gpu.get(), np.cumsum(x)) def test_diff_float32(self): x = np.array([1.3, 2.7, 4.9, 5.1], np.float32) x_gpu = gpuarray.to_gpu(x) y_gpu = misc.diff(x_gpu) assert np.allclose(y_gpu.get(), np.diff(x)) def test_diff_float64(self): x = np.array([1.3, 2.7, 4.9, 5.1], np.float64) x_gpu = gpuarray.to_gpu(x) y_gpu = misc.diff(x_gpu) assert np.allclose(y_gpu.get(), np.diff(x)) def test_diff_complex64(self): x = np.array([1.3+2.0j, 2.7-3.9j, 4.9+1.0j, 5.1-9.0j], np.complex64) x_gpu = gpuarray.to_gpu(x) y_gpu = misc.diff(x_gpu) assert np.allclose(y_gpu.get(), np.diff(x)) def test_diff_complex128(self): x = np.array([1.3+2.0j, 2.7-3.9j, 4.9+1.0j, 5.1-9.0j], np.complex128) x_gpu = gpuarray.to_gpu(x) y_gpu = misc.diff(x_gpu) assert np.allclose(y_gpu.get(), np.diff(x)) def test_get_by_index_float32(self): src = np.random.rand(5).astype(np.float32) src_gpu = gpuarray.to_gpu(src) ind = gpuarray.to_gpu(np.array([0, 2, 4])) res_gpu = misc.get_by_index(src_gpu, ind) assert np.allclose(res_gpu.get(), src[[0, 2, 4]]) ind = gpuarray.to_gpu(np.array([], np.int64)) res_gpu = misc.get_by_index(src_gpu, ind) assert len(res_gpu) == 0 def test_get_by_index_float64(self): src = np.random.rand(5).astype(np.float64) src_gpu = gpuarray.to_gpu(src) ind = gpuarray.to_gpu(np.array([0, 2, 4])) res_gpu = misc.get_by_index(src_gpu, ind) assert np.allclose(res_gpu.get(), src[[0, 2, 4]]) ind = gpuarray.to_gpu(np.array([], np.int64)) res_gpu = misc.get_by_index(src_gpu, ind) assert len(res_gpu) == 0 def test_set_by_index_dest_float32(self): dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) ind = gpuarray.to_gpu(np.array([0, 2, 4])) src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.float32)) misc.set_by_index(dest_gpu, ind, src_gpu, 'dest') assert np.allclose(dest_gpu.get(), np.array([1, 1, 1, 3, 1], dtype=np.float32)) dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) ind = gpuarray.to_gpu(np.array([], np.int64)) src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.float32)) misc.set_by_index(dest_gpu, ind, src_gpu, 'dest') assert np.allclose(dest_gpu.get(), np.arange(5, dtype=np.float32)) def test_set_by_index_dest_float64(self): dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.double)) ind = gpuarray.to_gpu(np.array([0, 2, 4])) src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.double)) misc.set_by_index(dest_gpu, ind, src_gpu, 'dest') assert np.allclose(dest_gpu.get(), np.array([1, 1, 1, 3, 1], dtype=np.double)) dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.double)) ind = gpuarray.to_gpu(np.array([], np.int64)) src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.double)) misc.set_by_index(dest_gpu, ind, src_gpu, 'dest') assert np.allclose(dest_gpu.get(), np.arange(5, dtype=np.double)) def test_set_by_index_src_float32(self): dest_gpu = gpuarray.to_gpu(np.zeros(3, dtype=np.float32)) ind = gpuarray.to_gpu(np.array([0, 2, 4])) src_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) misc.set_by_index(dest_gpu, ind, src_gpu, 'src') assert np.allclose(dest_gpu.get(), np.array([0, 2, 4], dtype=np.float32)) dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.float32)) ind = gpuarray.to_gpu(np.array([], np.int64)) src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.float32)) misc.set_by_index(dest_gpu, ind, src_gpu, 'src') assert np.allclose(dest_gpu.get(), np.arange(5, dtype=np.float32)) def test_set_by_index_src_float64(self): dest_gpu = gpuarray.to_gpu(np.zeros(3, dtype=np.double)) ind = gpuarray.to_gpu(np.array([0, 2, 4])) src_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.double)) misc.set_by_index(dest_gpu, ind, src_gpu, 'src') assert np.allclose(dest_gpu.get(), np.array([0, 2, 4], dtype=np.double)) dest_gpu = gpuarray.to_gpu(np.arange(5, dtype=np.double)) ind = gpuarray.to_gpu(np.array([], np.int64)) src_gpu = gpuarray.to_gpu(np.array([1, 1, 1], dtype=np.double)) misc.set_by_index(dest_gpu, ind, src_gpu, 'src') assert np.allclose(dest_gpu.get(), np.arange(5, dtype=np.double)) def impl_test_binaryop_matvec(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)).astype(dtype) a = np.random.normal(scale=5.0, size=(1, 5)).astype(dtype) b = np.random.normal(scale=5.0, size=(3, 1)).astype(dtype) # the following two test correct broadcasting on 0D vectors c = np.random.normal(scale=5.0, size=(5, )).astype(dtype) d = np.random.normal(scale=5.0, size=(3, )).astype(dtype) x_gpu = gpuarray.to_gpu(x) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.to_gpu(c) d_gpu = gpuarray.to_gpu(d) out = gpuarray.empty(x.shape, dtype=dtype) # addition res = misc.add_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x+a) assert np.allclose(misc.add_matvec(x_gpu, b_gpu).get(), x+b) assert np.allclose(misc.add_matvec(x_gpu, c_gpu).get(), x+c) assert_raises(ValueError, misc.add_matvec, x_gpu, d_gpu) # multiplication res = misc.mult_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x*a) assert np.allclose(misc.mult_matvec(x_gpu, b_gpu).get(), x*b) assert np.allclose(misc.mult_matvec(x_gpu, c_gpu).get(), x*c) assert_raises(ValueError, misc.mult_matvec, x_gpu, d_gpu) # division res = misc.div_matvec(x_gpu, a_gpu, out=out).get() assert np.allclose(res, x/a) assert np.allclose(misc.div_matvec(x_gpu, b_gpu).get(), x/b) assert np.allclose(misc.div_matvec(x_gpu, c_gpu).get(), x/c) assert_raises(ValueError, misc.div_matvec, x_gpu, d_gpu) def test_binaryop_matvec_float32(self): self.impl_test_binaryop_matvec(np.float32) def test_binaryop_matvec_float64(self): self.impl_test_binaryop_matvec(np.float64) def test_binaryop_matvec_complex64(self): self.impl_test_binaryop_matvec(np.complex64) def test_binaryop_matvec_complex128(self): self.impl_test_binaryop_matvec(np.complex128) def impl_test_sum(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)) x = x.astype(dtype=dtype, order='C') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.sum(x_gpu).get(), x.sum()) assert np.allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0)) assert np.allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1)) x = x.astype(dtype=dtype, order='F') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.sum(x_gpu).get(), x.sum()) assert np.allclose(misc.sum(x_gpu, axis=0).get(), x.sum(axis=0)) assert np.allclose(misc.sum(x_gpu, axis=1).get(), x.sum(axis=1)) def test_sum_float32(self): self.impl_test_sum(np.float32) def test_sum_float64(self): self.impl_test_sum(np.float64) def test_sum_complex64(self): self.impl_test_sum(np.complex64) def test_sum_complex128(self): self.impl_test_sum(np.complex128) def impl_test_mean(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)) x = x.astype(dtype=dtype, order='C') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.mean(x_gpu).get(), x.mean()) assert np.allclose(misc.mean(x_gpu, axis=0).get(), x.mean(axis=0)) assert np.allclose(misc.mean(x_gpu, axis=1).get(), x.mean(axis=1)) x = x.astype(dtype=dtype, order='F') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.mean(x_gpu).get(), x.mean()) assert np.allclose(misc.mean(x_gpu, axis=-1).get(), x.mean(axis=-1)) assert np.allclose(misc.mean(x_gpu, axis=-2).get(), x.mean(axis=-2)) def test_mean_float32(self): self.impl_test_mean(np.float32) def test_mean_float64(self): self.impl_test_mean(np.float64) def test_mean_complex64(self): self.impl_test_mean(np.complex64) def test_mean_complex128(self): self.impl_test_mean(np.complex128) def impl_test_var(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)) x = x.astype(dtype=dtype, order='C') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.var(x_gpu).get(), x.var()) assert np.allclose(misc.var(x_gpu, axis=0).get(), x.var(axis=0)) assert np.allclose(misc.var(x_gpu, axis=1).get(), x.var(axis=1)) assert np.allclose(misc.var(x_gpu, ddof=1).get(), x.var(ddof=1)) assert np.allclose(misc.var(x_gpu, ddof=1, axis=0).get(), x.var(ddof=1, axis=0)) assert np.allclose(misc.var(x_gpu, ddof=1, axis=1).get(), x.var(ddof=1, axis=1)) # Currently not working due to a bug in PyCUDA, see Issue #92 #x = x.astype(dtype=dtype, order='F') #x_gpu = gpuarray.to_gpu(x) #assert np.allclose(misc.var(x_gpu).get(), x.var()) #assert np.allclose(misc.var(x_gpu, axis=-1).get(), x.var(axis=-1)) #assert np.allclose(misc.var(x_gpu, axis=-2).get(), x.var(axis=-2)) def test_var_float32(self): self.impl_test_var(np.float32) def test_var_float64(self): self.impl_test_var(np.float64) def test_var_complex64(self): self.impl_test_var(np.complex64) def test_var_complex128(self): self.impl_test_var(np.complex128) def impl_test_std(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)) x = x.astype(dtype=dtype, order='C') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.std(x_gpu).get(), x.std()) assert np.allclose(misc.std(x_gpu, axis=0).get(), x.std(axis=0)) assert np.allclose(misc.std(x_gpu, axis=1).get(), x.std(axis=1)) assert np.allclose(misc.std(x_gpu, ddof=1).get(), x.std(ddof=1)) assert np.allclose(misc.std(x_gpu, ddof=1, axis=0).get(), x.std(ddof=1, axis=0)) assert np.allclose(misc.std(x_gpu, ddof=1, axis=1).get(), x.std(ddof=1, axis=1)) # Currently not working due to a bug in PyCUDA, see Issue #92 #x = x.astype(dtype=dtype, order='F') #x_gpu = gpuarray.to_gpu(x) #assert np.allclose(misc.std(x_gpu).get(), x.std()) #assert np.allclose(misc.std(x_gpu, axis=-1).get(), x.std(axis=-1)) #assert np.allclose(misc.std(x_gpu, axis=-2).get(), x.std(axis=-2)) def test_std_float32(self): self.impl_test_std(np.float32) def test_std_float64(self): self.impl_test_std(np.float64) def test_std_complex64(self): self.impl_test_std(np.complex64) def test_std_complex128(self): self.impl_test_std(np.complex128) def _impl_test_minmax(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)) x = x.astype(dtype=dtype, order='C') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.max(x_gpu, axis=0).get(), x.max(axis=0)) assert np.allclose(misc.max(x_gpu, axis=1).get(), x.max(axis=1)) assert np.allclose(misc.min(x_gpu, axis=0).get(), x.min(axis=0)) assert np.allclose(misc.min(x_gpu, axis=1).get(), x.min(axis=1)) x = x.astype(dtype=dtype, order='F') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.max(x_gpu, axis=0).get(), x.max(axis=0)) assert np.allclose(misc.max(x_gpu, axis=1).get(), x.max(axis=1)) assert np.allclose(misc.min(x_gpu, axis=0).get(), x.min(axis=0)) assert np.allclose(misc.min(x_gpu, axis=1).get(), x.min(axis=1)) def test_minmax_float32(self): self._impl_test_minmax(np.float32) def test_minmax_float64(self): self._impl_test_minmax(np.float64) def _impl_test_argminmax(self, dtype): x = np.random.normal(scale=5.0, size=(3, 5)) x = x.astype(dtype=dtype, order='C') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.argmax(x_gpu, axis=0).get(), x.argmax(axis=0)) assert np.allclose(misc.argmax(x_gpu, axis=1).get(), x.argmax(axis=1)) assert np.allclose(misc.argmin(x_gpu, axis=0).get(), x.argmin(axis=0)) assert np.allclose(misc.argmin(x_gpu, axis=1).get(), x.argmin(axis=1)) x = x.astype(dtype=dtype, order='F') x_gpu = gpuarray.to_gpu(x) assert np.allclose(misc.argmax(x_gpu, axis=0).get(), x.argmax(axis=0)) assert np.allclose(misc.argmax(x_gpu, axis=1).get(), x.argmax(axis=1)) assert np.allclose(misc.argmin(x_gpu, axis=0).get(), x.argmin(axis=0)) assert np.allclose(misc.argmin(x_gpu, axis=1).get(), x.argmin(axis=1)) def test_argminmax_float32(self): self._impl_test_argminmax(np.float32) def test_argminmax_float64(self): self._impl_test_argminmax(np.float64) def suite(): s = TestSuite() s.addTest(test_misc('test_maxabs_float32')) s.addTest(test_misc('test_maxabs_complex64')) s.addTest(test_misc('test_cumsum_float32')) s.addTest(test_misc('test_cumsum_complex64')) s.addTest(test_misc('test_diff_float32')) s.addTest(test_misc('test_diff_complex64')) s.addTest(test_misc('test_get_by_index_float32')) s.addTest(test_misc('test_set_by_index_dest_float32')) s.addTest(test_misc('test_set_by_index_src_float32')) s.addTest(test_misc('test_binaryop_matvec_float32')) s.addTest(test_misc('test_binaryop_matvec_complex64')) s.addTest(test_misc('test_sum_float32')) s.addTest(test_misc('test_sum_complex64')) s.addTest(test_misc('test_mean_float32')) s.addTest(test_misc('test_mean_complex64')) s.addTest(test_misc('test_var_float32')) s.addTest(test_misc('test_var_complex64')) s.addTest(test_misc('test_std_float32')) s.addTest(test_misc('test_std_complex64')) s.addTest(test_misc('test_minmax_float32')) s.addTest(test_misc('test_argminmax_float32')) if misc.get_compute_capability(pycuda.autoinit.device) >= 1.3: s.addTest(test_misc('test_maxabs_float64')) s.addTest(test_misc('test_maxabs_complex128')) s.addTest(test_misc('test_cumsum_float64')) s.addTest(test_misc('test_cumsum_complex128')) s.addTest(test_misc('test_diff_float64')) s.addTest(test_misc('test_diff_complex128')) s.addTest(test_misc('test_get_by_index_float32')) s.addTest(test_misc('test_set_by_index_dest_float64')) s.addTest(test_misc('test_set_by_index_src_float64')) s.addTest(test_misc('test_sum_float64')) s.addTest(test_misc('test_sum_complex128')) s.addTest(test_misc('test_mean_float64')) s.addTest(test_misc('test_mean_complex128')) s.addTest(test_misc('test_binaryop_matvec_float64')) s.addTest(test_misc('test_binaryop_matvec_complex128')) s.addTest(test_misc('test_var_float64')) s.addTest(test_misc('test_var_complex128')) s.addTest(test_misc('test_std_float64')) s.addTest(test_misc('test_std_complex128')) s.addTest(test_misc('test_minmax_float64')) s.addTest(test_misc('test_argminmax_float64')) return s if __name__ == '__main__': main(defaultTest = 'suite') scikit-cuda-0.5.1/tests/test_rlinalg.py000066400000000000000000000144221261465507300201340ustar00rootroot00000000000000#!/usr/bin/env python """ Unit tests for scikits.cuda.linalg """ from unittest import main, makeSuite, TestCase, TestSuite import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np from numpy.testing import assert_raises import skcuda.linalg as linalg import skcuda.rlinalg as rlinalg import skcuda.misc as misc atol_float32 = 1e-4 atol_float64 = 1e-8 class test_rlinalg(TestCase): def setUp(self): np.random.seed(0) linalg.init() rlinalg.init() def test_rsvd_float32(self): m, n = 5, 4 a = np.array(np.random.randn(m, n), np.float32, order='F') a_gpu = gpuarray.to_gpu(a) U, s, Vt = rlinalg.rsvd(a_gpu, k=n, p=0, q=2, method='standard') assert np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), atol_float32) def test_rsvd_float64(self): m, n = 5, 4 a = np.array(np.random.randn(m, n), np.float64, order='F') a_gpu = gpuarray.to_gpu(a) U, s, Vt = rlinalg.rsvd(a_gpu, k=n, p=0, q=2, method='standard') assert np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), atol_float64) def test_rsvd_complex64(self): m, n = 5, 4 a = np.array(np.random.randn(m, n) + 1j*np.random.randn(m, n), np.complex64, order='F') a_gpu = gpuarray.to_gpu(a) U, s, Vt = rlinalg.rsvd(a_gpu, k=n, p=0, q=2, method='standard') assert np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), atol_float32) def test_rsvd_complex128(self): m, n = 5, 4 a = np.array(np.random.randn(m, n) + 1j*np.random.randn(m, n), np.complex128, order='F') a_gpu = gpuarray.to_gpu(a) U, s, Vt = rlinalg.rsvd(a_gpu, k=n, p=0, q=2, method='standard') assert np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), atol_float64) def test_rsvdf_float32(self): m, n = 5, 4 a = np.array(np.random.randn(m, n), np.float32, order='F') a_gpu = gpuarray.to_gpu(a) U, s, Vt = rlinalg.rsvd(a_gpu, k=n, p=0, q=2, method='fast') assert np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), atol_float32) def test_rsvdf_float64(self): m, n = 5, 4 a = np.array(np.random.randn(m, n), np.float64, order='F') a_gpu = gpuarray.to_gpu(a) U, s, Vt = rlinalg.rsvd(a_gpu, k=n, p=0, q=2, method='fast') assert np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), atol_float64) def test_rsvdf_complex64(self): m, n = 5, 4 a = np.array(np.random.randn(m, n) + 1j*np.random.randn(m, n), np.complex64, order='F') a_gpu = gpuarray.to_gpu(a) U, s, Vt = rlinalg.rsvd(a_gpu, k=n, p=0, q=2, method='fast') assert np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), atol_float32) def test_rsvdf_complex128(self): m, n = 5, 4 a = np.array(np.random.randn(m, n) + 1j*np.random.randn(m, n), np.complex128, order='F') a_gpu = gpuarray.to_gpu(a) U, s, Vt = rlinalg.rsvd(a_gpu, k=n, p=0, q=2, method='fast') assert np.allclose(a, np.dot(U.get(), np.dot(np.diag(s.get()), Vt.get())), atol_float64) def test_rdmd_float32(self): m, n = 6, 4 a = np.array(np.fliplr(np.vander(np.random.rand(m)+1, n)), np.float32, order='F') a_gpu = gpuarray.to_gpu(a) f_gpu, b_gpu, v_gpu = rlinalg.rdmd(a_gpu, k=(n-1), p=0, q=2, modes='standard') assert np.allclose(a[:,:(n-1)], np.dot(f_gpu.get(), np.dot(np.diag(b_gpu.get()), v_gpu.get()) ), atol_float32) def test_rdmd_float64(self): m, n = 9, 7 a = np.array(np.fliplr(np.vander(np.random.rand(m)+1, n)), np.float64, order='F') a_gpu = gpuarray.to_gpu(a) f_gpu, b_gpu, v_gpu = rlinalg.rdmd(a_gpu, k=(n-1), p=0, q=1, modes='standard') assert np.allclose(a[:,:(n-1)], np.dot(f_gpu.get(), np.dot(np.diag(b_gpu.get()), v_gpu.get()) ), atol_float64) def test_rdmd_complex64(self): m, n = 9, 7 a = np.array(np.fliplr(np.vander(np.random.rand(m)+1, n)) + 1j*np.fliplr(np.vander(np.random.rand(m)+1, n)), np.complex64, order='F') a_gpu = gpuarray.to_gpu(a) f_gpu, b_gpu, v_gpu = rlinalg.rdmd(a_gpu, k=(n-1), p=0, q=1, modes='standard') assert np.allclose(a[:,:(n-1)], np.dot(f_gpu.get(), np.dot(np.diag(b_gpu.get()), v_gpu.get()) ), atol_float32) def test_rdmd_complex128(self): m, n = 9, 7 a = np.array(np.fliplr(np.vander(np.random.rand(m)+1, n)) + 1j*np.fliplr(np.vander(np.random.rand(m)+1, n)), np.complex128, order='F') a_gpu = gpuarray.to_gpu(a) f_gpu, b_gpu, v_gpu = rlinalg.rdmd(a_gpu, k=(n-1), p=0, q=1, modes='standard') assert np.allclose(a[:,:(n-1)], np.dot(f_gpu.get(), np.dot(np.diag(b_gpu.get()), v_gpu.get()) ), atol_float64) def suite(): s = TestSuite() s.addTest(test_linalg('test_rsvd_float32')) s.addTest(test_linalg('test_rsvd_float64')) s.addTest(test_linalg('test_rsvd_complex64')) s.addTest(test_linalg('test_rsvd_complex128')) s.addTest(test_linalg('test_rsvdf_float32')) s.addTest(test_linalg('test_rsvdf_float64')) s.addTest(test_linalg('test_rsvdf_complex64')) s.addTest(test_linalg('test_rsvdf_complex128')) s.addTest(test_linalg('test_rdmd_float32')) s.addTest(test_linalg('test_rdmd_float64')) s.addTest(test_linalg('test_rdmd_complex64')) s.addTest(test_linalg('test_rdmd_complex128')) if misc.get_compute_capability(pycuda.autoinit.device) >= 1.3: s.addTest(test_linalg('test_rsvd_float32')) s.addTest(test_linalg('test_rsvd_float64')) s.addTest(test_linalg('test_rsvd_complex64')) s.addTest(test_linalg('test_rsvd_complex128')) s.addTest(test_linalg('test_rsvdf_float32')) s.addTest(test_linalg('test_rsvdf_float64')) s.addTest(test_linalg('test_rsvdf_complex64')) s.addTest(test_linalg('test_rsvdf_complex128')) s.addTest(test_linalg('test_rdmd_float32')) s.addTest(test_linalg('test_rdmd_float64')) s.addTest(test_linalg('test_rdmd_complex64')) s.addTest(test_linalg('test_rdmd_complex128')) return s if __name__ == '__main__': main(defaultTest = 'suite') scikit-cuda-0.5.1/tests/test_special.py000066400000000000000000000047621261465507300201320ustar00rootroot00000000000000#!/usr/bin/env python """ Unit tests for scikits.cuda.linalg """ from unittest import main, makeSuite, TestCase, TestSuite import pycuda.autoinit import pycuda.gpuarray as gpuarray import numpy as np import scipy as sp import scipy.special import skcuda.linalg as linalg import skcuda.misc as misc import skcuda.special as special class test_special(TestCase): def setUp(self): np.random.seed(0) linalg.init() def test_sici_float32(self): x = np.array([[1, 2], [3, 4]], np.float32) x_gpu = gpuarray.to_gpu(x) (si_gpu, ci_gpu) = special.sici(x_gpu) (si, ci) = scipy.special.sici(x) assert np.allclose(si, si_gpu.get()) assert np.allclose(ci, ci_gpu.get()) def test_sici_float64(self): x = np.array([[1, 2], [3, 4]], np.float64) x_gpu = gpuarray.to_gpu(x) (si_gpu, ci_gpu) = special.sici(x_gpu) (si, ci) = scipy.special.sici(x) assert np.allclose(si, si_gpu.get()) assert np.allclose(ci, ci_gpu.get()) def test_exp1_complex64(self): z = np.asarray(np.random.rand(4, 4) + 1j*np.random.rand(4, 4), np.complex64) z_gpu = gpuarray.to_gpu(z) e_gpu = special.exp1(z_gpu) assert np.allclose(sp.special.exp1(z), e_gpu.get()) def test_exp1_complex128(self): z = np.asarray(np.random.rand(4, 4) + 1j*np.random.rand(4, 4), np.complex128) z_gpu = gpuarray.to_gpu(z) e_gpu = special.exp1(z_gpu) assert np.allclose(sp.special.exp1(z), e_gpu.get()) def test_expi_complex64(self): z = np.asarray(np.random.rand(4, 4) + 1j*np.random.rand(4, 4), np.complex64) z_gpu = gpuarray.to_gpu(z) e_gpu = special.expi(z_gpu) assert np.allclose(sp.special.expi(z), e_gpu.get()) def test_expi_complex128(self): z = np.asarray(np.random.rand(4, 4) + 1j*np.random.rand(4, 4), np.complex128) z_gpu = gpuarray.to_gpu(z) e_gpu = special.expi(z_gpu) assert np.allclose(sp.special.expi(z), e_gpu.get()) def suite(): s = TestSuite() s.addTest(test_special('test_sici_float32')) s.addTest(test_special('test_exp1_complex64')) s.addTest(test_special('test_expi_complex64')) if misc.get_compute_capability(pycuda.autoinit.device) >= 1.3: s.addTest(test_special('test_sici_float64')) s.addTest(test_special('test_exp1_complex128')) s.addTest(test_special('test_expi_complex128')) return s if __name__ == '__main__': main(defaultTest = 'suite') scikit-cuda-0.5.1/tox.ini000066400000000000000000000001151261465507300152360ustar00rootroot00000000000000[tox] envlist = py27, py33, py34 [testenv] deps = nose commands = nosetests