pyopencl

python でも OpenCL が手軽に実行できます。　　

C 言語で、OpenCL が利用できない場合には、まず、C 言語で OpenCL が利用できる環境を整えてください。　　

その上で、以下のように、pyopencl をインストールすると、python から OpenCL が扱えるようになります。

これは、Google Colaboratory でも実行できます(ランタイムタイプを GPU にして利用して下さい)。

!pip install pyopencl

デバイスと環境の調査

正常に pyopencl が利用できる環境で以下のプログラムを実行すると clinfo のような環境情報が表示されます。

import pyopencl as cl

for platform in cl.get_platforms():
	print("===================================================")
	print("Platform Name               ", platform.name)
	print("Platform Vendor             ", platform.vendor)
	print("Platform Version            ", platform.version)
	print("Platform Profile            ", platform.profile)
	print("Platform Extensions         ", platform.extensions)
	for device in platform.get_devices():
		print("---------------------------------------------------")
		print("Device Name                 ", device.name)
		print("Device Vendor               ", device.vendor)
		print("Device Version              ", device.version)
		print("Driver Version              ", device.driver_version)
		print("Device OpenCL C Version     ", device.opencl_c_version)
		print("Device Type                 ", cl.device_type.to_string(device.type))
		print("Device Profile              ", device.profile)
		print("Max compute units           ", device.max_compute_units)
		print("Max clock frequency         ", device.max_clock_frequency, 'MHz')
		print("Compute Capability          ", device.compute_capability_major_nv, '.', device.compute_capability_minor_nv)
		print("Device memory               ", device.global_mem_size//1024//1024, 'MB')
		print("Max work item dimensions    ", device.max_work_item_dimensions)
		print("Max work item size          ", device.max_work_item_sizes)
		print("Max work group size         ", device.max_work_group_size)
		print("Warp size (NV)              ", device.warp_size_nv)

===================================================
Platform Name                NVIDIA CUDA
Platform Vendor              NVIDIA Corporation
Platform Version             OpenCL 1.1 CUDA 6.5.51
Platform Profile             FULL_PROFILE
Platform Extensions          cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts 
---------------------------------------------------
Device Name                  GeForce 9400M
Device Vendor                NVIDIA Corporation
Device Version               OpenCL 1.0 CUDA
Driver Version               340.108
Device OpenCL C Version      OpenCL C 1.0 
Device Type                  ALL | GPU
Device Profile               FULL_PROFILE
Max compute units            2
Max clock frequency          1100 MHz
Compute Capability           1 . 1
Device memory                253 MB
Max work item dimensions     3
Max work item size           [512, 512, 64]
Max work group size          512
Warp size (NV)               32

行列積の実行

以下のプログラムで、numpy と OpenCL の両方で、整数と実数の場合の行列積を計算します。

python のヒアドキュメントの機能により、OpenCL のカーネルプログラムがベタで記述できるのは良いです。

import numpy as np
import pyopencl as cl
import time

size = 1024

src = '''//CL//
__kernel void matmul(
	__global const float* a,
	__global const float* b,
	__global float* c,
			 const int n)
{
	const int i = get_global_id(0);
	const int j = get_global_id(1);
	float tmp = 0.0;

	for(int k = 0; k < n; k++) {
		tmp += a[j * n + k] * b[k * n + i];
	}
	c[j * n + i] = tmp; 
}
'''

print('Matrix multiplication float size : ', size)

a = np.random.rand(size, size).astype(np.float32)
b = np.random.rand(size, size).astype(np.float32)
a = 2.0 * a - 1.0
b = 2.0 * b - 1.0

tmst = time.time()
c = np.matmul(a, b)
tmed = time.time()

print('Numpy  time : ', tmed - tmst)

contx = cl.create_some_context(interactive=False)
queue = cl.CommandQueue(contx)
prgrm = cl.Program(contx, src).build()
mf = cl.mem_flags
abuf = cl.Buffer(contx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
bbuf = cl.Buffer(contx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
cbuf = cl.Buffer(contx, mf.WRITE_ONLY, c.nbytes)
n = np.int32(size)

tmst = time.time()
event = prgrm.matmul(queue, c.shape, None, abuf, bbuf, cbuf, n)
event.wait()
tmed = time.time()

cout = np.empty_like(c)
cl.enqueue_copy(queue, cout, cbuf)

print('OpenCL time : ', tmed - tmst)
print('Max Min result diff : ', np.max(cout - c), ',', np.min(cout - c))



src = '''//CL//
__kernel void matmul(
	__global const int* a,
	__global const int* b,
	__global int* c,
			 const int n)
{
	const int i = get_global_id(0);
	const int j = get_global_id(1);
	int tmp = 0.0;

	for(int k = 0; k < n; k++) {
		tmp += a[j * n + k] * b[k * n + i];
	}
	c[j * n + i] = tmp; 
}
'''

print('Matrix multiplication integer size : ', size)

a = np.random.randint(-256, 256, (size, size)).astype(np.int32)
b = np.random.randint(-256, 256, (size, size)).astype(np.int32)

tmst = time.time()
c = np.matmul(a, b)
tmed = time.time()

print('Numpy  time : ', tmed - tmst)

contx = cl.create_some_context(interactive=False)
queue = cl.CommandQueue(contx)
prgrm = cl.Program(contx, src).build()
mf = cl.mem_flags
abuf = cl.Buffer(contx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
bbuf = cl.Buffer(contx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
cbuf = cl.Buffer(contx, mf.WRITE_ONLY, c.nbytes)
n = np.int32(size)

tmst = time.time()
event = prgrm.matmul(queue, c.shape, None, abuf, bbuf, cbuf, n)
event.wait()
tmed = time.time()

cout = np.empty_like(c)
cl.enqueue_copy(queue, cout, cbuf)

print('OpenCL time : ', tmed - tmst)
print('Max Min result diff : ', np.max(cout - c), ',', np.min(cout - c))

Matrix multiplication float size :  1024
Numpy  time :  0.11262202262878418
OpenCL time :  1.0116124153137207
Max Min result diff :  8.392334e-05 , -6.67572e-05
Matrix multiplication integer size :  1024
Numpy  time :  17.078879594802856
OpenCL time :  0.9658496379852295
Max Min result diff :  0 , 0

ご利用の計算機環境によりますが、numpy は整数の計算が遅いというのは言えそうです。

マンデルブロ集合とジュリア集合

マンデルブロ集合とジュリア集合を OpenCL を使って計算し、表示してみます。

import numpy as np
import pyopencl as cl
import time
from PIL import Image

src = '''//CL
__kernel void mandel(
	__global uchar* out,
	const float zr,
	const float zi,
	const float zd,
	const float cr, 
	const float ci, 
	const float cd 
)
{
	const float threshold = 4.0;
	const int max = 256;
	const int i = get_global_id(1);
	const int j = get_global_id(0);
	float wr = zr + zd * i;
	float wi = zi + zd * j;
	float ar = cr + cd * i;
	float ai = ci + cd * j;
	float wr2, wi2, wri;
	int k;
	for(k = 0; k < max; k++) {
		wri = wr * wi;
		wr2 = wr * wr;
		wi2 = wi * wi;
		wr  = wr2 - wi2 + ar;
		wi  = wri + wri + ai;
		if(wr2 + wi2 > threshold) {
			break;
		}
	}
	if(k > 255){
		k = 0;
	}
	const int idx = (j * get_global_size(1) + i) * 3;
	const int im = 8;
	out[idx + 2] =  (k % im) * 256 / im;
	out[idx + 1] = ((k / im) % im) * 256 / im;
	out[idx + 0] = ((k / im  / im) % im) * 256 / im;
}
'''

width = 640
height = 360
zr  = np.float32(0.0)
zi  = np.float32(0.0)
zd  = np.float32(0.0)
cr  = np.float32(-2.5)
ci  = np.float32(-1.1)
cd  = np.float32(0.006)
#zr  = np.float32(-2.0)
#zi  = np.float32(-1.1)
#zd  = np.float32(0.006)
#cr  = np.float32(-0.4)
#ci  = np.float32(-0.6)
#cd  = np.float32(0.0)
out = np.empty((height, width, 3), np.uint8)

contx = cl.create_some_context(interactive=False)
queue = cl.CommandQueue(contx)
prgrm = cl.Program(contx, src).build()
obuf  = cl.Buffer(contx, cl.mem_flags.WRITE_ONLY, out.nbytes)
timst = time.time()
event = prgrm.mandel(queue, (height, width), None, obuf, zr, zi, zd, cr, ci, cd)
event.wait()
timed = time.time()
cl.enqueue_copy(queue, out, obuf)
print('time : ', timed - timst)
image = Image.fromarray(out)
image.show()
image

time :  0.020096540451049805

pygame でアニメーション　【番外編】

前述のように、OpenCL を使うと高速で計算できることがわかりました。
そこで、pygame を使って、キーボードで押されたキーによってパラメータを変更して、アニメーションを表示するようにしました。
利用するキーは、カーソルキー、a, s, d, w, z, x, n, m, ESCキーです。

※　これは、番外編のため、jupyter lab や notebook では、表示できません。また、pygame のインストールも必要です。

import numpy as np
import pyopencl as cl
from PIL import Image
import pygame as pg

src = '''//CL
__kernel void mandel(
	__global uchar* out,
	const float zr,
	const float zi,
	const float zd,
	const float cr, 
	const float ci, 
	const float cd 
)
{
	const float threshold = 4.0;
	const int max = 256;
	const int i = get_global_id(1);
	const int j = get_global_id(0);
	float wr = zr + zd * i;
	float wi = zi + zd * j;
	float ar = cr + cd * i;
	float ai = ci + cd * j;
	float wr2, wi2, wri;
	int k;
	for(k = 0; k < max; k++) {
		wri = wr * wi;
		wr2 = wr * wr;
		wi2 = wi * wi;
		wr  = wr2 - wi2 + ar;
		wi  = wri + wri + ai;
		if(wr2 + wi2 > threshold) {
			break;
		}
	}
	if(k > 255){
		k = 0;
	}
	const int idx = (j * get_global_size(1) + i) * 3;
	const int im = 8;
	out[idx + 2] =  (k % im) * 256 / im;
	out[idx + 1] = ((k / im) % im) * 256 / im;
	out[idx + 0] = ((k / im  / im) % im) * 256 / im;
}
'''

def main():
	width = 640
	height = 360
	zr  = np.float32(0.0)
	zi  = np.float32(0.0)
	zd  = np.float32(0.0)
	cr  = np.float32(-2.5)
	ci  = np.float32(-1.1)
	cd  = np.float32(0.006)
#	zr  = np.float32(-2.0)
#	zi  = np.float32(-1.1)
#	zd  = np.float32(0.006)
#	cr  = np.float32(-0.4)
#	ci  = np.float32(-0.6)
#	cd  = np.float32(0.0)
	out = np.empty((height, width, 3), np.uint8)

	contx = cl.create_some_context(interactive=False)
	queue = cl.CommandQueue(contx)
	prgrm = cl.Program(contx, src).build()
	obuf  = cl.Buffer(contx, cl.mem_flags.WRITE_ONLY, out.nbytes)
	event = prgrm.mandel(queue, (height, width), None, obuf, zr, zi, zd, cr, ci, cd)
	event.wait()
	cl.enqueue_copy(queue, out, obuf)
	image = Image.fromarray(out)
	#image
	#image.show()
	pg.init()
	pg.key.set_repeat(50, 50)
	screen = pg.display.set_mode((width, height))
	raw_str = image.tobytes("raw", 'RGB')
	surface = pg.image.fromstring(raw_str, (width, height), 'RGB')
	screen.blit(surface, (0,0))
	pg.display.update()
	update = False
	while True:
		for event in pg.event.get():
			if event.type == pg.QUIT:
				pg.quit()
				return
			if event.type == pg.KEYDOWN:
				if event.key == pg.K_ESCAPE:
					pg.quit()
					return
				update = True
				if event.key == pg.K_z:
					zd = np.float32(zd + 0.0001)
				if event.key == pg.K_x:
					zd = np.float32(zd - 0.0001)
				if event.key == pg.K_w:
					zi = np.float32(zi + 0.01)
				if event.key == pg.K_s:
					zi = np.float32(zi - 0.01)
				if event.key == pg.K_a:
					zr = np.float32(zr + 0.01)
				if event.key == pg.K_d:
					zr = np.float32(zr - 0.01)
				if event.key == pg.K_n:
					cd = np.float32(cd + 0.0001)
				if event.key == pg.K_m:
					cd = np.float32(cd - 0.0001)
				if event.key == pg.K_UP:
					ci = np.float32(ci + 0.01)
				if event.key == pg.K_DOWN:
					ci = np.float32(ci - 0.01)
				if event.key == pg.K_LEFT:
					cr = np.float32(cr + 0.01)
				if event.key == pg.K_RIGHT:
					cr = np.float32(cr - 0.01)
		if update :
			update = False
			event = prgrm.mandel(queue, (height, width), None, obuf, zr, zi, zd, cr, ci, cd)
			event.wait()
			cl.enqueue_copy(queue, out, obuf)
			image = Image.fromarray(out)
			raw_str = image.tobytes("raw", 'RGB')
			surface = pg.image.fromstring(raw_str, (width, height), 'RGB')
			screen.blit(surface, (0,0))
			pg.display.update()
			print(zr, zi, zd, cr, ci, cd)
	pg.quit()

if __name__ == '__main__':
	main()

アニメーションのテスト実行のビデオです。
マンデルブロ集合からジュリア集合へパラメータを徐々に変えました。

pyopencl

デバイスと環境の調査

行列積の実行

マンデルブロ集合とジュリア集合

pygame でアニメーション 【番外編】

pygame でアニメーション　【番外編】