PyOpenCLでBitonicSort(備忘録)
Connected-Component Labelingの実装中にソートが必要だったのでPyOpenCL環境で記述.
pyopencl.algorithm下にあるっぽいのですが, どの道最終的にHLSLに移植する際には自分で記述しないといけないので書きました.
Connected-Component Labelingの実装の途中に記述してあるのを無理やり引っこ抜いてきたので謎の変数名が散見されますが, 備忘録ということでひとつ.
※注意※
実装方法に問題があるため基数ソートが安定ソートになっていません
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyopencl as cl | |
import numpy as np | |
ctx = cl.create_some_context() | |
queue = cl.CommandQueue(ctx) | |
length = np.int32(256) | |
labelElements = np.zeros(length * 2, dtype=np.int32) | |
length_Buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, length.nbytes, hostbuf=length) | |
sortedElements_Buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, labelElements.nbytes) | |
sequence = np.empty(2, dtype=np.int32) | |
sequence_Buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, sequence.nbytes) | |
code_bitonicSort = ''' | |
#define GROUP_SIZE_X 16 | |
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE_X, 1, 1))) | |
void firstSort( | |
__constant int *input, | |
__constant int *length, | |
__global int *result, | |
__global int *sequence | |
){ | |
int thid = get_global_id(0); | |
int grid = get_local_id(0); | |
int gridx = get_group_id(0); | |
__local int local_data[(GROUP_SIZE_X * 2) * 2]; | |
local_data[(2 * grid) * 2] = input[(2 * thid) * 2]; //index | |
local_data[(2 * grid) * 2 + 1] = input[(2 * thid) * 2 + 1]; //key | |
local_data[(2 * grid + 1) * 2] = input[(2 * thid + 1) * 2]; //index | |
local_data[(2 * grid + 1) * 2 + 1] = input[(2 * thid + 1) * 2 + 1];//key | |
barrier(CLK_LOCAL_MEM_FENCE); | |
__local int temp[GROUP_SIZE_X * 2]; | |
//========================================================================= | |
// Radix Sort | |
//========================================================================= | |
for(int k=0; k<32; k++){ | |
int bit1 = (local_data[(2 * grid) * 2 + 1] >> k) & 1; | |
int bit2 = (local_data[(2 * grid + 1) * 2 + 1] >> k) & 1; | |
temp[2 * grid] = bit1 ^ 1; //0と1を反転 | |
temp[2 * grid + 1] = bit2 ^ 1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int total = temp[(2 * GROUP_SIZE_X) - 1]; | |
//===================================================================== | |
// Scan | |
//===================================================================== | |
int offset = 1; | |
for(int i=(2 * GROUP_SIZE_X)>>1; i>0; i>>=1){ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(grid < i){ | |
int ai = offset*(2*grid+1)-1; | |
int bi = offset*(2*grid+2)-1; | |
temp[bi]+=temp[ai]; | |
} | |
offset <<= 1; | |
} | |
if(grid==0) temp[(2 * GROUP_SIZE_X) - 1] = 0; | |
for(int i=1; i<(2 * GROUP_SIZE_X); i<<=1){ | |
offset >>= 1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(grid < i){ | |
int ai = offset*(2*grid+1)-1; | |
int bi = offset*(2*grid+2)-1; | |
int t = temp[ai]; | |
temp[ai] = temp[bi]; | |
temp[bi] += t; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
total += temp[(2 * GROUP_SIZE_X) - 1]; | |
//===================================================================== | |
// Sort | |
//===================================================================== | |
int t1 = (2 * grid) - temp[(2 * grid)] + total; | |
int t2 = (2 * grid + 1) - temp[(2 * grid + 1)] + total; | |
temp[(2 * grid)] = bit1 ? t1 : temp[(2 * grid)]; | |
temp[(2 * grid + 1)] = bit2 ? t2 : temp[(2 * grid + 1)]; | |
int2 d1 = (int2)(local_data[(2 * grid) * 2], local_data[(2 * grid) * 2 + 1]); | |
int2 d2 = (int2)(local_data[(2 * grid + 1) * 2], local_data[(2 * grid + 1) * 2 + 1]); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
local_data[temp[(2 * grid)] * 2] = d1.x; | |
local_data[temp[(2 * grid)] * 2 + 1] = d1.y; | |
local_data[temp[(2 * grid + 1)] * 2] = d2.x; | |
local_data[temp[(2 * grid + 1)] * 2 + 1] = d2.y; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
if(gridx%2==0){ | |
result[(2 * thid) * 2] = local_data[(2 * grid) * 2]; | |
result[(2 * thid) * 2 + 1] = local_data[(2 * grid) * 2 + 1]; | |
result[(2 * thid + 1) * 2] = local_data[(2 * grid + 1) * 2]; | |
result[(2 * thid + 1) * 2 + 1] = local_data[(2 * grid + 1) * 2 + 1]; | |
} | |
else{ | |
result[(2 * thid) * 2] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid) + 1) * 2]; | |
result[(2 * thid) * 2 + 1] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid) + 1) * 2 + 1]; | |
result[(2 * thid + 1) * 2] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid)) * 2]; | |
result[(2 * thid + 1) * 2 + 1] = local_data[(2 * ((GROUP_SIZE_X - 1) - grid)) * 2 + 1]; | |
} | |
// シーケンスを設定 | |
if(thid==0){ | |
sequence[0] = 4; | |
sequence[1] = sequence[0] >> 1; | |
} | |
} | |
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE_X, 1, 1))) | |
void bitonicSort( | |
/*__constant int *input, */ | |
__constant int *length, | |
__global int *result, | |
__global int *sequence | |
){ | |
int thid = get_global_id(0); | |
int grid = get_local_id(0); | |
int gridx = get_group_id(0); | |
int dir = 1;//ソートの方向. 1で小->大, 0で大->小 | |
__local int local_data[(GROUP_SIZE_X * 2) * 2]; | |
__local int temp[GROUP_SIZE_X * 2]; | |
//========================================================================= | |
// ソートするブロックと方向を指定 | |
//========================================================================= | |
// 方向 | |
dir = ((int)(gridx / (sequence[0] >> 1)) & 1) ^ 1;// 1グループあたり2列を担当することに注意 | |
int seq[2] = {sequence[0], sequence[1]}; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// 一つ目のブロック | |
local_data[grid * 2] | |
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2]; //index | |
local_data[grid * 2 + 1] | |
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1]; //key | |
// 二つ目のブロック | |
local_data[(grid + GROUP_SIZE_X) * 2] | |
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2]; //index | |
local_data[(grid + GROUP_SIZE_X) * 2 + 1] | |
= result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1];//key | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for(int k=0; k<32; k++){ | |
int bit1 = (local_data[(2 * grid) * 2 + 1] >> k) & 1; | |
int bit2 = (local_data[(2 * grid + 1) * 2 + 1] >> k) & 1; | |
temp[2 * grid] = bit1 ^ 1; | |
temp[2 * grid + 1] = bit2 ^ 1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int total = temp[(2 * GROUP_SIZE_X) - 1]; | |
//===================================================================== | |
// Scan | |
//===================================================================== | |
int offset = 1; | |
for(int i=(2 * GROUP_SIZE_X)>>1; i>0; i>>=1){ | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(grid < i){ | |
int ai = offset*(2*grid+1)-1; | |
int bi = offset*(2*grid+2)-1; | |
temp[bi]+=temp[ai]; | |
} | |
offset <<= 1; | |
} | |
if(grid==0) temp[(2 * GROUP_SIZE_X) - 1] = 0; | |
for(int i=1; i<(2 * GROUP_SIZE_X); i<<=1){ | |
offset >>= 1; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if(grid < i){ | |
int ai = offset*(2*grid+1)-1; | |
int bi = offset*(2*grid+2)-1; | |
int t = temp[ai]; | |
temp[ai] = temp[bi]; | |
temp[bi] += t; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
total += temp[(2 * GROUP_SIZE_X) - 1]; | |
//===================================================================== | |
// Sort | |
//===================================================================== | |
int t1 = (2 * grid) - temp[(2 * grid)] + total; | |
int t2 = (2 * grid + 1) - temp[(2 * grid + 1)] + total; | |
temp[(2 * grid)] = bit1 ? t1 : temp[(2 * grid)]; | |
temp[(2 * grid + 1)] = bit2 ? t2 : temp[(2 * grid + 1)]; | |
int2 d1 = (int2)(local_data[(2 * grid) * 2], local_data[(2 * grid) * 2 + 1]); | |
int2 d2 = (int2)(local_data[(2 * grid + 1) * 2], local_data[(2 * grid + 1) * 2 + 1]); | |
barrier(CLK_LOCAL_MEM_FENCE); | |
local_data[temp[(2 * grid)] * 2] = d1.x; | |
local_data[temp[(2 * grid)] * 2 + 1] = d1.y; | |
local_data[temp[(2 * grid + 1)] * 2] = d2.x; | |
local_data[temp[(2 * grid + 1)] * 2 + 1] = d2.y; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
//========================================================================= | |
// 結果を格納 | |
//========================================================================= | |
if(dir){ | |
// 一つ目のブロック | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2] | |
= local_data[grid * 2]; //index | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1] | |
= local_data[grid * 2 + 1]; //key | |
// 二つ目のブロック | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2] | |
= local_data[(grid + GROUP_SIZE_X) * 2]; //index | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1] | |
= local_data[(grid + GROUP_SIZE_X) * 2 + 1]; //key | |
} | |
else{ | |
// 上と同様(逆順) | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2] | |
= local_data[(2 * GROUP_SIZE_X - 1 - grid) * 2]; | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1])) * GROUP_SIZE_X + grid) * 2 + 1] | |
= local_data[(2 * GROUP_SIZE_X - 1 - grid) * 2 + 1]; | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2] | |
= local_data[(GROUP_SIZE_X - 1 - grid) * 2]; | |
result[(((int)(gridx / seq[1]) * (seq[1] * 2) + (gridx % seq[1]) + seq[1]) * GROUP_SIZE_X + grid) * 2 + 1] | |
= local_data[(GROUP_SIZE_X - 1 - grid) * 2 + 1]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// シーケンスの更新 | |
if(thid == 0){ | |
if(sequence[1] > 1){ | |
sequence[1] >>= 1; | |
} | |
else{ | |
sequence[0] <<= 1; | |
sequence[1] = sequence[0] >> 1; | |
} | |
} | |
} | |
''' | |
prg_bitonicSort = cl.Program(ctx, code_bitonicSort).build() | |
randomArray = np.empty(2*length, dtype=np.int32) | |
randomArray[0:length*2:2] = np.arange(1, length+1, 1) | |
randomArray[1:length*2+1:2] = np.random.randint(0, 1000, length) | |
print(randomArray) | |
randomArray_Buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, randomArray.nbytes, hostbuf=randomArray) | |
event_firstSort = prg_bitonicSort.firstSort(queue, | |
[length // 2], [16], | |
randomArray_Buf, length_Buf, sortedElements_Buf, sequence_Buf) | |
event_firstSort.wait() | |
sortedElements = np.empty(labelElements.shape, dtype=np.int32) | |
cl.enqueue_copy(queue, sortedElements, sortedElements_Buf) | |
print(sortedElements) | |
for i in range(sum(range(2, int(np.log2(length//16)) + 1))): | |
event_bitonicSort = prg_bitonicSort.bitonicSort(queue, | |
[length // 2], [16], | |
length_Buf, sortedElements_Buf, sequence_Buf) | |
event_bitonicSort.wait() | |
cl.enqueue_copy(queue, sortedElements, sortedElements_Buf) | |
print(sortedElements) |
Unityだと文字を出力するタイプのプログラムのデバッグはやや面倒(というよりは冗長?)かと思ったのでPyOpenCL(Python+OpenCL)を使ってみました.
デバイス側のコードは結局CライクなのでHLSL触っていれば導入コストは低いと思います.
ドキュメンテーションがしっかりしているのも良い点.
PyOpenCLの方は, 私の検索が甘かったのかもしれませんが, あまり実装例が見当たらなかった印象.
Pythonを初めて触りましたが, Numpyがやや面倒でした.
参考
http://t-pot.com/program/90_BitonicSort/index.html (2016/09/26)
http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html (2016/09/26)