From 8501d5649e3b139b48c66df063e790ad0ab95a3d Mon Sep 17 00:00:00 2001 From: Joe Groff Date: Fri, 16 Apr 2010 15:16:17 -0700 Subject: [PATCH] cuda: add a single-block prefix sum sample kernel (so doug has a more substantive kernel to test with) --- extra/cuda/prefix-sum.cu | 103 ++++++++++++++++++ extra/cuda/prefix-sum.ptx | 222 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 325 insertions(+) create mode 100644 extra/cuda/prefix-sum.cu create mode 100644 extra/cuda/prefix-sum.ptx diff --git a/extra/cuda/prefix-sum.cu b/extra/cuda/prefix-sum.cu new file mode 100644 index 0000000000..a77a67f035 --- /dev/null +++ b/extra/cuda/prefix-sum.cu @@ -0,0 +1,103 @@ +#include +#include +#include + +static const int LOG_BANK_COUNT = 4; + +static inline __device__ __host__ unsigned shared_offset(unsigned i) +{ + return i + (i >> LOG_BANK_COUNT); +} + +static inline __device__ __host__ unsigned offset_a(unsigned offset, unsigned i) +{ + return shared_offset(offset * (2*i + 1) - 1); +} + +static inline __device__ __host__ unsigned offset_b(unsigned offset, unsigned i) +{ + return shared_offset(offset * (2*i + 2) - 1); +} + +static inline __device__ __host__ unsigned lpot(unsigned x) +{ + --x; x |= x>>1; x|=x>>2; x|=x>>4; x|=x>>8; x|=x>>16; return ++x; +} + +template +__global__ void prefix_sum_block(T *in, T *out, unsigned n) +{ + extern __shared__ T temp[]; + + int idx = threadIdx.x; + int blocksize = blockDim.x; + + temp[shared_offset(idx )] = (idx < n) ? in[idx ] : 0; + temp[shared_offset(idx + blocksize)] = (idx + blocksize < n) ? in[idx + blocksize] : 0; + + int offset, d; + for (offset = 1, d = blocksize; d > 0; d >>= 1, offset <<= 1) { + __syncthreads(); + if (idx < d) { + unsigned a = offset_a(offset, idx), b = offset_b(offset, idx); + temp[b] += temp[a]; + } + } + + if (idx == 0) temp[shared_offset(blocksize*2 - 1)] = 0; + + for (d = 1; d <= blocksize; d <<= 1) { + offset >>= 1; + __syncthreads(); + + if (idx < d) { + unsigned a = offset_a(offset, idx), b = offset_b(offset, idx); + unsigned t = temp[a]; + temp[a] = temp[b]; + temp[b] += t; + } + } + __syncthreads(); + + if (idx < n) out[idx ] = temp[shared_offset(idx )]; + if (idx + blocksize < n) out[idx + blocksize] = temp[shared_offset(idx + blocksize)]; +} + +template +void prefix_sum(T *in, T *out, unsigned n) +{ + char *device_values; + unsigned n_lpot = lpot(n); + size_t n_pitch; + + cudaError_t error = cudaMallocPitch((void**)&device_values, &n_pitch, sizeof(T)*n, 2); + if (error != 0) { + printf("error %u allocating width %lu height %u\n", error, sizeof(T)*n, 2); + exit(1); + } + + cudaMemcpy(device_values, in, sizeof(T)*n, cudaMemcpyHostToDevice); + + prefix_sum_block<<<1, n_lpot/2, shared_offset(n_lpot)*sizeof(T)>>> + ((T*)device_values, (T*)(device_values + n_pitch), n); + + cudaMemcpy(out, device_values + n_pitch, sizeof(T)*n, cudaMemcpyDeviceToHost); + cudaFree(device_values); +} + +int main() +{ + sranddev(); + + static unsigned in_values[1024], out_values[1024]; + + for (int i = 0; i < 1024; ++i) + in_values[i] = rand() >> 21; + + prefix_sum(in_values, out_values, 1024); + + for (int i = 0; i < 1024; ++i) + printf("%5d => %5d\n", in_values[i], out_values[i]); + + return 0; +} diff --git a/extra/cuda/prefix-sum.ptx b/extra/cuda/prefix-sum.ptx new file mode 100644 index 0000000000..d18917965d --- /dev/null +++ b/extra/cuda/prefix-sum.ptx @@ -0,0 +1,222 @@ + .version 1.4 + .target sm_10, map_f64_to_f32 + // compiled with /usr/local/cuda/bin/../open64/lib//be + // nvopencc 3.0 built on 2010-03-11 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_00000236_00000000-7_prefix-sum.cpp3.i (/var/folders/K6/K6oI14wZ2RWhSE+BYqTjA++++TI/-Tmp-/ccBI#.0ATpGM) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "" + .file 2 "/tmp/tmpxft_00000236_00000000-6_prefix-sum.cudafe2.gpu" + .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h" + .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/bin/../include/host_defines.h" + .file 6 "/usr/local/cuda/bin/../include/builtin_types.h" + .file 7 "/usr/local/cuda/bin/../include/device_types.h" + .file 8 "/usr/local/cuda/bin/../include/driver_types.h" + .file 9 "/usr/local/cuda/bin/../include/texture_types.h" + .file 10 "/usr/local/cuda/bin/../include/vector_types.h" + .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h" + .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h" + .file 13 "/usr/include/i386/_types.h" + .file 14 "/usr/include/time.h" + .file 15 "prefix-sum.cu" + .file 16 "/usr/local/cuda/bin/../include/common_functions.h" + .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h" + .file 18 "/usr/local/cuda/bin/../include/math_functions.h" + .file 19 "/usr/local/cuda/bin/../include/device_functions.h" + .file 20 "/usr/local/cuda/bin/../include/math_constants.h" + .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/bin/../include/common_types.h" + .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h" + .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h" + .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h" + .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h" + + .extern .shared .align 4 .b8 temp[]; + + .entry _Z16prefix_sum_blockIjEvPT_S1_j ( + .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in, + .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out, + .param .u32 __cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n) + { + .reg .u32 %r<81>; + .reg .pred %p<11>; + .loc 15 28 0 +$LBB1__Z16prefix_sum_blockIjEvPT_S1_j: + ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n]; + cvt.s32.u16 %r2, %tid.x; + setp.lt.u32 %p1, %r2, %r1; + @!%p1 bra $Lt_0_7938; + .loc 15 35 0 + ld.param.u32 %r3, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in]; + mul24.lo.u32 %r4, %r2, 4; + add.u32 %r5, %r3, %r4; + ld.global.u32 %r6, [%r5+0]; + bra.uni $Lt_0_7682; +$Lt_0_7938: + mov.u32 %r6, 0; +$Lt_0_7682: + mov.u32 %r7, temp; + shr.u32 %r8, %r2, 4; + add.u32 %r9, %r2, %r8; + mul.lo.u32 %r10, %r9, 4; + add.u32 %r11, %r10, %r7; + st.shared.u32 [%r11+0], %r6; + cvt.s32.u16 %r12, %ntid.x; + add.s32 %r13, %r12, %r2; + .loc 15 28 0 + ld.param.u32 %r1, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_n]; + .loc 15 35 0 + setp.lt.u32 %p2, %r13, %r1; + @!%p2 bra $Lt_0_8450; + .loc 15 36 0 + ld.param.u32 %r14, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_in]; + mul.lo.u32 %r15, %r13, 4; + add.u32 %r16, %r14, %r15; + ld.global.u32 %r17, [%r16+0]; + bra.uni $Lt_0_8194; +$Lt_0_8450: + mov.u32 %r17, 0; +$Lt_0_8194: + shr.u32 %r18, %r13, 4; + add.u32 %r19, %r13, %r18; + mul.lo.u32 %r20, %r19, 4; + add.u32 %r21, %r20, %r7; + st.shared.u32 [%r21+0], %r17; + .loc 15 39 0 + mov.s32 %r22, %r12; + mov.u32 %r23, 0; + setp.le.s32 %p3, %r12, %r23; + mov.s32 %r24, 1; + @%p3 bra $Lt_0_13314; +$Lt_0_9218: + // Loop body line 39, nesting depth: 1, estimated iterations: unknown + .loc 15 40 0 + bar.sync 0; + setp.le.s32 %p4, %r22, %r2; + @%p4 bra $Lt_0_9474; + // Part of loop body line 39, head labeled $Lt_0_9218 + .loc 15 43 0 + mul24.lo.u32 %r25, %r2, 2; + add.u32 %r26, %r25, 1; + add.u32 %r27, %r25, 2; + mul.lo.u32 %r28, %r24, %r26; + mul.lo.u32 %r29, %r24, %r27; + sub.u32 %r30, %r29, 1; + shr.u32 %r31, %r30, 4; + add.u32 %r32, %r29, %r31; + mul.lo.u32 %r33, %r32, 4; + add.u32 %r34, %r33, %r7; + ld.shared.u32 %r35, [%r34+-4]; + sub.u32 %r36, %r28, 1; + shr.u32 %r37, %r36, 4; + add.u32 %r38, %r28, %r37; + mul.lo.u32 %r39, %r38, 4; + add.u32 %r40, %r7, %r39; + ld.shared.u32 %r41, [%r40+-4]; + add.u32 %r42, %r35, %r41; + st.shared.u32 [%r34+-4], %r42; +$Lt_0_9474: + // Part of loop body line 39, head labeled $Lt_0_9218 + .loc 15 39 0 + shr.s32 %r22, %r22, 1; + shl.b32 %r24, %r24, 1; + mov.u32 %r43, 0; + setp.gt.s32 %p5, %r22, %r43; + @%p5 bra $Lt_0_9218; + bra.uni $Lt_0_8706; +$Lt_0_13314: +$Lt_0_8706: + mov.u32 %r44, 0; + setp.ne.s32 %p6, %r2, %r44; + @%p6 bra $Lt_0_10242; + .loc 15 47 0 + mul24.lo.s32 %r45, %r12, 2; + mov.u32 %r46, 0; + sub.u32 %r47, %r45, 1; + shr.u32 %r48, %r47, 4; + add.u32 %r49, %r45, %r48; + mul.lo.u32 %r50, %r49, 4; + add.u32 %r51, %r7, %r50; + st.shared.u32 [%r51+-4], %r46; +$Lt_0_10242: + mov.u32 %r52, 1; + setp.lt.s32 %p7, %r12, %r52; + @%p7 bra $Lt_0_10754; + mov.s32 %r22, 1; +$Lt_0_11266: + // Loop body line 47, nesting depth: 1, estimated iterations: unknown + .loc 15 50 0 + shr.s32 %r24, %r24, 1; + .loc 15 51 0 + bar.sync 0; + setp.le.s32 %p8, %r22, %r2; + @%p8 bra $Lt_0_11522; + // Part of loop body line 47, head labeled $Lt_0_11266 + .loc 15 55 0 + mul24.lo.u32 %r53, %r2, 2; + add.u32 %r54, %r53, 1; + mul.lo.u32 %r55, %r24, %r54; + sub.u32 %r56, %r55, 1; + shr.u32 %r57, %r56, 4; + add.u32 %r58, %r55, %r57; + mul.lo.u32 %r59, %r58, 4; + add.u32 %r60, %r59, %r7; + ld.shared.u32 %r61, [%r60+-4]; + .loc 15 56 0 + add.u32 %r62, %r53, 2; + mul.lo.u32 %r63, %r24, %r62; + sub.u32 %r64, %r63, 1; + shr.u32 %r65, %r64, 4; + add.u32 %r66, %r63, %r65; + mul.lo.u32 %r67, %r66, 4; + add.u32 %r68, %r67, %r7; + ld.shared.u32 %r69, [%r68+-4]; + st.shared.u32 [%r60+-4], %r69; + .loc 15 57 0 + ld.shared.u32 %r70, [%r68+-4]; + add.u32 %r71, %r70, %r61; + st.shared.u32 [%r68+-4], %r71; +$Lt_0_11522: + // Part of loop body line 47, head labeled $Lt_0_11266 + .loc 15 49 0 + shl.b32 %r22, %r22, 1; + setp.le.s32 %p9, %r22, %r12; + @%p9 bra $Lt_0_11266; +$Lt_0_10754: + .loc 15 60 0 + bar.sync 0; + @!%p1 bra $Lt_0_12290; + .loc 15 62 0 + ld.shared.u32 %r72, [%r11+0]; + ld.param.u32 %r73, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out]; + mul24.lo.u32 %r74, %r2, 4; + add.u32 %r75, %r73, %r74; + st.global.u32 [%r75+0], %r72; +$Lt_0_12290: + @!%p2 bra $Lt_0_12802; + .loc 15 63 0 + ld.shared.u32 %r76, [%r21+0]; + ld.param.u32 %r77, [__cudaparm__Z16prefix_sum_blockIjEvPT_S1_j_out]; + mul.lo.u32 %r78, %r13, 4; + add.u32 %r79, %r77, %r78; + st.global.u32 [%r79+0], %r76; +$Lt_0_12802: + .loc 15 64 0 + exit; +$LDWend__Z16prefix_sum_blockIjEvPT_S1_j: + } // _Z16prefix_sum_blockIjEvPT_S1_j +