From c392ff27188d89e72025840396836111296e6ee9 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Wed, 14 Apr 2010 21:09:16 -0500 Subject: [PATCH] Working on a CUDA api. Add a hello world program. --- extra/cuda/cuda.factor | 303 +++++++++++++++++++++++++++++++++++----- extra/cuda/ffi/tags.txt | 1 + extra/cuda/hello.cu | 65 +++++++++ extra/cuda/hello.ptx | 71 ++++++++++ 4 files changed, 404 insertions(+), 36 deletions(-) create mode 100644 extra/cuda/hello.cu create mode 100644 extra/cuda/hello.ptx diff --git a/extra/cuda/cuda.factor b/extra/cuda/cuda.factor index 887740d542..6b343fb1cc 100644 --- a/extra/cuda/cuda.factor +++ b/extra/cuda/cuda.factor @@ -1,17 +1,24 @@ ! Copyright (C) 2010 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: alien.c-types alien.data assocs classes.struct -combinators continuations cuda.ffi fry io.backend kernel -sequences ; +USING: accessors alien alien.c-types alien.data alien.parser +alien.strings arrays assocs byte-arrays classes.struct +combinators continuations cuda.ffi destructors fry io +io.backend io.encodings.string io.encodings.utf8 kernel lexer +locals math math.parser namespaces opengl.gl.extensions +prettyprint quotations sequences ; IN: cuda +SYMBOL: cuda-device +SYMBOL: cuda-context +SYMBOL: cuda-module +SYMBOL: cuda-function +SYMBOL: cuda-launcher +SYMBOL: cuda-memory-hashtable + ERROR: throw-cuda-error n ; : cuda-error ( n -- ) - { - { CUDA_SUCCESS [ ] } - [ throw-cuda-error ] - } case ; + dup CUDA_SUCCESS = [ drop ] [ throw-cuda-error ] if ; : cuda-version ( -- n ) int [ cuDriverGetVersion cuda-error ] keep *int ; @@ -19,32 +26,10 @@ ERROR: throw-cuda-error n ; : init-cuda ( -- ) 0 cuInit cuda-error ; -: with-cuda ( quot -- ) - init-cuda [ ] [ ] cleanup ; inline - - [ cuDeviceGetCount cuda-error ] keep *int ; - -: n>cuda-device ( n -- device ) - [ CUdevice ] dip [ cuDeviceGet cuda-error ] 2keep drop *int ; - -: enumerate-cuda-devices ( -- devices ) - #cuda-devices iota [ n>cuda-device ] map ; - -: cuda-device>properties ( device -- properties ) - [ CUdevprop ] dip - [ cuDeviceGetProperties cuda-error ] 2keep drop - CUdevprop memory>struct ; - -: cuda-device-properties ( -- properties ) - enumerate-cuda-devices [ cuda-device>properties ] map ; - -PRIVATE> - -: cuda-devices ( -- assoc ) - enumerate-cuda-devices [ dup cuda-device>properties ] { } map>assoc ; +TUPLE: launcher +{ device integer initial: 0 } +{ device-flags initial: 0 } +path block-shape shared-size grid ; : with-cuda-context ( flags device quot -- ) [ @@ -65,13 +50,259 @@ PRIVATE> [ drop '[ _ cuModuleUnload cuda-error ] ] 2bi [ ] cleanup ; inline -: get-cuda-function ( module string -- function ) +: with-cuda-program ( flags device path quot -- ) + [ dup cuda-device set ] 2dip + '[ + cuda-context set + _ [ + cuda-module set + _ call + ] with-cuda-module + ] with-cuda-context ; inline + +: with-cuda ( launcher quot -- ) + [ + init-cuda + H{ } clone cuda-memory-hashtable + ] 2dip '[ + _ + [ cuda-launcher set ] + [ [ device>> ] [ device-flags>> ] [ path>> ] tri ] bi + _ with-cuda-program + ] with-variable ; inline + + [ cuDeviceGetCount cuda-error ] keep *int ; + +: n>cuda-device ( n -- device ) + [ CUdevice ] dip [ cuDeviceGet cuda-error ] 2keep drop *int ; + +: enumerate-cuda-devices ( -- devices ) + #cuda-devices iota [ n>cuda-device ] map ; + +: cuda-device-properties ( device -- properties ) + [ CUdevprop ] dip + [ cuDeviceGetProperties cuda-error ] 2keep drop + CUdevprop memory>struct ; + +PRIVATE> + +: cuda-devices ( -- assoc ) + enumerate-cuda-devices [ dup cuda-device-properties ] { } map>assoc ; + +: cuda-device-name ( n -- string ) + [ 256 [ ] keep ] dip + [ cuDeviceGetName cuda-error ] + [ 2drop utf8 alien>string ] 3bi ; + +: cuda-device-capability ( n -- pair ) + [ int int ] dip + [ cuDeviceComputeCapability cuda-error ] + [ drop [ *int ] bi@ ] 3bi 2array ; + +: cuda-device-memory ( n -- bytes ) + [ uint ] dip + [ cuDeviceTotalMem cuda-error ] + [ drop *uint ] 2bi ; + +: get-cuda-function* ( module string -- function ) [ CUfunction ] 2dip [ cuModuleGetFunction cuda-error ] 3keep 2drop *void* ; +: get-cuda-function ( string -- function ) + [ cuda-module get ] dip get-cuda-function* ; + +: with-cuda-function ( string quot -- ) + [ + get-cuda-function cuda-function set + ] dip call ; inline + +: launch-function* ( function -- ) cuLaunch cuda-error ; + +: launch-function ( -- ) cuda-function get cuLaunch cuda-error ; + +: launch-function-grid* ( function width height -- ) + cuLaunchGrid cuda-error ; + +: launch-function-grid ( width height -- ) + [ cuda-function get ] 2dip + cuLaunchGrid cuda-error ; + +TUPLE: cuda-memory < disposable ptr length ; + +: ( ptr length -- obj ) + cuda-memory new-disposable + swap >>length + swap >>ptr ; + +: add-cuda-memory ( obj -- obj ) + dup dup ptr>> cuda-memory-hashtable get set-at ; + +: delete-cuda-memory ( obj -- ) + cuda-memory-hashtable delete-at ; + +ERROR: invalid-cuda-memory ptr ; + +: cuda-memory-length ( cuda-memory -- n ) + ptr>> cuda-memory-hashtable get ?at [ + length>> + ] [ + invalid-cuda-memory + ] if ; + +M: cuda-memory byte-length length>> ; + : cuda-malloc ( n -- ptr ) [ CUdeviceptr ] dip - [ cuMemAlloc cuda-error ] 2keep drop *int ; + [ cuMemAlloc cuda-error ] 2keep + [ *int ] dip add-cuda-memory ; -: cuda-free ( ptr -- ) +: cuda-free* ( ptr -- ) cuMemFree cuda-error ; + +M: cuda-memory dispose ( ptr -- ) + ptr>> cuda-free* ; + +: host>device ( dest-ptr src-ptr -- ) + [ ptr>> ] dip dup length cuMemcpyHtoD cuda-error ; + +:: device>host ( ptr -- seq ) + ptr byte-length + [ ptr [ ptr>> ] [ byte-length ] bi cuMemcpyDtoH cuda-error ] keep ; + +: memcpy-device>device ( dest-ptr src-ptr count -- ) + cuMemcpyDtoD cuda-error ; + +: memcpy-device>array ( dest-array dest-index src-ptr count -- ) + cuMemcpyDtoA cuda-error ; + +: memcpy-array>device ( dest-ptr src-array src-index count -- ) + cuMemcpyAtoD cuda-error ; + +: memcpy-array>host ( dest-ptr src-array src-index count -- ) + cuMemcpyAtoH cuda-error ; + +: memcpy-host>array ( dest-array dest-index src-ptr count -- ) + cuMemcpyHtoA cuda-error ; + +: memcpy-array>array ( dest-array dest-index src-array src-ptr count -- ) + cuMemcpyAtoA cuda-error ; + +: cuda-int* ( function offset value -- ) + cuParamSeti cuda-error ; + +: cuda-int ( offset value -- ) + [ cuda-function get ] 2dip cuda-int* ; + +: cuda-float* ( function offset value -- ) + cuParamSetf cuda-error ; + +: cuda-float ( offset value -- ) + [ cuda-function get ] 2dip cuda-float* ; + +: cuda-vector* ( function offset ptr n -- ) + cuParamSetv cuda-error ; + +: cuda-vector ( offset ptr n -- ) + [ cuda-function get ] 3dip cuda-vector* ; + +: param-size* ( function n -- ) + cuParamSetSize cuda-error ; + +: param-size ( n -- ) + [ cuda-function get ] dip param-size* ; + +: malloc-device-string ( string -- n ) + utf8 encode + [ length cuda-malloc ] keep + [ host>device ] [ drop ] 2bi ; + +ERROR: bad-cuda-parameter parameter ; + +:: set-parameters ( seq -- ) + cuda-function get :> function + 0 :> offset! + seq [ + [ offset ] dip + { + { [ dup cuda-memory? ] [ ptr>> cuda-int ] } + { [ dup float? ] [ cuda-float ] } + { [ dup integer? ] [ cuda-int ] } + [ bad-cuda-parameter ] + } cond + offset 4 + offset! + ] each + offset param-size ; + +: cuda-device-attribute ( attribute dev -- n ) + [ int ] 2dip + [ cuDeviceGetAttribute cuda-error ] + [ 2drop *int ] 3bi ; + +: function-block-shape* ( function x y z -- ) + cuFuncSetBlockShape cuda-error ; + +: function-block-shape ( x y z -- ) + [ cuda-function get ] 3dip + cuFuncSetBlockShape cuda-error ; + +: function-shared-size* ( function n -- ) + cuFuncSetSharedSize cuda-error ; + +: function-shared-size ( n -- ) + [ cuda-function get ] dip + cuFuncSetSharedSize cuda-error ; + +: launch ( -- ) + cuda-launcher get { + [ block-shape>> first3 function-block-shape ] + [ shared-size>> function-shared-size ] + [ + grid>> [ + launch-function + ] [ + first2 launch-function-grid + ] if-empty + ] + } cleave ; + +: cuda-device. ( n -- ) + { + [ "Device: " write number>string print ] + [ "Name: " write cuda-device-name print ] + [ "Memory: " write cuda-device-memory number>string print ] + [ + "Capability: " write + cuda-device-capability [ number>string ] map " " join print + ] + [ "Properties: " write cuda-device-properties . ] + [ + "CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: " write + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP swap + cuda-device-attribute number>string print + ] + } cleave ; + +: cuda. ( -- ) + "CUDA Version: " write cuda-version number>string print nl + #cuda-devices iota [ nl ] [ cuda-device. ] interleave ; + + +: test-cuda0 ( -- ) + T{ launcher + { path "vocab:cuda/hello.ptx" } + { block-shape { 6 6 6 } } + { shared-size 2 } + { grid { 2 6 } } + } [ + "helloWorld" [ + "Hello World!" [ - ] map-index + malloc-device-string &dispose + + [ 1array set-parameters ] + [ drop launch ] + [ device>host utf8 alien>string . ] tri + ] with-cuda-function + ] with-cuda ; diff --git a/extra/cuda/ffi/tags.txt b/extra/cuda/ffi/tags.txt index 700f0dc9a5..f74dbeec64 100644 --- a/extra/cuda/ffi/tags.txt +++ b/extra/cuda/ffi/tags.txt @@ -1 +1,2 @@ not tested +bindings diff --git a/extra/cuda/hello.cu b/extra/cuda/hello.cu new file mode 100644 index 0000000000..1f3cd677f9 --- /dev/null +++ b/extra/cuda/hello.cu @@ -0,0 +1,65 @@ +/* + World using CUDA +** +** The string "Hello World!" is mangled then restored using a common CUDA idiom +** +** Byron Galbraith +** 2009-02-18 +*/ +#include +#include + +// Prototypes +extern "C" __global__ void helloWorld(char*); + +// Host function +int +main(int argc, char** argv) +{ + int i; + + // desired output + char str[] = "Hello World!"; + + // mangle contents of output + // the null character is left intact for simplicity + for(i = 0; i < 12; i++) + str[i] -= i; + + // allocate memory on the device + char *d_str; + size_t size = sizeof(str); + cudaMalloc((void**)&d_str, size); + + // copy the string to the device + cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice); + + // set the grid and block sizes + dim3 dimGrid(2); // one block per word + dim3 dimBlock(6); // one thread per character + + // invoke the kernel + helloWorld<<< dimGrid, dimBlock >>>(d_str); + + // retrieve the results from the device + cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost); + + // free up the allocated memory on the device + cudaFree(d_str); + + // everyone's favorite part + printf("%s\n", str); + + return 0; +} + +// Device kernel +__global__ void +helloWorld(char* str) +{ + // determine where in the thread grid we are + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + // unmangle output + str[idx] += idx; +} diff --git a/extra/cuda/hello.ptx b/extra/cuda/hello.ptx new file mode 100644 index 0000000000..049bb5e9a5 --- /dev/null +++ b/extra/cuda/hello.ptx @@ -0,0 +1,71 @@ + .version 1.4 + .target sm_10, map_f64_to_f32 + // compiled with /usr/local/cuda/bin/../open64/lib//be + // nvopencc 3.0 built on 2010-03-11 + + //----------------------------------------------------------- + // Compiling /tmp/tmpxft_00000eab_00000000-7_hello.cpp3.i (/var/folders/KD/KDnx4D80Eh0fsORqNrFWBE+++TI/-Tmp-/ccBI#.AYqbdQ) + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Options: + //----------------------------------------------------------- + // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32 + // -O3 (Optimization level) + // -g0 (Debug level) + // -m2 (Report advisories) + //----------------------------------------------------------- + + .file 1 "" + .file 2 "/tmp/tmpxft_00000eab_00000000-6_hello.cudafe2.gpu" + .file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h" + .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h" + .file 5 "/usr/local/cuda/bin/../include/host_defines.h" + .file 6 "/usr/local/cuda/bin/../include/builtin_types.h" + .file 7 "/usr/local/cuda/bin/../include/device_types.h" + .file 8 "/usr/local/cuda/bin/../include/driver_types.h" + .file 9 "/usr/local/cuda/bin/../include/texture_types.h" + .file 10 "/usr/local/cuda/bin/../include/vector_types.h" + .file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h" + .file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h" + .file 13 "/usr/include/i386/_types.h" + .file 14 "/usr/include/time.h" + .file 15 "/usr/local/cuda/bin/../include/texture_fetch_functions.h" + .file 16 "/usr/local/cuda/bin/../include/common_functions.h" + .file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h" + .file 18 "/usr/local/cuda/bin/../include/math_functions.h" + .file 19 "/usr/local/cuda/bin/../include/device_functions.h" + .file 20 "/usr/local/cuda/bin/../include/math_constants.h" + .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h" + .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h" + .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h" + .file 24 "/usr/local/cuda/bin/../include/common_types.h" + .file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h" + .file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h" + .file 27 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h" + .file 28 "hello.cu" + + + .entry helloWorld ( + .param .u32 __cudaparm_helloWorld_str) + { + .reg .u16 %rh<4>; + .reg .u32 %r<9>; + .loc 28 58 0 +$LBB1_helloWorld: + .loc 28 64 0 + mov.u16 %rh1, %ctaid.x; + mov.u16 %rh2, %ntid.x; + mul.wide.u16 %r1, %rh1, %rh2; + cvt.u32.u16 %r2, %tid.x; + add.u32 %r3, %r2, %r1; + ld.param.u32 %r4, [__cudaparm_helloWorld_str]; + add.u32 %r5, %r4, %r3; + ld.global.s8 %r6, [%r5+0]; + add.s32 %r7, %r6, %r3; + st.global.s8 [%r5+0], %r7; + .loc 28 65 0 + exit; +$LDWend_helloWorld: + } // helloWorld +