Working on a CUDA api. Add a hello world program.
parent
b7164ee4c1
commit
c392ff2718
|
@ -1,17 +1,24 @@
|
|||
! Copyright (C) 2010 Doug Coleman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: alien.c-types alien.data assocs classes.struct
|
||||
combinators continuations cuda.ffi fry io.backend kernel
|
||||
sequences ;
|
||||
USING: accessors alien alien.c-types alien.data alien.parser
|
||||
alien.strings arrays assocs byte-arrays classes.struct
|
||||
combinators continuations cuda.ffi destructors fry io
|
||||
io.backend io.encodings.string io.encodings.utf8 kernel lexer
|
||||
locals math math.parser namespaces opengl.gl.extensions
|
||||
prettyprint quotations sequences ;
|
||||
IN: cuda
|
||||
|
||||
SYMBOL: cuda-device
|
||||
SYMBOL: cuda-context
|
||||
SYMBOL: cuda-module
|
||||
SYMBOL: cuda-function
|
||||
SYMBOL: cuda-launcher
|
||||
SYMBOL: cuda-memory-hashtable
|
||||
|
||||
ERROR: throw-cuda-error n ;
|
||||
|
||||
: cuda-error ( n -- )
|
||||
{
|
||||
{ CUDA_SUCCESS [ ] }
|
||||
[ throw-cuda-error ]
|
||||
} case ;
|
||||
dup CUDA_SUCCESS = [ drop ] [ throw-cuda-error ] if ;
|
||||
|
||||
: cuda-version ( -- n )
|
||||
int <c-object> [ cuDriverGetVersion cuda-error ] keep *int ;
|
||||
|
@ -19,32 +26,10 @@ ERROR: throw-cuda-error n ;
|
|||
: init-cuda ( -- )
|
||||
0 cuInit cuda-error ;
|
||||
|
||||
: with-cuda ( quot -- )
|
||||
init-cuda [ ] [ ] cleanup ; inline
|
||||
|
||||
<PRIVATE
|
||||
|
||||
: #cuda-devices ( -- n )
|
||||
int <c-object> [ cuDeviceGetCount cuda-error ] keep *int ;
|
||||
|
||||
: n>cuda-device ( n -- device )
|
||||
[ CUdevice <c-object> ] dip [ cuDeviceGet cuda-error ] 2keep drop *int ;
|
||||
|
||||
: enumerate-cuda-devices ( -- devices )
|
||||
#cuda-devices iota [ n>cuda-device ] map ;
|
||||
|
||||
: cuda-device>properties ( device -- properties )
|
||||
[ CUdevprop <c-object> ] dip
|
||||
[ cuDeviceGetProperties cuda-error ] 2keep drop
|
||||
CUdevprop memory>struct ;
|
||||
|
||||
: cuda-device-properties ( -- properties )
|
||||
enumerate-cuda-devices [ cuda-device>properties ] map ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: cuda-devices ( -- assoc )
|
||||
enumerate-cuda-devices [ dup cuda-device>properties ] { } map>assoc ;
|
||||
TUPLE: launcher
|
||||
{ device integer initial: 0 }
|
||||
{ device-flags initial: 0 }
|
||||
path block-shape shared-size grid ;
|
||||
|
||||
: with-cuda-context ( flags device quot -- )
|
||||
[
|
||||
|
@ -65,13 +50,259 @@ PRIVATE>
|
|||
[ drop '[ _ cuModuleUnload cuda-error ] ] 2bi
|
||||
[ ] cleanup ; inline
|
||||
|
||||
: get-cuda-function ( module string -- function )
|
||||
: with-cuda-program ( flags device path quot -- )
|
||||
[ dup cuda-device set ] 2dip
|
||||
'[
|
||||
cuda-context set
|
||||
_ [
|
||||
cuda-module set
|
||||
_ call
|
||||
] with-cuda-module
|
||||
] with-cuda-context ; inline
|
||||
|
||||
: with-cuda ( launcher quot -- )
|
||||
[
|
||||
init-cuda
|
||||
H{ } clone cuda-memory-hashtable
|
||||
] 2dip '[
|
||||
_
|
||||
[ cuda-launcher set ]
|
||||
[ [ device>> ] [ device-flags>> ] [ path>> ] tri ] bi
|
||||
_ with-cuda-program
|
||||
] with-variable ; inline
|
||||
|
||||
<PRIVATE
|
||||
|
||||
: #cuda-devices ( -- n )
|
||||
int <c-object> [ cuDeviceGetCount cuda-error ] keep *int ;
|
||||
|
||||
: n>cuda-device ( n -- device )
|
||||
[ CUdevice <c-object> ] dip [ cuDeviceGet cuda-error ] 2keep drop *int ;
|
||||
|
||||
: enumerate-cuda-devices ( -- devices )
|
||||
#cuda-devices iota [ n>cuda-device ] map ;
|
||||
|
||||
: cuda-device-properties ( device -- properties )
|
||||
[ CUdevprop <c-object> ] dip
|
||||
[ cuDeviceGetProperties cuda-error ] 2keep drop
|
||||
CUdevprop memory>struct ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: cuda-devices ( -- assoc )
|
||||
enumerate-cuda-devices [ dup cuda-device-properties ] { } map>assoc ;
|
||||
|
||||
: cuda-device-name ( n -- string )
|
||||
[ 256 [ <byte-array> ] keep ] dip
|
||||
[ cuDeviceGetName cuda-error ]
|
||||
[ 2drop utf8 alien>string ] 3bi ;
|
||||
|
||||
: cuda-device-capability ( n -- pair )
|
||||
[ int <c-object> int <c-object> ] dip
|
||||
[ cuDeviceComputeCapability cuda-error ]
|
||||
[ drop [ *int ] bi@ ] 3bi 2array ;
|
||||
|
||||
: cuda-device-memory ( n -- bytes )
|
||||
[ uint <c-object> ] dip
|
||||
[ cuDeviceTotalMem cuda-error ]
|
||||
[ drop *uint ] 2bi ;
|
||||
|
||||
: get-cuda-function* ( module string -- function )
|
||||
[ CUfunction <c-object> ] 2dip
|
||||
[ cuModuleGetFunction cuda-error ] 3keep 2drop *void* ;
|
||||
|
||||
: get-cuda-function ( string -- function )
|
||||
[ cuda-module get ] dip get-cuda-function* ;
|
||||
|
||||
: with-cuda-function ( string quot -- )
|
||||
[
|
||||
get-cuda-function cuda-function set
|
||||
] dip call ; inline
|
||||
|
||||
: launch-function* ( function -- ) cuLaunch cuda-error ;
|
||||
|
||||
: launch-function ( -- ) cuda-function get cuLaunch cuda-error ;
|
||||
|
||||
: launch-function-grid* ( function width height -- )
|
||||
cuLaunchGrid cuda-error ;
|
||||
|
||||
: launch-function-grid ( width height -- )
|
||||
[ cuda-function get ] 2dip
|
||||
cuLaunchGrid cuda-error ;
|
||||
|
||||
TUPLE: cuda-memory < disposable ptr length ;
|
||||
|
||||
: <cuda-memory> ( ptr length -- obj )
|
||||
cuda-memory new-disposable
|
||||
swap >>length
|
||||
swap >>ptr ;
|
||||
|
||||
: add-cuda-memory ( obj -- obj )
|
||||
dup dup ptr>> cuda-memory-hashtable get set-at ;
|
||||
|
||||
: delete-cuda-memory ( obj -- )
|
||||
cuda-memory-hashtable delete-at ;
|
||||
|
||||
ERROR: invalid-cuda-memory ptr ;
|
||||
|
||||
: cuda-memory-length ( cuda-memory -- n )
|
||||
ptr>> cuda-memory-hashtable get ?at [
|
||||
length>>
|
||||
] [
|
||||
invalid-cuda-memory
|
||||
] if ;
|
||||
|
||||
M: cuda-memory byte-length length>> ;
|
||||
|
||||
: cuda-malloc ( n -- ptr )
|
||||
[ CUdeviceptr <c-object> ] dip
|
||||
[ cuMemAlloc cuda-error ] 2keep drop *int ;
|
||||
[ cuMemAlloc cuda-error ] 2keep
|
||||
[ *int ] dip <cuda-memory> add-cuda-memory ;
|
||||
|
||||
: cuda-free ( ptr -- )
|
||||
: cuda-free* ( ptr -- )
|
||||
cuMemFree cuda-error ;
|
||||
|
||||
M: cuda-memory dispose ( ptr -- )
|
||||
ptr>> cuda-free* ;
|
||||
|
||||
: host>device ( dest-ptr src-ptr -- )
|
||||
[ ptr>> ] dip dup length cuMemcpyHtoD cuda-error ;
|
||||
|
||||
:: device>host ( ptr -- seq )
|
||||
ptr byte-length <byte-array>
|
||||
[ ptr [ ptr>> ] [ byte-length ] bi cuMemcpyDtoH cuda-error ] keep ;
|
||||
|
||||
: memcpy-device>device ( dest-ptr src-ptr count -- )
|
||||
cuMemcpyDtoD cuda-error ;
|
||||
|
||||
: memcpy-device>array ( dest-array dest-index src-ptr count -- )
|
||||
cuMemcpyDtoA cuda-error ;
|
||||
|
||||
: memcpy-array>device ( dest-ptr src-array src-index count -- )
|
||||
cuMemcpyAtoD cuda-error ;
|
||||
|
||||
: memcpy-array>host ( dest-ptr src-array src-index count -- )
|
||||
cuMemcpyAtoH cuda-error ;
|
||||
|
||||
: memcpy-host>array ( dest-array dest-index src-ptr count -- )
|
||||
cuMemcpyHtoA cuda-error ;
|
||||
|
||||
: memcpy-array>array ( dest-array dest-index src-array src-ptr count -- )
|
||||
cuMemcpyAtoA cuda-error ;
|
||||
|
||||
: cuda-int* ( function offset value -- )
|
||||
cuParamSeti cuda-error ;
|
||||
|
||||
: cuda-int ( offset value -- )
|
||||
[ cuda-function get ] 2dip cuda-int* ;
|
||||
|
||||
: cuda-float* ( function offset value -- )
|
||||
cuParamSetf cuda-error ;
|
||||
|
||||
: cuda-float ( offset value -- )
|
||||
[ cuda-function get ] 2dip cuda-float* ;
|
||||
|
||||
: cuda-vector* ( function offset ptr n -- )
|
||||
cuParamSetv cuda-error ;
|
||||
|
||||
: cuda-vector ( offset ptr n -- )
|
||||
[ cuda-function get ] 3dip cuda-vector* ;
|
||||
|
||||
: param-size* ( function n -- )
|
||||
cuParamSetSize cuda-error ;
|
||||
|
||||
: param-size ( n -- )
|
||||
[ cuda-function get ] dip param-size* ;
|
||||
|
||||
: malloc-device-string ( string -- n )
|
||||
utf8 encode
|
||||
[ length cuda-malloc ] keep
|
||||
[ host>device ] [ drop ] 2bi ;
|
||||
|
||||
ERROR: bad-cuda-parameter parameter ;
|
||||
|
||||
:: set-parameters ( seq -- )
|
||||
cuda-function get :> function
|
||||
0 :> offset!
|
||||
seq [
|
||||
[ offset ] dip
|
||||
{
|
||||
{ [ dup cuda-memory? ] [ ptr>> cuda-int ] }
|
||||
{ [ dup float? ] [ cuda-float ] }
|
||||
{ [ dup integer? ] [ cuda-int ] }
|
||||
[ bad-cuda-parameter ]
|
||||
} cond
|
||||
offset 4 + offset!
|
||||
] each
|
||||
offset param-size ;
|
||||
|
||||
: cuda-device-attribute ( attribute dev -- n )
|
||||
[ int <c-object> ] 2dip
|
||||
[ cuDeviceGetAttribute cuda-error ]
|
||||
[ 2drop *int ] 3bi ;
|
||||
|
||||
: function-block-shape* ( function x y z -- )
|
||||
cuFuncSetBlockShape cuda-error ;
|
||||
|
||||
: function-block-shape ( x y z -- )
|
||||
[ cuda-function get ] 3dip
|
||||
cuFuncSetBlockShape cuda-error ;
|
||||
|
||||
: function-shared-size* ( function n -- )
|
||||
cuFuncSetSharedSize cuda-error ;
|
||||
|
||||
: function-shared-size ( n -- )
|
||||
[ cuda-function get ] dip
|
||||
cuFuncSetSharedSize cuda-error ;
|
||||
|
||||
: launch ( -- )
|
||||
cuda-launcher get {
|
||||
[ block-shape>> first3 function-block-shape ]
|
||||
[ shared-size>> function-shared-size ]
|
||||
[
|
||||
grid>> [
|
||||
launch-function
|
||||
] [
|
||||
first2 launch-function-grid
|
||||
] if-empty
|
||||
]
|
||||
} cleave ;
|
||||
|
||||
: cuda-device. ( n -- )
|
||||
{
|
||||
[ "Device: " write number>string print ]
|
||||
[ "Name: " write cuda-device-name print ]
|
||||
[ "Memory: " write cuda-device-memory number>string print ]
|
||||
[
|
||||
"Capability: " write
|
||||
cuda-device-capability [ number>string ] map " " join print
|
||||
]
|
||||
[ "Properties: " write cuda-device-properties . ]
|
||||
[
|
||||
"CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: " write
|
||||
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP swap
|
||||
cuda-device-attribute number>string print
|
||||
]
|
||||
} cleave ;
|
||||
|
||||
: cuda. ( -- )
|
||||
"CUDA Version: " write cuda-version number>string print nl
|
||||
#cuda-devices iota [ nl ] [ cuda-device. ] interleave ;
|
||||
|
||||
|
||||
: test-cuda0 ( -- )
|
||||
T{ launcher
|
||||
{ path "vocab:cuda/hello.ptx" }
|
||||
{ block-shape { 6 6 6 } }
|
||||
{ shared-size 2 }
|
||||
{ grid { 2 6 } }
|
||||
} [
|
||||
"helloWorld" [
|
||||
"Hello World!" [ - ] map-index
|
||||
malloc-device-string &dispose
|
||||
|
||||
[ 1array set-parameters ]
|
||||
[ drop launch ]
|
||||
[ device>host utf8 alien>string . ] tri
|
||||
] with-cuda-function
|
||||
] with-cuda ;
|
||||
|
|
|
@ -1 +1,2 @@
|
|||
not tested
|
||||
bindings
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
/*
|
||||
World using CUDA
|
||||
**
|
||||
** The string "Hello World!" is mangled then restored using a common CUDA idiom
|
||||
**
|
||||
** Byron Galbraith
|
||||
** 2009-02-18
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// Prototypes
|
||||
extern "C" __global__ void helloWorld(char*);
|
||||
|
||||
// Host function
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
int i;
|
||||
|
||||
// desired output
|
||||
char str[] = "Hello World!";
|
||||
|
||||
// mangle contents of output
|
||||
// the null character is left intact for simplicity
|
||||
for(i = 0; i < 12; i++)
|
||||
str[i] -= i;
|
||||
|
||||
// allocate memory on the device
|
||||
char *d_str;
|
||||
size_t size = sizeof(str);
|
||||
cudaMalloc((void**)&d_str, size);
|
||||
|
||||
// copy the string to the device
|
||||
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
|
||||
|
||||
// set the grid and block sizes
|
||||
dim3 dimGrid(2); // one block per word
|
||||
dim3 dimBlock(6); // one thread per character
|
||||
|
||||
// invoke the kernel
|
||||
helloWorld<<< dimGrid, dimBlock >>>(d_str);
|
||||
|
||||
// retrieve the results from the device
|
||||
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
|
||||
|
||||
// free up the allocated memory on the device
|
||||
cudaFree(d_str);
|
||||
|
||||
// everyone's favorite part
|
||||
printf("%s\n", str);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Device kernel
|
||||
__global__ void
|
||||
helloWorld(char* str)
|
||||
{
|
||||
// determine where in the thread grid we are
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// unmangle output
|
||||
str[idx] += idx;
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
.version 1.4
|
||||
.target sm_10, map_f64_to_f32
|
||||
// compiled with /usr/local/cuda/bin/../open64/lib//be
|
||||
// nvopencc 3.0 built on 2010-03-11
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00000eab_00000000-7_hello.cpp3.i (/var/folders/KD/KDnx4D80Eh0fsORqNrFWBE+++TI/-Tmp-/ccBI#.AYqbdQ)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00000eab_00000000-6_hello.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/i686-apple-darwin10/4.2.1/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/bin/../include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/bin/../include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/bin/../include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/bin/../include/texture_types.h"
|
||||
.file 10 "/usr/local/cuda/bin/../include/vector_types.h"
|
||||
.file 11 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
|
||||
.file 12 "/usr/local/cuda/bin/../include/crt/storage_class.h"
|
||||
.file 13 "/usr/include/i386/_types.h"
|
||||
.file 14 "/usr/include/time.h"
|
||||
.file 15 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
|
||||
.file 16 "/usr/local/cuda/bin/../include/common_functions.h"
|
||||
.file 17 "/usr/local/cuda/bin/../include/crt/func_macro.h"
|
||||
.file 18 "/usr/local/cuda/bin/../include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/bin/../include/device_functions.h"
|
||||
.file 20 "/usr/local/cuda/bin/../include/math_constants.h"
|
||||
.file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/bin/../include/common_types.h"
|
||||
.file 25 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
|
||||
.file 26 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
|
||||
.file 27 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
|
||||
.file 28 "hello.cu"
|
||||
|
||||
|
||||
.entry helloWorld (
|
||||
.param .u32 __cudaparm_helloWorld_str)
|
||||
{
|
||||
.reg .u16 %rh<4>;
|
||||
.reg .u32 %r<9>;
|
||||
.loc 28 58 0
|
||||
$LBB1_helloWorld:
|
||||
.loc 28 64 0
|
||||
mov.u16 %rh1, %ctaid.x;
|
||||
mov.u16 %rh2, %ntid.x;
|
||||
mul.wide.u16 %r1, %rh1, %rh2;
|
||||
cvt.u32.u16 %r2, %tid.x;
|
||||
add.u32 %r3, %r2, %r1;
|
||||
ld.param.u32 %r4, [__cudaparm_helloWorld_str];
|
||||
add.u32 %r5, %r4, %r3;
|
||||
ld.global.s8 %r6, [%r5+0];
|
||||
add.s32 %r7, %r6, %r3;
|
||||
st.global.s8 [%r5+0], %r7;
|
||||
.loc 28 65 0
|
||||
exit;
|
||||
$LDWend_helloWorld:
|
||||
} // helloWorld
|
||||
|
Loading…
Reference in New Issue