From ea2bcd69c71882d545fb57548ff0cda4ac009ce9 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Sun, 20 Sep 2009 02:08:32 -0500 Subject: [PATCH] math.vectors.simd: redesign to be more flexible, integer SIMD work in progress --- basis/alien/c-types/c-types.factor | 8 + basis/cpu/architecture/architecture.factor | 34 +++- basis/cpu/x86/features/features.factor | 49 ++--- basis/cpu/x86/x86.factor | 115 ++++++++++- basis/math/floats/env/x86/x86.factor | 4 +- .../math/vectors/simd/functor/functor.factor | 134 +++++++++++-- .../simd/intrinsics/intrinsics-tests.factor | 18 ++ .../vectors/simd/intrinsics/intrinsics.factor | 54 +++++- basis/math/vectors/simd/simd-docs.factor | 84 -------- basis/math/vectors/simd/simd.factor | 180 +----------------- basis/math/vectors/simd/summary.txt | 1 + 11 files changed, 363 insertions(+), 318 deletions(-) create mode 100644 basis/math/vectors/simd/intrinsics/intrinsics-tests.factor create mode 100644 basis/math/vectors/simd/summary.txt diff --git a/basis/alien/c-types/c-types.factor b/basis/alien/c-types/c-types.factor index fa27e29c04..afbb664fed 100755 --- a/basis/alien/c-types/c-types.factor +++ b/basis/alien/c-types/c-types.factor @@ -472,3 +472,11 @@ SYMBOLS: \ ulong \ size_t typedef ] with-compilation-unit +M: char-16-rep rep-component-type drop char ; +M: uchar-16-rep rep-component-type drop uchar ; +M: short-8-rep rep-component-type drop short ; +M: ushort-8-rep rep-component-type drop ushort ; +M: int-4-rep rep-component-type drop int ; +M: uint-4-rep rep-component-type drop uint ; +M: float-4-rep rep-component-type drop float ; +M: double-2-rep rep-component-type drop double ; diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor index d6611c3384..61e4e2df37 100644 --- a/basis/cpu/architecture/architecture.factor +++ b/basis/cpu/architecture/architecture.factor @@ -22,8 +22,6 @@ SINGLETONS: float-rep double-rep ; ! On x86, floating point registers are really vector registers SINGLETONS: -float-4-rep -double-2-rep char-16-rep uchar-16-rep short-8-rep @@ -31,9 +29,11 @@ ushort-8-rep int-4-rep uint-4-rep ; -UNION: vector-rep +SINGLETONS: float-4-rep -double-2-rep +double-2-rep ; + +UNION: int-vector-rep char-16-rep uchar-16-rep short-8-rep @@ -41,6 +41,14 @@ ushort-8-rep int-4-rep uint-4-rep ; +UNION: float-vector-rep +float-4-rep +double-2-rep ; + +UNION: vector-rep +int-vector-rep +float-vector-rep ; + UNION: representation any-rep tagged-rep @@ -76,10 +84,15 @@ M: double-rep rep-size drop 8 ; M: stack-params rep-size drop cell ; M: vector-rep rep-size drop 16 ; +GENERIC: rep-component-type ( rep -- n ) + +! Methods defined in alien.c-types + GENERIC: scalar-rep-of ( rep -- rep' ) M: float-4-rep scalar-rep-of drop float-rep ; M: double-2-rep scalar-rep-of drop double-rep ; +M: int-vector-rep scalar-rep-of drop int-rep ; ! Mapping from register class to machine registers HOOK: machine-registers cpu ( -- assoc ) @@ -167,7 +180,6 @@ HOOK: %unbox-vector cpu ( dst src rep -- ) HOOK: %broadcast-vector cpu ( dst src rep -- ) HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- ) HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- ) - HOOK: %add-vector cpu ( dst src1 src2 rep -- ) HOOK: %sub-vector cpu ( dst src1 src2 rep -- ) HOOK: %mul-vector cpu ( dst src1 src2 rep -- ) @@ -177,6 +189,18 @@ HOOK: %max-vector cpu ( dst src1 src2 rep -- ) HOOK: %sqrt-vector cpu ( dst src rep -- ) HOOK: %horizontal-add-vector cpu ( dst src rep -- ) +HOOK: %broadcast-vector-reps cpu ( -- reps ) +HOOK: %gather-vector-2-reps cpu ( -- reps ) +HOOK: %gather-vector-4-reps cpu ( -- reps ) +HOOK: %add-vector-reps cpu ( -- reps ) +HOOK: %sub-vector-reps cpu ( -- reps ) +HOOK: %mul-vector-reps cpu ( -- reps ) +HOOK: %div-vector-reps cpu ( -- reps ) +HOOK: %min-vector-reps cpu ( -- reps ) +HOOK: %max-vector-reps cpu ( -- reps ) +HOOK: %sqrt-vector-reps cpu ( -- reps ) +HOOK: %horizontal-add-vector-reps cpu ( -- reps ) + HOOK: %unbox-alien cpu ( dst src -- ) HOOK: %unbox-any-c-ptr cpu ( dst src temp -- ) HOOK: %box-alien cpu ( dst src temp -- ) diff --git a/basis/cpu/x86/features/features.factor b/basis/cpu/x86/features/features.factor index c5cf2d470a..5fad4e802c 100644 --- a/basis/cpu/x86/features/features.factor +++ b/basis/cpu/x86/features/features.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2009 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: system kernel math math.order math.parser namespaces -alien.c-types alien.syntax combinators locals init io cpu.x86 +USING: system kernel memoize math math.order math.parser +namespaces alien.c-types alien.syntax combinators locals init io compiler compiler.units accessors ; IN: cpu.x86.features @@ -13,7 +13,16 @@ FUNCTION: longlong read_timestamp_counter ( ) ; PRIVATE> -ALIAS: sse-version sse_version +: sse-version ( -- n ) + sse_version + "sse-version" get string>number [ min ] when* ; foldable + +: sse? ( -- ? ) sse-version 10 >= ; foldable +: sse2? ( -- ? ) sse-version 20 >= ; foldable +: sse3? ( -- ? ) sse-version 30 >= ; foldable +: ssse3? ( -- ? ) sse-version 33 >= ; foldable +: sse4.1? ( -- ? ) sse-version 41 >= ; foldable +: sse4.2? ( -- ? ) sse-version 42 >= ; foldable : sse-string ( version -- string ) { @@ -32,37 +41,3 @@ M: x86 instruction-count read_timestamp_counter ; : count-instructions ( quot -- n ) instruction-count [ call ] dip instruction-count swap - ; inline - -USING: cpu.x86.features cpu.x86.features.private ; - -:: install-sse-check ( version -- ) - [ - sse-version version < [ - "This image was built to use " write - version sse-string write - " but your CPU only supports " write - sse-version sse-string write "." print - "You will need to bootstrap Factor again." print - flush - 1 exit - ] when - ] "cpu.x86" add-init-hook ; - -: enable-sse ( version -- ) - { - { 00 [ ] } - { 10 [ ] } - { 20 [ enable-sse2 ] } - { 30 [ enable-sse3 ] } - { 33 [ enable-sse3 ] } - { 41 [ enable-sse3 ] } - { 42 [ enable-sse3 ] } - } case ; - -[ { sse_version } compile ] with-optimizer - -"Checking for multimedia extensions: " write sse-version -"sse-version" get [ string>number min ] when* -[ sse-string write " detected" print ] -[ install-sse-check ] -[ enable-sse ] tri diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor index 04b5308836..4d80862ed3 100644 --- a/basis/cpu/x86/x86.factor +++ b/basis/cpu/x86/x86.factor @@ -4,7 +4,8 @@ USING: accessors assocs alien alien.c-types arrays strings cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands cpu.architecture kernel kernel.private math memory namespaces make sequences words system layouts combinators math.order fry locals -compiler.constants byte-arrays +compiler.constants byte-arrays io macros quotations cpu.x86.features +cpu.x86.features.private compiler compiler.units init compiler.cfg.registers compiler.cfg.instructions compiler.cfg.intrinsics @@ -250,12 +251,26 @@ M:: x86 %unbox-vector ( dst src rep -- ) dst src byte-array-offset [+] rep copy-register ; +MACRO: available-reps ( alist -- ) + ! Each SSE version adds new representations and supports + ! all old ones + unzip { } [ append ] accumulate rest swap suffix + [ [ 1quotation ] map ] bi@ zip + reverse [ { } ] suffix + '[ _ cond ] ; + M: x86 %broadcast-vector ( dst src rep -- ) { { float-4-rep [ [ MOVSS ] [ drop dup 0 SHUFPS ] 2bi ] } { double-2-rep [ [ MOVSD ] [ drop dup UNPCKLPD ] 2bi ] } } case ; +M: x86 %broadcast-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) rep { { @@ -269,6 +284,11 @@ M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) } } case ; +M: x86 %gather-vector-4-reps + { + { sse? { float-4-rep } } + } available-reps ; + M:: x86 %gather-vector-2 ( dst src1 src2 rep -- ) rep { { @@ -280,6 +300,11 @@ M:: x86 %gather-vector-2 ( dst src1 src2 rep -- ) } } case ; +M: x86 %gather-vector-2-reps + { + { sse2? { double-2-rep } } + } available-reps ; + M: x86 %add-vector ( dst src1 src2 rep -- ) { { float-4-rep [ ADDPS ] } @@ -292,6 +317,12 @@ M: x86 %add-vector ( dst src1 src2 rep -- ) { uint-4-rep [ PADDD ] } } case drop ; +M: x86 %add-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + M: x86 %sub-vector ( dst src1 src2 rep -- ) { { float-4-rep [ SUBPS ] } @@ -304,43 +335,92 @@ M: x86 %sub-vector ( dst src1 src2 rep -- ) { uint-4-rep [ PSUBD ] } } case drop ; +M: x86 %sub-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + M: x86 %mul-vector ( dst src1 src2 rep -- ) { { float-4-rep [ MULPS ] } { double-2-rep [ MULPD ] } - { int-4-rep [ PMULLW ] } + { short-8-rep [ PMULLW ] } + { ushort-8-rep [ PMULLW ] } + { int-4-rep [ PMULLD ] } + { uint-4-rep [ PMULLD ] } } case drop ; +M: x86 %mul-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + M: x86 %div-vector ( dst src1 src2 rep -- ) { { float-4-rep [ DIVPS ] } { double-2-rep [ DIVPD ] } } case drop ; +M: x86 %div-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + M: x86 %min-vector ( dst src1 src2 rep -- ) { { float-4-rep [ MINPS ] } { double-2-rep [ MINPD ] } + { uchar-16-rep [ PMINUB ] } + { short-8-rep [ PMINSW ] } } case drop ; +M: x86 %min-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep short-8-rep uchar-16-rep } } + } available-reps ; + M: x86 %max-vector ( dst src1 src2 rep -- ) { { float-4-rep [ MAXPS ] } { double-2-rep [ MAXPD ] } + { uchar-16-rep [ PMAXUB ] } + { short-8-rep [ PMAXSW ] } } case drop ; +M: x86 %max-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep short-8-rep uchar-16-rep } } + } available-reps ; + M: x86 %sqrt-vector ( dst src rep -- ) { { float-4-rep [ SQRTPS ] } { double-2-rep [ SQRTPD ] } } case ; +M: x86 %sqrt-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + M: x86 %horizontal-add-vector ( dst src rep -- ) { { float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] } { double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] } } case ; +M: x86 %horizontal-add-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep short-8-rep uchar-16-rep } } + } available-reps ; + M: x86 %unbox-alien ( dst src -- ) alien-offset [+] MOV ; @@ -775,3 +855,34 @@ M: x86 small-enough? ( n -- ? ) enable-sse3-simd ; enable-min/max + +:: install-sse-check ( version -- ) + [ + sse-version version < [ + "This image was built to use " write + version sse-string write + " but your CPU only supports " write + sse-version sse-string write "." print + "You will need to bootstrap Factor again." print + flush + 1 exit + ] when + ] "cpu.x86" add-init-hook ; + +: enable-sse ( version -- ) + { + { 00 [ ] } + { 10 [ ] } + { 20 [ enable-sse2 ] } + { 30 [ enable-sse3 ] } + { 33 [ enable-sse3 ] } + { 41 [ enable-sse3 ] } + { 42 [ enable-sse3 ] } + } case ; + +[ { sse_version } compile ] with-optimizer + +"Checking for multimedia extensions: " write sse-version 30 min +[ sse-string write " detected" print ] +[ install-sse-check ] +[ enable-sse ] tri diff --git a/basis/math/floats/env/x86/x86.factor b/basis/math/floats/env/x86/x86.factor index e91fc4eda9..e9120567aa 100644 --- a/basis/math/floats/env/x86/x86.factor +++ b/basis/math/floats/env/x86/x86.factor @@ -31,9 +31,7 @@ M: x87-env (set-fp-env-register) set_x87_env ; M: x86 (fp-env-registers) - sse-version 20 >= - [ 2array ] - [ 1array ] if ; + sse2? [ 2array ] [ 1array ] if ; CONSTANT: sse-exception-flag-bits HEX: 3f CONSTANT: sse-exception-flag>bit diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor index 641585a5d7..a97dc192be 100644 --- a/basis/math/vectors/simd/functor/functor.factor +++ b/basis/math/vectors/simd/functor/functor.factor @@ -1,24 +1,94 @@ ! Copyright (C) 2009 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors alien.c-types byte-arrays classes functors -kernel math parser prettyprint.custom sequences -sequences.private literals ; +USING: accessors alien.c-types assocs byte-arrays classes +effects fry functors generalizations kernel literals locals +math math.functions math.vectors math.vectors.simd.intrinsics +math.vectors.specialization parser prettyprint.custom sequences +sequences.private strings words definitions macros cpu.architecture ; IN: math.vectors.simd.functor ERROR: bad-length got expected ; +MACRO: simd-boa ( rep class -- simd-array ) + [ rep-components ] [ new ] bi* '[ _ _ nsequence ] ; + +:: define-boa-custom-inlining ( word rep class -- ) + word [ + drop + rep rep rep-gather-word supported-simd-op? [ + [ rep (simd-boa) class boa ] + ] [ word def>> ] if + ] "custom-inlining" set-word-prop ; + +: simd-with ( rep class x -- simd-array ) + [ rep-components ] [ new ] [ '[ _ ] ] tri* swap replicate-as ; inline + +:: define-with-custom-inlining ( word rep class -- ) + word [ + drop + rep \ (simd-broadcast) supported-simd-op? [ + [ rep rep-coerce rep (simd-broadcast) class boa ] + ] [ word def>> ] if + ] "custom-inlining" set-word-prop ; + +: boa-effect ( rep n -- effect ) + [ rep-components ] dip * + [ CHAR: a + 1string ] map + { "simd-vector" } ; + +: supported-simd-ops ( assoc rep -- assoc' ) + [ + { + { v+ (simd-v+) } + { v- (simd-v-) } + { v* (simd-v*) } + { v/ (simd-v/) } + { vmin (simd-vmin) } + { vmax (simd-vmax) } + { sum (simd-sum) } + } + ] dip + '[ nip _ swap supported-simd-op? ] assoc-filter + '[ drop _ key? ] assoc-filter ; + +:: high-level-ops ( ctor -- assoc ) + ! Some SIMD operations are defined in terms of others. + { + { vneg [ [ dup v- ] keep v- ] } + { v. [ v* sum ] } + { n+v [ [ ctor execute ] dip v+ ] } + { v+n [ ctor execute v+ ] } + { n-v [ [ ctor execute ] dip v- ] } + { v-n [ ctor execute v- ] } + { n*v [ [ ctor execute ] dip v* ] } + { v*n [ ctor execute v* ] } + { n/v [ [ ctor execute ] dip v/ ] } + { v/n [ ctor execute v/ ] } + { norm-sq [ dup v. assert-positive ] } + { norm [ norm-sq sqrt ] } + { normalize [ dup norm v/n ] } + { distance [ v- norm ] } + } ; + +:: simd-vector-words ( class ctor rep assoc -- ) + class + rep rep-component-type c-type-boxed-class + assoc rep supported-simd-ops + ctor high-level-ops assoc-union + specialize-vector-words ; + FUNCTOR: define-simd-128 ( T -- ) -T-TYPE IS ${T} - -N [ 16 T-TYPE heap-size /i ] +N [ 16 T heap-size /i ] A DEFINES-CLASS ${T}-${N} +A-boa DEFINES ${A}-boa +A-with DEFINES ${A}-with >A DEFINES >${A} A{ DEFINES ${A}{ -NTH [ T-TYPE dup c-type-getter-boxer array-accessor ] -SET-NTH [ T-TYPE dup c-setter array-accessor ] +NTH [ T dup c-type-getter-boxer array-accessor ] +SET-NTH [ T dup c-setter array-accessor ] A-rep IS ${A}-rep A-vv->v-op DEFINES-PRIVATE ${A}-vv->v-op @@ -59,6 +129,16 @@ M: A pprint* pprint-object ; SYNTAX: A{ \ } [ >A ] parse-literal ; +: A-with ( x -- simd-array ) [ A-rep A ] dip simd-with ; + +\ A-with \ A-rep \ A define-with-custom-inlining + +\ A-boa [ \ A-rep \ A simd-boa ] \ A-rep 1 boa-effect define-declared + +\ A-rep rep-gather-word [ + \ A-boa \ A-rep \ A define-boa-custom-inlining +] when + INSTANCE: A sequence n-op ( v quot -- n ) [ underlying>> A-rep ] dip call ; inline +\ A \ A-with \ A-rep H{ + { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] } + { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] } + { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] } + { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] } + { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] } + { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] } + { sum [ [ (simd-sum) ] \ A-v->n-op execute ] } +} simd-vector-words + PRIVATE> ;FUNCTOR @@ -76,14 +166,16 @@ PRIVATE> ! Synthesize 256-bit vectors from a pair of 128-bit vectors FUNCTOR: define-simd-256 ( T -- ) -T-TYPE IS ${T} - -N [ 32 T-TYPE heap-size /i ] +N [ 32 T heap-size /i ] N/2 [ N 2 / ] A/2 IS ${T}-${N/2} +A/2-boa IS ${A/2}-boa +A/2-with IS ${A/2}-with A DEFINES-CLASS ${T}-${N} +A-boa DEFINES ${A}-boa +A-with DEFINES ${A}-with >A DEFINES >${A} A{ DEFINES ${A}{ @@ -137,6 +229,16 @@ M: A >pprint-sequence ; M: A pprint* pprint-object ; +: A-with ( x -- simd-array ) + [ A/2-with ] [ A/2-with ] bi [ underlying>> ] bi@ + \ A boa ; inline + +: A-boa ( ... -- simd-array ) + [ A/2-boa ] N/2 ndip A/2-boa [ underlying>> ] bi@ + \ A boa ; + +\ A-rep 2 boa-effect \ A-boa set-stack-effect + INSTANCE: A sequence : A-vv->v-op ( v1 v2 quot -- v3 ) @@ -148,4 +250,14 @@ INSTANCE: A sequence [ [ [ underlying1>> ] [ underlying2>> ] bi A-rep ] dip call A-rep ] dip call ; inline +\ A \ A-with \ A-rep H{ + { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] } + { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] } + { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] } + { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] } + { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] } + { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] } + { sum [ [ (simd-v+) ] [ (simd-sum) ] \ A-v->n-op execute ] } +} simd-vector-words + ;FUNCTOR diff --git a/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor b/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor new file mode 100644 index 0000000000..84eee935a0 --- /dev/null +++ b/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor @@ -0,0 +1,18 @@ +IN: math.vectors.simd.intrinsics.tests +USING: math.vectors.simd.intrinsics cpu.architecture tools.test ; + +[ 16 ] [ uchar-16-rep rep-components ] unit-test +[ 16 ] [ char-16-rep rep-components ] unit-test +[ 8 ] [ ushort-8-rep rep-components ] unit-test +[ 8 ] [ short-8-rep rep-components ] unit-test +[ 4 ] [ uint-4-rep rep-components ] unit-test +[ 4 ] [ int-4-rep rep-components ] unit-test +[ 4 ] [ float-4-rep rep-components ] unit-test +[ 2 ] [ double-2-rep rep-components ] unit-test + +{ 4 1 } [ uint-4-rep (simd-boa) ] must-infer-as +{ 4 1 } [ int-4-rep (simd-boa) ] must-infer-as +{ 4 1 } [ float-4-rep (simd-boa) ] must-infer-as +{ 2 1 } [ double-2-rep (simd-boa) ] must-infer-as + + diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor index 914d1ef169..a7d019af81 100644 --- a/basis/math/vectors/simd/intrinsics/intrinsics.factor +++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor @@ -1,6 +1,8 @@ ! Copyright (C) 2009 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel alien alien.data cpu.architecture libc ; +USING: alien alien.c-types alien.data assocs combinators +cpu.architecture fry generalizations kernel libc macros math +sequences ; IN: math.vectors.simd.intrinsics ERROR: bad-simd-call ; @@ -26,3 +28,53 @@ ERROR: bad-simd-call ; ! Inefficient version for when intrinsics are missing [ swap swap ] dip rep-size memcpy ; +<< + +: rep-components ( rep -- n ) + 16 swap rep-component-type heap-size /i ; foldable + +: rep-coercer ( rep -- quot ) + { + { [ dup int-vector-rep? ] [ [ >fixnum ] ] } + { [ dup float-vector-rep? ] [ [ >float ] ] } + } cond nip ; foldable + +: rep-coerce ( value rep -- value' ) + rep-coercer call( value -- value' ) ; inline + +CONSTANT: rep-gather-words + { + { 2 (simd-gather-2) } + { 4 (simd-gather-4) } + } + +: rep-gather-word ( rep -- word ) + rep-components rep-gather-words at ; + +>> + +MACRO: (simd-boa) ( rep -- quot ) + { + [ rep-coercer ] + [ rep-components ] + [ ] + [ rep-gather-word ] + } cleave + '[ _ _ napply _ _ execute ] ; + +GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? ) + +M: vector-rep supported-simd-op? + { + { \ (simd-v+) [ %add-vector-reps ] } + { \ (simd-v-) [ %sub-vector-reps ] } + { \ (simd-v*) [ %mul-vector-reps ] } + { \ (simd-v/) [ %div-vector-reps ] } + { \ (simd-vmin) [ %min-vector-reps ] } + { \ (simd-vmax) [ %max-vector-reps ] } + { \ (simd-vsqrt) [ %sqrt-vector-reps ] } + { \ (simd-sum) [ %horizontal-add-vector-reps ] } + { \ (simd-broadcast) [ %broadcast-vector-reps ] } + { \ (simd-gather-2) [ %gather-vector-2-reps ] } + { \ (simd-gather-4) [ %gather-vector-4-reps ] } + } case member? ; diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor index b110de1de8..d6131b3a71 100644 --- a/basis/math/vectors/simd/simd-docs.factor +++ b/basis/math/vectors/simd/simd-docs.factor @@ -43,22 +43,6 @@ $nl } "The " { $link float-4 } " and " { $link double-2 } " types correspond to 128-bit vector registers. The " { $link float-8 } " and " { $link double-4 } " types are not directly supported in hardware, and instead unbox to a pair of 128-bit vector registers." $nl -"Operations on " { $link float-4 } " instances:" -{ $subsection float-4-with } -{ $subsection float-4-boa } -{ $subsection POSTPONE: float-4{ } -"Operations on " { $link double-2 } " instances:" -{ $subsection double-2-with } -{ $subsection double-2-boa } -{ $subsection POSTPONE: double-2{ } -"Operations on " { $link float-8 } " instances:" -{ $subsection float-8-with } -{ $subsection float-8-boa } -{ $subsection POSTPONE: float-8{ } -"Operations on " { $link double-4 } " instances:" -{ $subsection double-4-with } -{ $subsection double-4-boa } -{ $subsection POSTPONE: double-4{ } "To actually perform vector arithmetic on SIMD vectors, use " { $link "math-vectors" } " words." { $see-also "c-types-specs" } ; @@ -184,72 +168,4 @@ ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)" { $subsection "math.vectors.simd.alien" } { $subsection "math.vectors.simd.intrinsics" } ; -! ! ! float-4 - -HELP: float-4 -{ $class-description "A sequence of four single-precision floating point values. New instances can be created with " { $link float-4-with } " or " { $link float-4-boa } "." } ; - -HELP: float-4-with -{ $values { "x" float } { "simd-array" float-4 } } -{ $description "Creates a new vector with all four components equal to a scalar." } ; - -HELP: float-4-boa -{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" float-4 } } -{ $description "Creates a new vector from four scalar components." } ; - -HELP: float-4{ -{ $syntax "float-4{ a b c d }" } -{ $description "Literal syntax for a " { $link float-4 } "." } ; - -! ! ! double-2 - -HELP: double-2 -{ $class-description "A sequence of two double-precision floating point values. New instances can be created with " { $link double-2-with } " or " { $link double-2-boa } "." } ; - -HELP: double-2-with -{ $values { "x" float } { "simd-array" double-2 } } -{ $description "Creates a new vector with both components equal to a scalar." } ; - -HELP: double-2-boa -{ $values { "a" float } { "b" float } { "simd-array" double-2 } } -{ $description "Creates a new vector from two scalar components." } ; - -HELP: double-2{ -{ $syntax "double-2{ a b }" } -{ $description "Literal syntax for a " { $link double-2 } "." } ; - -! ! ! float-8 - -HELP: float-8 -{ $class-description "A sequence of eight single-precision floating point values. New instances can be created with " { $link float-8-with } " or " { $link float-8-boa } "." } ; - -HELP: float-8-with -{ $values { "x" float } { "simd-array" float-8 } } -{ $description "Creates a new vector with all eight components equal to a scalar." } ; - -HELP: float-8-boa -{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "e" float } { "f" float } { "g" float } { "h" float } { "simd-array" float-8 } } -{ $description "Creates a new vector from eight scalar components." } ; - -HELP: float-8{ -{ $syntax "float-8{ a b c d e f g h }" } -{ $description "Literal syntax for a " { $link float-8 } "." } ; - -! ! ! double-4 - -HELP: double-4 -{ $class-description "A sequence of four double-precision floating point values. New instances can be created with " { $link double-4-with } " or " { $link double-4-boa } "." } ; - -HELP: double-4-with -{ $values { "x" float } { "simd-array" double-4 } } -{ $description "Creates a new vector with all four components equal to a scalar." } ; - -HELP: double-4-boa -{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" double-4 } } -{ $description "Creates a new vector from four scalar components." } ; - -HELP: double-4{ -{ $syntax "double-4{ a b c d }" } -{ $description "Literal syntax for a " { $link double-4 } "." } ; - ABOUT: "math.vectors.simd" diff --git a/basis/math/vectors/simd/simd.factor b/basis/math/vectors/simd/simd.factor index a3c99ae217..c5e7d6f75d 100644 --- a/basis/math/vectors/simd/simd.factor +++ b/basis/math/vectors/simd/simd.factor @@ -1,185 +1,15 @@ ! Copyright (C) 2009 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors alien.c-types byte-arrays cpu.architecture -kernel math math.functions math.vectors -math.vectors.simd.functor math.vectors.simd.intrinsics -math.vectors.specialization parser prettyprint.custom sequences -sequences.private locals assocs words fry ; -FROM: alien.c-types => float ; -QUALIFIED-WITH: math m +USING: alien.c-types cpu.architecture kernel +math.vectors.simd.functor vocabs.loader ; +FROM: sequences => each ; IN: math.vectors.simd << -DEFER: float-4 -DEFER: double-2 -DEFER: float-8 -DEFER: double-4 - -"double" define-simd-128 -"float" define-simd-128 -"double" define-simd-256 -"float" define-simd-256 +{ double float char uchar short ushort int uint } +[ [ define-simd-128 ] [ define-simd-256 ] bi ] each >> -: float-4-with ( x -- simd-array ) - [ 4 ] dip >float '[ _ ] \ float-4 new replicate-as ; - -: float-4-boa ( a b c d -- simd-array ) - \ float-4 new 4sequence ; - -: double-2-with ( x -- simd-array ) - [ 2 ] dip >float '[ _ ] \ double-2 new replicate-as ; - -: double-2-boa ( a b -- simd-array ) - \ double-2 new 2sequence ; - -! More efficient expansions for the above, used when SIMD is -! actually available. - -<< - -\ float-4-with [ - drop - \ (simd-broadcast) "intrinsic" word-prop [ - [ >float float-4-rep (simd-broadcast) \ float-4 boa ] - ] [ \ float-4-with def>> ] if -] "custom-inlining" set-word-prop - -\ float-4-boa [ - drop - \ (simd-gather-4) "intrinsic" word-prop [ - [| a b c d | - a >float b >float c >float d >float - float-4-rep (simd-gather-4) \ float-4 boa - ] - ] [ \ float-4-boa def>> ] if -] "custom-inlining" set-word-prop - -\ double-2-with [ - drop - \ (simd-broadcast) "intrinsic" word-prop [ - [ >float double-2-rep (simd-broadcast) \ double-2 boa ] - ] [ \ double-2-with def>> ] if -] "custom-inlining" set-word-prop - -\ double-2-boa [ - drop - \ (simd-gather-4) "intrinsic" word-prop [ - [ [ >float ] bi@ double-2-rep (simd-gather-2) \ double-2 boa ] - ] [ \ double-2-boa def>> ] if -] "custom-inlining" set-word-prop - ->> - -: float-8-with ( x -- simd-array ) - [ float-4-with ] [ float-4-with ] bi [ underlying>> ] bi@ - \ float-8 boa ; inline - -:: float-8-boa ( a b c d e f g h -- simd-array ) - a b c d float-4-boa - e f g h float-4-boa - [ underlying>> ] bi@ - \ float-8 boa ; inline - -: double-4-with ( x -- simd-array ) - [ double-2-with ] [ double-2-with ] bi [ underlying>> ] bi@ - \ double-4 boa ; inline - -:: double-4-boa ( a b c d -- simd-array ) - a b double-2-boa - c d double-2-boa - [ underlying>> ] bi@ - \ double-4 boa ; inline - -<< - - - -\ float-4 \ float-4-with m:float H{ - { v+ [ [ (simd-v+) ] float-4-vv->v-op ] } - { v- [ [ (simd-v-) ] float-4-vv->v-op ] } - { v* [ [ (simd-v*) ] float-4-vv->v-op ] } - { v/ [ [ (simd-v/) ] float-4-vv->v-op ] } - { vmin [ [ (simd-vmin) ] float-4-vv->v-op ] } - { vmax [ [ (simd-vmax) ] float-4-vv->v-op ] } - { sum [ [ (simd-sum) ] float-4-v->n-op ] } -} simd-vector-words - -\ double-2 \ double-2-with m:float H{ - { v+ [ [ (simd-v+) ] double-2-vv->v-op ] } - { v- [ [ (simd-v-) ] double-2-vv->v-op ] } - { v* [ [ (simd-v*) ] double-2-vv->v-op ] } - { v/ [ [ (simd-v/) ] double-2-vv->v-op ] } - { vmin [ [ (simd-vmin) ] double-2-vv->v-op ] } - { vmax [ [ (simd-vmax) ] double-2-vv->v-op ] } - { sum [ [ (simd-sum) ] double-2-v->n-op ] } -} simd-vector-words - -\ float-8 \ float-8-with m:float H{ - { v+ [ [ (simd-v+) ] float-8-vv->v-op ] } - { v- [ [ (simd-v-) ] float-8-vv->v-op ] } - { v* [ [ (simd-v*) ] float-8-vv->v-op ] } - { v/ [ [ (simd-v/) ] float-8-vv->v-op ] } - { vmin [ [ (simd-vmin) ] float-8-vv->v-op ] } - { vmax [ [ (simd-vmax) ] float-8-vv->v-op ] } - { sum [ [ (simd-sum) ] [ + ] float-8-v->n-op ] } -} simd-vector-words - -\ double-4 \ double-4-with m:float H{ - { v+ [ [ (simd-v+) ] double-4-vv->v-op ] } - { v- [ [ (simd-v-) ] double-4-vv->v-op ] } - { v* [ [ (simd-v*) ] double-4-vv->v-op ] } - { v/ [ [ (simd-v/) ] double-4-vv->v-op ] } - { vmin [ [ (simd-vmin) ] double-4-vv->v-op ] } - { vmax [ [ (simd-vmax) ] double-4-vv->v-op ] } - { sum [ [ (simd-v+) ] [ (simd-sum) ] double-4-v->n-op ] } -} simd-vector-words - ->> - -USE: vocabs.loader - "math.vectors.simd.alien" require diff --git a/basis/math/vectors/simd/summary.txt b/basis/math/vectors/simd/summary.txt new file mode 100644 index 0000000000..22593f1286 --- /dev/null +++ b/basis/math/vectors/simd/summary.txt @@ -0,0 +1 @@ +Single-instruction-multiple-data parallel vector operations