From f395d83379310c50cd25f6a6f27a18c4e68bbaf2 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Tue, 29 Sep 2009 04:46:38 -0500 Subject: [PATCH] math.vectors.simd: add fast intrinsic for 'nth', replace broadcast primitive with shuffles --- .../cfg/instructions/instructions.factor | 22 +- .../compiler/cfg/intrinsics/intrinsics.factor | 2 +- .../compiler/cfg/intrinsics/simd/simd.factor | 21 +- basis/compiler/codegen/codegen.factor | 4 +- .../tree/propagation/simd/simd.factor | 3 +- basis/cpu/architecture/architecture.factor | 6 +- basis/cpu/ppc/ppc.factor | 9 +- basis/cpu/x86/x86.factor | 80 +++---- .../math/vectors/simd/functor/functor.factor | 89 ++++++-- .../vectors/simd/intrinsics/intrinsics.factor | 2 - basis/math/vectors/simd/simd-docs.factor | 6 +- basis/math/vectors/simd/simd-tests.factor | 205 ++++++++++-------- basis/math/vectors/simd/simd.factor | 10 +- 13 files changed, 279 insertions(+), 180 deletions(-) diff --git a/basis/compiler/cfg/instructions/instructions.factor b/basis/compiler/cfg/instructions/instructions.factor index 2e9d2863c3..716ae46592 100644 --- a/basis/compiler/cfg/instructions/instructions.factor +++ b/basis/compiler/cfg/instructions/instructions.factor @@ -278,11 +278,6 @@ PURE-INSN: ##zero-vector def: dst literal: rep ; -PURE-INSN: ##broadcast-vector -def: dst -use: src/scalar-rep -literal: rep ; - PURE-INSN: ##gather-vector-2 def: dst use: src1/scalar-rep src2/scalar-rep @@ -298,11 +293,6 @@ def: dst use: src literal: shuffle rep ; -PURE-INSN: ##select-vector -def: dst -use: src -literal: n rep ; - PURE-INSN: ##add-vector def: dst use: src1 src2 @@ -418,7 +408,7 @@ def: dst use: src1 src2/scalar-rep literal: rep ; -! Scalar/integer conversion +! Scalar/vector conversion PURE-INSN: ##scalar>integer def: dst/int-rep use: src @@ -429,6 +419,16 @@ def: dst use: src/int-rep literal: rep ; +PURE-INSN: ##vector>scalar +def: dst/scalar-rep +use: src +literal: rep ; + +PURE-INSN: ##scalar>vector +def: dst +use: src/scalar-rep +literal: rep ; + ! Boxing and unboxing aliens PURE-INSN: ##box-alien def: dst/int-rep diff --git a/basis/compiler/cfg/intrinsics/intrinsics.factor b/basis/compiler/cfg/intrinsics/intrinsics.factor index 9784855b6d..c9cbc71a8c 100644 --- a/basis/compiler/cfg/intrinsics/intrinsics.factor +++ b/basis/compiler/cfg/intrinsics/intrinsics.factor @@ -175,7 +175,7 @@ IN: compiler.cfg.intrinsics { math.vectors.simd.intrinsics:(simd-vrshift) [ [ ^^shr-vector ] emit-binary-vector-op ] } { math.vectors.simd.intrinsics:(simd-hlshift) [ [ ^^horizontal-shl-vector ] emit-horizontal-shift ] } { math.vectors.simd.intrinsics:(simd-hrshift) [ [ ^^horizontal-shr-vector ] emit-horizontal-shift ] } - { math.vectors.simd.intrinsics:(simd-broadcast) [ [ ^^broadcast-vector ] emit-unary-vector-op ] } + { math.vectors.simd.intrinsics:(simd-broadcast) [ emit-broadcast-vector ] } { math.vectors.simd.intrinsics:(simd-gather-2) [ emit-gather-vector-2 ] } { math.vectors.simd.intrinsics:(simd-gather-4) [ emit-gather-vector-4 ] } { math.vectors.simd.intrinsics:(simd-vshuffle) [ emit-shuffle-vector ] } diff --git a/basis/compiler/cfg/intrinsics/simd/simd.factor b/basis/compiler/cfg/intrinsics/simd/simd.factor index 7f393fdc83..07ee55fc3c 100644 --- a/basis/compiler/cfg/intrinsics/simd/simd.factor +++ b/basis/compiler/cfg/intrinsics/simd/simd.factor @@ -1,10 +1,10 @@ ! Copyright (C) 2009 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. USING: accessors byte-arrays fry cpu.architecture kernel math -sequences macros generalizations combinators -combinators.short-circuit arrays compiler.tree.propagation.info -compiler.cfg.builder.blocks compiler.cfg.stacks -compiler.cfg.stacks.local compiler.cfg.hats +sequences math.vectors.simd.intrinsics macros generalizations +combinators combinators.short-circuit arrays +compiler.tree.propagation.info compiler.cfg.builder.blocks +compiler.cfg.stacks compiler.cfg.stacks.local compiler.cfg.hats compiler.cfg.instructions compiler.cfg.registers compiler.cfg.intrinsics.alien ; IN: compiler.cfg.intrinsics.simd @@ -70,6 +70,19 @@ MACRO: if-literals-match ( quots -- ) [ [ -2 inc-d ds-pop ] 2dip ^^shuffle-vector ds-push ] { [ shuffle? ] [ representation? ] } if-literals-match ; inline +: ^^broadcast-vector ( src rep -- dst ) + [ ^^scalar>vector ] keep + [ rep-components 0 ] keep + ^^shuffle-vector ; + +: emit-broadcast-vector ( node -- ) + [ ^^broadcast-vector ] emit-unary-vector-op ; + +: ^^select-vector ( src n rep -- dst ) + [ rep-components swap ] keep + [ ^^shuffle-vector ] keep + ^^vector>scalar ; + : emit-select-vector ( node -- ) [ [ -2 inc-d ds-pop ] 2dip ^^select-vector ds-push ] { [ integer? ] [ representation? ] } if-literals-match ; inline diff --git a/basis/compiler/codegen/codegen.factor b/basis/compiler/codegen/codegen.factor index 12cf303b4c..8e99f79b36 100755 --- a/basis/compiler/codegen/codegen.factor +++ b/basis/compiler/codegen/codegen.factor @@ -162,11 +162,9 @@ CODEGEN: ##integer>float %integer>float CODEGEN: ##float>integer %float>integer CODEGEN: ##unbox-vector %unbox-vector CODEGEN: ##zero-vector %zero-vector -CODEGEN: ##broadcast-vector %broadcast-vector CODEGEN: ##gather-vector-2 %gather-vector-2 CODEGEN: ##gather-vector-4 %gather-vector-4 CODEGEN: ##shuffle-vector %shuffle-vector -CODEGEN: ##select-vector %select-vector CODEGEN: ##box-vector %box-vector CODEGEN: ##add-vector %add-vector CODEGEN: ##saturated-add-vector %saturated-add-vector @@ -193,6 +191,8 @@ CODEGEN: ##shl-vector %shl-vector CODEGEN: ##shr-vector %shr-vector CODEGEN: ##integer>scalar %integer>scalar CODEGEN: ##scalar>integer %scalar>integer +CODEGEN: ##vector>scalar %vector>scalar +CODEGEN: ##scalar>vector %scalar>vector CODEGEN: ##box-alien %box-alien CODEGEN: ##box-displaced-alien %box-displaced-alien CODEGEN: ##unbox-alien %unbox-alien diff --git a/basis/compiler/tree/propagation/simd/simd.factor b/basis/compiler/tree/propagation/simd/simd.factor index 552ab799ba..f29b534deb 100644 --- a/basis/compiler/tree/propagation/simd/simd.factor +++ b/basis/compiler/tree/propagation/simd/simd.factor @@ -28,7 +28,6 @@ IN: compiler.tree.propagation.simd (simd-broadcast) (simd-gather-2) (simd-gather-4) - (simd-select) alien-vector } [ { byte-array } "default-output-classes" set-word-prop ] each @@ -46,6 +45,8 @@ IN: compiler.tree.propagation.simd \ (simd-v.) [ 2nip scalar-output-class ] "outputs" set-word-prop +\ (simd-select) [ 2nip scalar-output-class ] "outputs" set-word-prop + \ assert-positive [ real [0,inf] value-info-intersect ] "outputs" set-word-prop diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor index 9222fcc17c..9d6f8fd662 100644 --- a/basis/cpu/architecture/architecture.factor +++ b/basis/cpu/architecture/architecture.factor @@ -212,11 +212,9 @@ HOOK: %box-vector cpu ( dst src temp rep -- ) HOOK: %unbox-vector cpu ( dst src rep -- ) HOOK: %zero-vector cpu ( dst rep -- ) -HOOK: %broadcast-vector cpu ( dst src rep -- ) HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- ) HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- ) HOOK: %shuffle-vector cpu ( dst src shuffle rep -- ) -HOOK: %select-vector cpu ( dst src n rep -- ) HOOK: %add-vector cpu ( dst src1 src2 rep -- ) HOOK: %saturated-add-vector cpu ( dst src1 src2 rep -- ) HOOK: %add-sub-vector cpu ( dst src1 src2 rep -- ) @@ -243,13 +241,13 @@ HOOK: %horizontal-shr-vector cpu ( dst src1 src2 rep -- ) HOOK: %integer>scalar cpu ( dst src rep -- ) HOOK: %scalar>integer cpu ( dst src rep -- ) +HOOK: %vector>scalar cpu ( dst src rep -- ) +HOOK: %scalar>vector cpu ( dst src rep -- ) HOOK: %zero-vector-reps cpu ( -- reps ) -HOOK: %broadcast-vector-reps cpu ( -- reps ) HOOK: %gather-vector-2-reps cpu ( -- reps ) HOOK: %gather-vector-4-reps cpu ( -- reps ) HOOK: %shuffle-vector-reps cpu ( -- reps ) -HOOK: %select-vector-reps cpu ( -- reps ) HOOK: %add-vector-reps cpu ( -- reps ) HOOK: %saturated-add-vector-reps cpu ( -- reps ) HOOK: %add-sub-vector-reps cpu ( -- reps ) diff --git a/basis/cpu/ppc/ppc.factor b/basis/cpu/ppc/ppc.factor index d5ee166486..006d38f384 100644 --- a/basis/cpu/ppc/ppc.factor +++ b/basis/cpu/ppc/ppc.factor @@ -184,6 +184,7 @@ M: ppc %shr-imm swapd SRWI ; M: ppc %sar SRAW ; M: ppc %sar-imm SRAWI ; M: ppc %not NOT ; +M: ppc %neg NEG ; :: overflow-template ( label dst src1 src2 insn -- ) 0 0 LI @@ -262,9 +263,10 @@ M: ppc %single>double-float double-rep %copy ; M: ppc %double>single-float double-rep %copy ; ! VMX/AltiVec not supported yet -M: ppc %broadcast-vector-reps { } ; +M: ppc %zero-vector-reps { } ; M: ppc %gather-vector-2-reps { } ; M: ppc %gather-vector-4-reps { } ; +M: ppc %shuffle-vector-reps { } ; M: ppc %add-vector-reps { } ; M: ppc %saturated-add-vector-reps { } ; M: ppc %add-sub-vector-reps { } ; @@ -275,14 +277,19 @@ M: ppc %saturated-mul-vector-reps { } ; M: ppc %div-vector-reps { } ; M: ppc %min-vector-reps { } ; M: ppc %max-vector-reps { } ; +M: ppc %dot-vector-reps { } ; M: ppc %sqrt-vector-reps { } ; M: ppc %horizontal-add-vector-reps { } ; +M: ppc %horizontal-sub-vector-reps { } ; M: ppc %abs-vector-reps { } ; M: ppc %and-vector-reps { } ; +M: ppc %andn-vector-reps { } ; M: ppc %or-vector-reps { } ; M: ppc %xor-vector-reps { } ; M: ppc %shl-vector-reps { } ; M: ppc %shr-vector-reps { } ; +M: ppc %horizontal-shl-vector-reps { } ; +M: ppc %horizontal-shr-vector-reps { } ; M: ppc %unbox-alien ( dst src -- ) alien-offset LWZ ; diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor index 397a26a464..14a382d6cf 100644 --- a/basis/cpu/x86/x86.factor +++ b/basis/cpu/x86/x86.factor @@ -600,42 +600,42 @@ M: x86 %zero-vector-reps { uchar-16-rep char-16-rep } } ?at drop ; -M:: x86 %broadcast-vector ( dst src rep -- ) - rep unsign-rep { - { float-4-rep [ - dst src float-4-rep %copy - dst dst { 0 0 0 0 } SHUFPS - ] } - { double-2-rep [ - dst src MOVDDUP - ] } - { longlong-2-rep [ - dst src = - [ dst dst PUNPCKLQDQ ] - [ dst src { 0 1 0 1 } PSHUFD ] - if - ] } - { int-4-rep [ - dst src { 0 0 0 0 } PSHUFD - ] } - { short-8-rep [ - dst src { 0 0 0 0 } PSHUFLW - dst dst PUNPCKLQDQ - ] } - { char-16-rep [ - dst src char-16-rep %copy - dst dst PUNPCKLBW - dst dst { 0 0 0 0 } PSHUFLW - dst dst PUNPCKLQDQ - ] } - } case ; - -M: x86 %broadcast-vector-reps - { - ! Can't do this with sse1 since it will want to unbox - ! a double-precision float and convert to single precision - { sse2? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } } - } available-reps ; +! M:: x86 %broadcast-vector ( dst src rep -- ) +! rep unsign-rep { +! { float-4-rep [ +! dst src float-4-rep %copy +! dst dst { 0 0 0 0 } SHUFPS +! ] } +! { double-2-rep [ +! dst src MOVDDUP +! ] } +! { longlong-2-rep [ +! dst src = +! [ dst dst PUNPCKLQDQ ] +! [ dst src { 0 1 0 1 } PSHUFD ] +! if +! ] } +! { int-4-rep [ +! dst src { 0 0 0 0 } PSHUFD +! ] } +! { short-8-rep [ +! dst src { 0 0 0 0 } PSHUFLW +! dst dst PUNPCKLQDQ +! ] } +! { char-16-rep [ +! dst src char-16-rep %copy +! dst dst PUNPCKLBW +! dst dst { 0 0 0 0 } PSHUFLW +! dst dst PUNPCKLQDQ +! ] } +! } case ; +! +! M: x86 %broadcast-vector-reps +! { +! ! Can't do this with sse1 since it will want to unbox +! ! a double-precision float and convert to single precision +! { sse2? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } } +! } available-reps ; M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) rep unsign-rep { @@ -721,11 +721,10 @@ M:: x86 %shuffle-vector ( dst src shuffle rep -- ) M: x86 %shuffle-vector-reps { - { sse2? { double-2-rep float-4-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + { sse? { float-4-rep } } + { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } } available-reps ; -M: x86 %select-vector-reps { } ; - M: x86 %add-vector ( dst src1 src2 rep -- ) [ two-operand ] keep { @@ -1044,8 +1043,9 @@ M: x86 %shr-vector-reps } available-reps ; M: x86 %integer>scalar drop MOVD ; - M: x86 %scalar>integer drop MOVD ; +M: x86 %vector>scalar %copy ; +M: x86 %scalar>vector %copy ; M:: x86 %spill ( src rep dst -- ) dst src rep %copy ; M:: x86 %reload ( dst rep src -- ) dst src rep %copy ; diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor index ba045cda60..89ab073845 100644 --- a/basis/math/vectors/simd/functor/functor.factor +++ b/basis/math/vectors/simd/functor/functor.factor @@ -5,7 +5,7 @@ functors generalizations kernel literals locals math math.functions math.vectors math.vectors.private math.vectors.simd.intrinsics math.vectors.specialization parser prettyprint.custom sequences sequences.private strings words definitions macros cpu.architecture -namespaces arrays quotations ; +namespaces arrays quotations combinators sets ; QUALIFIED-WITH: math m IN: math.vectors.simd.functor @@ -28,11 +28,23 @@ MACRO: simd-boa ( rep class -- simd-array ) :: define-with-custom-inlining ( word rep class -- ) word [ drop - rep \ (simd-broadcast) supported-simd-op? [ + rep \ (simd-vshuffle) supported-simd-op? [ [ rep rep-coerce rep (simd-broadcast) class boa ] ] [ word def>> ] if ] "custom-inlining" set-word-prop ; +: simd-nth-fast ( rep -- quot ) + [ rep-components ] keep + '[ swap _ '[ _ _ (simd-select) ] 2array ] map-index + '[ swap >fixnum _ case ] ; + +: simd-nth-slow ( rep -- quot ) + rep-component-type dup c-type-getter-boxer array-accessor ; + +MACRO: simd-nth ( rep -- x ) + dup \ (simd-vshuffle) supported-simd-op? + [ simd-nth-fast ] [ simd-nth-slow ] if ; + : boa-effect ( rep n -- effect ) [ rep-components ] dip * [ CHAR: a + 1string ] map @@ -45,8 +57,8 @@ MACRO: simd-boa ( rep class -- simd-array ) ERROR: bad-schema schema ; -: low-level-ops ( box-quot: ( inputs... simd-op -- outputs... ) -- alist ) - [ simd-ops get ] dip '[ +: low-level-ops ( simd-ops alist -- alist' ) + '[ 1quotation over word-schema _ ?at [ bad-schema ] unless [ ] 2sequence @@ -73,21 +85,17 @@ ERROR: bad-schema schema ; ! in the general case. elt-class m:float = [ { distance [ v- norm ] } suffix ] when ; -:: simd-vector-words ( class ctor rep vv->v vn->v vv->n v->v v->n -- ) - rep rep-component-type c-type-boxed-class :> elt-class - class - elt-class +TUPLE: simd class elt-class ops wrappers ctor rep ; + +: define-simd ( simd -- ) + dup rep>> rep-component-type c-type-boxed-class >>elt-class { - { { +vector+ +vector+ -> +vector+ } vv->v } - { { +vector+ +scalar+ -> +vector+ } vn->v } - { { +vector+ +literal+ -> +vector+ } vn->v } - { { +vector+ +vector+ -> +scalar+ } vv->n } - { { +vector+ -> +vector+ } v->v } - { { +vector+ -> +scalar+ } v->n } - { { +vector+ -> +nonnegative+ } v->n } - } low-level-ops - rep supported-simd-ops - ctor elt-class high-level-ops assoc-union + [ class>> ] + [ elt-class>> ] + [ [ ops>> ] [ wrappers>> ] bi low-level-ops ] + [ rep>> supported-simd-ops ] + [ [ ctor>> ] [ elt-class>> ] bi high-level-ops assoc-union ] + } cleave specialize-vector-words ; :: define-simd-128-type ( class rep -- ) @@ -101,6 +109,11 @@ ERROR: bad-schema schema ; rep >>rep class typedef ; +: (define-simd-128) ( simd -- ) + simd-ops get >>ops + [ define-simd ] + [ [ class>> ] [ rep>> ] bi define-simd-128-type ] bi ; + FUNCTOR: define-simd-128 ( T -- ) N [ 16 T heap-size /i ] @@ -112,7 +125,6 @@ A-cast DEFINES ${A}-cast >A DEFINES >${A} A{ DEFINES ${A}{ -NTH [ T dup c-type-getter-boxer array-accessor ] SET-NTH [ T dup c-setter array-accessor ] A-rep [ A name>> "-rep" append "cpu.architecture" lookup ] @@ -131,7 +143,7 @@ M: A clone underlying>> clone \ A boa ; inline M: A length drop N ; inline -M: A nth-unsafe underlying>> NTH call ; inline +M: A nth-unsafe underlying>> A-rep simd-nth ; inline M: A set-nth-unsafe underlying>> SET-NTH call ; inline @@ -193,8 +205,20 @@ INSTANCE: A sequence : A-v->n-op ( v quot -- n ) [ underlying>> A-rep ] dip call ; inline -\ A \ A-with \ A-rep \ A-vv->v-op \ A-vn->v-op \ A-vv->n-op \ A-v->v-op \ A-v->n-op simd-vector-words -\ A \ A-rep define-simd-128-type +simd new + \ A >>class + \ A-with >>ctor + \ A-rep >>rep + { + { { +vector+ +vector+ -> +vector+ } A-vv->v-op } + { { +vector+ +scalar+ -> +vector+ } A-vn->v-op } + { { +vector+ +literal+ -> +vector+ } A-vn->v-op } + { { +vector+ +vector+ -> +scalar+ } A-vv->n-op } + { { +vector+ -> +vector+ } A-v->v-op } + { { +vector+ -> +scalar+ } A-v->n-op } + { { +vector+ -> +nonnegative+ } A-v->n-op } + } >>wrappers +(define-simd-128) PRIVATE> @@ -223,6 +247,11 @@ SLOT: underlying2 rep >>rep class typedef ; +: (define-simd-256) ( simd -- ) + simd-ops get { vshuffle hlshift hrshift } unique assoc-diff >>ops + [ define-simd ] + [ [ class>> ] [ rep>> ] bi define-simd-256-type ] bi ; + FUNCTOR: define-simd-256 ( T -- ) N [ 32 T heap-size /i ] @@ -332,7 +361,19 @@ INSTANCE: A sequence : A-v->n-op ( v1 combine-quot -- v2 ) [ [ underlying1>> ] [ underlying2>> ] bi A-rep (simd-v+) A-rep ] dip call ; inline -\ A \ A-with \ A-rep \ A-vv->v-op \ A-vn->v-op \ A-vv->n-op \ A-v->v-op \ A-v->n-op simd-vector-words -\ A \ A-rep define-simd-256-type +simd new + \ A >>class + \ A-with >>ctor + \ A-rep >>rep + { + { { +vector+ +vector+ -> +vector+ } A-vv->v-op } + { { +vector+ +scalar+ -> +vector+ } A-vn->v-op } + { { +vector+ +literal+ -> +vector+ } A-vn->v-op } + { { +vector+ +vector+ -> +scalar+ } A-vv->n-op } + { { +vector+ -> +vector+ } A-v->v-op } + { { +vector+ -> +scalar+ } A-v->n-op } + { { +vector+ -> +nonnegative+ } A-v->n-op } + } >>wrappers +(define-simd-256) ;FUNCTOR diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor index 78301ae3a5..522488d804 100644 --- a/basis/math/vectors/simd/intrinsics/intrinsics.factor +++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor @@ -126,8 +126,6 @@ M: vector-rep supported-simd-op? { \ (simd-hlshift) [ %horizontal-shl-vector-reps ] } { \ (simd-hrshift) [ %horizontal-shr-vector-reps ] } { \ (simd-vshuffle) [ %shuffle-vector-reps ] } - { \ (simd-broadcast) [ %broadcast-vector-reps ] } { \ (simd-gather-2) [ %gather-vector-2-reps ] } { \ (simd-gather-4) [ %gather-vector-4-reps ] } - { \ (simd-select) [ %select-vector-reps ] } } case member? ; diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor index 7a3ba727e0..541e5b5c22 100644 --- a/basis/math/vectors/simd/simd-docs.factor +++ b/basis/math/vectors/simd/simd-docs.factor @@ -21,13 +21,13 @@ ARTICLE: "math.vectors.simd.support" "Supported SIMD instruction sets and operat $nl "SSE1 only supports single-precision SIMD (" { $snippet "float-4" } " and " { $snippet "float-8" } ")." $nl -"SSE2 introduces double-precision SIMD (" { $snippet "double-2" } " and " { $snippet "double-4" } ") and integer SIMD (all types). Integer SIMD in missing a few features, in particular the " { $link vmin } " and " { $link vmax } " operations only work on " { $snippet "uchar-16" } " and " { $snippet "short-8" } "." +"SSE2 introduces double-precision SIMD (" { $snippet "double-2" } " and " { $snippet "double-4" } ") and integer SIMD (all types). Integer SIMD is missing a few features, in particular the " { $link vmin } " and " { $link vmax } " operations only work on " { $snippet "uchar-16" } " and " { $snippet "short-8" } "." $nl "SSE3 introduces horizontal adds (summing all components of a single vector register), which is useful for computing dot products. Where available, SSE3 operations are used to speed up " { $link sum } ", " { $link v. } ", " { $link norm-sq } ", " { $link norm } ", and " { $link distance } "." $nl "SSSE3 introduces " { $link vabs } " for " { $snippet "char-16" } ", " { $snippet "short-8" } " and " { $snippet "int-4" } "." $nl -"SSE4.1 introduces " { $link vmin } " and " { $link vmax } " for all remaining integer types." +"SSE4.1 introduces " { $link vmin } " and " { $link vmax } " for all remaining integer types, a faster instruction for " { $link v. } ", and a few other things." $nl "On PowerPC, or older x86 chips without SSE, software fallbacks are used for all high-level vector operations. SIMD code can run with no loss in functionality, just decreased performance." $nl @@ -183,7 +183,7 @@ $nl ARTICLE: "math.vectors.simd.accuracy" "Numerical accuracy of SIMD primitives" "No guarantees are made that " { $vocab-link "math.vectors.simd" } " words will give identical results on different SSE versions, or between the hardware intrinsics and the software fallbacks." $nl -"In particular, horizontal operations on " { $snippet "float-4" } " and " { $snippet "float-8" } " are affected by this. They are computed with lower precision in intrinsics than the software fallback. Horizontal opeartions include anything involving adding together the components of a vector, such as " { $link sum } " or " { $link normalize } "." ; +"In particular, horizontal operations on " { $snippet "float-4" } " and " { $snippet "float-8" } " are affected by this. They are computed with lower precision in intrinsics than the software fallback. Horizontal operations include anything involving adding together the components of a vector, such as " { $link sum } " or " { $link normalize } "." ; ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)" "The " { $vocab-link "math.vectors.simd" } " vocabulary extends the " { $vocab-link "math.vectors" } " vocabulary to support efficient vector arithmetic on small, fixed-size vectors." diff --git a/basis/math/vectors/simd/simd-tests.factor b/basis/math/vectors/simd/simd-tests.factor index 588ef83816..3f43a21c10 100644 --- a/basis/math/vectors/simd/simd-tests.factor +++ b/basis/math/vectors/simd/simd-tests.factor @@ -5,7 +5,8 @@ math.vectors.simd.private prettyprint random sequences system tools.test vocabs assocs compiler.cfg.debugger words locals math.vectors.specialization combinators cpu.architecture math.vectors.simd.intrinsics namespaces byte-arrays alien -specialized-arrays classes.struct eval ; +specialized-arrays classes.struct eval classes.algebra sets +quotations ; QUALIFIED-WITH: alien.c-types c SPECIALIZED-ARRAY: c:float SIMD: c:char @@ -34,6 +35,20 @@ IN: math.vectors.simd.tests [ V{ float-4 } ] [ [ { float-4 float-4 } declare v+ ] final-classes ] unit-test +[ V{ float } ] [ [ { float-4 } declare second ] final-classes ] unit-test + +[ V{ int-4 } ] [ [ { int-4 int-4 } declare v+ ] final-classes ] unit-test + +[ t ] [ [ { int-4 } declare second ] final-classes first integer class<= ] unit-test + +[ V{ longlong-2 } ] [ [ { longlong-2 longlong-2 } declare v+ ] final-classes ] unit-test + +[ V{ integer } ] [ [ { longlong-2 } declare second ] final-classes ] unit-test + +[ V{ int-8 } ] [ [ { int-8 int-8 } declare v+ ] final-classes ] unit-test + +[ t ] [ [ { int-8 } declare second ] final-classes first integer class<= ] unit-test + ! Test puns; only on x86 cpu x86? [ [ double-2{ 4 1024 } ] [ @@ -78,9 +93,10 @@ CONSTANT: simd-classes : boa-ctors ( -- seq ) simd-classes [ [ name>> "-boa" append ] [ vocabulary>> ] bi lookup ] map ; -: check-optimizer ( seq inputs quot eq-quot -- ) +: check-optimizer ( seq quot eq-quot -- failures ) '[ @ + [ dup [ class ] { } map-as ] dip '[ _ declare @ ] { [ "print-mr" get [ nip test-mr mr. ] [ 2drop ] if ] [ "print-checks" get [ [ . ] bi@ ] [ 2drop ] if ] @@ -104,7 +120,7 @@ CONSTANT: simd-classes [ { } ] [ with-ctors [ - [ 1000 random '[ _ ] ] dip '[ { fixnum } declare _ execute ] + [ 1000 random '[ _ ] ] dip '[ _ execute ] ] [ = ] check-optimizer ] unit-test @@ -112,10 +128,8 @@ CONSTANT: simd-classes [ { } ] [ boa-ctors [ - dup stack-effect in>> length - [ nip [ 1000 random ] [ ] replicate-as ] - [ fixnum swap '[ _ declare _ execute ] ] - 2bi + [ stack-effect in>> length [ 1000 random ] [ ] replicate-as ] keep + '[ _ execute ] ] [ = ] check-optimizer ] unit-test @@ -126,31 +140,22 @@ CONSTANT: simd-classes :: check-vector-op ( word inputs class elt-class -- inputs quot ) inputs [ - [ - { - { +vector+ [ class random-vector ] } - { +scalar+ [ 1000 random elt-class float = [ >float ] when ] } - } case - ] [ ] map-as - ] [ - [ - { - { +vector+ [ class ] } - { +scalar+ [ elt-class ] } - } case - ] map - ] bi - word '[ _ declare _ execute ] ; + { + { +vector+ [ class random-vector ] } + { +scalar+ [ 1000 random elt-class float = [ >float ] when ] } + } case + ] [ ] map-as + word '[ _ execute ] ; : remove-float-words ( alist -- alist' ) - [ drop { vsqrt n/v v/n v/ normalize } member? not ] assoc-filter ; + { vsqrt n/v v/n v/ normalize } unique assoc-diff ; : remove-integer-words ( alist -- alist' ) - [ drop { vlshift vrshift } member? not ] assoc-filter ; + { vlshift vrshift } unique assoc-diff ; : remove-special-words ( alist -- alist' ) ! These have their own tests later - [ drop { hlshift hrshift vshuffle } member? not ] assoc-filter ; + { hlshift hrshift vshuffle } unique assoc-diff ; : ops-to-check ( elt-class -- alist ) [ vector-words >alist ] dip @@ -189,13 +194,89 @@ simd-classes&reps [ [ [ { } ] ] dip first3 '[ _ _ _ check-vector-ops ] unit-test ] each -! Other regressions -[ 8000000 ] [ - int-8{ 1000 1000 1000 1000 1000 1000 1000 1000 } - [ { int-8 } declare dup [ * ] [ + ] 2map-reduce ] compile-call -] unit-test +"== Checking shifts and permutations" print + +[ int-4{ 256 512 1024 2048 } ] +[ int-4{ 1 2 4 8 } 1 hlshift ] unit-test + +[ int-4{ 256 512 1024 2048 } ] +[ int-4{ 1 2 4 8 } [ { int-4 } declare 1 hlshift ] compile-call ] unit-test + +[ int-4{ 1 2 4 8 } ] +[ int-4{ 256 512 1024 2048 } 1 hrshift ] unit-test + +[ int-4{ 1 2 4 8 } ] +[ int-4{ 256 512 1024 2048 } [ { int-4 } declare 1 hrshift ] compile-call ] unit-test + +! Shuffles +: shuffles-for ( n -- shuffles ) + { + { 2 [ + { + { 0 1 } + { 1 1 } + { 1 0 } + { 0 0 } + } + ] } + { 4 [ + { + { 1 2 3 0 } + { 0 1 2 3 } + { 1 1 2 2 } + { 0 0 1 1 } + { 2 2 3 3 } + { 0 1 0 1 } + { 2 3 2 3 } + { 0 0 2 2 } + { 1 1 3 3 } + { 0 1 0 1 } + { 2 2 3 3 } + } + ] } + { 8 [ + 4 shuffles-for + 4 shuffles-for + [ [ 4 + ] map ] map + [ append ] 2map + ] } + [ dup '[ _ random ] replicate 1array ] + } case ; + +simd-classes [ + [ [ { } ] ] dip + [ new length shuffles-for ] keep + '[ + _ [ [ _ new [ length iota ] keep like 1quotation ] dip '[ _ vshuffle ] ] + [ = ] check-optimizer + ] unit-test +] each + +"== Checking element access" print + +! Test element access -- it should box bignums for int-4 on x86 +: test-accesses ( seq -- failures ) + [ length >array ] keep + '[ [ _ 1quotation ] dip '[ _ swap nth ] ] [ = ] check-optimizer ; inline + +[ { } ] [ float-4{ 1.0 2.0 3.0 4.0 } test-accesses ] unit-test +[ { } ] [ int-4{ HEX: 7fffffff 3 4 -8 } test-accesses ] unit-test +[ { } ] [ uint-4{ HEX: ffffffff 2 3 4 } test-accesses ] unit-test + +[ { } ] [ double-2{ 1.0 2.0 } test-accesses ] unit-test +[ { } ] [ longlong-2{ 1 2 } test-accesses ] unit-test +[ { } ] [ ulonglong-2{ 1 2 } test-accesses ] unit-test + +[ { } ] [ float-8{ 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 } test-accesses ] unit-test +[ { } ] [ int-8{ 1 2 3 4 5 6 7 8 } test-accesses ] unit-test +[ { } ] [ uint-8{ 1 2 3 4 5 6 7 8 } test-accesses ] unit-test + +[ { } ] [ double-4{ 1.0 2.0 3.0 4.0 } test-accesses ] unit-test +[ { } ] [ longlong-4{ 1 2 3 4 } test-accesses ] unit-test +[ { } ] [ ulonglong-4{ 1 2 3 4 } test-accesses ] unit-test + +"== Checking alien operations" print -! Vector alien intrinsics [ float-4{ 1 2 3 4 } ] [ [ float-4{ 1 2 3 4 } @@ -259,60 +340,12 @@ STRUCT: simd-struct ] compile-call ] unit-test +"== Misc tests" print + [ ] [ char-16 new 1array stack. ] unit-test -[ int-4{ 256 512 1024 2048 } ] -[ int-4{ 1 2 4 8 } 1 hlshift ] unit-test - -[ int-4{ 256 512 1024 2048 } ] -[ int-4{ 1 2 4 8 } [ { int-4 } declare 1 hlshift ] compile-call ] unit-test - -[ int-4{ 1 2 4 8 } ] -[ int-4{ 256 512 1024 2048 } 1 hrshift ] unit-test - -[ int-4{ 1 2 4 8 } ] -[ int-4{ 256 512 1024 2048 } [ { int-4 } declare 1 hrshift ] compile-call ] unit-test - -! Shuffles -: test-shuffle ( input shuffle -- failures ) - [ dup class 1array ] dip - '[ _ declare _ vshuffle ] - [ call ] [ compile-call ] 2bi = not ; inline - -: shuffles-for ( seq -- shuffles ) - length { - { 2 [ - { - { 0 1 } - { 1 1 } - { 1 0 } - { 0 0 } - } - ] } - { 4 [ - { - { 1 2 3 0 } - { 0 1 2 3 } - { 1 1 2 2 } - { 0 0 1 1 } - { 2 2 3 3 } - { 0 1 0 1 } - { 2 3 2 3 } - { 0 0 2 2 } - { 1 1 3 3 } - { 0 1 0 1 } - { 2 2 3 3 } - } - ] } - } case ; - -: test-shuffles ( input -- failures ) - dup shuffles-for [ test-shuffle ] with filter ; inline - -[ { } ] [ float-4{ 1.0 2.0 3.0 4.0 } test-shuffles ] unit-test -[ { } ] [ int-4{ 1 2 3 4 } test-shuffles ] unit-test -[ { } ] [ uint-4{ 1 2 3 4 } test-shuffles ] unit-test - -[ { } ] [ double-2{ 1.0 2.0 } test-shuffles ] unit-test -[ { } ] [ longlong-2{ 1 2 } test-shuffles ] unit-test -[ { } ] [ ulonglong-2{ 1 2 } test-shuffles ] unit-test +! Other regressions +[ 8000000 ] [ + int-8{ 1000 1000 1000 1000 1000 1000 1000 1000 } + [ { int-8 } declare dup [ * ] [ + ] 2map-reduce ] compile-call +] unit-test diff --git a/basis/math/vectors/simd/simd.factor b/basis/math/vectors/simd/simd.factor index 230f43029a..af04e283f0 100644 --- a/basis/math/vectors/simd/simd.factor +++ b/basis/math/vectors/simd/simd.factor @@ -2,7 +2,8 @@ ! See http://factorcode.org/license.txt for BSD license. USING: alien.c-types combinators fry kernel parser math math.parser math.vectors.simd.functor sequences splitting vocabs.generated -vocabs.loader vocabs.parser words accessors ; +vocabs.loader vocabs.parser words accessors vocabs compiler.units +definitions ; QUALIFIED-WITH: alien.c-types c IN: math.vectors.simd @@ -17,6 +18,12 @@ ERROR: bad-base-type type ; dup { c:char c:uchar c:short c:ushort c:int c:uint c:longlong c:ulonglong c:float c:double } memq? [ bad-base-type ] unless ; +: forget-instances ( -- ) + [ + "math.vectors.simd.instances" child-vocabs + [ forget-vocab ] each + ] with-compilation-unit ; + PRIVATE> : define-simd-vocab ( type -- vocab ) @@ -29,3 +36,4 @@ PRIVATE> SYNTAX: SIMD: scan-word define-simd-vocab use-vocab ; +