math.vectors.simd: add fast intrinsic for 'nth', replace broadcast primitive with shuffles

db4
Slava Pestov 2009-09-29 04:46:38 -05:00
parent 6382aaabd5
commit f395d83379
13 changed files with 279 additions and 180 deletions

View File

@ -278,11 +278,6 @@ PURE-INSN: ##zero-vector
def: dst def: dst
literal: rep ; literal: rep ;
PURE-INSN: ##broadcast-vector
def: dst
use: src/scalar-rep
literal: rep ;
PURE-INSN: ##gather-vector-2 PURE-INSN: ##gather-vector-2
def: dst def: dst
use: src1/scalar-rep src2/scalar-rep use: src1/scalar-rep src2/scalar-rep
@ -298,11 +293,6 @@ def: dst
use: src use: src
literal: shuffle rep ; literal: shuffle rep ;
PURE-INSN: ##select-vector
def: dst
use: src
literal: n rep ;
PURE-INSN: ##add-vector PURE-INSN: ##add-vector
def: dst def: dst
use: src1 src2 use: src1 src2
@ -418,7 +408,7 @@ def: dst
use: src1 src2/scalar-rep use: src1 src2/scalar-rep
literal: rep ; literal: rep ;
! Scalar/integer conversion ! Scalar/vector conversion
PURE-INSN: ##scalar>integer PURE-INSN: ##scalar>integer
def: dst/int-rep def: dst/int-rep
use: src use: src
@ -429,6 +419,16 @@ def: dst
use: src/int-rep use: src/int-rep
literal: rep ; literal: rep ;
PURE-INSN: ##vector>scalar
def: dst/scalar-rep
use: src
literal: rep ;
PURE-INSN: ##scalar>vector
def: dst
use: src/scalar-rep
literal: rep ;
! Boxing and unboxing aliens ! Boxing and unboxing aliens
PURE-INSN: ##box-alien PURE-INSN: ##box-alien
def: dst/int-rep def: dst/int-rep

View File

@ -175,7 +175,7 @@ IN: compiler.cfg.intrinsics
{ math.vectors.simd.intrinsics:(simd-vrshift) [ [ ^^shr-vector ] emit-binary-vector-op ] } { math.vectors.simd.intrinsics:(simd-vrshift) [ [ ^^shr-vector ] emit-binary-vector-op ] }
{ math.vectors.simd.intrinsics:(simd-hlshift) [ [ ^^horizontal-shl-vector ] emit-horizontal-shift ] } { math.vectors.simd.intrinsics:(simd-hlshift) [ [ ^^horizontal-shl-vector ] emit-horizontal-shift ] }
{ math.vectors.simd.intrinsics:(simd-hrshift) [ [ ^^horizontal-shr-vector ] emit-horizontal-shift ] } { math.vectors.simd.intrinsics:(simd-hrshift) [ [ ^^horizontal-shr-vector ] emit-horizontal-shift ] }
{ math.vectors.simd.intrinsics:(simd-broadcast) [ [ ^^broadcast-vector ] emit-unary-vector-op ] } { math.vectors.simd.intrinsics:(simd-broadcast) [ emit-broadcast-vector ] }
{ math.vectors.simd.intrinsics:(simd-gather-2) [ emit-gather-vector-2 ] } { math.vectors.simd.intrinsics:(simd-gather-2) [ emit-gather-vector-2 ] }
{ math.vectors.simd.intrinsics:(simd-gather-4) [ emit-gather-vector-4 ] } { math.vectors.simd.intrinsics:(simd-gather-4) [ emit-gather-vector-4 ] }
{ math.vectors.simd.intrinsics:(simd-vshuffle) [ emit-shuffle-vector ] } { math.vectors.simd.intrinsics:(simd-vshuffle) [ emit-shuffle-vector ] }

View File

@ -1,10 +1,10 @@
! Copyright (C) 2009 Slava Pestov. ! Copyright (C) 2009 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors byte-arrays fry cpu.architecture kernel math USING: accessors byte-arrays fry cpu.architecture kernel math
sequences macros generalizations combinators sequences math.vectors.simd.intrinsics macros generalizations
combinators.short-circuit arrays compiler.tree.propagation.info combinators combinators.short-circuit arrays
compiler.cfg.builder.blocks compiler.cfg.stacks compiler.tree.propagation.info compiler.cfg.builder.blocks
compiler.cfg.stacks.local compiler.cfg.hats compiler.cfg.stacks compiler.cfg.stacks.local compiler.cfg.hats
compiler.cfg.instructions compiler.cfg.registers compiler.cfg.instructions compiler.cfg.registers
compiler.cfg.intrinsics.alien ; compiler.cfg.intrinsics.alien ;
IN: compiler.cfg.intrinsics.simd IN: compiler.cfg.intrinsics.simd
@ -70,6 +70,19 @@ MACRO: if-literals-match ( quots -- )
[ [ -2 inc-d ds-pop ] 2dip ^^shuffle-vector ds-push ] [ [ -2 inc-d ds-pop ] 2dip ^^shuffle-vector ds-push ]
{ [ shuffle? ] [ representation? ] } if-literals-match ; inline { [ shuffle? ] [ representation? ] } if-literals-match ; inline
: ^^broadcast-vector ( src rep -- dst )
[ ^^scalar>vector ] keep
[ rep-components 0 <array> ] keep
^^shuffle-vector ;
: emit-broadcast-vector ( node -- )
[ ^^broadcast-vector ] emit-unary-vector-op ;
: ^^select-vector ( src n rep -- dst )
[ rep-components swap <array> ] keep
[ ^^shuffle-vector ] keep
^^vector>scalar ;
: emit-select-vector ( node -- ) : emit-select-vector ( node -- )
[ [ -2 inc-d ds-pop ] 2dip ^^select-vector ds-push ] [ [ -2 inc-d ds-pop ] 2dip ^^select-vector ds-push ]
{ [ integer? ] [ representation? ] } if-literals-match ; inline { [ integer? ] [ representation? ] } if-literals-match ; inline

View File

@ -162,11 +162,9 @@ CODEGEN: ##integer>float %integer>float
CODEGEN: ##float>integer %float>integer CODEGEN: ##float>integer %float>integer
CODEGEN: ##unbox-vector %unbox-vector CODEGEN: ##unbox-vector %unbox-vector
CODEGEN: ##zero-vector %zero-vector CODEGEN: ##zero-vector %zero-vector
CODEGEN: ##broadcast-vector %broadcast-vector
CODEGEN: ##gather-vector-2 %gather-vector-2 CODEGEN: ##gather-vector-2 %gather-vector-2
CODEGEN: ##gather-vector-4 %gather-vector-4 CODEGEN: ##gather-vector-4 %gather-vector-4
CODEGEN: ##shuffle-vector %shuffle-vector CODEGEN: ##shuffle-vector %shuffle-vector
CODEGEN: ##select-vector %select-vector
CODEGEN: ##box-vector %box-vector CODEGEN: ##box-vector %box-vector
CODEGEN: ##add-vector %add-vector CODEGEN: ##add-vector %add-vector
CODEGEN: ##saturated-add-vector %saturated-add-vector CODEGEN: ##saturated-add-vector %saturated-add-vector
@ -193,6 +191,8 @@ CODEGEN: ##shl-vector %shl-vector
CODEGEN: ##shr-vector %shr-vector CODEGEN: ##shr-vector %shr-vector
CODEGEN: ##integer>scalar %integer>scalar CODEGEN: ##integer>scalar %integer>scalar
CODEGEN: ##scalar>integer %scalar>integer CODEGEN: ##scalar>integer %scalar>integer
CODEGEN: ##vector>scalar %vector>scalar
CODEGEN: ##scalar>vector %scalar>vector
CODEGEN: ##box-alien %box-alien CODEGEN: ##box-alien %box-alien
CODEGEN: ##box-displaced-alien %box-displaced-alien CODEGEN: ##box-displaced-alien %box-displaced-alien
CODEGEN: ##unbox-alien %unbox-alien CODEGEN: ##unbox-alien %unbox-alien

View File

@ -28,7 +28,6 @@ IN: compiler.tree.propagation.simd
(simd-broadcast) (simd-broadcast)
(simd-gather-2) (simd-gather-2)
(simd-gather-4) (simd-gather-4)
(simd-select)
alien-vector alien-vector
} [ { byte-array } "default-output-classes" set-word-prop ] each } [ { byte-array } "default-output-classes" set-word-prop ] each
@ -46,6 +45,8 @@ IN: compiler.tree.propagation.simd
\ (simd-v.) [ 2nip scalar-output-class ] "outputs" set-word-prop \ (simd-v.) [ 2nip scalar-output-class ] "outputs" set-word-prop
\ (simd-select) [ 2nip scalar-output-class ] "outputs" set-word-prop
\ assert-positive [ \ assert-positive [
real [0,inf] <class/interval-info> value-info-intersect real [0,inf] <class/interval-info> value-info-intersect
] "outputs" set-word-prop ] "outputs" set-word-prop

View File

@ -212,11 +212,9 @@ HOOK: %box-vector cpu ( dst src temp rep -- )
HOOK: %unbox-vector cpu ( dst src rep -- ) HOOK: %unbox-vector cpu ( dst src rep -- )
HOOK: %zero-vector cpu ( dst rep -- ) HOOK: %zero-vector cpu ( dst rep -- )
HOOK: %broadcast-vector cpu ( dst src rep -- )
HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- ) HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- ) HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
HOOK: %shuffle-vector cpu ( dst src shuffle rep -- ) HOOK: %shuffle-vector cpu ( dst src shuffle rep -- )
HOOK: %select-vector cpu ( dst src n rep -- )
HOOK: %add-vector cpu ( dst src1 src2 rep -- ) HOOK: %add-vector cpu ( dst src1 src2 rep -- )
HOOK: %saturated-add-vector cpu ( dst src1 src2 rep -- ) HOOK: %saturated-add-vector cpu ( dst src1 src2 rep -- )
HOOK: %add-sub-vector cpu ( dst src1 src2 rep -- ) HOOK: %add-sub-vector cpu ( dst src1 src2 rep -- )
@ -243,13 +241,13 @@ HOOK: %horizontal-shr-vector cpu ( dst src1 src2 rep -- )
HOOK: %integer>scalar cpu ( dst src rep -- ) HOOK: %integer>scalar cpu ( dst src rep -- )
HOOK: %scalar>integer cpu ( dst src rep -- ) HOOK: %scalar>integer cpu ( dst src rep -- )
HOOK: %vector>scalar cpu ( dst src rep -- )
HOOK: %scalar>vector cpu ( dst src rep -- )
HOOK: %zero-vector-reps cpu ( -- reps ) HOOK: %zero-vector-reps cpu ( -- reps )
HOOK: %broadcast-vector-reps cpu ( -- reps )
HOOK: %gather-vector-2-reps cpu ( -- reps ) HOOK: %gather-vector-2-reps cpu ( -- reps )
HOOK: %gather-vector-4-reps cpu ( -- reps ) HOOK: %gather-vector-4-reps cpu ( -- reps )
HOOK: %shuffle-vector-reps cpu ( -- reps ) HOOK: %shuffle-vector-reps cpu ( -- reps )
HOOK: %select-vector-reps cpu ( -- reps )
HOOK: %add-vector-reps cpu ( -- reps ) HOOK: %add-vector-reps cpu ( -- reps )
HOOK: %saturated-add-vector-reps cpu ( -- reps ) HOOK: %saturated-add-vector-reps cpu ( -- reps )
HOOK: %add-sub-vector-reps cpu ( -- reps ) HOOK: %add-sub-vector-reps cpu ( -- reps )

View File

@ -184,6 +184,7 @@ M: ppc %shr-imm swapd SRWI ;
M: ppc %sar SRAW ; M: ppc %sar SRAW ;
M: ppc %sar-imm SRAWI ; M: ppc %sar-imm SRAWI ;
M: ppc %not NOT ; M: ppc %not NOT ;
M: ppc %neg NEG ;
:: overflow-template ( label dst src1 src2 insn -- ) :: overflow-template ( label dst src1 src2 insn -- )
0 0 LI 0 0 LI
@ -262,9 +263,10 @@ M: ppc %single>double-float double-rep %copy ;
M: ppc %double>single-float double-rep %copy ; M: ppc %double>single-float double-rep %copy ;
! VMX/AltiVec not supported yet ! VMX/AltiVec not supported yet
M: ppc %broadcast-vector-reps { } ; M: ppc %zero-vector-reps { } ;
M: ppc %gather-vector-2-reps { } ; M: ppc %gather-vector-2-reps { } ;
M: ppc %gather-vector-4-reps { } ; M: ppc %gather-vector-4-reps { } ;
M: ppc %shuffle-vector-reps { } ;
M: ppc %add-vector-reps { } ; M: ppc %add-vector-reps { } ;
M: ppc %saturated-add-vector-reps { } ; M: ppc %saturated-add-vector-reps { } ;
M: ppc %add-sub-vector-reps { } ; M: ppc %add-sub-vector-reps { } ;
@ -275,14 +277,19 @@ M: ppc %saturated-mul-vector-reps { } ;
M: ppc %div-vector-reps { } ; M: ppc %div-vector-reps { } ;
M: ppc %min-vector-reps { } ; M: ppc %min-vector-reps { } ;
M: ppc %max-vector-reps { } ; M: ppc %max-vector-reps { } ;
M: ppc %dot-vector-reps { } ;
M: ppc %sqrt-vector-reps { } ; M: ppc %sqrt-vector-reps { } ;
M: ppc %horizontal-add-vector-reps { } ; M: ppc %horizontal-add-vector-reps { } ;
M: ppc %horizontal-sub-vector-reps { } ;
M: ppc %abs-vector-reps { } ; M: ppc %abs-vector-reps { } ;
M: ppc %and-vector-reps { } ; M: ppc %and-vector-reps { } ;
M: ppc %andn-vector-reps { } ;
M: ppc %or-vector-reps { } ; M: ppc %or-vector-reps { } ;
M: ppc %xor-vector-reps { } ; M: ppc %xor-vector-reps { } ;
M: ppc %shl-vector-reps { } ; M: ppc %shl-vector-reps { } ;
M: ppc %shr-vector-reps { } ; M: ppc %shr-vector-reps { } ;
M: ppc %horizontal-shl-vector-reps { } ;
M: ppc %horizontal-shr-vector-reps { } ;
M: ppc %unbox-alien ( dst src -- ) M: ppc %unbox-alien ( dst src -- )
alien-offset LWZ ; alien-offset LWZ ;

View File

@ -600,42 +600,42 @@ M: x86 %zero-vector-reps
{ uchar-16-rep char-16-rep } { uchar-16-rep char-16-rep }
} ?at drop ; } ?at drop ;
M:: x86 %broadcast-vector ( dst src rep -- ) ! M:: x86 %broadcast-vector ( dst src rep -- )
rep unsign-rep { ! rep unsign-rep {
{ float-4-rep [ ! { float-4-rep [
dst src float-4-rep %copy ! dst src float-4-rep %copy
dst dst { 0 0 0 0 } SHUFPS ! dst dst { 0 0 0 0 } SHUFPS
] } ! ] }
{ double-2-rep [ ! { double-2-rep [
dst src MOVDDUP ! dst src MOVDDUP
] } ! ] }
{ longlong-2-rep [ ! { longlong-2-rep [
dst src = ! dst src =
[ dst dst PUNPCKLQDQ ] ! [ dst dst PUNPCKLQDQ ]
[ dst src { 0 1 0 1 } PSHUFD ] ! [ dst src { 0 1 0 1 } PSHUFD ]
if ! if
] } ! ] }
{ int-4-rep [ ! { int-4-rep [
dst src { 0 0 0 0 } PSHUFD ! dst src { 0 0 0 0 } PSHUFD
] } ! ] }
{ short-8-rep [ ! { short-8-rep [
dst src { 0 0 0 0 } PSHUFLW ! dst src { 0 0 0 0 } PSHUFLW
dst dst PUNPCKLQDQ ! dst dst PUNPCKLQDQ
] } ! ] }
{ char-16-rep [ ! { char-16-rep [
dst src char-16-rep %copy ! dst src char-16-rep %copy
dst dst PUNPCKLBW ! dst dst PUNPCKLBW
dst dst { 0 0 0 0 } PSHUFLW ! dst dst { 0 0 0 0 } PSHUFLW
dst dst PUNPCKLQDQ ! dst dst PUNPCKLQDQ
] } ! ] }
} case ; ! } case ;
!
M: x86 %broadcast-vector-reps ! M: x86 %broadcast-vector-reps
{ ! {
! Can't do this with sse1 since it will want to unbox ! ! Can't do this with sse1 since it will want to unbox
! a double-precision float and convert to single precision ! ! a double-precision float and convert to single precision
{ sse2? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } } ! { sse2? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
} available-reps ; ! } available-reps ;
M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
rep unsign-rep { rep unsign-rep {
@ -721,11 +721,10 @@ M:: x86 %shuffle-vector ( dst src shuffle rep -- )
M: x86 %shuffle-vector-reps M: x86 %shuffle-vector-reps
{ {
{ sse2? { double-2-rep float-4-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } { sse? { float-4-rep } }
{ sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
} available-reps ; } available-reps ;
M: x86 %select-vector-reps { } ;
M: x86 %add-vector ( dst src1 src2 rep -- ) M: x86 %add-vector ( dst src1 src2 rep -- )
[ two-operand ] keep [ two-operand ] keep
{ {
@ -1044,8 +1043,9 @@ M: x86 %shr-vector-reps
} available-reps ; } available-reps ;
M: x86 %integer>scalar drop MOVD ; M: x86 %integer>scalar drop MOVD ;
M: x86 %scalar>integer drop MOVD ; M: x86 %scalar>integer drop MOVD ;
M: x86 %vector>scalar %copy ;
M: x86 %scalar>vector %copy ;
M:: x86 %spill ( src rep dst -- ) dst src rep %copy ; M:: x86 %spill ( src rep dst -- ) dst src rep %copy ;
M:: x86 %reload ( dst rep src -- ) dst src rep %copy ; M:: x86 %reload ( dst rep src -- ) dst src rep %copy ;

View File

@ -5,7 +5,7 @@ functors generalizations kernel literals locals math math.functions
math.vectors math.vectors.private math.vectors.simd.intrinsics math.vectors math.vectors.private math.vectors.simd.intrinsics
math.vectors.specialization parser prettyprint.custom sequences math.vectors.specialization parser prettyprint.custom sequences
sequences.private strings words definitions macros cpu.architecture sequences.private strings words definitions macros cpu.architecture
namespaces arrays quotations ; namespaces arrays quotations combinators sets ;
QUALIFIED-WITH: math m QUALIFIED-WITH: math m
IN: math.vectors.simd.functor IN: math.vectors.simd.functor
@ -28,11 +28,23 @@ MACRO: simd-boa ( rep class -- simd-array )
:: define-with-custom-inlining ( word rep class -- ) :: define-with-custom-inlining ( word rep class -- )
word [ word [
drop drop
rep \ (simd-broadcast) supported-simd-op? [ rep \ (simd-vshuffle) supported-simd-op? [
[ rep rep-coerce rep (simd-broadcast) class boa ] [ rep rep-coerce rep (simd-broadcast) class boa ]
] [ word def>> ] if ] [ word def>> ] if
] "custom-inlining" set-word-prop ; ] "custom-inlining" set-word-prop ;
: simd-nth-fast ( rep -- quot )
[ rep-components ] keep
'[ swap _ '[ _ _ (simd-select) ] 2array ] map-index
'[ swap >fixnum _ case ] ;
: simd-nth-slow ( rep -- quot )
rep-component-type dup c-type-getter-boxer array-accessor ;
MACRO: simd-nth ( rep -- x )
dup \ (simd-vshuffle) supported-simd-op?
[ simd-nth-fast ] [ simd-nth-slow ] if ;
: boa-effect ( rep n -- effect ) : boa-effect ( rep n -- effect )
[ rep-components ] dip * [ rep-components ] dip *
[ CHAR: a + 1string ] map [ CHAR: a + 1string ] map
@ -45,8 +57,8 @@ MACRO: simd-boa ( rep class -- simd-array )
ERROR: bad-schema schema ; ERROR: bad-schema schema ;
: low-level-ops ( box-quot: ( inputs... simd-op -- outputs... ) -- alist ) : low-level-ops ( simd-ops alist -- alist' )
[ simd-ops get ] dip '[ '[
1quotation 1quotation
over word-schema _ ?at [ bad-schema ] unless over word-schema _ ?at [ bad-schema ] unless
[ ] 2sequence [ ] 2sequence
@ -73,21 +85,17 @@ ERROR: bad-schema schema ;
! in the general case. ! in the general case.
elt-class m:float = [ { distance [ v- norm ] } suffix ] when ; elt-class m:float = [ { distance [ v- norm ] } suffix ] when ;
:: simd-vector-words ( class ctor rep vv->v vn->v vv->n v->v v->n -- ) TUPLE: simd class elt-class ops wrappers ctor rep ;
rep rep-component-type c-type-boxed-class :> elt-class
class : define-simd ( simd -- )
elt-class dup rep>> rep-component-type c-type-boxed-class >>elt-class
{ {
{ { +vector+ +vector+ -> +vector+ } vv->v } [ class>> ]
{ { +vector+ +scalar+ -> +vector+ } vn->v } [ elt-class>> ]
{ { +vector+ +literal+ -> +vector+ } vn->v } [ [ ops>> ] [ wrappers>> ] bi low-level-ops ]
{ { +vector+ +vector+ -> +scalar+ } vv->n } [ rep>> supported-simd-ops ]
{ { +vector+ -> +vector+ } v->v } [ [ ctor>> ] [ elt-class>> ] bi high-level-ops assoc-union ]
{ { +vector+ -> +scalar+ } v->n } } cleave
{ { +vector+ -> +nonnegative+ } v->n }
} low-level-ops
rep supported-simd-ops
ctor elt-class high-level-ops assoc-union
specialize-vector-words ; specialize-vector-words ;
:: define-simd-128-type ( class rep -- ) :: define-simd-128-type ( class rep -- )
@ -101,6 +109,11 @@ ERROR: bad-schema schema ;
rep >>rep rep >>rep
class typedef ; class typedef ;
: (define-simd-128) ( simd -- )
simd-ops get >>ops
[ define-simd ]
[ [ class>> ] [ rep>> ] bi define-simd-128-type ] bi ;
FUNCTOR: define-simd-128 ( T -- ) FUNCTOR: define-simd-128 ( T -- )
N [ 16 T heap-size /i ] N [ 16 T heap-size /i ]
@ -112,7 +125,6 @@ A-cast DEFINES ${A}-cast
>A DEFINES >${A} >A DEFINES >${A}
A{ DEFINES ${A}{ A{ DEFINES ${A}{
NTH [ T dup c-type-getter-boxer array-accessor ]
SET-NTH [ T dup c-setter array-accessor ] SET-NTH [ T dup c-setter array-accessor ]
A-rep [ A name>> "-rep" append "cpu.architecture" lookup ] A-rep [ A name>> "-rep" append "cpu.architecture" lookup ]
@ -131,7 +143,7 @@ M: A clone underlying>> clone \ A boa ; inline
M: A length drop N ; inline M: A length drop N ; inline
M: A nth-unsafe underlying>> NTH call ; inline M: A nth-unsafe underlying>> A-rep simd-nth ; inline
M: A set-nth-unsafe underlying>> SET-NTH call ; inline M: A set-nth-unsafe underlying>> SET-NTH call ; inline
@ -193,8 +205,20 @@ INSTANCE: A sequence
: A-v->n-op ( v quot -- n ) : A-v->n-op ( v quot -- n )
[ underlying>> A-rep ] dip call ; inline [ underlying>> A-rep ] dip call ; inline
\ A \ A-with \ A-rep \ A-vv->v-op \ A-vn->v-op \ A-vv->n-op \ A-v->v-op \ A-v->n-op simd-vector-words simd new
\ A \ A-rep define-simd-128-type \ A >>class
\ A-with >>ctor
\ A-rep >>rep
{
{ { +vector+ +vector+ -> +vector+ } A-vv->v-op }
{ { +vector+ +scalar+ -> +vector+ } A-vn->v-op }
{ { +vector+ +literal+ -> +vector+ } A-vn->v-op }
{ { +vector+ +vector+ -> +scalar+ } A-vv->n-op }
{ { +vector+ -> +vector+ } A-v->v-op }
{ { +vector+ -> +scalar+ } A-v->n-op }
{ { +vector+ -> +nonnegative+ } A-v->n-op }
} >>wrappers
(define-simd-128)
PRIVATE> PRIVATE>
@ -223,6 +247,11 @@ SLOT: underlying2
rep >>rep rep >>rep
class typedef ; class typedef ;
: (define-simd-256) ( simd -- )
simd-ops get { vshuffle hlshift hrshift } unique assoc-diff >>ops
[ define-simd ]
[ [ class>> ] [ rep>> ] bi define-simd-256-type ] bi ;
FUNCTOR: define-simd-256 ( T -- ) FUNCTOR: define-simd-256 ( T -- )
N [ 32 T heap-size /i ] N [ 32 T heap-size /i ]
@ -332,7 +361,19 @@ INSTANCE: A sequence
: A-v->n-op ( v1 combine-quot -- v2 ) : A-v->n-op ( v1 combine-quot -- v2 )
[ [ underlying1>> ] [ underlying2>> ] bi A-rep (simd-v+) A-rep ] dip call ; inline [ [ underlying1>> ] [ underlying2>> ] bi A-rep (simd-v+) A-rep ] dip call ; inline
\ A \ A-with \ A-rep \ A-vv->v-op \ A-vn->v-op \ A-vv->n-op \ A-v->v-op \ A-v->n-op simd-vector-words simd new
\ A \ A-rep define-simd-256-type \ A >>class
\ A-with >>ctor
\ A-rep >>rep
{
{ { +vector+ +vector+ -> +vector+ } A-vv->v-op }
{ { +vector+ +scalar+ -> +vector+ } A-vn->v-op }
{ { +vector+ +literal+ -> +vector+ } A-vn->v-op }
{ { +vector+ +vector+ -> +scalar+ } A-vv->n-op }
{ { +vector+ -> +vector+ } A-v->v-op }
{ { +vector+ -> +scalar+ } A-v->n-op }
{ { +vector+ -> +nonnegative+ } A-v->n-op }
} >>wrappers
(define-simd-256)
;FUNCTOR ;FUNCTOR

View File

@ -126,8 +126,6 @@ M: vector-rep supported-simd-op?
{ \ (simd-hlshift) [ %horizontal-shl-vector-reps ] } { \ (simd-hlshift) [ %horizontal-shl-vector-reps ] }
{ \ (simd-hrshift) [ %horizontal-shr-vector-reps ] } { \ (simd-hrshift) [ %horizontal-shr-vector-reps ] }
{ \ (simd-vshuffle) [ %shuffle-vector-reps ] } { \ (simd-vshuffle) [ %shuffle-vector-reps ] }
{ \ (simd-broadcast) [ %broadcast-vector-reps ] }
{ \ (simd-gather-2) [ %gather-vector-2-reps ] } { \ (simd-gather-2) [ %gather-vector-2-reps ] }
{ \ (simd-gather-4) [ %gather-vector-4-reps ] } { \ (simd-gather-4) [ %gather-vector-4-reps ] }
{ \ (simd-select) [ %select-vector-reps ] }
} case member? ; } case member? ;

View File

@ -21,13 +21,13 @@ ARTICLE: "math.vectors.simd.support" "Supported SIMD instruction sets and operat
$nl $nl
"SSE1 only supports single-precision SIMD (" { $snippet "float-4" } " and " { $snippet "float-8" } ")." "SSE1 only supports single-precision SIMD (" { $snippet "float-4" } " and " { $snippet "float-8" } ")."
$nl $nl
"SSE2 introduces double-precision SIMD (" { $snippet "double-2" } " and " { $snippet "double-4" } ") and integer SIMD (all types). Integer SIMD in missing a few features, in particular the " { $link vmin } " and " { $link vmax } " operations only work on " { $snippet "uchar-16" } " and " { $snippet "short-8" } "." "SSE2 introduces double-precision SIMD (" { $snippet "double-2" } " and " { $snippet "double-4" } ") and integer SIMD (all types). Integer SIMD is missing a few features, in particular the " { $link vmin } " and " { $link vmax } " operations only work on " { $snippet "uchar-16" } " and " { $snippet "short-8" } "."
$nl $nl
"SSE3 introduces horizontal adds (summing all components of a single vector register), which is useful for computing dot products. Where available, SSE3 operations are used to speed up " { $link sum } ", " { $link v. } ", " { $link norm-sq } ", " { $link norm } ", and " { $link distance } "." "SSE3 introduces horizontal adds (summing all components of a single vector register), which is useful for computing dot products. Where available, SSE3 operations are used to speed up " { $link sum } ", " { $link v. } ", " { $link norm-sq } ", " { $link norm } ", and " { $link distance } "."
$nl $nl
"SSSE3 introduces " { $link vabs } " for " { $snippet "char-16" } ", " { $snippet "short-8" } " and " { $snippet "int-4" } "." "SSSE3 introduces " { $link vabs } " for " { $snippet "char-16" } ", " { $snippet "short-8" } " and " { $snippet "int-4" } "."
$nl $nl
"SSE4.1 introduces " { $link vmin } " and " { $link vmax } " for all remaining integer types." "SSE4.1 introduces " { $link vmin } " and " { $link vmax } " for all remaining integer types, a faster instruction for " { $link v. } ", and a few other things."
$nl $nl
"On PowerPC, or older x86 chips without SSE, software fallbacks are used for all high-level vector operations. SIMD code can run with no loss in functionality, just decreased performance." "On PowerPC, or older x86 chips without SSE, software fallbacks are used for all high-level vector operations. SIMD code can run with no loss in functionality, just decreased performance."
$nl $nl
@ -183,7 +183,7 @@ $nl
ARTICLE: "math.vectors.simd.accuracy" "Numerical accuracy of SIMD primitives" ARTICLE: "math.vectors.simd.accuracy" "Numerical accuracy of SIMD primitives"
"No guarantees are made that " { $vocab-link "math.vectors.simd" } " words will give identical results on different SSE versions, or between the hardware intrinsics and the software fallbacks." "No guarantees are made that " { $vocab-link "math.vectors.simd" } " words will give identical results on different SSE versions, or between the hardware intrinsics and the software fallbacks."
$nl $nl
"In particular, horizontal operations on " { $snippet "float-4" } " and " { $snippet "float-8" } " are affected by this. They are computed with lower precision in intrinsics than the software fallback. Horizontal opeartions include anything involving adding together the components of a vector, such as " { $link sum } " or " { $link normalize } "." ; "In particular, horizontal operations on " { $snippet "float-4" } " and " { $snippet "float-8" } " are affected by this. They are computed with lower precision in intrinsics than the software fallback. Horizontal operations include anything involving adding together the components of a vector, such as " { $link sum } " or " { $link normalize } "." ;
ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)" ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
"The " { $vocab-link "math.vectors.simd" } " vocabulary extends the " { $vocab-link "math.vectors" } " vocabulary to support efficient vector arithmetic on small, fixed-size vectors." "The " { $vocab-link "math.vectors.simd" } " vocabulary extends the " { $vocab-link "math.vectors" } " vocabulary to support efficient vector arithmetic on small, fixed-size vectors."

View File

@ -5,7 +5,8 @@ math.vectors.simd.private prettyprint random sequences system
tools.test vocabs assocs compiler.cfg.debugger words tools.test vocabs assocs compiler.cfg.debugger words
locals math.vectors.specialization combinators cpu.architecture locals math.vectors.specialization combinators cpu.architecture
math.vectors.simd.intrinsics namespaces byte-arrays alien math.vectors.simd.intrinsics namespaces byte-arrays alien
specialized-arrays classes.struct eval ; specialized-arrays classes.struct eval classes.algebra sets
quotations ;
QUALIFIED-WITH: alien.c-types c QUALIFIED-WITH: alien.c-types c
SPECIALIZED-ARRAY: c:float SPECIALIZED-ARRAY: c:float
SIMD: c:char SIMD: c:char
@ -34,6 +35,20 @@ IN: math.vectors.simd.tests
[ V{ float-4 } ] [ [ { float-4 float-4 } declare v+ ] final-classes ] unit-test [ V{ float-4 } ] [ [ { float-4 float-4 } declare v+ ] final-classes ] unit-test
[ V{ float } ] [ [ { float-4 } declare second ] final-classes ] unit-test
[ V{ int-4 } ] [ [ { int-4 int-4 } declare v+ ] final-classes ] unit-test
[ t ] [ [ { int-4 } declare second ] final-classes first integer class<= ] unit-test
[ V{ longlong-2 } ] [ [ { longlong-2 longlong-2 } declare v+ ] final-classes ] unit-test
[ V{ integer } ] [ [ { longlong-2 } declare second ] final-classes ] unit-test
[ V{ int-8 } ] [ [ { int-8 int-8 } declare v+ ] final-classes ] unit-test
[ t ] [ [ { int-8 } declare second ] final-classes first integer class<= ] unit-test
! Test puns; only on x86 ! Test puns; only on x86
cpu x86? [ cpu x86? [
[ double-2{ 4 1024 } ] [ [ double-2{ 4 1024 } ] [
@ -78,9 +93,10 @@ CONSTANT: simd-classes
: boa-ctors ( -- seq ) : boa-ctors ( -- seq )
simd-classes [ [ name>> "-boa" append ] [ vocabulary>> ] bi lookup ] map ; simd-classes [ [ name>> "-boa" append ] [ vocabulary>> ] bi lookup ] map ;
: check-optimizer ( seq inputs quot eq-quot -- ) : check-optimizer ( seq quot eq-quot -- failures )
'[ '[
@ @
[ dup [ class ] { } map-as ] dip '[ _ declare @ ]
{ {
[ "print-mr" get [ nip test-mr mr. ] [ 2drop ] if ] [ "print-mr" get [ nip test-mr mr. ] [ 2drop ] if ]
[ "print-checks" get [ [ . ] bi@ ] [ 2drop ] if ] [ "print-checks" get [ [ . ] bi@ ] [ 2drop ] if ]
@ -104,7 +120,7 @@ CONSTANT: simd-classes
[ { } ] [ [ { } ] [
with-ctors [ with-ctors [
[ 1000 random '[ _ ] ] dip '[ { fixnum } declare _ execute ] [ 1000 random '[ _ ] ] dip '[ _ execute ]
] [ = ] check-optimizer ] [ = ] check-optimizer
] unit-test ] unit-test
@ -112,10 +128,8 @@ CONSTANT: simd-classes
[ { } ] [ [ { } ] [
boa-ctors [ boa-ctors [
dup stack-effect in>> length [ stack-effect in>> length [ 1000 random ] [ ] replicate-as ] keep
[ nip [ 1000 random ] [ ] replicate-as ] '[ _ execute ]
[ fixnum <array> swap '[ _ declare _ execute ] ]
2bi
] [ = ] check-optimizer ] [ = ] check-optimizer
] unit-test ] unit-test
@ -126,31 +140,22 @@ CONSTANT: simd-classes
:: check-vector-op ( word inputs class elt-class -- inputs quot ) :: check-vector-op ( word inputs class elt-class -- inputs quot )
inputs [ inputs [
[
{ {
{ +vector+ [ class random-vector ] } { +vector+ [ class random-vector ] }
{ +scalar+ [ 1000 random elt-class float = [ >float ] when ] } { +scalar+ [ 1000 random elt-class float = [ >float ] when ] }
} case } case
] [ ] map-as ] [ ] map-as
] [ word '[ _ execute ] ;
[
{
{ +vector+ [ class ] }
{ +scalar+ [ elt-class ] }
} case
] map
] bi
word '[ _ declare _ execute ] ;
: remove-float-words ( alist -- alist' ) : remove-float-words ( alist -- alist' )
[ drop { vsqrt n/v v/n v/ normalize } member? not ] assoc-filter ; { vsqrt n/v v/n v/ normalize } unique assoc-diff ;
: remove-integer-words ( alist -- alist' ) : remove-integer-words ( alist -- alist' )
[ drop { vlshift vrshift } member? not ] assoc-filter ; { vlshift vrshift } unique assoc-diff ;
: remove-special-words ( alist -- alist' ) : remove-special-words ( alist -- alist' )
! These have their own tests later ! These have their own tests later
[ drop { hlshift hrshift vshuffle } member? not ] assoc-filter ; { hlshift hrshift vshuffle } unique assoc-diff ;
: ops-to-check ( elt-class -- alist ) : ops-to-check ( elt-class -- alist )
[ vector-words >alist ] dip [ vector-words >alist ] dip
@ -189,13 +194,89 @@ simd-classes&reps [
[ [ { } ] ] dip first3 '[ _ _ _ check-vector-ops ] unit-test [ [ { } ] ] dip first3 '[ _ _ _ check-vector-ops ] unit-test
] each ] each
! Other regressions "== Checking shifts and permutations" print
[ 8000000 ] [
int-8{ 1000 1000 1000 1000 1000 1000 1000 1000 } [ int-4{ 256 512 1024 2048 } ]
[ { int-8 } declare dup [ * ] [ + ] 2map-reduce ] compile-call [ int-4{ 1 2 4 8 } 1 hlshift ] unit-test
] unit-test
[ int-4{ 256 512 1024 2048 } ]
[ int-4{ 1 2 4 8 } [ { int-4 } declare 1 hlshift ] compile-call ] unit-test
[ int-4{ 1 2 4 8 } ]
[ int-4{ 256 512 1024 2048 } 1 hrshift ] unit-test
[ int-4{ 1 2 4 8 } ]
[ int-4{ 256 512 1024 2048 } [ { int-4 } declare 1 hrshift ] compile-call ] unit-test
! Shuffles
: shuffles-for ( n -- shuffles )
{
{ 2 [
{
{ 0 1 }
{ 1 1 }
{ 1 0 }
{ 0 0 }
}
] }
{ 4 [
{
{ 1 2 3 0 }
{ 0 1 2 3 }
{ 1 1 2 2 }
{ 0 0 1 1 }
{ 2 2 3 3 }
{ 0 1 0 1 }
{ 2 3 2 3 }
{ 0 0 2 2 }
{ 1 1 3 3 }
{ 0 1 0 1 }
{ 2 2 3 3 }
}
] }
{ 8 [
4 shuffles-for
4 shuffles-for
[ [ 4 + ] map ] map
[ append ] 2map
] }
[ dup '[ _ random ] replicate 1array ]
} case ;
simd-classes [
[ [ { } ] ] dip
[ new length shuffles-for ] keep
'[
_ [ [ _ new [ length iota ] keep like 1quotation ] dip '[ _ vshuffle ] ]
[ = ] check-optimizer
] unit-test
] each
"== Checking element access" print
! Test element access -- it should box bignums for int-4 on x86
: test-accesses ( seq -- failures )
[ length >array ] keep
'[ [ _ 1quotation ] dip '[ _ swap nth ] ] [ = ] check-optimizer ; inline
[ { } ] [ float-4{ 1.0 2.0 3.0 4.0 } test-accesses ] unit-test
[ { } ] [ int-4{ HEX: 7fffffff 3 4 -8 } test-accesses ] unit-test
[ { } ] [ uint-4{ HEX: ffffffff 2 3 4 } test-accesses ] unit-test
[ { } ] [ double-2{ 1.0 2.0 } test-accesses ] unit-test
[ { } ] [ longlong-2{ 1 2 } test-accesses ] unit-test
[ { } ] [ ulonglong-2{ 1 2 } test-accesses ] unit-test
[ { } ] [ float-8{ 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 } test-accesses ] unit-test
[ { } ] [ int-8{ 1 2 3 4 5 6 7 8 } test-accesses ] unit-test
[ { } ] [ uint-8{ 1 2 3 4 5 6 7 8 } test-accesses ] unit-test
[ { } ] [ double-4{ 1.0 2.0 3.0 4.0 } test-accesses ] unit-test
[ { } ] [ longlong-4{ 1 2 3 4 } test-accesses ] unit-test
[ { } ] [ ulonglong-4{ 1 2 3 4 } test-accesses ] unit-test
"== Checking alien operations" print
! Vector alien intrinsics
[ float-4{ 1 2 3 4 } ] [ [ float-4{ 1 2 3 4 } ] [
[ [
float-4{ 1 2 3 4 } float-4{ 1 2 3 4 }
@ -259,60 +340,12 @@ STRUCT: simd-struct
] compile-call ] compile-call
] unit-test ] unit-test
"== Misc tests" print
[ ] [ char-16 new 1array stack. ] unit-test [ ] [ char-16 new 1array stack. ] unit-test
[ int-4{ 256 512 1024 2048 } ] ! Other regressions
[ int-4{ 1 2 4 8 } 1 hlshift ] unit-test [ 8000000 ] [
int-8{ 1000 1000 1000 1000 1000 1000 1000 1000 }
[ int-4{ 256 512 1024 2048 } ] [ { int-8 } declare dup [ * ] [ + ] 2map-reduce ] compile-call
[ int-4{ 1 2 4 8 } [ { int-4 } declare 1 hlshift ] compile-call ] unit-test ] unit-test
[ int-4{ 1 2 4 8 } ]
[ int-4{ 256 512 1024 2048 } 1 hrshift ] unit-test
[ int-4{ 1 2 4 8 } ]
[ int-4{ 256 512 1024 2048 } [ { int-4 } declare 1 hrshift ] compile-call ] unit-test
! Shuffles
: test-shuffle ( input shuffle -- failures )
[ dup class 1array ] dip
'[ _ declare _ vshuffle ]
[ call ] [ compile-call ] 2bi = not ; inline
: shuffles-for ( seq -- shuffles )
length {
{ 2 [
{
{ 0 1 }
{ 1 1 }
{ 1 0 }
{ 0 0 }
}
] }
{ 4 [
{
{ 1 2 3 0 }
{ 0 1 2 3 }
{ 1 1 2 2 }
{ 0 0 1 1 }
{ 2 2 3 3 }
{ 0 1 0 1 }
{ 2 3 2 3 }
{ 0 0 2 2 }
{ 1 1 3 3 }
{ 0 1 0 1 }
{ 2 2 3 3 }
}
] }
} case ;
: test-shuffles ( input -- failures )
dup shuffles-for [ test-shuffle ] with filter ; inline
[ { } ] [ float-4{ 1.0 2.0 3.0 4.0 } test-shuffles ] unit-test
[ { } ] [ int-4{ 1 2 3 4 } test-shuffles ] unit-test
[ { } ] [ uint-4{ 1 2 3 4 } test-shuffles ] unit-test
[ { } ] [ double-2{ 1.0 2.0 } test-shuffles ] unit-test
[ { } ] [ longlong-2{ 1 2 } test-shuffles ] unit-test
[ { } ] [ ulonglong-2{ 1 2 } test-shuffles ] unit-test

View File

@ -2,7 +2,8 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: alien.c-types combinators fry kernel parser math math.parser USING: alien.c-types combinators fry kernel parser math math.parser
math.vectors.simd.functor sequences splitting vocabs.generated math.vectors.simd.functor sequences splitting vocabs.generated
vocabs.loader vocabs.parser words accessors ; vocabs.loader vocabs.parser words accessors vocabs compiler.units
definitions ;
QUALIFIED-WITH: alien.c-types c QUALIFIED-WITH: alien.c-types c
IN: math.vectors.simd IN: math.vectors.simd
@ -17,6 +18,12 @@ ERROR: bad-base-type type ;
dup { c:char c:uchar c:short c:ushort c:int c:uint c:longlong c:ulonglong c:float c:double } memq? dup { c:char c:uchar c:short c:ushort c:int c:uint c:longlong c:ulonglong c:float c:double } memq?
[ bad-base-type ] unless ; [ bad-base-type ] unless ;
: forget-instances ( -- )
[
"math.vectors.simd.instances" child-vocabs
[ forget-vocab ] each
] with-compilation-unit ;
PRIVATE> PRIVATE>
: define-simd-vocab ( type -- vocab ) : define-simd-vocab ( type -- vocab )
@ -29,3 +36,4 @@ PRIVATE>
SYNTAX: SIMD: SYNTAX: SIMD:
scan-word define-simd-vocab use-vocab ; scan-word define-simd-vocab use-vocab ;