math.vectors.simd: redesign to be more flexible, integer SIMD work in progress

Slava Pestov 2009-09-20 02:08:32 -05:00
parent e5af501429
commit ea2bcd69c7
11 changed files with 363 additions and 318 deletions

View File

@ -472,3 +472,11 @@ SYMBOLS:
\ ulong \ size_t typedef \ ulong \ size_t typedef
] with-compilation-unit ] with-compilation-unit
M: char-16-rep rep-component-type drop char ;
M: uchar-16-rep rep-component-type drop uchar ;
M: short-8-rep rep-component-type drop short ;
M: ushort-8-rep rep-component-type drop ushort ;
M: int-4-rep rep-component-type drop int ;
M: uint-4-rep rep-component-type drop uint ;
M: float-4-rep rep-component-type drop float ;
M: double-2-rep rep-component-type drop double ;

View File

@ -22,8 +22,6 @@ SINGLETONS: float-rep double-rep ;
! On x86, floating point registers are really vector registers ! On x86, floating point registers are really vector registers
SINGLETONS: SINGLETONS:
float-4-rep
double-2-rep
char-16-rep char-16-rep
uchar-16-rep uchar-16-rep
short-8-rep short-8-rep
@ -31,9 +29,11 @@ ushort-8-rep
int-4-rep int-4-rep
uint-4-rep ; uint-4-rep ;
UNION: vector-rep SINGLETONS:
float-4-rep float-4-rep
double-2-rep double-2-rep ;
UNION: int-vector-rep
char-16-rep char-16-rep
uchar-16-rep uchar-16-rep
short-8-rep short-8-rep
@ -41,6 +41,14 @@ ushort-8-rep
int-4-rep int-4-rep
uint-4-rep ; uint-4-rep ;
UNION: float-vector-rep
float-4-rep
double-2-rep ;
UNION: vector-rep
int-vector-rep
float-vector-rep ;
UNION: representation UNION: representation
any-rep any-rep
tagged-rep tagged-rep
@ -76,10 +84,15 @@ M: double-rep rep-size drop 8 ;
M: stack-params rep-size drop cell ; M: stack-params rep-size drop cell ;
M: vector-rep rep-size drop 16 ; M: vector-rep rep-size drop 16 ;
GENERIC: rep-component-type ( rep -- n )
! Methods defined in alien.c-types
GENERIC: scalar-rep-of ( rep -- rep' ) GENERIC: scalar-rep-of ( rep -- rep' )
M: float-4-rep scalar-rep-of drop float-rep ; M: float-4-rep scalar-rep-of drop float-rep ;
M: double-2-rep scalar-rep-of drop double-rep ; M: double-2-rep scalar-rep-of drop double-rep ;
M: int-vector-rep scalar-rep-of drop int-rep ;
! Mapping from register class to machine registers ! Mapping from register class to machine registers
HOOK: machine-registers cpu ( -- assoc ) HOOK: machine-registers cpu ( -- assoc )
@ -167,7 +180,6 @@ HOOK: %unbox-vector cpu ( dst src rep -- )
HOOK: %broadcast-vector cpu ( dst src rep -- ) HOOK: %broadcast-vector cpu ( dst src rep -- )
HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- ) HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- ) HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
HOOK: %add-vector cpu ( dst src1 src2 rep -- ) HOOK: %add-vector cpu ( dst src1 src2 rep -- )
HOOK: %sub-vector cpu ( dst src1 src2 rep -- ) HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
HOOK: %mul-vector cpu ( dst src1 src2 rep -- ) HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
@ -177,6 +189,18 @@ HOOK: %max-vector cpu ( dst src1 src2 rep -- )
HOOK: %sqrt-vector cpu ( dst src rep -- ) HOOK: %sqrt-vector cpu ( dst src rep -- )
HOOK: %horizontal-add-vector cpu ( dst src rep -- ) HOOK: %horizontal-add-vector cpu ( dst src rep -- )
HOOK: %broadcast-vector-reps cpu ( -- reps )
HOOK: %gather-vector-2-reps cpu ( -- reps )
HOOK: %gather-vector-4-reps cpu ( -- reps )
HOOK: %add-vector-reps cpu ( -- reps )
HOOK: %sub-vector-reps cpu ( -- reps )
HOOK: %mul-vector-reps cpu ( -- reps )
HOOK: %div-vector-reps cpu ( -- reps )
HOOK: %min-vector-reps cpu ( -- reps )
HOOK: %max-vector-reps cpu ( -- reps )
HOOK: %sqrt-vector-reps cpu ( -- reps )
HOOK: %horizontal-add-vector-reps cpu ( -- reps )
HOOK: %unbox-alien cpu ( dst src -- ) HOOK: %unbox-alien cpu ( dst src -- )
HOOK: %unbox-any-c-ptr cpu ( dst src temp -- ) HOOK: %unbox-any-c-ptr cpu ( dst src temp -- )
HOOK: %box-alien cpu ( dst src temp -- ) HOOK: %box-alien cpu ( dst src temp -- )

View File

@ -1,7 +1,7 @@
! Copyright (C) 2009 Slava Pestov. ! Copyright (C) 2009 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: system kernel math math.order math.parser namespaces USING: system kernel memoize math math.order math.parser
alien.c-types alien.syntax combinators locals init io cpu.x86 namespaces alien.c-types alien.syntax combinators locals init io
compiler compiler.units accessors ; compiler compiler.units accessors ;
IN: cpu.x86.features IN: cpu.x86.features
@ -13,7 +13,16 @@ FUNCTION: longlong read_timestamp_counter ( ) ;
PRIVATE> PRIVATE>
ALIAS: sse-version sse_version : sse-version ( -- n )
sse_version
"sse-version" get string>number [ min ] when* ; foldable
: sse? ( -- ? ) sse-version 10 >= ; foldable
: sse2? ( -- ? ) sse-version 20 >= ; foldable
: sse3? ( -- ? ) sse-version 30 >= ; foldable
: ssse3? ( -- ? ) sse-version 33 >= ; foldable
: sse4.1? ( -- ? ) sse-version 41 >= ; foldable
: sse4.2? ( -- ? ) sse-version 42 >= ; foldable
: sse-string ( version -- string ) : sse-string ( version -- string )
{ {
@ -32,37 +41,3 @@ M: x86 instruction-count read_timestamp_counter ;
: count-instructions ( quot -- n ) : count-instructions ( quot -- n )
instruction-count [ call ] dip instruction-count swap - ; inline instruction-count [ call ] dip instruction-count swap - ; inline
USING: cpu.x86.features cpu.x86.features.private ;
:: install-sse-check ( version -- )
[
sse-version version < [
"This image was built to use " write
version sse-string write
" but your CPU only supports " write
sse-version sse-string write "." print
"You will need to bootstrap Factor again." print
flush
1 exit
] when
] "cpu.x86" add-init-hook ;
: enable-sse ( version -- )
{
{ 00 [ ] }
{ 10 [ ] }
{ 20 [ enable-sse2 ] }
{ 30 [ enable-sse3 ] }
{ 33 [ enable-sse3 ] }
{ 41 [ enable-sse3 ] }
{ 42 [ enable-sse3 ] }
} case ;
[ { sse_version } compile ] with-optimizer
"Checking for multimedia extensions: " write sse-version
"sse-version" get [ string>number min ] when*
[ sse-string write " detected" print ]
[ install-sse-check ]
[ enable-sse ] tri

View File

@ -4,7 +4,8 @@ USING: accessors assocs alien alien.c-types arrays strings
cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
cpu.architecture kernel kernel.private math memory namespaces make cpu.architecture kernel kernel.private math memory namespaces make
sequences words system layouts combinators math.order fry locals sequences words system layouts combinators math.order fry locals
compiler.constants byte-arrays compiler.constants byte-arrays io macros quotations cpu.x86.features
cpu.x86.features.private compiler compiler.units init
compiler.cfg.registers compiler.cfg.registers
compiler.cfg.instructions compiler.cfg.instructions
compiler.cfg.intrinsics compiler.cfg.intrinsics
@ -250,12 +251,26 @@ M:: x86 %unbox-vector ( dst src rep -- )
dst src byte-array-offset [+] dst src byte-array-offset [+]
rep copy-register ; rep copy-register ;
MACRO: available-reps ( alist -- )
! Each SSE version adds new representations and supports
! all old ones
unzip { } [ append ] accumulate rest swap suffix
[ [ 1quotation ] map ] bi@ zip
reverse [ { } ] suffix
'[ _ cond ] ;
M: x86 %broadcast-vector ( dst src rep -- ) M: x86 %broadcast-vector ( dst src rep -- )
{ {
{ float-4-rep [ [ MOVSS ] [ drop dup 0 SHUFPS ] 2bi ] } { float-4-rep [ [ MOVSS ] [ drop dup 0 SHUFPS ] 2bi ] }
{ double-2-rep [ [ MOVSD ] [ drop dup UNPCKLPD ] 2bi ] } { double-2-rep [ [ MOVSD ] [ drop dup UNPCKLPD ] 2bi ] }
} case ; } case ;
M: x86 %broadcast-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep } }
} available-reps ;
M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
rep { rep {
{ {
@ -269,6 +284,11 @@ M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
} }
} case ; } case ;
M: x86 %gather-vector-4-reps
{
{ sse? { float-4-rep } }
} available-reps ;
M:: x86 %gather-vector-2 ( dst src1 src2 rep -- ) M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
rep { rep {
{ {
@ -280,6 +300,11 @@ M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
} }
} case ; } case ;
M: x86 %gather-vector-2-reps
{
{ sse2? { double-2-rep } }
} available-reps ;
M: x86 %add-vector ( dst src1 src2 rep -- ) M: x86 %add-vector ( dst src1 src2 rep -- )
{ {
{ float-4-rep [ ADDPS ] } { float-4-rep [ ADDPS ] }
@ -292,6 +317,12 @@ M: x86 %add-vector ( dst src1 src2 rep -- )
{ uint-4-rep [ PADDD ] } { uint-4-rep [ PADDD ] }
} case drop ; } case drop ;
M: x86 %add-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
} available-reps ;
M: x86 %sub-vector ( dst src1 src2 rep -- ) M: x86 %sub-vector ( dst src1 src2 rep -- )
{ {
{ float-4-rep [ SUBPS ] } { float-4-rep [ SUBPS ] }
@ -304,43 +335,92 @@ M: x86 %sub-vector ( dst src1 src2 rep -- )
{ uint-4-rep [ PSUBD ] } { uint-4-rep [ PSUBD ] }
} case drop ; } case drop ;
M: x86 %sub-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
} available-reps ;
M: x86 %mul-vector ( dst src1 src2 rep -- ) M: x86 %mul-vector ( dst src1 src2 rep -- )
{ {
{ float-4-rep [ MULPS ] } { float-4-rep [ MULPS ] }
{ double-2-rep [ MULPD ] } { double-2-rep [ MULPD ] }
{ int-4-rep [ PMULLW ] } { short-8-rep [ PMULLW ] }
{ ushort-8-rep [ PMULLW ] }
{ int-4-rep [ PMULLD ] }
{ uint-4-rep [ PMULLD ] }
} case drop ; } case drop ;
M: x86 %mul-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
} available-reps ;
M: x86 %div-vector ( dst src1 src2 rep -- ) M: x86 %div-vector ( dst src1 src2 rep -- )
{ {
{ float-4-rep [ DIVPS ] } { float-4-rep [ DIVPS ] }
{ double-2-rep [ DIVPD ] } { double-2-rep [ DIVPD ] }
} case drop ; } case drop ;
M: x86 %div-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep } }
} available-reps ;
M: x86 %min-vector ( dst src1 src2 rep -- ) M: x86 %min-vector ( dst src1 src2 rep -- )
{ {
{ float-4-rep [ MINPS ] } { float-4-rep [ MINPS ] }
{ double-2-rep [ MINPD ] } { double-2-rep [ MINPD ] }
{ uchar-16-rep [ PMINUB ] }
{ short-8-rep [ PMINSW ] }
} case drop ; } case drop ;
M: x86 %min-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep short-8-rep uchar-16-rep } }
} available-reps ;
M: x86 %max-vector ( dst src1 src2 rep -- ) M: x86 %max-vector ( dst src1 src2 rep -- )
{ {
{ float-4-rep [ MAXPS ] } { float-4-rep [ MAXPS ] }
{ double-2-rep [ MAXPD ] } { double-2-rep [ MAXPD ] }
{ uchar-16-rep [ PMAXUB ] }
{ short-8-rep [ PMAXSW ] }
} case drop ; } case drop ;
M: x86 %max-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep short-8-rep uchar-16-rep } }
} available-reps ;
M: x86 %sqrt-vector ( dst src rep -- ) M: x86 %sqrt-vector ( dst src rep -- )
{ {
{ float-4-rep [ SQRTPS ] } { float-4-rep [ SQRTPS ] }
{ double-2-rep [ SQRTPD ] } { double-2-rep [ SQRTPD ] }
} case ; } case ;
M: x86 %sqrt-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep } }
} available-reps ;
M: x86 %horizontal-add-vector ( dst src rep -- ) M: x86 %horizontal-add-vector ( dst src rep -- )
{ {
{ float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] } { float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
{ double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] } { double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
} case ; } case ;
M: x86 %horizontal-add-vector-reps
{
{ sse? { float-4-rep } }
{ sse2? { double-2-rep short-8-rep uchar-16-rep } }
} available-reps ;
M: x86 %unbox-alien ( dst src -- ) M: x86 %unbox-alien ( dst src -- )
alien-offset [+] MOV ; alien-offset [+] MOV ;
@ -775,3 +855,34 @@ M: x86 small-enough? ( n -- ? )
enable-sse3-simd ; enable-sse3-simd ;
enable-min/max enable-min/max
:: install-sse-check ( version -- )
[
sse-version version < [
"This image was built to use " write
version sse-string write
" but your CPU only supports " write
sse-version sse-string write "." print
"You will need to bootstrap Factor again." print
flush
1 exit
] when
] "cpu.x86" add-init-hook ;
: enable-sse ( version -- )
{
{ 00 [ ] }
{ 10 [ ] }
{ 20 [ enable-sse2 ] }
{ 30 [ enable-sse3 ] }
{ 33 [ enable-sse3 ] }
{ 41 [ enable-sse3 ] }
{ 42 [ enable-sse3 ] }
} case ;
[ { sse_version } compile ] with-optimizer
"Checking for multimedia extensions: " write sse-version 30 min
[ sse-string write " detected" print ]
[ install-sse-check ]
[ enable-sse ] tri

View File

@ -31,9 +31,7 @@ M: x87-env (set-fp-env-register)
set_x87_env ; set_x87_env ;
M: x86 (fp-env-registers) M: x86 (fp-env-registers)
sse-version 20 >= sse2? [ <sse-env> <x87-env> 2array ] [ <x87-env> 1array ] if ;
[ <sse-env> <x87-env> 2array ]
[ <x87-env> 1array ] if ;
CONSTANT: sse-exception-flag-bits HEX: 3f CONSTANT: sse-exception-flag-bits HEX: 3f
CONSTANT: sse-exception-flag>bit CONSTANT: sse-exception-flag>bit

View File

@ -1,24 +1,94 @@
! Copyright (C) 2009 Slava Pestov. ! Copyright (C) 2009 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors alien.c-types byte-arrays classes functors USING: accessors alien.c-types assocs byte-arrays classes
kernel math parser prettyprint.custom sequences effects fry functors generalizations kernel literals locals
sequences.private literals ; math math.functions math.vectors math.vectors.simd.intrinsics
math.vectors.specialization parser prettyprint.custom sequences
sequences.private strings words definitions macros cpu.architecture ;
IN: math.vectors.simd.functor IN: math.vectors.simd.functor
ERROR: bad-length got expected ; ERROR: bad-length got expected ;
MACRO: simd-boa ( rep class -- simd-array )
[ rep-components ] [ new ] bi* '[ _ _ nsequence ] ;
:: define-boa-custom-inlining ( word rep class -- )
word [
drop
rep rep rep-gather-word supported-simd-op? [
[ rep (simd-boa) class boa ]
] [ word def>> ] if
] "custom-inlining" set-word-prop ;
: simd-with ( rep class x -- simd-array )
[ rep-components ] [ new ] [ '[ _ ] ] tri* swap replicate-as ; inline
:: define-with-custom-inlining ( word rep class -- )
word [
drop
rep \ (simd-broadcast) supported-simd-op? [
[ rep rep-coerce rep (simd-broadcast) class boa ]
] [ word def>> ] if
] "custom-inlining" set-word-prop ;
: boa-effect ( rep n -- effect )
[ rep-components ] dip *
[ CHAR: a + 1string ] map
{ "simd-vector" } <effect> ;
: supported-simd-ops ( assoc rep -- assoc' )
[
{
{ v+ (simd-v+) }
{ v- (simd-v-) }
{ v* (simd-v*) }
{ v/ (simd-v/) }
{ vmin (simd-vmin) }
{ vmax (simd-vmax) }
{ sum (simd-sum) }
}
] dip
'[ nip _ swap supported-simd-op? ] assoc-filter
'[ drop _ key? ] assoc-filter ;
:: high-level-ops ( ctor -- assoc )
! Some SIMD operations are defined in terms of others.
{
{ vneg [ [ dup v- ] keep v- ] }
{ v. [ v* sum ] }
{ n+v [ [ ctor execute ] dip v+ ] }
{ v+n [ ctor execute v+ ] }
{ n-v [ [ ctor execute ] dip v- ] }
{ v-n [ ctor execute v- ] }
{ n*v [ [ ctor execute ] dip v* ] }
{ v*n [ ctor execute v* ] }
{ n/v [ [ ctor execute ] dip v/ ] }
{ v/n [ ctor execute v/ ] }
{ norm-sq [ dup v. assert-positive ] }
{ norm [ norm-sq sqrt ] }
{ normalize [ dup norm v/n ] }
{ distance [ v- norm ] }
} ;
:: simd-vector-words ( class ctor rep assoc -- )
class
rep rep-component-type c-type-boxed-class
assoc rep supported-simd-ops
ctor high-level-ops assoc-union
specialize-vector-words ;
FUNCTOR: define-simd-128 ( T -- ) FUNCTOR: define-simd-128 ( T -- )
T-TYPE IS ${T} N [ 16 T heap-size /i ]
N [ 16 T-TYPE heap-size /i ]
A DEFINES-CLASS ${T}-${N} A DEFINES-CLASS ${T}-${N}
A-boa DEFINES ${A}-boa
A-with DEFINES ${A}-with
>A DEFINES >${A} >A DEFINES >${A}
A{ DEFINES ${A}{ A{ DEFINES ${A}{
NTH [ T-TYPE dup c-type-getter-boxer array-accessor ] NTH [ T dup c-type-getter-boxer array-accessor ]
SET-NTH [ T-TYPE dup c-setter array-accessor ] SET-NTH [ T dup c-setter array-accessor ]
A-rep IS ${A}-rep A-rep IS ${A}-rep
A-vv->v-op DEFINES-PRIVATE ${A}-vv->v-op A-vv->v-op DEFINES-PRIVATE ${A}-vv->v-op
@ -59,6 +129,16 @@ M: A pprint* pprint-object ;
SYNTAX: A{ \ } [ >A ] parse-literal ; SYNTAX: A{ \ } [ >A ] parse-literal ;
: A-with ( x -- simd-array ) [ A-rep A ] dip simd-with ;
\ A-with \ A-rep \ A define-with-custom-inlining
\ A-boa [ \ A-rep \ A simd-boa ] \ A-rep 1 boa-effect define-declared
\ A-rep rep-gather-word [
\ A-boa \ A-rep \ A define-boa-custom-inlining
] when
INSTANCE: A sequence INSTANCE: A sequence
<PRIVATE <PRIVATE
@ -69,6 +149,16 @@ INSTANCE: A sequence
: A-v->n-op ( v quot -- n ) : A-v->n-op ( v quot -- n )
[ underlying>> A-rep ] dip call ; inline [ underlying>> A-rep ] dip call ; inline
\ A \ A-with \ A-rep H{
{ v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
{ v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
{ v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
{ v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
{ vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
{ vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
{ sum [ [ (simd-sum) ] \ A-v->n-op execute ] }
} simd-vector-words
PRIVATE> PRIVATE>
;FUNCTOR ;FUNCTOR
@ -76,14 +166,16 @@ PRIVATE>
! Synthesize 256-bit vectors from a pair of 128-bit vectors ! Synthesize 256-bit vectors from a pair of 128-bit vectors
FUNCTOR: define-simd-256 ( T -- ) FUNCTOR: define-simd-256 ( T -- )
T-TYPE IS ${T} N [ 32 T heap-size /i ]
N [ 32 T-TYPE heap-size /i ]
N/2 [ N 2 / ] N/2 [ N 2 / ]
A/2 IS ${T}-${N/2} A/2 IS ${T}-${N/2}
A/2-boa IS ${A/2}-boa
A/2-with IS ${A/2}-with
A DEFINES-CLASS ${T}-${N} A DEFINES-CLASS ${T}-${N}
A-boa DEFINES ${A}-boa
A-with DEFINES ${A}-with
>A DEFINES >${A} >A DEFINES >${A}
A{ DEFINES ${A}{ A{ DEFINES ${A}{
@ -137,6 +229,16 @@ M: A >pprint-sequence ;
M: A pprint* pprint-object ; M: A pprint* pprint-object ;
: A-with ( x -- simd-array )
[ A/2-with ] [ A/2-with ] bi [ underlying>> ] bi@
\ A boa ; inline
: A-boa ( ... -- simd-array )
[ A/2-boa ] N/2 ndip A/2-boa [ underlying>> ] bi@
\ A boa ;
\ A-rep 2 boa-effect \ A-boa set-stack-effect
INSTANCE: A sequence INSTANCE: A sequence
: A-vv->v-op ( v1 v2 quot -- v3 ) : A-vv->v-op ( v1 v2 quot -- v3 )
@ -148,4 +250,14 @@ INSTANCE: A sequence
[ [ [ underlying1>> ] [ underlying2>> ] bi A-rep ] dip call A-rep ] [ [ [ underlying1>> ] [ underlying2>> ] bi A-rep ] dip call A-rep ]
dip call ; inline dip call ; inline
\ A \ A-with \ A-rep H{
{ v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
{ v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
{ v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
{ v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
{ vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
{ vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
{ sum [ [ (simd-v+) ] [ (simd-sum) ] \ A-v->n-op execute ] }
} simd-vector-words
;FUNCTOR ;FUNCTOR

View File

@ -0,0 +1,18 @@
IN: math.vectors.simd.intrinsics.tests
USING: math.vectors.simd.intrinsics cpu.architecture tools.test ;
[ 16 ] [ uchar-16-rep rep-components ] unit-test
[ 16 ] [ char-16-rep rep-components ] unit-test
[ 8 ] [ ushort-8-rep rep-components ] unit-test
[ 8 ] [ short-8-rep rep-components ] unit-test
[ 4 ] [ uint-4-rep rep-components ] unit-test
[ 4 ] [ int-4-rep rep-components ] unit-test
[ 4 ] [ float-4-rep rep-components ] unit-test
[ 2 ] [ double-2-rep rep-components ] unit-test
{ 4 1 } [ uint-4-rep (simd-boa) ] must-infer-as
{ 4 1 } [ int-4-rep (simd-boa) ] must-infer-as
{ 4 1 } [ float-4-rep (simd-boa) ] must-infer-as
{ 2 1 } [ double-2-rep (simd-boa) ] must-infer-as

View File

@ -1,6 +1,8 @@
! Copyright (C) 2009 Slava Pestov. ! Copyright (C) 2009 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel alien alien.data cpu.architecture libc ; USING: alien alien.c-types alien.data assocs combinators
cpu.architecture fry generalizations kernel libc macros math
sequences ;
IN: math.vectors.simd.intrinsics IN: math.vectors.simd.intrinsics
ERROR: bad-simd-call ; ERROR: bad-simd-call ;
@ -26,3 +28,53 @@ ERROR: bad-simd-call ;
! Inefficient version for when intrinsics are missing ! Inefficient version for when intrinsics are missing
[ swap <displaced-alien> swap ] dip rep-size memcpy ; [ swap <displaced-alien> swap ] dip rep-size memcpy ;
<<
: rep-components ( rep -- n )
16 swap rep-component-type heap-size /i ; foldable
: rep-coercer ( rep -- quot )
{
{ [ dup int-vector-rep? ] [ [ >fixnum ] ] }
{ [ dup float-vector-rep? ] [ [ >float ] ] }
} cond nip ; foldable
: rep-coerce ( value rep -- value' )
rep-coercer call( value -- value' ) ; inline
CONSTANT: rep-gather-words
{
{ 2 (simd-gather-2) }
{ 4 (simd-gather-4) }
}
: rep-gather-word ( rep -- word )
rep-components rep-gather-words at ;
>>
MACRO: (simd-boa) ( rep -- quot )
{
[ rep-coercer ]
[ rep-components ]
[ ]
[ rep-gather-word ]
} cleave
'[ _ _ napply _ _ execute ] ;
GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
M: vector-rep supported-simd-op?
{
{ \ (simd-v+) [ %add-vector-reps ] }
{ \ (simd-v-) [ %sub-vector-reps ] }
{ \ (simd-v*) [ %mul-vector-reps ] }
{ \ (simd-v/) [ %div-vector-reps ] }
{ \ (simd-vmin) [ %min-vector-reps ] }
{ \ (simd-vmax) [ %max-vector-reps ] }
{ \ (simd-vsqrt) [ %sqrt-vector-reps ] }
{ \ (simd-sum) [ %horizontal-add-vector-reps ] }
{ \ (simd-broadcast) [ %broadcast-vector-reps ] }
{ \ (simd-gather-2) [ %gather-vector-2-reps ] }
{ \ (simd-gather-4) [ %gather-vector-4-reps ] }
} case member? ;

View File

@ -43,22 +43,6 @@ $nl
} }
"The " { $link float-4 } " and " { $link double-2 } " types correspond to 128-bit vector registers. The " { $link float-8 } " and " { $link double-4 } " types are not directly supported in hardware, and instead unbox to a pair of 128-bit vector registers." "The " { $link float-4 } " and " { $link double-2 } " types correspond to 128-bit vector registers. The " { $link float-8 } " and " { $link double-4 } " types are not directly supported in hardware, and instead unbox to a pair of 128-bit vector registers."
$nl $nl
"Operations on " { $link float-4 } " instances:"
{ $subsection float-4-with }
{ $subsection float-4-boa }
{ $subsection POSTPONE: float-4{ }
"Operations on " { $link double-2 } " instances:"
{ $subsection double-2-with }
{ $subsection double-2-boa }
{ $subsection POSTPONE: double-2{ }
"Operations on " { $link float-8 } " instances:"
{ $subsection float-8-with }
{ $subsection float-8-boa }
{ $subsection POSTPONE: float-8{ }
"Operations on " { $link double-4 } " instances:"
{ $subsection double-4-with }
{ $subsection double-4-boa }
{ $subsection POSTPONE: double-4{ }
"To actually perform vector arithmetic on SIMD vectors, use " { $link "math-vectors" } " words." "To actually perform vector arithmetic on SIMD vectors, use " { $link "math-vectors" } " words."
{ $see-also "c-types-specs" } ; { $see-also "c-types-specs" } ;
@ -184,72 +168,4 @@ ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
{ $subsection "math.vectors.simd.alien" } { $subsection "math.vectors.simd.alien" }
{ $subsection "math.vectors.simd.intrinsics" } ; { $subsection "math.vectors.simd.intrinsics" } ;
! ! ! float-4
HELP: float-4
{ $class-description "A sequence of four single-precision floating point values. New instances can be created with " { $link float-4-with } " or " { $link float-4-boa } "." } ;
HELP: float-4-with
{ $values { "x" float } { "simd-array" float-4 } }
{ $description "Creates a new vector with all four components equal to a scalar." } ;
HELP: float-4-boa
{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" float-4 } }
{ $description "Creates a new vector from four scalar components." } ;
HELP: float-4{
{ $syntax "float-4{ a b c d }" }
{ $description "Literal syntax for a " { $link float-4 } "." } ;
! ! ! double-2
HELP: double-2
{ $class-description "A sequence of two double-precision floating point values. New instances can be created with " { $link double-2-with } " or " { $link double-2-boa } "." } ;
HELP: double-2-with
{ $values { "x" float } { "simd-array" double-2 } }
{ $description "Creates a new vector with both components equal to a scalar." } ;
HELP: double-2-boa
{ $values { "a" float } { "b" float } { "simd-array" double-2 } }
{ $description "Creates a new vector from two scalar components." } ;
HELP: double-2{
{ $syntax "double-2{ a b }" }
{ $description "Literal syntax for a " { $link double-2 } "." } ;
! ! ! float-8
HELP: float-8
{ $class-description "A sequence of eight single-precision floating point values. New instances can be created with " { $link float-8-with } " or " { $link float-8-boa } "." } ;
HELP: float-8-with
{ $values { "x" float } { "simd-array" float-8 } }
{ $description "Creates a new vector with all eight components equal to a scalar." } ;
HELP: float-8-boa
{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "e" float } { "f" float } { "g" float } { "h" float } { "simd-array" float-8 } }
{ $description "Creates a new vector from eight scalar components." } ;
HELP: float-8{
{ $syntax "float-8{ a b c d e f g h }" }
{ $description "Literal syntax for a " { $link float-8 } "." } ;
! ! ! double-4
HELP: double-4
{ $class-description "A sequence of four double-precision floating point values. New instances can be created with " { $link double-4-with } " or " { $link double-4-boa } "." } ;
HELP: double-4-with
{ $values { "x" float } { "simd-array" double-4 } }
{ $description "Creates a new vector with all four components equal to a scalar." } ;
HELP: double-4-boa
{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" double-4 } }
{ $description "Creates a new vector from four scalar components." } ;
HELP: double-4{
{ $syntax "double-4{ a b c d }" }
{ $description "Literal syntax for a " { $link double-4 } "." } ;
ABOUT: "math.vectors.simd" ABOUT: "math.vectors.simd"

View File

@ -1,185 +1,15 @@
! Copyright (C) 2009 Slava Pestov. ! Copyright (C) 2009 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors alien.c-types byte-arrays cpu.architecture USING: alien.c-types cpu.architecture kernel
kernel math math.functions math.vectors math.vectors.simd.functor vocabs.loader ;
math.vectors.simd.functor math.vectors.simd.intrinsics FROM: sequences => each ;
math.vectors.specialization parser prettyprint.custom sequences
sequences.private locals assocs words fry ;
FROM: alien.c-types => float ;
QUALIFIED-WITH: math m
IN: math.vectors.simd IN: math.vectors.simd
<< <<
DEFER: float-4 { double float char uchar short ushort int uint }
DEFER: double-2 [ [ define-simd-128 ] [ define-simd-256 ] bi ] each
DEFER: float-8
DEFER: double-4
"double" define-simd-128
"float" define-simd-128
"double" define-simd-256
"float" define-simd-256
>> >>
: float-4-with ( x -- simd-array )
[ 4 ] dip >float '[ _ ] \ float-4 new replicate-as ;
: float-4-boa ( a b c d -- simd-array )
\ float-4 new 4sequence ;
: double-2-with ( x -- simd-array )
[ 2 ] dip >float '[ _ ] \ double-2 new replicate-as ;
: double-2-boa ( a b -- simd-array )
\ double-2 new 2sequence ;
! More efficient expansions for the above, used when SIMD is
! actually available.
<<
\ float-4-with [
drop
\ (simd-broadcast) "intrinsic" word-prop [
[ >float float-4-rep (simd-broadcast) \ float-4 boa ]
] [ \ float-4-with def>> ] if
] "custom-inlining" set-word-prop
\ float-4-boa [
drop
\ (simd-gather-4) "intrinsic" word-prop [
[| a b c d |
a >float b >float c >float d >float
float-4-rep (simd-gather-4) \ float-4 boa
]
] [ \ float-4-boa def>> ] if
] "custom-inlining" set-word-prop
\ double-2-with [
drop
\ (simd-broadcast) "intrinsic" word-prop [
[ >float double-2-rep (simd-broadcast) \ double-2 boa ]
] [ \ double-2-with def>> ] if
] "custom-inlining" set-word-prop
\ double-2-boa [
drop
\ (simd-gather-4) "intrinsic" word-prop [
[ [ >float ] bi@ double-2-rep (simd-gather-2) \ double-2 boa ]
] [ \ double-2-boa def>> ] if
] "custom-inlining" set-word-prop
>>
: float-8-with ( x -- simd-array )
[ float-4-with ] [ float-4-with ] bi [ underlying>> ] bi@
\ float-8 boa ; inline
:: float-8-boa ( a b c d e f g h -- simd-array )
a b c d float-4-boa
e f g h float-4-boa
[ underlying>> ] bi@
\ float-8 boa ; inline
: double-4-with ( x -- simd-array )
[ double-2-with ] [ double-2-with ] bi [ underlying>> ] bi@
\ double-4 boa ; inline
:: double-4-boa ( a b c d -- simd-array )
a b double-2-boa
c d double-2-boa
[ underlying>> ] bi@
\ double-4 boa ; inline
<<
<PRIVATE
! Filter out operations that are not available, eg horizontal adds
! on SSE2. Fallback code in math.vectors is used in that case.
: supported-simd-ops ( assoc -- assoc' )
{
{ v+ (simd-v+) }
{ v- (simd-v-) }
{ v* (simd-v*) }
{ v/ (simd-v/) }
{ vmin (simd-vmin) }
{ vmax (simd-vmax) }
{ sum (simd-sum) }
} [ nip "intrinsic" word-prop ] assoc-filter
'[ drop _ key? ] assoc-filter ;
! Some SIMD operations are defined in terms of others.
:: high-level-ops ( ctor -- assoc )
{
{ vneg [ [ dup v- ] keep v- ] }
{ v. [ v* sum ] }
{ n+v [ [ ctor execute ] dip v+ ] }
{ v+n [ ctor execute v+ ] }
{ n-v [ [ ctor execute ] dip v- ] }
{ v-n [ ctor execute v- ] }
{ n*v [ [ ctor execute ] dip v* ] }
{ v*n [ ctor execute v* ] }
{ n/v [ [ ctor execute ] dip v/ ] }
{ v/n [ ctor execute v/ ] }
{ norm-sq [ dup v. assert-positive ] }
{ norm [ norm-sq sqrt ] }
{ normalize [ dup norm v/n ] }
{ distance [ v- norm ] }
} ;
:: simd-vector-words ( class ctor elt-type assoc -- )
class elt-type assoc supported-simd-ops ctor high-level-ops assoc-union
specialize-vector-words ;
PRIVATE>
\ float-4 \ float-4-with m:float H{
{ v+ [ [ (simd-v+) ] float-4-vv->v-op ] }
{ v- [ [ (simd-v-) ] float-4-vv->v-op ] }
{ v* [ [ (simd-v*) ] float-4-vv->v-op ] }
{ v/ [ [ (simd-v/) ] float-4-vv->v-op ] }
{ vmin [ [ (simd-vmin) ] float-4-vv->v-op ] }
{ vmax [ [ (simd-vmax) ] float-4-vv->v-op ] }
{ sum [ [ (simd-sum) ] float-4-v->n-op ] }
} simd-vector-words
\ double-2 \ double-2-with m:float H{
{ v+ [ [ (simd-v+) ] double-2-vv->v-op ] }
{ v- [ [ (simd-v-) ] double-2-vv->v-op ] }
{ v* [ [ (simd-v*) ] double-2-vv->v-op ] }
{ v/ [ [ (simd-v/) ] double-2-vv->v-op ] }
{ vmin [ [ (simd-vmin) ] double-2-vv->v-op ] }
{ vmax [ [ (simd-vmax) ] double-2-vv->v-op ] }
{ sum [ [ (simd-sum) ] double-2-v->n-op ] }
} simd-vector-words
\ float-8 \ float-8-with m:float H{
{ v+ [ [ (simd-v+) ] float-8-vv->v-op ] }
{ v- [ [ (simd-v-) ] float-8-vv->v-op ] }
{ v* [ [ (simd-v*) ] float-8-vv->v-op ] }
{ v/ [ [ (simd-v/) ] float-8-vv->v-op ] }
{ vmin [ [ (simd-vmin) ] float-8-vv->v-op ] }
{ vmax [ [ (simd-vmax) ] float-8-vv->v-op ] }
{ sum [ [ (simd-sum) ] [ + ] float-8-v->n-op ] }
} simd-vector-words
\ double-4 \ double-4-with m:float H{
{ v+ [ [ (simd-v+) ] double-4-vv->v-op ] }
{ v- [ [ (simd-v-) ] double-4-vv->v-op ] }
{ v* [ [ (simd-v*) ] double-4-vv->v-op ] }
{ v/ [ [ (simd-v/) ] double-4-vv->v-op ] }
{ vmin [ [ (simd-vmin) ] double-4-vv->v-op ] }
{ vmax [ [ (simd-vmax) ] double-4-vv->v-op ] }
{ sum [ [ (simd-v+) ] [ (simd-sum) ] double-4-v->n-op ] }
} simd-vector-words
>>
USE: vocabs.loader
"math.vectors.simd.alien" require "math.vectors.simd.alien" require

View File

@ -0,0 +1 @@
Single-instruction-multiple-data parallel vector operations