math.vectors.simd: redesign to be more flexible, integer SIMD work in progress

2009-09-20 02:08:32 -05:00 · 2009-09-20 02:08:32 -05:00 · ea2bcd69c7
parent e5af501429
commit ea2bcd69c7
11 changed files with 363 additions and 318 deletions
--- a/basis/alien/c-types/c-types.factor
+++ b/basis/alien/c-types/c-types.factor
@ -472,3 +472,11 @@ SYMBOLS:
    \ ulong \ size_t typedef
 ] with-compilation-unit
 M: char-16-rep rep-component-type drop char ;
 M: uchar-16-rep rep-component-type drop uchar ;
 M: short-8-rep rep-component-type drop short ;
 M: ushort-8-rep rep-component-type drop ushort ;
 M: int-4-rep rep-component-type drop int ;
 M: uint-4-rep rep-component-type drop uint ;
 M: float-4-rep rep-component-type drop float ;
 M: double-2-rep rep-component-type drop double ;
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@ -22,8 +22,6 @@ SINGLETONS: float-rep double-rep ;
 ! On x86, floating point registers are really vector registers
 SINGLETONS:
 float-4-rep
 double-2-rep
 char-16-rep
 uchar-16-rep
 short-8-rep
@ -31,9 +29,11 @@ ushort-8-rep
 int-4-rep
 uint-4-rep ;
-UNION: vector-rep
+SINGLETONS:
 float-4-rep
-double-2-rep
+double-2-rep ;
 UNION: int-vector-rep
 char-16-rep
 uchar-16-rep
 short-8-rep
@ -41,6 +41,14 @@ ushort-8-rep
 int-4-rep
 uint-4-rep ;
 UNION: float-vector-rep
 float-4-rep
 double-2-rep ;
 UNION: vector-rep
 int-vector-rep
 float-vector-rep ;
 UNION: representation
 any-rep
 tagged-rep
@ -76,10 +84,15 @@ M: double-rep rep-size drop 8 ;
 M: stack-params rep-size drop cell ;
 M: vector-rep rep-size drop 16 ;
 GENERIC: rep-component-type ( rep -- n )
 ! Methods defined in alien.c-types
 GENERIC: scalar-rep-of ( rep -- rep' )
 M: float-4-rep scalar-rep-of drop float-rep ;
 M: double-2-rep scalar-rep-of drop double-rep ;
 M: int-vector-rep scalar-rep-of drop int-rep ;
 ! Mapping from register class to machine registers
 HOOK: machine-registers cpu ( -- assoc )
@ -167,7 +180,6 @@ HOOK: %unbox-vector cpu ( dst src rep -- )
 HOOK: %broadcast-vector cpu ( dst src rep -- )
 HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
 HOOK: %add-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
@ -177,6 +189,18 @@ HOOK: %max-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sqrt-vector cpu ( dst src rep -- )
 HOOK: %horizontal-add-vector cpu ( dst src rep -- )
 HOOK: %broadcast-vector-reps cpu ( -- reps )
 HOOK: %gather-vector-2-reps cpu ( -- reps )
 HOOK: %gather-vector-4-reps cpu ( -- reps )
 HOOK: %add-vector-reps cpu ( -- reps )
 HOOK: %sub-vector-reps cpu ( -- reps )
 HOOK: %mul-vector-reps cpu ( -- reps )
 HOOK: %div-vector-reps cpu ( -- reps )
 HOOK: %min-vector-reps cpu ( -- reps )
 HOOK: %max-vector-reps cpu ( -- reps )
 HOOK: %sqrt-vector-reps cpu ( -- reps )
 HOOK: %horizontal-add-vector-reps cpu ( -- reps )
 HOOK: %unbox-alien cpu ( dst src -- )
 HOOK: %unbox-any-c-ptr cpu ( dst src temp -- )
 HOOK: %box-alien cpu ( dst src temp -- )
--- a/basis/cpu/x86/features/features.factor
+++ b/basis/cpu/x86/features/features.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: system kernel math math.order math.parser namespaces
+USING: system kernel memoize math math.order math.parser
-alien.c-types alien.syntax combinators locals init io cpu.x86
+namespaces alien.c-types alien.syntax combinators locals init io
 compiler compiler.units accessors ;
 IN: cpu.x86.features
@ -13,7 +13,16 @@ FUNCTION: longlong read_timestamp_counter ( ) ;
 PRIVATE>
-ALIAS: sse-version sse_version
+: sse-version ( -- n )
    sse_version
    "sse-version" get string>number [ min ] when* ; foldable
 : sse? ( -- ? ) sse-version 10 >= ; foldable
 : sse2? ( -- ? ) sse-version 20 >= ; foldable
 : sse3? ( -- ? ) sse-version 30 >= ; foldable
 : ssse3? ( -- ? ) sse-version 33 >= ; foldable
 : sse4.1? ( -- ? ) sse-version 41 >= ; foldable
 : sse4.2? ( -- ? ) sse-version 42 >= ; foldable
 : sse-string ( version -- string )
    {
@ -32,37 +41,3 @@ M: x86 instruction-count read_timestamp_counter ;
 : count-instructions ( quot -- n )
    instruction-count [ call ] dip instruction-count swap - ; inline
 USING: cpu.x86.features cpu.x86.features.private ;
 :: install-sse-check ( version -- )
    [
        sse-version version < [
            "This image was built to use " write
            version sse-string write
            " but your CPU only supports " write
            sse-version sse-string write "." print
            "You will need to bootstrap Factor again." print
            flush
            1 exit
        ] when
    ] "cpu.x86" add-init-hook ;
 : enable-sse ( version -- )
    {
        { 00 [ ] }
        { 10 [ ] }
        { 20 [ enable-sse2 ] }
        { 30 [ enable-sse3 ] }
        { 33 [ enable-sse3 ] }
        { 41 [ enable-sse3 ] }
        { 42 [ enable-sse3 ] }
    } case ;
 [ { sse_version } compile ] with-optimizer
 "Checking for multimedia extensions: " write sse-version
 "sse-version" get [ string>number min ] when*
 [ sse-string write " detected" print ]
 [ install-sse-check ]
 [ enable-sse ] tri
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@ -4,7 +4,8 @@ USING: accessors assocs alien alien.c-types arrays strings
 cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
 cpu.architecture kernel kernel.private math memory namespaces make
 sequences words system layouts combinators math.order fry locals
-compiler.constants byte-arrays
+compiler.constants byte-arrays io macros quotations cpu.x86.features
 cpu.x86.features.private compiler compiler.units init
 compiler.cfg.registers
 compiler.cfg.instructions
 compiler.cfg.intrinsics
@ -250,12 +251,26 @@ M:: x86 %unbox-vector ( dst src rep -- )
    dst src byte-array-offset [+]
    rep copy-register ;
 MACRO: available-reps ( alist -- )
    ! Each SSE version adds new representations and supports
    ! all old ones
    unzip { } [ append ] accumulate rest swap suffix
    [ [ 1quotation ] map ] bi@ zip
    reverse [ { } ] suffix
    '[ _ cond ] ;
 M: x86 %broadcast-vector ( dst src rep -- )
    {
        { float-4-rep [ [ MOVSS ] [ drop dup 0 SHUFPS ] 2bi ] }
        { double-2-rep [ [ MOVSD ] [ drop dup UNPCKLPD ] 2bi ] }
    } case ;
 M: x86 %broadcast-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep } }
    } available-reps ;
 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
    rep {
        {
@ -269,6 +284,11 @@ M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
        }
    } case ;
 M: x86 %gather-vector-4-reps
    {
        { sse? { float-4-rep } }
    } available-reps ;
 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
    rep {
        {
@ -280,6 +300,11 @@ M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
        }
    } case ;
 M: x86 %gather-vector-2-reps
    {
        { sse2? { double-2-rep } }
    } available-reps ;
 M: x86 %add-vector ( dst src1 src2 rep -- )
    {
        { float-4-rep [ ADDPS ] }
@ -292,6 +317,12 @@ M: x86 %add-vector ( dst src1 src2 rep -- )
        { uint-4-rep [ PADDD ] }
    } case drop ;
 M: x86 %add-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
    } available-reps ;
 M: x86 %sub-vector ( dst src1 src2 rep -- )
    {
        { float-4-rep [ SUBPS ] }
@ -304,43 +335,92 @@ M: x86 %sub-vector ( dst src1 src2 rep -- )
        { uint-4-rep [ PSUBD ] }
    } case drop ;
 M: x86 %sub-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
    } available-reps ;
 M: x86 %mul-vector ( dst src1 src2 rep -- )
    {
        { float-4-rep [ MULPS ] }
        { double-2-rep [ MULPD ] }
-        { int-4-rep [ PMULLW ] }
+        { short-8-rep [ PMULLW ] }
        { ushort-8-rep [ PMULLW ] }
        { int-4-rep [ PMULLD ] }
        { uint-4-rep [ PMULLD ] }
    } case drop ;
 M: x86 %mul-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
    } available-reps ;
 M: x86 %div-vector ( dst src1 src2 rep -- )
    {
        { float-4-rep [ DIVPS ] }
        { double-2-rep [ DIVPD ] }
    } case drop ;
 M: x86 %div-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep } }
    } available-reps ;
 M: x86 %min-vector ( dst src1 src2 rep -- )
    {
        { float-4-rep [ MINPS ] }
        { double-2-rep [ MINPD ] }
        { uchar-16-rep [ PMINUB ] }
        { short-8-rep [ PMINSW ] }
    } case drop ;
 M: x86 %min-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
    } available-reps ;
 M: x86 %max-vector ( dst src1 src2 rep -- )
    {
        { float-4-rep [ MAXPS ] }
        { double-2-rep [ MAXPD ] }
        { uchar-16-rep [ PMAXUB ] }
        { short-8-rep [ PMAXSW ] }
    } case drop ;
 M: x86 %max-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
    } available-reps ;
 M: x86 %sqrt-vector ( dst src rep -- )
    {
        { float-4-rep [ SQRTPS ] }
        { double-2-rep [ SQRTPD ] }
    } case ;
 M: x86 %sqrt-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep } }
    } available-reps ;
 M: x86 %horizontal-add-vector ( dst src rep -- )
    {
        { float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
        { double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
    } case ;
 M: x86 %horizontal-add-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
    } available-reps ;
 M: x86 %unbox-alien ( dst src -- )
    alien-offset [+] MOV ;
@ -775,3 +855,34 @@ M: x86 small-enough? ( n -- ? )
    enable-sse3-simd ;
 enable-min/max
 :: install-sse-check ( version -- )
    [
        sse-version version < [
            "This image was built to use " write
            version sse-string write
            " but your CPU only supports " write
            sse-version sse-string write "." print
            "You will need to bootstrap Factor again." print
            flush
            1 exit
        ] when
    ] "cpu.x86" add-init-hook ;
 : enable-sse ( version -- )
    {
        { 00 [ ] }
        { 10 [ ] }
        { 20 [ enable-sse2 ] }
        { 30 [ enable-sse3 ] }
        { 33 [ enable-sse3 ] }
        { 41 [ enable-sse3 ] }
        { 42 [ enable-sse3 ] }
    } case ;
 [ { sse_version } compile ] with-optimizer
 "Checking for multimedia extensions: " write sse-version 30 min
 [ sse-string write " detected" print ]
 [ install-sse-check ]
 [ enable-sse ] tri
--- a/basis/math/floats/env/x86/x86.factor
+++ b/basis/math/floats/env/x86/x86.factor
@ -31,9 +31,7 @@ M: x87-env (set-fp-env-register)
    set_x87_env ;
 M: x86 (fp-env-registers)
-    sse-version 20 >=
+    sse2? [ <sse-env> <x87-env> 2array ] [ <x87-env> 1array ] if ;
    [ <sse-env> <x87-env> 2array ]
    [ <x87-env> 1array ] if ;
 CONSTANT: sse-exception-flag-bits HEX: 3f
 CONSTANT: sse-exception-flag>bit
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@ -1,24 +1,94 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien.c-types byte-arrays classes functors
+USING: accessors alien.c-types assocs byte-arrays classes
-kernel math parser prettyprint.custom sequences
+effects fry functors generalizations kernel literals locals
-sequences.private literals ;
+math math.functions math.vectors math.vectors.simd.intrinsics
 math.vectors.specialization parser prettyprint.custom sequences
 sequences.private strings words definitions macros cpu.architecture ;
 IN: math.vectors.simd.functor
 ERROR: bad-length got expected ;
 MACRO: simd-boa ( rep class -- simd-array )
    [ rep-components ] [ new ] bi* '[ _ _ nsequence ] ;
 :: define-boa-custom-inlining ( word rep class -- )
    word [
        drop
        rep rep rep-gather-word supported-simd-op? [
            [ rep (simd-boa) class boa ]
        ] [ word def>> ] if
    ] "custom-inlining" set-word-prop ;
 : simd-with ( rep class x -- simd-array )
    [ rep-components ] [ new ] [ '[ _ ] ] tri* swap replicate-as ; inline
 :: define-with-custom-inlining ( word rep class -- )
    word [
        drop
        rep \ (simd-broadcast) supported-simd-op? [
            [ rep rep-coerce rep (simd-broadcast) class boa ]
        ] [ word def>> ] if
    ] "custom-inlining" set-word-prop ;
 : boa-effect ( rep n -- effect )
    [ rep-components ] dip *
    [ CHAR: a + 1string ] map
    { "simd-vector" } <effect> ;
 : supported-simd-ops ( assoc rep -- assoc' )
    [
        {
            { v+ (simd-v+) }
            { v- (simd-v-) }
            { v* (simd-v*) }
            { v/ (simd-v/) }
            { vmin (simd-vmin) }
            { vmax (simd-vmax) }
            { sum (simd-sum) }
        }
    ] dip 
    '[ nip _ swap supported-simd-op? ] assoc-filter
    '[ drop _ key? ] assoc-filter ;
 :: high-level-ops ( ctor -- assoc )
    ! Some SIMD operations are defined in terms of others.
    {
        { vneg [ [ dup v- ] keep v- ] }
        { v. [ v* sum ] }
        { n+v [ [ ctor execute ] dip v+ ] }
        { v+n [ ctor execute v+ ] }
        { n-v [ [ ctor execute ] dip v- ] }
        { v-n [ ctor execute v- ] }
        { n*v [ [ ctor execute ] dip v* ] }
        { v*n [ ctor execute v* ] }
        { n/v [ [ ctor execute ] dip v/ ] }
        { v/n [ ctor execute v/ ] }
        { norm-sq [ dup v. assert-positive ] }
        { norm [ norm-sq sqrt ] }
        { normalize [ dup norm v/n ] }
        { distance [ v- norm ] }
    } ;
 :: simd-vector-words ( class ctor rep assoc -- )
    class
    rep rep-component-type c-type-boxed-class
    assoc rep supported-simd-ops
    ctor high-level-ops assoc-union
    specialize-vector-words ;
 FUNCTOR: define-simd-128 ( T -- )
-T-TYPE       IS ${T}
+N            [ 16 T heap-size /i ]
 N            [ 16 T-TYPE heap-size /i ]
 A            DEFINES-CLASS ${T}-${N}
 A-boa        DEFINES ${A}-boa
 A-with       DEFINES ${A}-with
 >A           DEFINES >${A}
 A{           DEFINES ${A}{
-NTH          [ T-TYPE dup c-type-getter-boxer array-accessor ]
+NTH          [ T dup c-type-getter-boxer array-accessor ]
-SET-NTH      [ T-TYPE dup c-setter array-accessor ]
+SET-NTH      [ T dup c-setter array-accessor ]
 A-rep        IS ${A}-rep
 A-vv->v-op   DEFINES-PRIVATE ${A}-vv->v-op
@ -59,6 +129,16 @@ M: A pprint* pprint-object ;
 SYNTAX: A{ \ } [ >A ] parse-literal ;
 : A-with ( x -- simd-array ) [ A-rep A ] dip simd-with ;
 \ A-with \ A-rep \ A define-with-custom-inlining
 \ A-boa [ \ A-rep \ A simd-boa ] \ A-rep 1 boa-effect define-declared
 \ A-rep rep-gather-word [
    \ A-boa \ A-rep \ A define-boa-custom-inlining
 ] when
 INSTANCE: A sequence
 <PRIVATE
@ -69,6 +149,16 @@ INSTANCE: A sequence
 : A-v->n-op ( v quot -- n )
    [ underlying>> A-rep ] dip call ; inline
 \ A \ A-with \ A-rep H{
    { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
    { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
    { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
    { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
    { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
    { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
    { sum [ [ (simd-sum) ] \ A-v->n-op execute ] }
 } simd-vector-words
 PRIVATE>
 ;FUNCTOR
@ -76,14 +166,16 @@ PRIVATE>
 ! Synthesize 256-bit vectors from a pair of 128-bit vectors
 FUNCTOR: define-simd-256 ( T -- )
-T-TYPE       IS ${T}
+N            [ 32 T heap-size /i ]
 N            [ 32 T-TYPE heap-size /i ]
 N/2          [ N 2 / ]
 A/2          IS ${T}-${N/2}
 A/2-boa      IS ${A/2}-boa
 A/2-with     IS ${A/2}-with
 A            DEFINES-CLASS ${T}-${N}
 A-boa        DEFINES ${A}-boa
 A-with       DEFINES ${A}-with
 >A           DEFINES >${A}
 A{           DEFINES ${A}{
@ -137,6 +229,16 @@ M: A >pprint-sequence ;
 M: A pprint* pprint-object ;
 : A-with ( x -- simd-array )
    [ A/2-with ] [ A/2-with ] bi [ underlying>> ] bi@
    \ A boa ; inline
 : A-boa ( ... -- simd-array )
    [ A/2-boa ] N/2 ndip A/2-boa [ underlying>> ] bi@
    \ A boa ;
 \ A-rep 2 boa-effect \ A-boa set-stack-effect
 INSTANCE: A sequence
 : A-vv->v-op ( v1 v2 quot -- v3 )
@ -148,4 +250,14 @@ INSTANCE: A sequence
    [ [ [ underlying1>> ] [ underlying2>> ] bi A-rep ] dip call A-rep ]
    dip call ; inline
 \ A \ A-with \ A-rep H{
    { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
    { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
    { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
    { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
    { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
    { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
    { sum [ [ (simd-v+) ] [ (simd-sum) ] \ A-v->n-op execute ] }
 } simd-vector-words
 ;FUNCTOR
--- a/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor
@ -0,0 +1,18 @@
 IN: math.vectors.simd.intrinsics.tests
 USING: math.vectors.simd.intrinsics cpu.architecture tools.test ;
 [ 16 ] [ uchar-16-rep rep-components ] unit-test
 [ 16 ] [ char-16-rep rep-components ] unit-test
 [ 8 ] [ ushort-8-rep rep-components ] unit-test
 [ 8 ] [ short-8-rep rep-components ] unit-test
 [ 4 ] [ uint-4-rep rep-components ] unit-test
 [ 4 ] [ int-4-rep rep-components ] unit-test
 [ 4 ] [ float-4-rep rep-components ] unit-test
 [ 2 ] [ double-2-rep rep-components ] unit-test
 { 4 1 } [ uint-4-rep (simd-boa) ] must-infer-as
 { 4 1 } [ int-4-rep (simd-boa) ] must-infer-as
 { 4 1 } [ float-4-rep (simd-boa) ] must-infer-as
 { 2 1 } [ double-2-rep (simd-boa) ] must-infer-as
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@ -1,6 +1,8 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel alien alien.data cpu.architecture libc ;
+USING: alien alien.c-types alien.data assocs combinators
 cpu.architecture fry generalizations kernel libc macros math
 sequences ;
 IN: math.vectors.simd.intrinsics
 ERROR: bad-simd-call ;
@ -26,3 +28,53 @@ ERROR: bad-simd-call ;
    ! Inefficient version for when intrinsics are missing
    [ swap <displaced-alien> swap ] dip rep-size memcpy ;
 <<
 : rep-components ( rep -- n )
    16 swap rep-component-type heap-size /i ; foldable
 : rep-coercer ( rep -- quot )
    {
        { [ dup int-vector-rep? ] [ [ >fixnum ] ] }
        { [ dup float-vector-rep? ] [ [ >float ] ] }
    } cond nip ; foldable
 : rep-coerce ( value rep -- value' )
    rep-coercer call( value -- value' ) ; inline
 CONSTANT: rep-gather-words
    {
        { 2 (simd-gather-2) }
        { 4 (simd-gather-4) }
    }
 : rep-gather-word ( rep -- word )
    rep-components rep-gather-words at ;
 >>
 MACRO: (simd-boa) ( rep -- quot )
    {
        [ rep-coercer ]
        [ rep-components ]
        [ ]
        [ rep-gather-word ]
    } cleave
    '[ _ _ napply _ _ execute ] ;
 GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
 M: vector-rep supported-simd-op?
    {
        { \ (simd-v+)        [ %add-vector-reps            ] }
        { \ (simd-v-)        [ %sub-vector-reps            ] }
        { \ (simd-v*)        [ %mul-vector-reps            ] }
        { \ (simd-v/)        [ %div-vector-reps            ] }
        { \ (simd-vmin)      [ %min-vector-reps            ] }
        { \ (simd-vmax)      [ %max-vector-reps            ] }
        { \ (simd-vsqrt)     [ %sqrt-vector-reps           ] }
        { \ (simd-sum)       [ %horizontal-add-vector-reps ] }
        { \ (simd-broadcast) [ %broadcast-vector-reps      ] }
        { \ (simd-gather-2)  [ %gather-vector-2-reps       ] }
        { \ (simd-gather-4)  [ %gather-vector-4-reps       ] }
    } case member? ;
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@ -43,22 +43,6 @@ $nl
 }
 "The " { $link float-4 } " and " { $link double-2 } " types correspond to 128-bit vector registers. The " { $link float-8 } " and " { $link double-4 } " types are not directly supported in hardware, and instead unbox to a pair of 128-bit vector registers."
 $nl
 "Operations on " { $link float-4 } " instances:"
 { $subsection float-4-with }
 { $subsection float-4-boa }
 { $subsection POSTPONE: float-4{ }
 "Operations on " { $link double-2 } " instances:"
 { $subsection double-2-with }
 { $subsection double-2-boa }
 { $subsection POSTPONE: double-2{ }
 "Operations on " { $link float-8 } " instances:"
 { $subsection float-8-with }
 { $subsection float-8-boa }
 { $subsection POSTPONE: float-8{ }
 "Operations on " { $link double-4 } " instances:"
 { $subsection double-4-with }
 { $subsection double-4-boa }
 { $subsection POSTPONE: double-4{ }
 "To actually perform vector arithmetic on SIMD vectors, use " { $link "math-vectors" } " words."
 { $see-also "c-types-specs" } ;
@ -184,72 +168,4 @@ ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
 { $subsection "math.vectors.simd.alien" }
 { $subsection "math.vectors.simd.intrinsics" } ;
 ! ! ! float-4
 HELP: float-4
 { $class-description "A sequence of four single-precision floating point values. New instances can be created with " { $link float-4-with } " or " { $link float-4-boa } "." } ;
 HELP: float-4-with
 { $values { "x" float } { "simd-array" float-4 } }
 { $description "Creates a new vector with all four components equal to a scalar." } ;
 HELP: float-4-boa
 { $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" float-4 } }
 { $description "Creates a new vector from four scalar components." } ;
 HELP: float-4{
 { $syntax "float-4{ a b c d }" }
 { $description "Literal syntax for a " { $link float-4 } "." } ;
 ! ! ! double-2
 HELP: double-2
 { $class-description "A sequence of two double-precision floating point values. New instances can be created with " { $link double-2-with } " or " { $link double-2-boa } "." } ;
 HELP: double-2-with
 { $values { "x" float } { "simd-array" double-2 } }
 { $description "Creates a new vector with both components equal to a scalar." } ;
 HELP: double-2-boa
 { $values { "a" float } { "b" float } { "simd-array" double-2 } }
 { $description "Creates a new vector from two scalar components." } ;
 HELP: double-2{
 { $syntax "double-2{ a b }" }
 { $description "Literal syntax for a " { $link double-2 } "." } ;
 ! ! ! float-8
 HELP: float-8
 { $class-description "A sequence of eight single-precision floating point values. New instances can be created with " { $link float-8-with } " or " { $link float-8-boa } "." } ;
 HELP: float-8-with
 { $values { "x" float } { "simd-array" float-8 } }
 { $description "Creates a new vector with all eight components equal to a scalar." } ;
 HELP: float-8-boa
 { $values { "a" float } { "b" float } { "c" float } { "d" float } { "e" float } { "f" float } { "g" float } { "h" float } { "simd-array" float-8 } }
 { $description "Creates a new vector from eight scalar components." } ;
 HELP: float-8{
 { $syntax "float-8{ a b c d e f g h }" }
 { $description "Literal syntax for a " { $link float-8 } "." } ;
 ! ! ! double-4
 HELP: double-4
 { $class-description "A sequence of four double-precision floating point values. New instances can be created with " { $link double-4-with } " or " { $link double-4-boa } "." } ;
 HELP: double-4-with
 { $values { "x" float } { "simd-array" double-4 } }
 { $description "Creates a new vector with all four components equal to a scalar." } ;
 HELP: double-4-boa
 { $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" double-4 } }
 { $description "Creates a new vector from four scalar components." } ;
 HELP: double-4{
 { $syntax "double-4{ a b c d }" }
 { $description "Literal syntax for a " { $link double-4 } "." } ;
 ABOUT: "math.vectors.simd"
--- a/basis/math/vectors/simd/simd.factor
+++ b/basis/math/vectors/simd/simd.factor
@ -1,185 +1,15 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien.c-types byte-arrays cpu.architecture
+USING: alien.c-types cpu.architecture kernel
-kernel math math.functions math.vectors
+math.vectors.simd.functor vocabs.loader ;
-math.vectors.simd.functor math.vectors.simd.intrinsics
+FROM: sequences => each ;
 math.vectors.specialization parser prettyprint.custom sequences
 sequences.private locals assocs words fry ;
 FROM: alien.c-types => float ;
 QUALIFIED-WITH: math m
 IN: math.vectors.simd
 <<
-DEFER: float-4
+{ double float char uchar short ushort int uint }
-DEFER: double-2
+[ [ define-simd-128 ] [ define-simd-256 ] bi ] each
 DEFER: float-8
 DEFER: double-4
 "double" define-simd-128
 "float"  define-simd-128
 "double" define-simd-256
 "float"  define-simd-256
 >>
 : float-4-with ( x -- simd-array )
    [ 4 ] dip >float '[ _ ] \ float-4 new replicate-as ;
 : float-4-boa ( a b c d -- simd-array )
    \ float-4 new 4sequence ;
 : double-2-with ( x -- simd-array )
    [ 2 ] dip >float '[ _ ] \ double-2 new replicate-as ;
 : double-2-boa ( a b -- simd-array )
    \ double-2 new 2sequence ;
 ! More efficient expansions for the above, used when SIMD is
 ! actually available.
 <<
 \ float-4-with [
    drop
    \ (simd-broadcast) "intrinsic" word-prop [
        [ >float float-4-rep (simd-broadcast) \ float-4 boa ]
    ] [ \ float-4-with def>> ] if
 ] "custom-inlining" set-word-prop
 \ float-4-boa [
    drop
    \ (simd-gather-4) "intrinsic" word-prop [
        [| a b c d |
            a >float b >float c >float d >float
            float-4-rep (simd-gather-4) \ float-4 boa
        ]
    ] [ \ float-4-boa def>> ] if
 ] "custom-inlining" set-word-prop
 \ double-2-with [
    drop
    \ (simd-broadcast) "intrinsic" word-prop [
        [ >float double-2-rep (simd-broadcast) \ double-2 boa ]
    ] [ \ double-2-with def>> ] if
 ] "custom-inlining" set-word-prop
 \ double-2-boa [
    drop
    \ (simd-gather-4) "intrinsic" word-prop [
        [ [ >float ] bi@ double-2-rep (simd-gather-2) \ double-2 boa ]
    ] [ \ double-2-boa def>> ] if
 ] "custom-inlining" set-word-prop
 >>
 : float-8-with ( x -- simd-array )
    [ float-4-with ] [ float-4-with ] bi [ underlying>> ] bi@
    \ float-8 boa ; inline
 :: float-8-boa ( a b c d e f g h -- simd-array )
    a b c d float-4-boa
    e f g h float-4-boa
    [ underlying>> ] bi@
    \ float-8 boa ; inline
 : double-4-with ( x -- simd-array )
    [ double-2-with ] [ double-2-with ] bi [ underlying>> ] bi@
    \ double-4 boa ; inline
 :: double-4-boa ( a b c d -- simd-array )
    a b double-2-boa
    c d double-2-boa
    [ underlying>> ] bi@
    \ double-4 boa ; inline
 <<
 <PRIVATE
 ! Filter out operations that are not available, eg horizontal adds
 ! on SSE2. Fallback code in math.vectors is used in that case.
 : supported-simd-ops ( assoc -- assoc' )
    {
        { v+ (simd-v+) }
        { v- (simd-v-) }
        { v* (simd-v*) }
        { v/ (simd-v/) }
        { vmin (simd-vmin) }
        { vmax (simd-vmax) }
        { sum (simd-sum) }
    } [ nip "intrinsic" word-prop ] assoc-filter
    '[ drop _ key? ] assoc-filter ;
 ! Some SIMD operations are defined in terms of others.
 :: high-level-ops ( ctor -- assoc )
    {
        { vneg [ [ dup v- ] keep v- ] }
        { v. [ v* sum ] }
        { n+v [ [ ctor execute ] dip v+ ] }
        { v+n [ ctor execute v+ ] }
        { n-v [ [ ctor execute ] dip v- ] }
        { v-n [ ctor execute v- ] }
        { n*v [ [ ctor execute ] dip v* ] }
        { v*n [ ctor execute v* ] }
        { n/v [ [ ctor execute ] dip v/ ] }
        { v/n [ ctor execute v/ ] }
        { norm-sq [ dup v. assert-positive ] }
        { norm [ norm-sq sqrt ] }
        { normalize [ dup norm v/n ] }
        { distance [ v- norm ] }
    } ;
 :: simd-vector-words ( class ctor elt-type assoc -- )
    class elt-type assoc supported-simd-ops ctor high-level-ops assoc-union
    specialize-vector-words ;
 PRIVATE>
 \ float-4 \ float-4-with m:float H{
    { v+ [ [ (simd-v+) ] float-4-vv->v-op ] }
    { v- [ [ (simd-v-) ] float-4-vv->v-op ] }
    { v* [ [ (simd-v*) ] float-4-vv->v-op ] }
    { v/ [ [ (simd-v/) ] float-4-vv->v-op ] }
    { vmin [ [ (simd-vmin) ] float-4-vv->v-op ] }
    { vmax [ [ (simd-vmax) ] float-4-vv->v-op ] }
    { sum [ [ (simd-sum) ] float-4-v->n-op ] }
 } simd-vector-words
 \ double-2 \ double-2-with m:float H{
    { v+ [ [ (simd-v+) ] double-2-vv->v-op ] }
    { v- [ [ (simd-v-) ] double-2-vv->v-op ] }
    { v* [ [ (simd-v*) ] double-2-vv->v-op ] }
    { v/ [ [ (simd-v/) ] double-2-vv->v-op ] }
    { vmin [ [ (simd-vmin) ] double-2-vv->v-op ] }
    { vmax [ [ (simd-vmax) ] double-2-vv->v-op ] }
    { sum [ [ (simd-sum) ] double-2-v->n-op ] }
 } simd-vector-words
 \ float-8 \ float-8-with m:float H{
    { v+ [ [ (simd-v+) ] float-8-vv->v-op ] }
    { v- [ [ (simd-v-) ] float-8-vv->v-op ] }
    { v* [ [ (simd-v*) ] float-8-vv->v-op ] }
    { v/ [ [ (simd-v/) ] float-8-vv->v-op ] }
    { vmin [ [ (simd-vmin) ] float-8-vv->v-op ] }
    { vmax [ [ (simd-vmax) ] float-8-vv->v-op ] }
    { sum [ [ (simd-sum) ] [ + ] float-8-v->n-op ] }
 } simd-vector-words
 \ double-4 \ double-4-with m:float H{
    { v+ [ [ (simd-v+) ] double-4-vv->v-op ] }
    { v- [ [ (simd-v-) ] double-4-vv->v-op ] }
    { v* [ [ (simd-v*) ] double-4-vv->v-op ] }
    { v/ [ [ (simd-v/) ] double-4-vv->v-op ] }
    { vmin [ [ (simd-vmin) ] double-4-vv->v-op ] }
    { vmax [ [ (simd-vmax) ] double-4-vv->v-op ] }
    { sum [ [ (simd-v+) ] [ (simd-sum) ] double-4-v->n-op ] }
 } simd-vector-words
 >>
 USE: vocabs.loader
 "math.vectors.simd.alien" require
--- a/basis/math/vectors/simd/summary.txt
+++ b/basis/math/vectors/simd/summary.txt
@ -0,0 +1 @@
 Single-instruction-multiple-data parallel vector operations
		`@ -0,0 +1 @@`
							`Single-instruction-multiple-data parallel vector operations`