Initial implementation of SSE vector intrinsics:

- cpu.architecture: add SSE vector representations - compiler.cfg.intrinsics.alien: remove an attempt at optimization that value numbering handles now - compiler.cfg.representations: support instructions where the representation is set in the 'rep' slot, and support conversions between single and double floats - alien-float, set-alien-float now use the single float representation, and the conversion is implicit; this fixes a long-standing bug where a register could get clobbered because of how %set-alien-float was defined on x86 - math.vectors.specialization: add support for SIMD specialization (where the vector word's body is replaced by another quotation), also specialize the 'sum' word - math.vectors.simd: 4float-array, 2double-array, 4double-array types, and specializers for the math.vectors words
2009-09-03 02:33:07 -05:00 · 2009-09-03 02:33:07 -05:00 · ff8c70dbe0
parent c719b92363
commit ff8c70dbe0
23 changed files with 1235 additions and 101 deletions
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@ -91,6 +91,11 @@ INSN: ##set-string-nth-fast
 use: src/int-rep obj/int-rep index/int-rep
 temp: temp/int-rep ;

+PURE-INSN: ##copy
+def: dst
+use: src
+literal: rep ;
+
 ! Integer arithmetic
 PURE-INSN: ##add
 def: dst/int-rep
@ -201,6 +206,15 @@ use: src/int-rep
 temp: temp/int-rep ;

 ! Float arithmetic
+PURE-INSN: ##unbox-float
+def: dst/double-float-rep
+use: src/int-rep ;
+
+PURE-INSN: ##box-float
+def: dst/int-rep
+use: src/double-float-rep
+temp: temp/int-rep ;
+
 PURE-INSN: ##add-float
 def: dst/double-float-rep
 use: src1/double-float-rep src2/double-float-rep ;
@ -240,6 +254,15 @@ def: dst/double-float-rep
 use: src1/double-float-rep src2/double-float-rep
 literal: func ;

+! Single/double float conversion
+PURE-INSN: ##single>double-float
+def: dst/double-float-rep
+use: src/single-float-rep ;
+
+PURE-INSN: ##double>single-float
+def: dst/single-float-rep
+use: src/double-float-rep ;
+
 ! Float/integer conversion
 PURE-INSN: ##float>integer
 def: dst/int-rep
@ -249,26 +272,80 @@ PURE-INSN: ##integer>float
 def: dst/double-float-rep
 use: src/int-rep ;

-! Boxing and unboxing
-PURE-INSN: ##copy
+! SIMD operations
+
+INSN: ##box-vector
+def: dst/int-rep
+use: src
+literal: rep
+temp: temp/int-rep ;
+
+INSN: ##unbox-vector
+def: dst
+use: src/int-rep
+literal: rep ;
+
+INSN: ##broadcast-vector
+def: dst
+use: src/scalar-rep
+literal: rep ;
+
+INSN: ##gather-vector-2
+def: dst
+use: src1/scalar-rep src2/scalar-rep
+literal: rep ;
+
+INSN: ##gather-vector-4
+def: dst
+use: src1/scalar-rep src2/scalar-rep src3/scalar-rep src4/scalar-rep
+literal: rep ;
+
+INSN: ##add-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+INSN: ##sub-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+INSN: ##mul-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+INSN: ##div-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+INSN: ##min-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+INSN: ##max-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+INSN: ##sqrt-vector
 def: dst
 use: src
 literal: rep ;

-PURE-INSN: ##unbox-float
-def: dst/double-float-rep
-use: src/int-rep ;
+INSN: ##horizontal-add-vector
+def: dst/scalar-rep
+use: src
+literal: rep ;

+! Boxing and unboxing aliens
 PURE-INSN: ##unbox-any-c-ptr
 def: dst/int-rep
 use: src/int-rep
 temp: temp/int-rep ;

-PURE-INSN: ##box-float
-def: dst/int-rep
-use: src/double-float-rep
-temp: temp/int-rep ;
-
 PURE-INSN: ##box-alien
 def: dst/int-rep
 use: src/int-rep
@ -322,13 +399,18 @@ def: dst/int-rep
 use: src/int-rep ;

 INSN: ##alien-float
-def: dst/double-float-rep
+def: dst/single-float-rep
 use: src/int-rep ;

 INSN: ##alien-double
 def: dst/double-float-rep
 use: src/int-rep ;

+INSN: ##alien-vector
+def: dst
+use: src/int-rep
+literal: rep ;
+
 INSN: ##set-alien-integer-1
 use: src/int-rep value/int-rep ;

@ -342,11 +424,15 @@ INSN: ##set-alien-cell
 use: src/int-rep value/int-rep ;

 INSN: ##set-alien-float
-use: src/int-rep value/double-float-rep ;
+use: src/int-rep value/single-float-rep ;

 INSN: ##set-alien-double
 use: src/int-rep value/double-float-rep ;

+INSN: ##set-alien-vector
+use: src/int-rep value
+literal: rep ;
+
 ! Memory allocation
 INSN: ##allot
 def: dst/int-rep
@ -510,6 +596,7 @@ literal: n ;
 UNION: ##allocation
 ##allot
 ##box-float
+##box-vector
 ##box-alien
 ##box-displaced-alien
 ##integer>bignum ;
--- a/basis/compiler/cfg/instructions/syntax/syntax.factor
+++ b/basis/compiler/cfg/instructions/syntax/syntax.factor
@ -7,11 +7,20 @@ IN: compiler.cfg.instructions.syntax

 SYMBOLS: def use temp literal constant ;

+SYMBOL: scalar-rep
+
 TUPLE: insn-slot-spec type name rep ;

+: parse-rep ( str/f -- rep )
+    {
+        { [ dup not ] [ ] }
+        { [ dup "scalar-rep" = ] [ drop scalar-rep ] }
+        [ "cpu.architecture" lookup ]
+    } cond ;
+
 : parse-insn-slot-spec ( type string -- spec )
    over [ "Missing type" throw ] unless
-    "/" split1 dup [ "cpu.architecture" lookup ] when
+    "/" split1 parse-rep
    insn-slot-spec boa ;

 : parse-insn-slot-specs ( seq -- specs )
--- a/basis/compiler/cfg/intrinsics/alien/alien.factor
+++ b/basis/compiler/cfg/intrinsics/alien/alien.factor
@ -20,22 +20,14 @@ IN: compiler.cfg.intrinsics.alien
        ^^box-displaced-alien ds-push
    ] [ emit-primitive ] if ;

-: (prepare-alien-accessor-imm) ( class offset -- offset-vreg )
-    ds-drop [ ds-pop swap ^^unbox-c-ptr ] dip ^^add-imm ;
-
-: (prepare-alien-accessor) ( class -- offset-vreg )
-    [ 2inputs ^^untag-fixnum swap ] dip ^^unbox-c-ptr ^^add ;
-
 : prepare-alien-accessor ( infos -- offset-vreg )
-    <reversed> [ second class>> ] [ first ] bi
-    dup value-info-small-fixnum? [
-        literal>> (prepare-alien-accessor-imm)
-    ] [ drop (prepare-alien-accessor) ] if ;
+    <reversed> second class>>
+    [ 2inputs ^^untag-fixnum swap ] dip ^^unbox-c-ptr ^^add ;

 :: inline-alien ( node quot test -- )
    [let | infos [ node node-input-infos ] |
        infos test call
-        [ infos prepare-alien-accessor quot call ]
+        [ infos quot call ]
        [ node emit-primitive ]
        if
    ] ; inline
@ -46,7 +38,7 @@ IN: compiler.cfg.intrinsics.alien
    bi and ;

 : inline-alien-getter ( node quot -- )
-    '[ @ ds-push ]
+    '[ prepare-alien-accessor @ ds-push ]
    [ inline-alien-getter? ] inline-alien ; inline

 : inline-alien-setter? ( infos class -- ? )
@ -56,18 +48,17 @@ IN: compiler.cfg.intrinsics.alien
    tri and and ;

 : inline-alien-integer-setter ( node quot -- )
-    '[ ds-pop ^^untag-fixnum @ ]
+    '[ prepare-alien-accessor ds-pop ^^untag-fixnum @ ]
    [ fixnum inline-alien-setter? ]
    inline-alien ; inline

 : inline-alien-cell-setter ( node quot -- )
-    [ dup node-input-infos first class>> ] dip
-    '[ ds-pop _ ^^unbox-c-ptr @ ]
+    '[ [ prepare-alien-accessor ds-pop ] [ first class>> ] bi ^^unbox-c-ptr @ ]
    [ pinned-c-ptr inline-alien-setter? ]
    inline-alien ; inline

 : inline-alien-float-setter ( node quot -- )
-    '[ ds-pop @ ]
+    '[ prepare-alien-accessor ds-pop @ ]
    [ float inline-alien-setter? ]
    inline-alien ; inline

--- a/basis/compiler/cfg/intrinsics/intrinsics.factor
+++ b/basis/compiler/cfg/intrinsics/intrinsics.factor
@ -7,6 +7,7 @@ compiler.cfg.intrinsics.alien
 compiler.cfg.intrinsics.allot
 compiler.cfg.intrinsics.fixnum
 compiler.cfg.intrinsics.float
+compiler.cfg.intrinsics.simd
 compiler.cfg.intrinsics.slots
 compiler.cfg.intrinsics.misc
 compiler.cfg.comparisons ;
@ -22,6 +23,9 @@ QUALIFIED: classes.tuple.private
 QUALIFIED: math.private
 QUALIFIED: math.integers.private
 QUALIFIED: math.floats.private
+QUALIFIED: math.vectors.simd
+QUALIFIED: math.vectors.simd.private
+QUALIFIED: math.vectors.simd.alien
 QUALIFIED: math.libm
 IN: compiler.cfg.intrinsics

@ -142,5 +146,27 @@ IN: compiler.cfg.intrinsics
        { math.integers.private:fixnum-log2 [ drop emit-fixnum-log2 ] }
    } enable-intrinsics ;

+: enable-sse2-simd ( -- )
+    {
+        { math.vectors.simd.private:assert-positive [ drop ] }
+        { math.vectors.simd.private:(simd-v+) [ [ ^^add-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.private:(simd-v-) [ [ ^^sub-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.private:(simd-v*) [ [ ^^mul-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.private:(simd-v/) [ [ ^^div-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.private:(simd-vmin) [ [ ^^min-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.private:(simd-vmax) [ [ ^^max-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.private:(simd-vsqrt) [ [ ^^sqrt-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.private:(simd-broadcast) [ [ ^^broadcast-vector ] emit-unary-vector-op ] }
+        { math.vectors.simd.private:(simd-gather-2) [ emit-gather-vector-2 ] }
+        { math.vectors.simd.private:(simd-gather-4) [ emit-gather-vector-4 ] }
+        { math.vectors.simd.alien:alien-vector [ emit-alien-vector ] }
+        { math.vectors.simd.alien:set-alien-vector [ emit-set-alien-vector ] }
+    } enable-intrinsics ;
+
+: enable-sse3-simd ( -- )
+    {
+        { math.vectors.simd.private:(simd-sum) [ [ ^^horizontal-add-vector ] emit-unary-vector-op ] }
+    } enable-intrinsics ;
+
 : emit-intrinsic ( node word -- )
    "intrinsic" word-prop call( node -- ) ;
--- a/basis/compiler/cfg/intrinsics/simd/authors.txt
+++ b/basis/compiler/cfg/intrinsics/simd/authors.txt
@ -0,0 +1 @@
+Slava Pestov
--- a/basis/compiler/cfg/intrinsics/simd/simd.factor
+++ b/basis/compiler/cfg/intrinsics/simd/simd.factor
@ -0,0 +1,57 @@
+! Copyright (C) 2009 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors byte-arrays fry cpu.architecture kernel
+sequences compiler.tree.propagation.info
+compiler.cfg.builder.blocks compiler.cfg.stacks
+compiler.cfg.stacks.local compiler.cfg.hats
+compiler.cfg.instructions compiler.cfg.registers
+compiler.cfg.intrinsics.alien ;
+IN: compiler.cfg.intrinsics.simd
+
+: emit-vector-op ( node quot: ( rep -- ) -- )
+    [ dup node-input-infos last literal>> ] dip over representation?
+    [ [ drop ] 2dip call ] [ 2drop emit-primitive ] if ; inline
+
+: emit-binary-vector-op ( node quot -- )
+    '[ [ ds-drop 2inputs ] dip @ ds-push ] emit-vector-op ; inline
+
+: emit-unary-vector-op ( node quot -- )
+    '[ [ ds-drop ds-pop ] dip @ ds-push ] emit-vector-op ; inline
+
+: emit-gather-vector-2 ( node -- )
+    [ ^^gather-vector-2 ] emit-binary-vector-op ;
+
+: emit-gather-vector-4 ( node -- )
+    [
+        ds-drop
+        [
+            D 3 peek-loc
+            D 2 peek-loc
+            D 1 peek-loc
+            D 0 peek-loc
+            -4 inc-d
+        ] dip
+        ^^gather-vector-4
+        ds-push
+    ] emit-vector-op ;
+
+: inline-alien-vector-setter ( node quot -- )
+    '[ ds-drop prepare-alien-accessor ds-pop @ ]
+    [ byte-array inline-alien-setter? ]
+    inline-alien ; inline
+
+: emit-alien-vector ( node -- )
+    dup [
+        '[
+            ds-drop prepare-alien-accessor
+            _ ^^alien-vector ds-push
+        ]
+        [ inline-alien-getter? ] inline-alien
+    ] with emit-vector-op ;
+
+: emit-set-alien-vector ( node -- )
+    dup [
+        '[
+            _ ##set-alien-vector
+        ] inline-alien-vector-setter
+    ] with emit-vector-op ;
--- a/basis/compiler/cfg/representations/preferred/preferred.factor
+++ b/basis/compiler/cfg/representations/preferred/preferred.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: kernel accessors sequences arrays fry namespaces generic
-words sets cpu.architecture compiler.units
+words sets combinators generalizations cpu.architecture compiler.units
 compiler.cfg.utilities compiler.cfg compiler.cfg.rpo
 compiler.cfg.instructions compiler.cfg.instructions.syntax
 compiler.cfg.def-use ;
@ -13,35 +13,41 @@ GENERIC: uses-vreg-reps ( insn -- reps )

 <PRIVATE

+: rep-getter-quot ( rep -- quot )
+    {
+        { f [ [ rep>> ] ] }
+        { scalar-rep [ [ rep>> scalar-rep-of ] ] }
+        [ '[ _ nip ] ]
+    } case ;
+
 : define-defs-vreg-rep-method ( insn -- )
    [ \ defs-vreg-rep create-method ]
-    [ insn-def-slot dup [ rep>> ] when '[ drop _ ] ] bi
-    define ;
+    [ insn-def-slot [ rep>> rep-getter-quot ] [ [ drop f ] ] if* ]
+    bi define ;
+
+: reps-getter-quot ( reps -- quot )
+    [ rep>> rep-getter-quot ] map dup length '[ _ cleave _ narray ] ;

 : define-uses-vreg-reps-method ( insn -- )
    [ \ uses-vreg-reps create-method ]
-    [ insn-use-slots [ rep>> ] map '[ drop _ ] ] bi
-    define ;
+    [ insn-use-slots reps-getter-quot ]
+    bi define ;

 : define-temp-vreg-reps-method ( insn -- )
    [ \ temp-vreg-reps create-method ]
-    [ insn-temp-slots [ rep>> ] map '[ drop _ ] ] bi
-    define ;
+    [ insn-temp-slots reps-getter-quot ]
+    bi define ;

 PRIVATE>

 [
    insn-classes get
-    [ { ##copy } diff [ define-defs-vreg-rep-method ] each ]
-    [ { ##copy ##phi } diff [ define-uses-vreg-reps-method ] each ]
+    [ [ define-defs-vreg-rep-method ] each ]
+    [ { ##phi } diff [ define-uses-vreg-reps-method ] each ]
    [ [ define-temp-vreg-reps-method ] each ]
    tri
 ] with-compilation-unit

-M: ##copy defs-vreg-rep rep>> ;
-
-M: ##copy uses-vreg-reps rep>> 1array ;
-
 : each-def-rep ( insn vreg-quot: ( vreg rep -- ) -- )
    [ [ defs-vreg ] [ defs-vreg-rep ] bi ] dip with when* ; inline

--- a/basis/compiler/cfg/representations/representations.factor
+++ b/basis/compiler/cfg/representations/representations.factor
@ -5,6 +5,7 @@ arrays combinators make locals deques dlists
 cpu.architecture compiler.utilities
 compiler.cfg
 compiler.cfg.rpo
+compiler.cfg.hats
 compiler.cfg.registers
 compiler.cfg.instructions
 compiler.cfg.def-use
@ -16,13 +17,47 @@ IN: compiler.cfg.representations

 ! Virtual register representation selection.

+ERROR: bad-conversion dst src dst-rep src-rep ;
+
+GENERIC: emit-box ( dst src rep -- )
+GENERIC: emit-unbox ( dst src rep -- )
+
+M: single-float-rep emit-box
+    drop
+    [ double-float-rep next-vreg-rep dup ] dip ##single>double-float
+    int-rep next-vreg-rep ##box-float ;
+
+M: single-float-rep emit-unbox
+    drop
+    [ double-float-rep next-vreg-rep dup ] dip ##unbox-float
+    ##double>single-float ;
+
+M: double-float-rep emit-box
+    drop
+    int-rep next-vreg-rep ##box-float ;
+
+M: double-float-rep emit-unbox
+    drop ##unbox-float ;
+
+M: vector-rep emit-box
+    int-rep next-vreg-rep ##box-vector ;
+
+M: vector-rep emit-unbox
+    ##unbox-vector ;
+
 : emit-conversion ( dst src dst-rep src-rep -- )
-    2array {
-        { { int-rep int-rep } [ int-rep ##copy ] }
-        { { double-float-rep double-float-rep } [ double-float-rep ##copy ] }
-        { { double-float-rep int-rep } [ ##unbox-float ] }
-        { { int-rep double-float-rep } [ int-rep next-vreg-rep ##box-float ] }
-    } case ;
+    {
+        { [ 2dup eq? ] [ drop ##copy ] }
+        { [ dup int-rep eq? ] [ drop emit-unbox ] }
+        { [ over int-rep eq? ] [ nip emit-box ] }
+        [
+            2array {
+                { { double-float-rep single-float-rep } [ ##single>double-float ] }
+                { { single-float-rep double-float-rep } [ ##double>single-float ] }
+                [ first2 bad-conversion ]
+            } case
+        ]
+    } cond ;

 <PRIVATE

--- a/basis/compiler/cfg/two-operand/two-operand.factor
+++ b/basis/compiler/cfg/two-operand/two-operand.factor
@ -45,7 +45,13 @@ UNION: two-operand-insn
    ##mul-float
    ##div-float
    ##min-float
-    ##max-float ;
+    ##max-float
+    ##add-vector
+    ##sub-vector
+    ##mul-vector
+    ##div-vector
+    ##min-vector
+    ##max-vector ;

 GENERIC: convert-two-operand* ( insn -- )

--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@ -144,8 +144,11 @@ CODEGEN: ##min %min
 CODEGEN: ##max %max
 CODEGEN: ##not %not
 CODEGEN: ##log2 %log2
+CODEGEN: ##copy %copy
 CODEGEN: ##integer>bignum %integer>bignum
 CODEGEN: ##bignum>integer %bignum>integer
+CODEGEN: ##unbox-float %unbox-float
+CODEGEN: ##box-float %box-float
 CODEGEN: ##add-float %add-float
 CODEGEN: ##sub-float %sub-float
 CODEGEN: ##mul-float %mul-float
@ -155,12 +158,24 @@ CODEGEN: ##max-float %max-float
 CODEGEN: ##sqrt %sqrt
 CODEGEN: ##unary-float-function %unary-float-function
 CODEGEN: ##binary-float-function %binary-float-function
+CODEGEN: ##single>double-float %single>double-float
+CODEGEN: ##double>single-float %double>single-float
 CODEGEN: ##integer>float %integer>float
 CODEGEN: ##float>integer %float>integer
-CODEGEN: ##copy %copy
-CODEGEN: ##unbox-float %unbox-float
+CODEGEN: ##unbox-vector %unbox-vector
+CODEGEN: ##broadcast-vector %broadcast-vector
+CODEGEN: ##gather-vector-2 %gather-vector-2
+CODEGEN: ##gather-vector-4 %gather-vector-4
+CODEGEN: ##box-vector %box-vector
+CODEGEN: ##add-vector %add-vector
+CODEGEN: ##sub-vector %sub-vector
+CODEGEN: ##mul-vector %mul-vector
+CODEGEN: ##div-vector %div-vector
+CODEGEN: ##min-vector %min-vector
+CODEGEN: ##max-vector %max-vector
+CODEGEN: ##sqrt-vector %sqrt-vector
+CODEGEN: ##horizontal-add-vector %horizontal-add-vector
 CODEGEN: ##unbox-any-c-ptr %unbox-any-c-ptr
-CODEGEN: ##box-float %box-float
 CODEGEN: ##box-alien %box-alien
 CODEGEN: ##box-displaced-alien %box-displaced-alien
 CODEGEN: ##alien-unsigned-1 %alien-unsigned-1
@ -172,12 +187,14 @@ CODEGEN: ##alien-signed-4 %alien-signed-4
 CODEGEN: ##alien-cell %alien-cell
 CODEGEN: ##alien-float %alien-float
 CODEGEN: ##alien-double %alien-double
+CODEGEN: ##alien-vector %alien-vector
 CODEGEN: ##set-alien-integer-1 %set-alien-integer-1
 CODEGEN: ##set-alien-integer-2 %set-alien-integer-2
 CODEGEN: ##set-alien-integer-4 %set-alien-integer-4
 CODEGEN: ##set-alien-cell %set-alien-cell
 CODEGEN: ##set-alien-float %set-alien-float
 CODEGEN: ##set-alien-double %set-alien-double
+CODEGEN: ##set-alien-vector %set-alien-vector
 CODEGEN: ##allot %allot
 CODEGEN: ##write-barrier %write-barrier
 CODEGEN: ##compare %compare
--- a/basis/compiler/tests/codegen.factor
+++ b/basis/compiler/tests/codegen.factor
@ -412,4 +412,6 @@ cell 4 = [
 [ 1 "0.169967142900241" ] [ 1.4 [ 1 swap fcos ] compile-call number>string ] unit-test
 [ 1 "0.169967142900241" ] [ 1.4 1 [ swap fcos ] compile-call number>string ] unit-test
 [ "0.169967142900241" "0.9854497299884601" ] [ 1.4 [ [ fcos ] [ fsin ] bi ] compile-call [ number>string ] bi@ ] unit-test
-[ 1 "0.169967142900241" "0.9854497299884601" ] [ 1.4 1 [ swap >float [ fcos ] [ fsin ] bi ] compile-call [ number>string ] bi@ ] unit-test
+[ 1 "0.169967142900241" "0.9854497299884601" ] [ 1.4 1 [ swap >float [ fcos ] [ fsin ] bi ] compile-call [ number>string ] bi@ ] unit-test
+
+[ 6.0 ] [ 1.0 [ >float 3.0 + [ B{ 0 0 0 0 } 0 set-alien-float ] [ 2.0 + ] bi ] compile-call ] unit-test
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@ -0,0 +1,43 @@
+! Copyright (C) 2009 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors byte-arrays combinators
+compiler.tree.propagation.info cpu.architecture kernel words math
+math.intervals math.vectors.simd math.vectors.simd.private
+math.vectors.simd.alien ;
+IN: compiler.tree.propagation.simd
+
+\ (simd-v+) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-v-) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-v*) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-v/) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-vmin) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-vmax) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-vsqrt) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-sum) [
+    nip dup literal?>> [
+        literal>> scalar-rep-of {
+            { single-float-rep [ float ] }
+            { double-float-rep [ float ] }
+        } case
+    ] [ drop real ] if
+    <class-info>
+] "outputs" set-word-prop
+
+\ (simd-broadcast) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-gather-2) { byte-array } "default-output-classes" set-word-prop
+
+\ (simd-gather-4) { byte-array } "default-output-classes" set-word-prop
+
+\ assert-positive [
+    real [0,inf] <class/interval-info> value-info-intersect
+] "outputs" set-word-prop
+
+\ alien-vector { byte-array } "default-output-classes" set-word-prop
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@ -20,7 +20,33 @@ SINGLETONS: tagged-rep int-rep ;
 ! one of these representations
 SINGLETONS: single-float-rep double-float-rep ;

-UNION: representation any-rep tagged-rep int-rep single-float-rep double-float-rep ;
+SINGLETONS:
+4float-array-rep
+2double-array-rep
+16char-array-rep
+16uchar-array-rep
+8short-array-rep
+8ushort-array-rep
+4int-array-rep
+4uint-array-rep ;
+
+UNION: vector-rep
+4float-array-rep
+2double-array-rep
+16char-array-rep
+16uchar-array-rep
+8short-array-rep
+8ushort-array-rep
+4int-array-rep
+4uint-array-rep ;
+
+UNION: representation
+any-rep
+tagged-rep
+int-rep
+single-float-rep
+double-float-rep
+vector-rep ;

 ! Register classes
 SINGLETONS: int-regs float-regs ;
@ -31,23 +57,28 @@ CONSTANT: reg-classes { int-regs float-regs }
 ! A pseudo-register class for parameters spilled on the stack
 SINGLETON: stack-params

-: reg-class-of ( rep -- reg-class )
-    {
-        { tagged-rep [ int-regs ] }
-        { int-rep [ int-regs ] }
-        { single-float-rep [ float-regs ] }
-        { double-float-rep [ float-regs ] }
-        { stack-params [ stack-params ] }
-    } case ;
+GENERIC: reg-class-of ( rep -- reg-class )

-: rep-size ( rep -- n )
-    {
-        { tagged-rep [ cell ] }
-        { int-rep [ cell ] }
-        { single-float-rep [ 4 ] }
-        { double-float-rep [ 8 ] }
-        { stack-params [ cell ] }
-    } case ;
+M: tagged-rep reg-class-of drop int-regs ;
+M: int-rep reg-class-of drop int-regs ;
+M: single-float-rep reg-class-of drop float-regs ;
+M: double-float-rep reg-class-of drop float-regs ;
+M: vector-rep reg-class-of drop float-regs ;
+M: stack-params reg-class-of drop stack-params ;
+
+GENERIC: rep-size ( rep -- n )
+
+M: tagged-rep rep-size drop cell ;
+M: int-rep rep-size drop cell ;
+M: single-float-rep rep-size drop 4 ;
+M: double-float-rep rep-size drop 8 ;
+M: stack-params rep-size drop cell ;
+M: vector-rep rep-size drop 16 ;
+
+GENERIC: scalar-rep-of ( rep -- rep' )
+
+M: 4float-array-rep scalar-rep-of drop single-float-rep ;
+M: 2double-array-rep scalar-rep-of drop double-float-rep ;

 ! Mapping from register class to machine registers
 HOOK: machine-registers cpu ( -- assoc )
@ -101,6 +132,8 @@ HOOK: %max     cpu ( dst src1 src2 -- )
 HOOK: %not     cpu ( dst src -- )
 HOOK: %log2    cpu ( dst src -- )

+HOOK: %copy cpu ( dst src rep -- )
+
 HOOK: %fixnum-add cpu ( label dst src1 src2 -- )
 HOOK: %fixnum-sub cpu ( label dst src1 src2 -- )
 HOOK: %fixnum-mul cpu ( label dst src1 src2 -- )
@ -108,6 +141,9 @@ HOOK: %fixnum-mul cpu ( label dst src1 src2 -- )
 HOOK: %integer>bignum cpu ( dst src temp -- )
 HOOK: %bignum>integer cpu ( dst src temp -- )

+HOOK: %unbox-float cpu ( dst src -- )
+HOOK: %box-float cpu ( dst src temp -- )
+
 HOOK: %add-float cpu ( dst src1 src2 -- )
 HOOK: %sub-float cpu ( dst src1 src2 -- )
 HOOK: %mul-float cpu ( dst src1 src2 -- )
@ -118,13 +154,29 @@ HOOK: %sqrt cpu ( dst src -- )
 HOOK: %unary-float-function cpu ( dst src func -- )
 HOOK: %binary-float-function cpu ( dst src1 src2 func -- )

+HOOK: %single>double-float cpu ( dst src -- )
+HOOK: %double>single-float cpu ( dst src -- )
+
 HOOK: %integer>float cpu ( dst src -- )
 HOOK: %float>integer cpu ( dst src -- )

-HOOK: %copy cpu ( dst src rep -- )
-HOOK: %unbox-float cpu ( dst src -- )
+HOOK: %box-vector cpu ( dst src temp rep -- )
+HOOK: %unbox-vector cpu ( dst src rep -- )
+
+HOOK: %broadcast-vector cpu ( dst src rep -- )
+HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
+HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
+
+HOOK: %add-vector cpu ( dst src1 src2 rep -- )
+HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
+HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
+HOOK: %div-vector cpu ( dst src1 src2 rep -- )
+HOOK: %min-vector cpu ( dst src1 src2 rep -- )
+HOOK: %max-vector cpu ( dst src1 src2 rep -- )
+HOOK: %sqrt-vector cpu ( dst src rep -- )
+HOOK: %horizontal-add-vector cpu ( dst src rep -- )
+
 HOOK: %unbox-any-c-ptr cpu ( dst src temp -- )
-HOOK: %box-float cpu ( dst src temp -- )
 HOOK: %box-alien cpu ( dst src temp -- )
 HOOK: %box-displaced-alien cpu ( dst displacement base temp1 temp2 base-class -- )

@ -137,6 +189,7 @@ HOOK: %alien-signed-4   cpu ( dst src -- )
 HOOK: %alien-cell       cpu ( dst src -- )
 HOOK: %alien-float      cpu ( dst src -- )
 HOOK: %alien-double     cpu ( dst src -- )
+HOOK: %alien-vector     cpu ( dst src rep -- )

 HOOK: %set-alien-integer-1 cpu ( ptr value -- )
 HOOK: %set-alien-integer-2 cpu ( ptr value -- )
@ -144,6 +197,7 @@ HOOK: %set-alien-integer-4 cpu ( ptr value -- )
 HOOK: %set-alien-cell      cpu ( ptr value -- )
 HOOK: %set-alien-float     cpu ( ptr value -- )
 HOOK: %set-alien-double    cpu ( ptr value -- )
+HOOK: %set-alien-vector    cpu ( ptr value rep -- )

 HOOK: %alien-global cpu ( dst symbol library -- )

--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@ -4,7 +4,7 @@ USING: accessors assocs alien alien.c-types arrays strings
 cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
 cpu.architecture kernel kernel.private math memory namespaces make
 sequences words system layouts combinators math.order fry locals
-compiler.constants
+compiler.constants byte-arrays
 compiler.cfg.registers
 compiler.cfg.instructions
 compiler.cfg.intrinsics
@ -130,6 +130,21 @@ M: x86 %max     nip [ CMP ] [ CMOVL ] 2bi ;
 M: x86 %not     drop NOT ;
 M: x86 %log2    BSR ;

+GENERIC: copy-register* ( dst src rep -- )
+
+M: int-rep copy-register* drop MOV ;
+M: tagged-rep copy-register* drop MOV ;
+M: single-float-rep copy-register* drop MOVSS ;
+M: double-float-rep copy-register* drop MOVSD ;
+M: 4float-array-rep copy-register* drop MOVUPS ;
+M: 2double-array-rep copy-register* drop MOVUPD ;
+M: vector-rep copy-register* drop MOVDQU ;
+
+: copy-register ( dst src rep -- )
+    2over eq? [ 3drop ] [ copy-register* ] if ;
+
+M: x86 %copy ( dst src rep -- ) copy-register ;
+
 :: overflow-template ( label dst src1 src2 insn -- )
    src1 src2 insn call
    label JO ; inline
@ -211,24 +226,120 @@ M: x86 %min-float nip MINSD ;
 M: x86 %max-float nip MAXSD ;
 M: x86 %sqrt SQRTSD ;

+M: x86 %single>double-float CVTSS2SD ;
+M: x86 %double>single-float CVTSD2SS ;
+
 M: x86 %integer>float CVTSI2SD ;
 M: x86 %float>integer CVTTSD2SI ;

-GENERIC: copy-register* ( dst src rep -- )
-
-M: int-rep copy-register* drop MOV ;
-M: tagged-rep copy-register* drop MOV ;
-M: single-float-rep copy-register* drop MOVSS ;
-M: double-float-rep copy-register* drop MOVSD ;
-
-: copy-register ( dst src rep -- )
-    2over eq? [ 3drop ] [ copy-register* ] if ;
-
-M: x86 %copy ( dst src rep -- ) copy-register ;
-
 M: x86 %unbox-float ( dst src -- )
    float-offset [+] MOVSD ;

+M:: x86 %box-float ( dst src temp -- )
+    dst 16 float temp %allot
+    dst float-offset [+] src MOVSD ;
+
+M:: x86 %box-vector ( dst src rep temp -- )
+    dst rep rep-size 2 cells + byte-array temp %allot
+    16 tag-fixnum dst 1 byte-array tag-number %set-slot-imm
+    dst byte-array-offset [+]
+    src rep copy-register ;
+
+M:: x86 %unbox-vector ( dst src rep -- )
+    dst src byte-array-offset [+]
+    rep copy-register ;
+
+M: x86 %broadcast-vector ( dst src rep -- )
+    {
+        { 4float-array-rep [ [ MOVAPS ] [ drop dup 0 SHUFPS ] 2bi ] }
+        { 2double-array-rep [ [ MOVAPD ] [ drop dup 0 SHUFPD ] 2bi ] }
+    } case ;
+
+M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
+    rep {
+        {
+            4float-array-rep
+            [
+                dst src1 MOVSS
+                dst src2 UNPCKLPS
+                src3 src4 UNPCKLPS
+                dst src3 HEX: 44 SHUFPS
+            ]
+        }
+    } case ;
+
+M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
+    rep {
+        {
+            2double-array-rep
+            [
+                dst src1 MOVAPD
+                dst src2 0 SHUFPD
+            ]
+        }
+    } case ;
+
+M: x86 %add-vector ( dst src1 src2 rep -- )
+    {
+        { 4float-array-rep [ ADDPS ] }
+        { 2double-array-rep [ ADDPD ] }
+        { 16char-array-rep [ PADDB ] }
+        { 16uchar-array-rep [ PADDB ] }
+        { 8short-array-rep [ PADDW ] }
+        { 8ushort-array-rep [ PADDW ] }
+        { 4int-array-rep [ PADDD ] }
+        { 4uint-array-rep [ PADDD ] }
+    } case drop ;
+
+M: x86 %sub-vector ( dst src1 src2 rep -- )
+    {
+        { 4float-array-rep [ SUBPS ] }
+        { 2double-array-rep [ SUBPD ] }
+        { 16char-array-rep [ PSUBB ] }
+        { 16uchar-array-rep [ PSUBB ] }
+        { 8short-array-rep [ PSUBW ] }
+        { 8ushort-array-rep [ PSUBW ] }
+        { 4int-array-rep [ PSUBD ] }
+        { 4uint-array-rep [ PSUBD ] }
+    } case drop ;
+
+M: x86 %mul-vector ( dst src1 src2 rep -- )
+    {
+        { 4float-array-rep [ MULPS ] }
+        { 2double-array-rep [ MULPD ] }
+        { 4int-array-rep [ PMULLW ] }
+    } case drop ;
+
+M: x86 %div-vector ( dst src1 src2 rep -- )
+    {
+        { 4float-array-rep [ DIVPS ] }
+        { 2double-array-rep [ DIVPD ] }
+    } case drop ;
+
+M: x86 %min-vector ( dst src1 src2 rep -- )
+    {
+        { 4float-array-rep [ MINPS ] }
+        { 2double-array-rep [ MINPD ] }
+    } case drop ;
+
+M: x86 %max-vector ( dst src1 src2 rep -- )
+    {
+        { 4float-array-rep [ MAXPS ] }
+        { 2double-array-rep [ MAXPD ] }
+    } case drop ;
+
+M: x86 %sqrt-vector ( dst src rep -- )
+    {
+        { 4float-array-rep [ SQRTPS ] }
+        { 2double-array-rep [ SQRTPD ] }
+    } case ;
+
+M: x86 %horizontal-add-vector ( dst src rep -- )
+    {
+        { 4float-array-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
+        { 2double-array-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
+    } case ;
+
 M:: x86 %unbox-any-c-ptr ( dst src temp -- )
    [
        { "is-byte-array" "end" "start" } [ define-label ] each
@ -255,10 +366,6 @@ M:: x86 %unbox-any-c-ptr ( dst src temp -- )
        "end" resolve-label
    ] with-scope ;

-M:: x86 %box-float ( dst src temp -- )
-    dst 16 float temp %allot
-    dst float-offset [+] src MOVSD ;
-
 : alien@ ( reg n -- op ) cells alien tag-number - [+] ;

 :: %allot-alien ( dst displacement base temp -- )
@ -405,8 +512,9 @@ M: x86 %alien-signed-2 16 %alien-signed-getter ;
 M: x86 %alien-signed-4 32 %alien-signed-getter ;

 M: x86 %alien-cell [] MOV ;
-M: x86 %alien-float dupd [] MOVSS dup CVTSS2SD ;
+M: x86 %alien-float [] MOVSS ;
 M: x86 %alien-double [] MOVSD ;
+M: x86 %alien-vector [ [] ] dip copy-register ;

 :: %alien-integer-setter ( ptr value size -- )
    value { ptr } size [| new-value |
@ -418,8 +526,9 @@ M: x86 %set-alien-integer-1 8 %alien-integer-setter ;
 M: x86 %set-alien-integer-2 16 %alien-integer-setter ;
 M: x86 %set-alien-integer-4 32 %alien-integer-setter ;
 M: x86 %set-alien-cell [ [] ] dip MOV ;
-M: x86 %set-alien-float dup dup CVTSD2SS [ [] ] dip MOVSS ;
+M: x86 %set-alien-float [ [] ] dip MOVSS ;
 M: x86 %set-alien-double [ [] ] dip MOVSD ;
+M: x86 %set-alien-vector [ [] ] 2dip copy-register ;

 : shift-count? ( reg -- ? ) { ECX RCX } memq? ;

--- a/basis/math/vectors/simd/alien/alien-tests.factor
+++ b/basis/math/vectors/simd/alien/alien-tests.factor
@ -0,0 +1,56 @@
+IN: math.vectors.simd.alien.tests
+USING: cpu.architecture math.vectors.simd accessors
+math.vectors.simd.alien kernel classes.struct tools.test
+compiler sequences byte-arrays alien math kernel.private
+specialized-arrays.float ;
+
+! Vector alien intrinsics
+[ 4float-array{ 1 2 3 4 } ] [
+    [
+        4float-array{ 1 2 3 4 }
+        underlying>> 0 4float-array-rep alien-vector
+    ] compile-call 4float-array boa
+] unit-test
+
+[ B{ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 } ] [
+    16 [ 1 ] B{ } replicate-as 16 <byte-array>
+    [
+        0 [
+            { byte-array c-ptr fixnum } declare
+            4float-array-rep set-alien-vector
+        ] compile-call
+    ] keep
+] unit-test
+
+[ float-array{ 1 2 3 4 } ] [
+    [
+        float-array{ 1 2 3 4 } underlying>>
+        float-array{ 4 3 2 1 } clone
+        [ underlying>> 0 4float-array-rep set-alien-vector ] keep
+    ] compile-call
+] unit-test
+
+STRUCT: simd-struct
+{ x 4float-array }
+{ y 2double-array }
+{ z 4double-array } ;
+
+[ t ] [ [ simd-struct <struct> ] compile-call >c-ptr [ 0 = ] all? ] unit-test
+
+[ 4float-array{ 1 2 3 4 } 2double-array{ 2 1 } 4double-array{ 4 3 2 1 } ] [
+    simd-struct <struct>
+    4float-array{ 1 2 3 4 } >>x
+    2double-array{ 2 1 } >>y
+    4double-array{ 4 3 2 1 } >>z
+    [ x>> ] [ y>> ] [ z>> ] tri
+] unit-test
+
+[ 4float-array{ 1 2 3 4 } 2double-array{ 2 1 } 4double-array{ 4 3 2 1 } ] [
+    [
+        simd-struct <struct>
+        4float-array{ 1 2 3 4 } >>x
+        2double-array{ 2 1 } >>y
+        4double-array{ 4 3 2 1 } >>z
+        [ x>> ] [ y>> ] [ z>> ] tri
+    ] compile-call
+] unit-test
--- a/basis/math/vectors/simd/alien/alien.factor
+++ b/basis/math/vectors/simd/alien/alien.factor
@ -0,0 +1,51 @@
+! Copyright (C) 2009 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: alien accessors alien.c-types byte-arrays compiler.units
+cpu.architecture libc locals kernel math math.vectors.simd
+math.vectors.simd.private ;
+IN: math.vectors.simd.alien
+
+: alien-vector ( c-ptr n rep -- value )
+    ! Inefficient version for when intrinsics are missing
+    [ swap <displaced-alien> ] dip rep-size memory>byte-array ;
+
+: set-alien-vector ( value c-ptr n rep -- )
+    ! Inefficient version for when intrinsics are missing
+    [ swap <displaced-alien> swap ] dip rep-size memcpy ;
+
+:: define-simd-type ( class rep -- )
+    <c-type>
+        byte-array >>class
+        class >>boxed-class
+        [ rep alien-vector ] >>getter
+        [ [ underlying>> ] 2dip rep set-alien-vector ] >>setter
+        16 >>size
+        8 >>align
+        rep >>rep
+        [ class boa ] >>boxer-quot
+        [ underlying>> ] >>unboxer-quot
+    class name>> typedef ;
+
+: define-4double-array-type ( -- )
+    <c-type>
+        4double-array >>class
+        4double-array >>boxed-class
+        [
+            [ 2double-array-rep alien-vector ]
+            [ 16 + >fixnum 2double-array-rep alien-vector ] 2bi
+            4double-array boa
+        ] >>getter
+        [
+            [ [ underlying1>> ] 2dip 2double-array-rep set-alien-vector ]
+            [ [ underlying2>> ] 2dip 16 + >fixnum 2double-array-rep set-alien-vector ]
+            3bi
+        ] >>setter
+        32 >>size
+        8 >>align
+        2double-array-rep >>rep
+    "4double-array" typedef ;
+[
+    4float-array 4float-array-rep define-simd-type
+    2double-array 2double-array-rep define-simd-type
+    define-4double-array-type
+] with-compilation-unit
--- a/basis/math/vectors/simd/alien/authors.txt
+++ b/basis/math/vectors/simd/alien/authors.txt
@ -0,0 +1 @@
+Slava Pestov
--- a/basis/math/vectors/simd/authors.txt
+++ b/basis/math/vectors/simd/authors.txt
@ -0,0 +1 @@
+Slava Pestov
--- a/basis/math/vectors/simd/functor/authors.txt
+++ b/basis/math/vectors/simd/functor/authors.txt
@ -0,0 +1 @@
+Slava Pestov
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@ -0,0 +1,61 @@
+! Copyright (C) 2009 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors alien.c-types byte-arrays classes functors
+kernel math parser prettyprint.custom sequences
+sequences.private ;
+IN: math.vectors.simd.functor
+
+ERROR: bad-length got expected ;
+
+FUNCTOR: define-simd-type ( T N -- )
+
+A            DEFINES-CLASS ${N}${T}-array
+<A>          DEFINES <${A}>
+(A)          DEFINES (${A})
+>A           DEFINES >${A}
+A{           DEFINES ${A}{
+
+NTH          [ T dup c-type-getter-boxer array-accessor ]
+SET-NTH      [ T dup c-setter array-accessor ]
+
+BYTES        [ T heap-size N * ]
+INITIAL      [ BYTES <byte-array> ]
+
+WHERE
+
+TUPLE: A
+{ underlying byte-array read-only initial: INITIAL } ;
+
+: <A> ( -- simd-array ) BYTES <byte-array> A boa ; inline
+
+: (A) ( -- simd-array ) BYTES (byte-array) A boa ; inline
+
+M: A clone underlying>> clone \ A boa ; inline
+
+M: A length drop N ; inline
+
+M: A nth-unsafe underlying>> NTH call ; inline
+
+M: A set-nth-unsafe underlying>> SET-NTH call ; inline
+
+: >A ( seq -- simd-array ) \ A new clone-like ;
+
+M: A like drop dup \ A instance? [ >A ] unless ; inline
+
+M: A new-sequence drop dup N = [ drop (A) ] [ N bad-length ] if ; inline
+
+M: A equal? over \ A instance? [ sequence= ] [ 2drop f ] if ;
+
+M: A byte-length underlying>> length ; inline
+
+M: A pprint-delims drop \ A{ \ } ;
+
+M: A >pprint-sequence ;
+
+M: A pprint* pprint-object ;
+
+SYNTAX: A{ \ } [ >A ] parse-literal ;
+
+INSTANCE: A sequence
+
+;FUNCTOR
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@ -0,0 +1,301 @@
+IN: math.vectors.simd.tests
+USING: math math.vectors.simd math.vectors.simd.private
+math.vectors math.functions kernel.private compiler sequences
+tools.test compiler.tree.debugger accessors kernel ;
+
+[ 4float-array{ 0 0 0 0 } ] [ 4float-array new ] unit-test
+
+[ V{ float } ] [ [ { 4float-array } declare norm-sq ] final-classes ] unit-test
+
+[ V{ float } ] [ [ { 4float-array } declare norm ] final-classes ] unit-test
+
+[ 4float-array{ 12 12 12 12 } ] [
+    12 [ 4float-array-with ] compile-call
+] unit-test
+
+[ 4float-array{ 1 2 3 4 } ] [
+    1 2 3 4 [ 4float-array-boa ] compile-call
+] unit-test
+
+[ 4float-array{ 11 22 33 44 } ] [
+    4float-array{ 1 2 3 4 } 4float-array{ 10 20 30 40 }
+    [ { 4float-array 4float-array } declare v+ ] compile-call
+] unit-test
+
+[ 4float-array{ -9 -18 -27 -36 } ] [
+    4float-array{ 1 2 3 4 } 4float-array{ 10 20 30 40 }
+    [ { 4float-array 4float-array } declare v- ] compile-call
+] unit-test
+
+[ 4float-array{ 10 40 90 160 } ] [
+    4float-array{ 1 2 3 4 } 4float-array{ 10 20 30 40 }
+    [ { 4float-array 4float-array } declare v* ] compile-call
+] unit-test
+
+[ 4float-array{ 10 100 1000 10000 } ] [
+    4float-array{ 100 2000 30000 400000 } 4float-array{ 10 20 30 40 }
+    [ { 4float-array 4float-array } declare v/ ] compile-call
+] unit-test
+
+[ 4float-array{ -10 -20 -30 -40 } ] [
+    4float-array{ -10 20 -30 40 } 4float-array{ 10 -20 30 -40 }
+    [ { 4float-array 4float-array } declare vmin ] compile-call
+] unit-test
+
+[ 4float-array{ 10 20 30 40 } ] [
+    4float-array{ -10 20 -30 40 } 4float-array{ 10 -20 30 -40 }
+    [ { 4float-array 4float-array } declare vmax ] compile-call
+] unit-test
+
+[ 10.0 ] [
+    4float-array{ 1 2 3 4 }
+    [ { 4float-array } declare sum ] compile-call
+] unit-test
+
+[ 13.0 ] [
+    4float-array{ 1 2 3 4 }
+    [ { 4float-array } declare sum 3.0 + ] compile-call
+] unit-test
+
+[ 8.0 ] [
+    4float-array{ 1 2 3 4 } 4float-array{ 2 0 2 0 }
+    [ { 4float-array 4float-array } declare v. ] compile-call
+] unit-test
+
+[ 4float-array{ 5 10 15 20 } ] [
+    5.0 4float-array{ 1 2 3 4 }
+    [ { float 4float-array } declare n*v ] compile-call
+] unit-test
+
+[ 4float-array{ 5 10 15 20 } ] [
+    4float-array{ 1 2 3 4 } 5.0
+    [ { float 4float-array } declare v*n ] compile-call
+] unit-test
+
+[ 4float-array{ 10 5 2 5 } ] [
+    10.0 4float-array{ 1 2 5 2 }
+    [ { float 4float-array } declare n/v ] compile-call
+] unit-test
+
+[ 4float-array{ 0.5 1 1.5 2 } ] [
+    4float-array{ 1 2 3 4 } 2
+    [ { float 4float-array } declare v/n ] compile-call
+] unit-test
+
+[ 4float-array{ 1 0 0 0 } ] [
+    4float-array{ 10 0 0 0 }
+    [ { 4float-array } declare normalize ] compile-call
+] unit-test
+
+[ 30.0 ] [
+    4float-array{ 1 2 3 4 }
+    [ { 4float-array } declare norm-sq ] compile-call
+] unit-test
+
+[ t ] [
+    4float-array{ 1 0 0 0 }
+    4float-array{ 0 1 0 0 }
+    [ { 4float-array 4float-array } declare distance ] compile-call
+    2 sqrt 1.0e-6 ~
+] unit-test
+
+[ 2double-array{ 12 12 } ] [
+    12 [ 2double-array-with ] compile-call
+] unit-test
+
+[ 2double-array{ 1 2 } ] [
+    1 2 [ 2double-array-boa ] compile-call
+] unit-test
+
+[ 2double-array{ 11 22 } ] [
+    2double-array{ 1 2 } 2double-array{ 10 20 }
+    [ { 2double-array 2double-array } declare v+ ] compile-call
+] unit-test
+
+[ 2double-array{ -9 -18 } ] [
+    2double-array{ 1 2 } 2double-array{ 10 20 }
+    [ { 2double-array 2double-array } declare v- ] compile-call
+] unit-test
+
+[ 2double-array{ 10 40 } ] [
+    2double-array{ 1 2 } 2double-array{ 10 20 }
+    [ { 2double-array 2double-array } declare v* ] compile-call
+] unit-test
+
+[ 2double-array{ 10 100 } ] [
+    2double-array{ 100 2000 } 2double-array{ 10 20 }
+    [ { 2double-array 2double-array } declare v/ ] compile-call
+] unit-test
+
+[ 2double-array{ -10 -20 } ] [
+    2double-array{ -10 20 } 2double-array{ 10 -20 }
+    [ { 2double-array 2double-array } declare vmin ] compile-call
+] unit-test
+
+[ 2double-array{ 10 20 } ] [
+    2double-array{ -10 20 } 2double-array{ 10 -20 }
+    [ { 2double-array 2double-array } declare vmax ] compile-call
+] unit-test
+
+[ 3.0 ] [
+    2double-array{ 1 2 }
+    [ { 2double-array } declare sum ] compile-call
+] unit-test
+
+[ 7.0 ] [
+    2double-array{ 1 2 }
+    [ { 2double-array } declare sum 4.0 + ] compile-call
+] unit-test
+
+[ 16.0 ] [
+    2double-array{ 1 2 } 2double-array{ 2 7 }
+    [ { 2double-array 2double-array } declare v. ] compile-call
+] unit-test
+
+[ 2double-array{ 5 10 } ] [
+    5.0 2double-array{ 1 2 }
+    [ { float 2double-array } declare n*v ] compile-call
+] unit-test
+
+[ 2double-array{ 5 10 } ] [
+    2double-array{ 1 2 } 5.0
+    [ { float 2double-array } declare v*n ] compile-call
+] unit-test
+
+[ 2double-array{ 10 5 } ] [
+    10.0 2double-array{ 1 2 }
+    [ { float 2double-array } declare n/v ] compile-call
+] unit-test
+
+[ 2double-array{ 0.5 1 } ] [
+    2double-array{ 1 2 } 2
+    [ { float 2double-array } declare v/n ] compile-call
+] unit-test
+
+[ 2double-array{ 0 0 } ] [ 2double-array new ] unit-test
+
+[ 2double-array{ 1 0 } ] [
+    2double-array{ 10 0 }
+    [ { 2double-array } declare normalize ] compile-call
+] unit-test
+
+[ 5.0 ] [
+    2double-array{ 1 2 }
+    [ { 2double-array } declare norm-sq ] compile-call
+] unit-test
+
+[ t ] [
+    2double-array{ 1 0 }
+    2double-array{ 0 1 }
+    [ { 2double-array 2double-array } declare distance ] compile-call
+    2 sqrt 1.0e-6 ~
+] unit-test
+
+[ 4double-array{ 0 0 0 0 } ] [ 4double-array new ] unit-test
+
+[ 4double-array{ 1 2 3 4 } ] [
+    1 2 3 4 4double-array-boa
+] unit-test
+
+[ 4double-array{ 1 1 1 1 } ] [
+    1 4double-array-with
+] unit-test
+
+[ 4double-array{ 0 1 2 3 } ] [
+    1 4double-array-with [ * ] map-index
+] unit-test
+
+[ V{ float } ] [ [ { 4double-array } declare norm-sq ] final-classes ] unit-test
+
+[ V{ float } ] [ [ { 4double-array } declare norm ] final-classes ] unit-test
+
+[ 4double-array{ 12 12 12 12 } ] [
+    12 [ 4double-array-with ] compile-call
+] unit-test
+
+[ 4double-array{ 1 2 3 4 } ] [
+    1 2 3 4 [ 4double-array-boa ] compile-call
+] unit-test
+
+[ 4double-array{ 11 22 33 44 } ] [
+    4double-array{ 1 2 3 4 } 4double-array{ 10 20 30 40 }
+    [ { 4double-array 4double-array } declare v+ ] compile-call
+] unit-test
+
+[ 4double-array{ -9 -18 -27 -36 } ] [
+    4double-array{ 1 2 3 4 } 4double-array{ 10 20 30 40 }
+    [ { 4double-array 4double-array } declare v- ] compile-call
+] unit-test
+
+[ 4double-array{ 10 40 90 160 } ] [
+    4double-array{ 1 2 3 4 } 4double-array{ 10 20 30 40 }
+    [ { 4double-array 4double-array } declare v* ] compile-call
+] unit-test
+
+[ 4double-array{ 10 100 1000 10000 } ] [
+    4double-array{ 100 2000 30000 400000 } 4double-array{ 10 20 30 40 }
+    [ { 4double-array 4double-array } declare v/ ] compile-call
+] unit-test
+
+[ 4double-array{ -10 -20 -30 -40 } ] [
+    4double-array{ -10 20 -30 40 } 4double-array{ 10 -20 30 -40 }
+    [ { 4double-array 4double-array } declare vmin ] compile-call
+] unit-test
+
+[ 4double-array{ 10 20 30 40 } ] [
+    4double-array{ -10 20 -30 40 } 4double-array{ 10 -20 30 -40 }
+    [ { 4double-array 4double-array } declare vmax ] compile-call
+] unit-test
+
+[ 10.0 ] [
+    4double-array{ 1 2 3 4 }
+    [ { 4double-array } declare sum ] compile-call
+] unit-test
+
+[ 13.0 ] [
+    4double-array{ 1 2 3 4 }
+    [ { 4double-array } declare sum 3.0 + ] compile-call
+] unit-test
+
+[ 8.0 ] [
+    4double-array{ 1 2 3 4 } 4double-array{ 2 0 2 0 }
+    [ { 4double-array 4double-array } declare v. ] compile-call
+] unit-test
+
+[ 4double-array{ 5 10 15 20 } ] [
+    5.0 4double-array{ 1 2 3 4 }
+    [ { float 4double-array } declare n*v ] compile-call
+] unit-test
+
+[ 4double-array{ 5 10 15 20 } ] [
+    4double-array{ 1 2 3 4 } 5.0
+    [ { float 4double-array } declare v*n ] compile-call
+] unit-test
+
+[ 4double-array{ 10 5 2 5 } ] [
+    10.0 4double-array{ 1 2 5 2 }
+    [ { float 4double-array } declare n/v ] compile-call
+] unit-test
+
+[ 4double-array{ 0.5 1 1.5 2 } ] [
+    4double-array{ 1 2 3 4 } 2
+    [ { float 4double-array } declare v/n ] compile-call
+] unit-test
+
+[ 4double-array{ 1 0 0 0 } ] [
+    4double-array{ 10 0 0 0 }
+    [ { 4double-array } declare normalize ] compile-call
+] unit-test
+
+[ 30.0 ] [
+    4double-array{ 1 2 3 4 }
+    [ { 4double-array } declare norm-sq ] compile-call
+] unit-test
+
+[ t ] [
+    4double-array{ 1 0 0 0 }
+    4double-array{ 0 1 0 0 }
+    [ { 4double-array 4double-array } declare distance ] compile-call
+    2 sqrt 1.0e-6 ~
+] unit-test
+
--- a/basis/math/vectors/simd/simd.factor
+++ b/basis/math/vectors/simd/simd.factor
@ -0,0 +1,214 @@
+! Copyright (C) 2009 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors alien.c-types byte-arrays cpu.architecture
+generalizations kernel math math.functions math.vectors
+math.vectors.simd.functor math.vectors.specialization parser
+prettyprint.custom sequences sequences.private
+specialized-arrays.double locals assocs literals ;
+IN: math.vectors.simd
+
+<PRIVATE
+
+ERROR: bad-simd-call ;
+
+: (simd-v+) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-v-) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-v*) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-v/) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-vmin) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-vmax) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-vsqrt) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-sum) ( v1 rep -- v2 ) bad-simd-call ;
+: (simd-broadcast) ( x rep -- v ) bad-simd-call ;
+: (simd-gather-2) ( a b rep -- v ) bad-simd-call ;
+: (simd-gather-4) ( a b c d rep -- v ) bad-simd-call ;
+: assert-positive ( x -- y ) ;
+
+PRIVATE>
+
+<<
+
+DEFER: 4float-array
+DEFER: 2double-array
+
+"double" 2 define-simd-type
+"float" 4 define-simd-type
+
+>>
+
+! Constructors
+: 4float-array-with ( x -- simd-array )
+    >float 4float-array-rep (simd-broadcast) 4float-array boa ; inline
+
+: 4float-array-boa ( a b c d -- simd-array )
+    [ >float ] 4 napply 4float-array-rep (simd-gather-4) 4float-array boa ; inline
+
+: 2double-array-with ( x -- simd-array )
+    >float 2double-array-rep (simd-broadcast) 2double-array boa ; inline
+
+: 2double-array-boa ( a b -- simd-array )
+    [ >float ] bi@ 2double-array-rep (simd-gather-2) 2double-array boa ; inline
+
+<PRIVATE
+
+: 4float-array-vv->v-op ( v1 v2 quot -- v3 )
+    [ [ underlying>> ] bi@ 4float-array-rep ] dip call 4float-array boa ; inline
+
+: 4float-array-v->n-op ( v1 quot -- v2 )
+    [ underlying>> 4float-array-rep ] dip call ; inline
+
+: 2double-array-vv->v-op ( v1 v2 quot -- v3 )
+    [ [ underlying>> ] bi@ 2double-array-rep ] dip call 2double-array boa ; inline
+
+: 2double-array-v->n-op ( v1 quot -- v2 )
+    [ underlying>> 2double-array-rep ] dip call ; inline
+
+PRIVATE>
+
+<<
+
+<PRIVATE
+
+:: simd-vector-words ( class ctor elt-type assoc -- )
+    class elt-type assoc {
+        { vneg [ [ dup v- ] keep v- ] }
+        { v. [ v* sum ] }
+        { n+v [ [ ctor execute ] dip v+ ] }
+        { v+n [ ctor execute v+ ] }
+        { n-v [ [ ctor execute ] dip v- ] }
+        { v-n [ ctor execute v- ] }
+        { n*v [ [ ctor execute ] dip v* ] }
+        { v*n [ ctor execute v* ] }
+        { n/v [ [ ctor execute ] dip v/ ] }
+        { v/n [ ctor execute v/ ] }
+        { norm-sq [ dup v. assert-positive ] }
+        { norm [ norm-sq sqrt ] }
+        { normalize [ dup norm v/n ] }
+        { distance [ v- norm ] }
+    } assoc-union
+    specialize-vector-words ;
+
+PRIVATE>
+
+\ 4float-array \ 4float-array-with float H{
+    { v+ [ [ (simd-v+) ] 4float-array-vv->v-op ] }
+    { v- [ [ (simd-v-) ] 4float-array-vv->v-op ] }
+    { v* [ [ (simd-v*) ] 4float-array-vv->v-op ] }
+    { v/ [ [ (simd-v/) ] 4float-array-vv->v-op ] }
+    { vmin [ [ (simd-vmin) ] 4float-array-vv->v-op ] }
+    { vmax [ [ (simd-vmax) ] 4float-array-vv->v-op ] }
+    { sum [ [ (simd-sum) ] 4float-array-v->n-op ] }
+} simd-vector-words
+
+\ 2double-array \ 2double-array-with float H{
+    { v+ [ [ (simd-v+) ] 2double-array-vv->v-op ] }
+    { v- [ [ (simd-v-) ] 2double-array-vv->v-op ] }
+    { v* [ [ (simd-v*) ] 2double-array-vv->v-op ] }
+    { v/ [ [ (simd-v/) ] 2double-array-vv->v-op ] }
+    { vmin [ [ (simd-vmin) ] 2double-array-vv->v-op ] }
+    { vmax [ [ (simd-vmax) ] 2double-array-vv->v-op ] }
+    { sum [ [ (simd-sum) ] 2double-array-v->n-op ] }
+} simd-vector-words
+
+>>
+
+! Synthesize 256-bit vectors from a pair of 128-bit vectors
+! Functorize this later so that we can do it for integers, etc
+TUPLE: 4double-array
+{ underlying1 byte-array initial: $[ 16 <byte-array> ] read-only }
+{ underlying2 byte-array initial: $[ 16 <byte-array> ] read-only } ;
+
+: <4double-array> ( -- simd-array )
+    16 <byte-array> 16 <byte-array> 4double-array boa ; inline
+
+: (4double-array) ( -- simd-array )
+    16 (byte-array) 16 (byte-array) 4double-array boa ; inline
+
+M: 4double-array clone
+    [ underlying1>> clone ] [ underlying2>> clone ] bi
+    4double-array boa ; inline
+
+M: 4double-array length drop 4 ; inline
+
+<PRIVATE
+
+: 4double-array-deref ( n seq -- n' seq' )
+    over 2 < [ underlying1>> ] [ [ 2 - ] dip underlying2>> ] if
+    2 swap double-array boa ; inline
+
+PRIVATE>
+
+M: 4double-array nth-unsafe
+    4double-array-deref nth-unsafe ; inline
+
+M: 4double-array set-nth-unsafe
+    4double-array-deref set-nth-unsafe ; inline
+
+: >4double-array ( seq -- simd-array )
+    4double-array new clone-like ;
+
+M: 4double-array like
+    drop dup 4double-array? [ >4double-array ] unless ; inline
+
+M: 4double-array new-sequence
+    drop dup 4 = [ drop (4double-array) ] [ 4 bad-length ] if ; inline
+
+M: 4double-array equal?
+    over 4double-array? [ sequence= ] [ 2drop f ] if ;
+
+M: 4double-array byte-length drop 32 ; inline
+
+SYNTAX: 4double-array{
+    \ } [ >4double-array ] parse-literal ;
+
+M: 4double-array pprint-delims
+    drop \ 4double-array{ \ } ;
+
+M: 4double-array >pprint-sequence ;
+
+M: 4double-array pprint* pprint-object ;
+
+INSTANCE: 4double-array sequence
+
+: 4double-array-with ( x -- simd-array )
+    dup [ >float 2double-array-rep (simd-broadcast) ] bi@
+    4double-array boa ; inline
+
+: 4double-array-boa ( a b c d -- simd-array )
+    [ >float ] 4 napply [ 2double-array-rep (simd-gather-2) ] 2bi@
+    4double-array boa ; inline
+
+! SIMD operations on 4double-arrays
+
+<PRIVATE
+
+: 4double-array-vv->v-op ( v1 v2 quot -- v3 )
+    [ [ [ underlying1>> ] bi@ 2double-array-rep ] dip call ]
+    [ [ [ underlying2>> ] bi@ 2double-array-rep ] dip call ] 3bi
+    4double-array boa ; inline
+
+: 4double-array-v->n-op ( v1 quot scalar-quot -- v2 )
+    [
+        [ [ underlying1>> 2double-array-rep ] dip call ]
+        [ [ underlying2>> 2double-array-rep ] dip call ] 2bi
+    ] dip call ; inline
+
+PRIVATE>
+
+<<
+
+\ 4double-array \ 4double-array-with float H{
+    { v+ [ [ (simd-v+) ] 4double-array-vv->v-op ] }
+    { v- [ [ (simd-v-) ] 4double-array-vv->v-op ] }
+    { v* [ [ (simd-v*) ] 4double-array-vv->v-op ] }
+    { v/ [ [ (simd-v/) ] 4double-array-vv->v-op ] }
+    { vmin [ [ (simd-vmin) ] 4double-array-vv->v-op ] }
+    { vmax [ [ (simd-vmax) ] 4double-array-vv->v-op ] }
+    { sum [ [ (simd-sum) ] [ + ] 4double-array-v->n-op ] }
+} simd-vector-words
+
+>>
+
+USE: vocabs.loader
+
+"math.vectors.simd.alien" require
--- a/basis/math/vectors/specialization/specialization.factor
+++ b/basis/math/vectors/specialization/specialization.factor
@ -2,7 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: words kernel make sequences effects kernel.private accessors
 combinators math math.intervals math.vectors namespaces assocs fry
-splitting classes.algebra generalizations
+splitting classes.algebra generalizations locals
 compiler.tree.propagation.info ;
 IN: math.vectors.specialization

@ -67,6 +67,7 @@ H{
    { vmin { +vector+ +vector+ -> +vector+ } }
    { vneg { +vector+ -> +vector+ } }
    { vtruncate { +vector+ -> +vector+ } }
+    { sum { +vector+ -> +scalar+ } }
 }

 SYMBOL: specializations
@ -82,19 +83,23 @@ specializations [ vector-words keys [ V{ } clone ] H{ } map>assoc ] initialize

 : outputs ( schema -- seq ) { -> } split second ;

-: specialize-vector-word ( word array-type elt-type -- word' )
+: loop-vector-op ( word array-type elt-type -- word' )
    pick word-schema
    [ inputs (specialize-vector-word) ]
    [ outputs record-output-signature ] 3bi ;

-: input-signature ( word -- signature ) def>> first ;
+:: specialize-vector-word ( word array-type elt-type simd -- word/quot' )
+    word simd key? [ word simd at ] [ word array-type elt-type loop-vector-op ] if ;

-: specialize-vector-words ( array-type elt-type -- )
-    [ vector-words keys ] 2dip
-    '[
-        [ _ _ specialize-vector-word ] keep
-        [ dup input-signature ] dip
-        add-specialization
+:: input-signature ( word array-type elt-type -- signature )
+    array-type elt-type word word-schema inputs signature-for-schema ;
+
+:: specialize-vector-words ( array-type elt-type simd -- )
+    vector-words keys [
+        [ array-type elt-type simd specialize-vector-word ]
+        [ array-type elt-type input-signature ]
+        [ ]
+        tri add-specialization
    ] each ;

 : find-specialization ( classes word -- word/f )