Initial implementation of SSE vector intrinsics:

- cpu.architecture: add SSE vector representations - compiler.cfg.intrinsics.alien: remove an attempt at optimization that value numbering handles now - compiler.cfg.representations: support instructions where the representation is set in the 'rep' slot, and support conversions between single and double floats - alien-float, set-alien-float now use the single float representation, and the conversion is implicit; this fixes a long-standing bug where a register could get clobbered because of how %set-alien-float was defined on x86 - math.vectors.specialization: add support for SIMD specialization (where the vector word's body is replaced by another quotation), also specialize the 'sum' word - math.vectors.simd: 4float-array, 2double-array, 4double-array types, and specializers for the math.vectors words
2009-09-03 02:33:07 -05:00 · 2009-09-03 02:33:07 -05:00 · ff8c70dbe0
parent c719b92363
commit ff8c70dbe0
23 changed files with 1235 additions and 101 deletions
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@ -91,6 +91,11 @@ INSN: ##set-string-nth-fast
 use: src/int-rep obj/int-rep index/int-rep
 temp: temp/int-rep ;
 PURE-INSN: ##copy
 def: dst
 use: src
 literal: rep ;
 ! Integer arithmetic
 PURE-INSN: ##add
 def: dst/int-rep
@ -201,6 +206,15 @@ use: src/int-rep
 temp: temp/int-rep ;
 ! Float arithmetic
 PURE-INSN: ##unbox-float
 def: dst/double-float-rep
 use: src/int-rep ;
 PURE-INSN: ##box-float
 def: dst/int-rep
 use: src/double-float-rep
 temp: temp/int-rep ;
 PURE-INSN: ##add-float
 def: dst/double-float-rep
 use: src1/double-float-rep src2/double-float-rep ;
@ -240,6 +254,15 @@ def: dst/double-float-rep
 use: src1/double-float-rep src2/double-float-rep
 literal: func ;
 ! Single/double float conversion
 PURE-INSN: ##single>double-float
 def: dst/double-float-rep
 use: src/single-float-rep ;
 PURE-INSN: ##double>single-float
 def: dst/single-float-rep
 use: src/double-float-rep ;
 ! Float/integer conversion
 PURE-INSN: ##float>integer
 def: dst/int-rep
@ -249,26 +272,80 @@ PURE-INSN: ##integer>float
 def: dst/double-float-rep
 use: src/int-rep ;
-! Boxing and unboxing
+! SIMD operations
-PURE-INSN: ##copy
+
 INSN: ##box-vector
 def: dst/int-rep
 use: src
 literal: rep
 temp: temp/int-rep ;
 INSN: ##unbox-vector
 def: dst
 use: src/int-rep
 literal: rep ;
 INSN: ##broadcast-vector
 def: dst
 use: src/scalar-rep
 literal: rep ;
 INSN: ##gather-vector-2
 def: dst
 use: src1/scalar-rep src2/scalar-rep
 literal: rep ;
 INSN: ##gather-vector-4
 def: dst
 use: src1/scalar-rep src2/scalar-rep src3/scalar-rep src4/scalar-rep
 literal: rep ;
 INSN: ##add-vector
 def: dst
 use: src1 src2
 literal: rep ;
 INSN: ##sub-vector
 def: dst
 use: src1 src2
 literal: rep ;
 INSN: ##mul-vector
 def: dst
 use: src1 src2
 literal: rep ;
 INSN: ##div-vector
 def: dst
 use: src1 src2
 literal: rep ;
 INSN: ##min-vector
 def: dst
 use: src1 src2
 literal: rep ;
 INSN: ##max-vector
 def: dst
 use: src1 src2
 literal: rep ;
 INSN: ##sqrt-vector
 def: dst
 use: src
 literal: rep ;
-PURE-INSN: ##unbox-float
+INSN: ##horizontal-add-vector
-def: dst/double-float-rep
+def: dst/scalar-rep
-use: src/int-rep ;
+use: src
 literal: rep ;
 ! Boxing and unboxing aliens
 PURE-INSN: ##unbox-any-c-ptr
 def: dst/int-rep
 use: src/int-rep
 temp: temp/int-rep ;
 PURE-INSN: ##box-float
 def: dst/int-rep
 use: src/double-float-rep
 temp: temp/int-rep ;
 PURE-INSN: ##box-alien
 def: dst/int-rep
 use: src/int-rep
@ -322,13 +399,18 @@ def: dst/int-rep
 use: src/int-rep ;
 INSN: ##alien-float
-def: dst/double-float-rep
+def: dst/single-float-rep
 use: src/int-rep ;
 INSN: ##alien-double
 def: dst/double-float-rep
 use: src/int-rep ;
 INSN: ##alien-vector
 def: dst
 use: src/int-rep
 literal: rep ;
 INSN: ##set-alien-integer-1
 use: src/int-rep value/int-rep ;
@ -342,11 +424,15 @@ INSN: ##set-alien-cell
 use: src/int-rep value/int-rep ;
 INSN: ##set-alien-float
-use: src/int-rep value/double-float-rep ;
+use: src/int-rep value/single-float-rep ;
 INSN: ##set-alien-double
 use: src/int-rep value/double-float-rep ;
 INSN: ##set-alien-vector
 use: src/int-rep value
 literal: rep ;
 ! Memory allocation
 INSN: ##allot
 def: dst/int-rep
@ -510,6 +596,7 @@ literal: n ;
 UNION: ##allocation
 ##allot
 ##box-float
 ##box-vector
 ##box-alien
 ##box-displaced-alien
 ##integer>bignum ;
--- a/basis/compiler/cfg/instructions/syntax/syntax.factor
+++ b/basis/compiler/cfg/instructions/syntax/syntax.factor
@ -7,11 +7,20 @@ IN: compiler.cfg.instructions.syntax
 SYMBOLS: def use temp literal constant ;
 SYMBOL: scalar-rep
 TUPLE: insn-slot-spec type name rep ;
 : parse-rep ( str/f -- rep )
    {
        { [ dup not ] [ ] }
        { [ dup "scalar-rep" = ] [ drop scalar-rep ] }
        [ "cpu.architecture" lookup ]
    } cond ;
 : parse-insn-slot-spec ( type string -- spec )
    over [ "Missing type" throw ] unless
-    "/" split1 dup [ "cpu.architecture" lookup ] when
+    "/" split1 parse-rep
    insn-slot-spec boa ;
 : parse-insn-slot-specs ( seq -- specs )
--- a/basis/compiler/cfg/intrinsics/alien/alien.factor
+++ b/basis/compiler/cfg/intrinsics/alien/alien.factor
@ -20,22 +20,14 @@ IN: compiler.cfg.intrinsics.alien
        ^^box-displaced-alien ds-push
    ] [ emit-primitive ] if ;
 : (prepare-alien-accessor-imm) ( class offset -- offset-vreg )
    ds-drop [ ds-pop swap ^^unbox-c-ptr ] dip ^^add-imm ;
 : (prepare-alien-accessor) ( class -- offset-vreg )
    [ 2inputs ^^untag-fixnum swap ] dip ^^unbox-c-ptr ^^add ;
 : prepare-alien-accessor ( infos -- offset-vreg )
-    <reversed> [ second class>> ] [ first ] bi
+    <reversed> second class>>
-    dup value-info-small-fixnum? [
+    [ 2inputs ^^untag-fixnum swap ] dip ^^unbox-c-ptr ^^add ;
        literal>> (prepare-alien-accessor-imm)
    ] [ drop (prepare-alien-accessor) ] if ;
 :: inline-alien ( node quot test -- )
    [let | infos [ node node-input-infos ] |
        infos test call
-        [ infos prepare-alien-accessor quot call ]
+        [ infos quot call ]
        [ node emit-primitive ]
        if
    ] ; inline
@ -46,7 +38,7 @@ IN: compiler.cfg.intrinsics.alien
    bi and ;
 : inline-alien-getter ( node quot -- )
-    '[ @ ds-push ]
+    '[ prepare-alien-accessor @ ds-push ]
    [ inline-alien-getter? ] inline-alien ; inline
 : inline-alien-setter? ( infos class -- ? )
@ -56,18 +48,17 @@ IN: compiler.cfg.intrinsics.alien
    tri and and ;
 : inline-alien-integer-setter ( node quot -- )
-    '[ ds-pop ^^untag-fixnum @ ]
+    '[ prepare-alien-accessor ds-pop ^^untag-fixnum @ ]
    [ fixnum inline-alien-setter? ]
    inline-alien ; inline
 : inline-alien-cell-setter ( node quot -- )
-    [ dup node-input-infos first class>> ] dip
+    '[ [ prepare-alien-accessor ds-pop ] [ first class>> ] bi ^^unbox-c-ptr @ ]
    '[ ds-pop _ ^^unbox-c-ptr @ ]
    [ pinned-c-ptr inline-alien-setter? ]
    inline-alien ; inline
 : inline-alien-float-setter ( node quot -- )
-    '[ ds-pop @ ]
+    '[ prepare-alien-accessor ds-pop @ ]
    [ float inline-alien-setter? ]
    inline-alien ; inline
--- a/basis/compiler/cfg/intrinsics/intrinsics.factor
+++ b/basis/compiler/cfg/intrinsics/intrinsics.factor
@ -7,6 +7,7 @@ compiler.cfg.intrinsics.alien
 compiler.cfg.intrinsics.allot
 compiler.cfg.intrinsics.fixnum
 compiler.cfg.intrinsics.float
 compiler.cfg.intrinsics.simd
 compiler.cfg.intrinsics.slots
 compiler.cfg.intrinsics.misc
 compiler.cfg.comparisons ;
@ -22,6 +23,9 @@ QUALIFIED: classes.tuple.private
 QUALIFIED: math.private
 QUALIFIED: math.integers.private
 QUALIFIED: math.floats.private
 QUALIFIED: math.vectors.simd
 QUALIFIED: math.vectors.simd.private
 QUALIFIED: math.vectors.simd.alien
 QUALIFIED: math.libm
 IN: compiler.cfg.intrinsics
@ -142,5 +146,27 @@ IN: compiler.cfg.intrinsics
        { math.integers.private:fixnum-log2 [ drop emit-fixnum-log2 ] }
    } enable-intrinsics ;
 : enable-sse2-simd ( -- )
    {
        { math.vectors.simd.private:assert-positive [ drop ] }
        { math.vectors.simd.private:(simd-v+) [ [ ^^add-vector ] emit-binary-vector-op ] }
        { math.vectors.simd.private:(simd-v-) [ [ ^^sub-vector ] emit-binary-vector-op ] }
        { math.vectors.simd.private:(simd-v*) [ [ ^^mul-vector ] emit-binary-vector-op ] }
        { math.vectors.simd.private:(simd-v/) [ [ ^^div-vector ] emit-binary-vector-op ] }
        { math.vectors.simd.private:(simd-vmin) [ [ ^^min-vector ] emit-binary-vector-op ] }
        { math.vectors.simd.private:(simd-vmax) [ [ ^^max-vector ] emit-binary-vector-op ] }
        { math.vectors.simd.private:(simd-vsqrt) [ [ ^^sqrt-vector ] emit-binary-vector-op ] }
        { math.vectors.simd.private:(simd-broadcast) [ [ ^^broadcast-vector ] emit-unary-vector-op ] }
        { math.vectors.simd.private:(simd-gather-2) [ emit-gather-vector-2 ] }
        { math.vectors.simd.private:(simd-gather-4) [ emit-gather-vector-4 ] }
        { math.vectors.simd.alien:alien-vector [ emit-alien-vector ] }
        { math.vectors.simd.alien:set-alien-vector [ emit-set-alien-vector ] }
    } enable-intrinsics ;
 : enable-sse3-simd ( -- )
    {
        { math.vectors.simd.private:(simd-sum) [ [ ^^horizontal-add-vector ] emit-unary-vector-op ] }
    } enable-intrinsics ;
 : emit-intrinsic ( node word -- )
    "intrinsic" word-prop call( node -- ) ;
--- a/basis/compiler/cfg/intrinsics/simd/authors.txt
+++ b/basis/compiler/cfg/intrinsics/simd/authors.txt
@ -0,0 +1 @@
 Slava Pestov
--- a/basis/compiler/cfg/intrinsics/simd/simd.factor
+++ b/basis/compiler/cfg/intrinsics/simd/simd.factor
@ -0,0 +1,57 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors byte-arrays fry cpu.architecture kernel
 sequences compiler.tree.propagation.info
 compiler.cfg.builder.blocks compiler.cfg.stacks
 compiler.cfg.stacks.local compiler.cfg.hats
 compiler.cfg.instructions compiler.cfg.registers
 compiler.cfg.intrinsics.alien ;
 IN: compiler.cfg.intrinsics.simd
 : emit-vector-op ( node quot: ( rep -- ) -- )
    [ dup node-input-infos last literal>> ] dip over representation?
    [ [ drop ] 2dip call ] [ 2drop emit-primitive ] if ; inline
 : emit-binary-vector-op ( node quot -- )
    '[ [ ds-drop 2inputs ] dip @ ds-push ] emit-vector-op ; inline
 : emit-unary-vector-op ( node quot -- )
    '[ [ ds-drop ds-pop ] dip @ ds-push ] emit-vector-op ; inline
 : emit-gather-vector-2 ( node -- )
    [ ^^gather-vector-2 ] emit-binary-vector-op ;
 : emit-gather-vector-4 ( node -- )
    [
        ds-drop
        [
            D 3 peek-loc
            D 2 peek-loc
            D 1 peek-loc
            D 0 peek-loc
            -4 inc-d
        ] dip
        ^^gather-vector-4
        ds-push
    ] emit-vector-op ;
 : inline-alien-vector-setter ( node quot -- )
    '[ ds-drop prepare-alien-accessor ds-pop @ ]
    [ byte-array inline-alien-setter? ]
    inline-alien ; inline
 : emit-alien-vector ( node -- )
    dup [
        '[
            ds-drop prepare-alien-accessor
            _ ^^alien-vector ds-push
        ]
        [ inline-alien-getter? ] inline-alien
    ] with emit-vector-op ;
 : emit-set-alien-vector ( node -- )
    dup [
        '[
            _ ##set-alien-vector
        ] inline-alien-vector-setter
    ] with emit-vector-op ;
--- a/basis/compiler/cfg/representations/preferred/preferred.factor
+++ b/basis/compiler/cfg/representations/preferred/preferred.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: kernel accessors sequences arrays fry namespaces generic
-words sets cpu.architecture compiler.units
+words sets combinators generalizations cpu.architecture compiler.units
 compiler.cfg.utilities compiler.cfg compiler.cfg.rpo
 compiler.cfg.instructions compiler.cfg.instructions.syntax
 compiler.cfg.def-use ;
@ -13,35 +13,41 @@ GENERIC: uses-vreg-reps ( insn -- reps )
 <PRIVATE
 : rep-getter-quot ( rep -- quot )
    {
        { f [ [ rep>> ] ] }
        { scalar-rep [ [ rep>> scalar-rep-of ] ] }
        [ '[ _ nip ] ]
    } case ;
 : define-defs-vreg-rep-method ( insn -- )
    [ \ defs-vreg-rep create-method ]
-    [ insn-def-slot dup [ rep>> ] when '[ drop _ ] ] bi
+    [ insn-def-slot [ rep>> rep-getter-quot ] [ [ drop f ] ] if* ]
-    define ;
+    bi define ;
 : reps-getter-quot ( reps -- quot )
    [ rep>> rep-getter-quot ] map dup length '[ _ cleave _ narray ] ;
 : define-uses-vreg-reps-method ( insn -- )
    [ \ uses-vreg-reps create-method ]
-    [ insn-use-slots [ rep>> ] map '[ drop _ ] ] bi
+    [ insn-use-slots reps-getter-quot ]
-    define ;
+    bi define ;
 : define-temp-vreg-reps-method ( insn -- )
    [ \ temp-vreg-reps create-method ]
-    [ insn-temp-slots [ rep>> ] map '[ drop _ ] ] bi
+    [ insn-temp-slots reps-getter-quot ]
-    define ;
+    bi define ;
 PRIVATE>
 [
    insn-classes get
-    [ { ##copy } diff [ define-defs-vreg-rep-method ] each ]
+    [ [ define-defs-vreg-rep-method ] each ]
-    [ { ##copy ##phi } diff [ define-uses-vreg-reps-method ] each ]
+    [ { ##phi } diff [ define-uses-vreg-reps-method ] each ]
    [ [ define-temp-vreg-reps-method ] each ]
    tri
 ] with-compilation-unit
 M: ##copy defs-vreg-rep rep>> ;
 M: ##copy uses-vreg-reps rep>> 1array ;
 : each-def-rep ( insn vreg-quot: ( vreg rep -- ) -- )
    [ [ defs-vreg ] [ defs-vreg-rep ] bi ] dip with when* ; inline
--- a/basis/compiler/cfg/representations/representations.factor
+++ b/basis/compiler/cfg/representations/representations.factor
@ -5,6 +5,7 @@ arrays combinators make locals deques dlists
 cpu.architecture compiler.utilities
 compiler.cfg
 compiler.cfg.rpo
 compiler.cfg.hats
 compiler.cfg.registers
 compiler.cfg.instructions
 compiler.cfg.def-use
@ -16,13 +17,47 @@ IN: compiler.cfg.representations
 ! Virtual register representation selection.
 ERROR: bad-conversion dst src dst-rep src-rep ;
 GENERIC: emit-box ( dst src rep -- )
 GENERIC: emit-unbox ( dst src rep -- )
 M: single-float-rep emit-box
    drop
    [ double-float-rep next-vreg-rep dup ] dip ##single>double-float
    int-rep next-vreg-rep ##box-float ;
 M: single-float-rep emit-unbox
    drop
    [ double-float-rep next-vreg-rep dup ] dip ##unbox-float
    ##double>single-float ;
 M: double-float-rep emit-box
    drop
    int-rep next-vreg-rep ##box-float ;
 M: double-float-rep emit-unbox
    drop ##unbox-float ;
 M: vector-rep emit-box
    int-rep next-vreg-rep ##box-vector ;
 M: vector-rep emit-unbox
    ##unbox-vector ;
 : emit-conversion ( dst src dst-rep src-rep -- )
-    2array {
+    {
-        { { int-rep int-rep } [ int-rep ##copy ] }
+        { [ 2dup eq? ] [ drop ##copy ] }
-        { { double-float-rep double-float-rep } [ double-float-rep ##copy ] }
+        { [ dup int-rep eq? ] [ drop emit-unbox ] }
-        { { double-float-rep int-rep } [ ##unbox-float ] }
+        { [ over int-rep eq? ] [ nip emit-box ] }
-        { { int-rep double-float-rep } [ int-rep next-vreg-rep ##box-float ] }
+        [
-    } case ;
+            2array {
                { { double-float-rep single-float-rep } [ ##single>double-float ] }
                { { single-float-rep double-float-rep } [ ##double>single-float ] }
                [ first2 bad-conversion ]
            } case
        ]
    } cond ;
 <PRIVATE
--- a/basis/compiler/cfg/two-operand/two-operand.factor
+++ b/basis/compiler/cfg/two-operand/two-operand.factor
@ -45,7 +45,13 @@ UNION: two-operand-insn
    ##mul-float
    ##div-float
    ##min-float
-    ##max-float ;
+    ##max-float
    ##add-vector
    ##sub-vector
    ##mul-vector
    ##div-vector
    ##min-vector
    ##max-vector ;
 GENERIC: convert-two-operand* ( insn -- )
--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@ -144,8 +144,11 @@ CODEGEN: ##min %min
 CODEGEN: ##max %max
 CODEGEN: ##not %not
 CODEGEN: ##log2 %log2
 CODEGEN: ##copy %copy
 CODEGEN: ##integer>bignum %integer>bignum
 CODEGEN: ##bignum>integer %bignum>integer
 CODEGEN: ##unbox-float %unbox-float
 CODEGEN: ##box-float %box-float
 CODEGEN: ##add-float %add-float
 CODEGEN: ##sub-float %sub-float
 CODEGEN: ##mul-float %mul-float
@ -155,12 +158,24 @@ CODEGEN: ##max-float %max-float
 CODEGEN: ##sqrt %sqrt
 CODEGEN: ##unary-float-function %unary-float-function
 CODEGEN: ##binary-float-function %binary-float-function
 CODEGEN: ##single>double-float %single>double-float
 CODEGEN: ##double>single-float %double>single-float
 CODEGEN: ##integer>float %integer>float
 CODEGEN: ##float>integer %float>integer
-CODEGEN: ##copy %copy
+CODEGEN: ##unbox-vector %unbox-vector
-CODEGEN: ##unbox-float %unbox-float
+CODEGEN: ##broadcast-vector %broadcast-vector
 CODEGEN: ##gather-vector-2 %gather-vector-2
 CODEGEN: ##gather-vector-4 %gather-vector-4
 CODEGEN: ##box-vector %box-vector
 CODEGEN: ##add-vector %add-vector
 CODEGEN: ##sub-vector %sub-vector
 CODEGEN: ##mul-vector %mul-vector
 CODEGEN: ##div-vector %div-vector
 CODEGEN: ##min-vector %min-vector
 CODEGEN: ##max-vector %max-vector
 CODEGEN: ##sqrt-vector %sqrt-vector
 CODEGEN: ##horizontal-add-vector %horizontal-add-vector
 CODEGEN: ##unbox-any-c-ptr %unbox-any-c-ptr
 CODEGEN: ##box-float %box-float
 CODEGEN: ##box-alien %box-alien
 CODEGEN: ##box-displaced-alien %box-displaced-alien
 CODEGEN: ##alien-unsigned-1 %alien-unsigned-1
@ -172,12 +187,14 @@ CODEGEN: ##alien-signed-4 %alien-signed-4
 CODEGEN: ##alien-cell %alien-cell
 CODEGEN: ##alien-float %alien-float
 CODEGEN: ##alien-double %alien-double
 CODEGEN: ##alien-vector %alien-vector
 CODEGEN: ##set-alien-integer-1 %set-alien-integer-1
 CODEGEN: ##set-alien-integer-2 %set-alien-integer-2
 CODEGEN: ##set-alien-integer-4 %set-alien-integer-4
 CODEGEN: ##set-alien-cell %set-alien-cell
 CODEGEN: ##set-alien-float %set-alien-float
 CODEGEN: ##set-alien-double %set-alien-double
 CODEGEN: ##set-alien-vector %set-alien-vector
 CODEGEN: ##allot %allot
 CODEGEN: ##write-barrier %write-barrier
 CODEGEN: ##compare %compare
--- a/basis/compiler/tests/codegen.factor
+++ b/basis/compiler/tests/codegen.factor
@ -413,3 +413,5 @@ cell 4 = [
 [ 1 "0.169967142900241" ] [ 1.4 1 [ swap fcos ] compile-call number>string ] unit-test
 [ "0.169967142900241" "0.9854497299884601" ] [ 1.4 [ [ fcos ] [ fsin ] bi ] compile-call [ number>string ] bi@ ] unit-test
 [ 1 "0.169967142900241" "0.9854497299884601" ] [ 1.4 1 [ swap >float [ fcos ] [ fsin ] bi ] compile-call [ number>string ] bi@ ] unit-test
 [ 6.0 ] [ 1.0 [ >float 3.0 + [ B{ 0 0 0 0 } 0 set-alien-float ] [ 2.0 + ] bi ] compile-call ] unit-test
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@ -0,0 +1,43 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors byte-arrays combinators
 compiler.tree.propagation.info cpu.architecture kernel words math
 math.intervals math.vectors.simd math.vectors.simd.private
 math.vectors.simd.alien ;
 IN: compiler.tree.propagation.simd
 \ (simd-v+) { byte-array } "default-output-classes" set-word-prop
 \ (simd-v-) { byte-array } "default-output-classes" set-word-prop
 \ (simd-v*) { byte-array } "default-output-classes" set-word-prop
 \ (simd-v/) { byte-array } "default-output-classes" set-word-prop
 \ (simd-vmin) { byte-array } "default-output-classes" set-word-prop
 \ (simd-vmax) { byte-array } "default-output-classes" set-word-prop
 \ (simd-vsqrt) { byte-array } "default-output-classes" set-word-prop
 \ (simd-sum) [
    nip dup literal?>> [
        literal>> scalar-rep-of {
            { single-float-rep [ float ] }
            { double-float-rep [ float ] }
        } case
    ] [ drop real ] if
    <class-info>
 ] "outputs" set-word-prop
 \ (simd-broadcast) { byte-array } "default-output-classes" set-word-prop
 \ (simd-gather-2) { byte-array } "default-output-classes" set-word-prop
 \ (simd-gather-4) { byte-array } "default-output-classes" set-word-prop
 \ assert-positive [
    real [0,inf] <class/interval-info> value-info-intersect
 ] "outputs" set-word-prop
 \ alien-vector { byte-array } "default-output-classes" set-word-prop
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@ -20,7 +20,33 @@ SINGLETONS: tagged-rep int-rep ;
 ! one of these representations
 SINGLETONS: single-float-rep double-float-rep ;
-UNION: representation any-rep tagged-rep int-rep single-float-rep double-float-rep ;
+SINGLETONS:
 4float-array-rep
 2double-array-rep
 16char-array-rep
 16uchar-array-rep
 8short-array-rep
 8ushort-array-rep
 4int-array-rep
 4uint-array-rep ;
 UNION: vector-rep
 4float-array-rep
 2double-array-rep
 16char-array-rep
 16uchar-array-rep
 8short-array-rep
 8ushort-array-rep
 4int-array-rep
 4uint-array-rep ;
 UNION: representation
 any-rep
 tagged-rep
 int-rep
 single-float-rep
 double-float-rep
 vector-rep ;
 ! Register classes
 SINGLETONS: int-regs float-regs ;
@ -31,23 +57,28 @@ CONSTANT: reg-classes { int-regs float-regs }
 ! A pseudo-register class for parameters spilled on the stack
 SINGLETON: stack-params
-: reg-class-of ( rep -- reg-class )
+GENERIC: reg-class-of ( rep -- reg-class )
    {
        { tagged-rep [ int-regs ] }
        { int-rep [ int-regs ] }
        { single-float-rep [ float-regs ] }
        { double-float-rep [ float-regs ] }
        { stack-params [ stack-params ] }
    } case ;
-: rep-size ( rep -- n )
+M: tagged-rep reg-class-of drop int-regs ;
-    {
+M: int-rep reg-class-of drop int-regs ;
-        { tagged-rep [ cell ] }
+M: single-float-rep reg-class-of drop float-regs ;
-        { int-rep [ cell ] }
+M: double-float-rep reg-class-of drop float-regs ;
-        { single-float-rep [ 4 ] }
+M: vector-rep reg-class-of drop float-regs ;
-        { double-float-rep [ 8 ] }
+M: stack-params reg-class-of drop stack-params ;
-        { stack-params [ cell ] }
+
-    } case ;
+GENERIC: rep-size ( rep -- n )
 M: tagged-rep rep-size drop cell ;
 M: int-rep rep-size drop cell ;
 M: single-float-rep rep-size drop 4 ;
 M: double-float-rep rep-size drop 8 ;
 M: stack-params rep-size drop cell ;
 M: vector-rep rep-size drop 16 ;
 GENERIC: scalar-rep-of ( rep -- rep' )
 M: 4float-array-rep scalar-rep-of drop single-float-rep ;
 M: 2double-array-rep scalar-rep-of drop double-float-rep ;
 ! Mapping from register class to machine registers
 HOOK: machine-registers cpu ( -- assoc )
@ -101,6 +132,8 @@ HOOK: %max     cpu ( dst src1 src2 -- )
 HOOK: %not     cpu ( dst src -- )
 HOOK: %log2    cpu ( dst src -- )
 HOOK: %copy cpu ( dst src rep -- )
 HOOK: %fixnum-add cpu ( label dst src1 src2 -- )
 HOOK: %fixnum-sub cpu ( label dst src1 src2 -- )
 HOOK: %fixnum-mul cpu ( label dst src1 src2 -- )
@ -108,6 +141,9 @@ HOOK: %fixnum-mul cpu ( label dst src1 src2 -- )
 HOOK: %integer>bignum cpu ( dst src temp -- )
 HOOK: %bignum>integer cpu ( dst src temp -- )
 HOOK: %unbox-float cpu ( dst src -- )
 HOOK: %box-float cpu ( dst src temp -- )
 HOOK: %add-float cpu ( dst src1 src2 -- )
 HOOK: %sub-float cpu ( dst src1 src2 -- )
 HOOK: %mul-float cpu ( dst src1 src2 -- )
@ -118,13 +154,29 @@ HOOK: %sqrt cpu ( dst src -- )
 HOOK: %unary-float-function cpu ( dst src func -- )
 HOOK: %binary-float-function cpu ( dst src1 src2 func -- )
 HOOK: %single>double-float cpu ( dst src -- )
 HOOK: %double>single-float cpu ( dst src -- )
 HOOK: %integer>float cpu ( dst src -- )
 HOOK: %float>integer cpu ( dst src -- )
-HOOK: %copy cpu ( dst src rep -- )
+HOOK: %box-vector cpu ( dst src temp rep -- )
-HOOK: %unbox-float cpu ( dst src -- )
+HOOK: %unbox-vector cpu ( dst src rep -- )
 HOOK: %broadcast-vector cpu ( dst src rep -- )
 HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
 HOOK: %add-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
 HOOK: %div-vector cpu ( dst src1 src2 rep -- )
 HOOK: %min-vector cpu ( dst src1 src2 rep -- )
 HOOK: %max-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sqrt-vector cpu ( dst src rep -- )
 HOOK: %horizontal-add-vector cpu ( dst src rep -- )
 HOOK: %unbox-any-c-ptr cpu ( dst src temp -- )
 HOOK: %box-float cpu ( dst src temp -- )
 HOOK: %box-alien cpu ( dst src temp -- )
 HOOK: %box-displaced-alien cpu ( dst displacement base temp1 temp2 base-class -- )
@ -137,6 +189,7 @@ HOOK: %alien-signed-4   cpu ( dst src -- )
 HOOK: %alien-cell       cpu ( dst src -- )
 HOOK: %alien-float      cpu ( dst src -- )
 HOOK: %alien-double     cpu ( dst src -- )
 HOOK: %alien-vector     cpu ( dst src rep -- )
 HOOK: %set-alien-integer-1 cpu ( ptr value -- )
 HOOK: %set-alien-integer-2 cpu ( ptr value -- )
@ -144,6 +197,7 @@ HOOK: %set-alien-integer-4 cpu ( ptr value -- )
 HOOK: %set-alien-cell      cpu ( ptr value -- )
 HOOK: %set-alien-float     cpu ( ptr value -- )
 HOOK: %set-alien-double    cpu ( ptr value -- )
 HOOK: %set-alien-vector    cpu ( ptr value rep -- )
 HOOK: %alien-global cpu ( dst symbol library -- )
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@ -4,7 +4,7 @@ USING: accessors assocs alien alien.c-types arrays strings
 cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
 cpu.architecture kernel kernel.private math memory namespaces make
 sequences words system layouts combinators math.order fry locals
-compiler.constants
+compiler.constants byte-arrays
 compiler.cfg.registers
 compiler.cfg.instructions
 compiler.cfg.intrinsics
@ -130,6 +130,21 @@ M: x86 %max     nip [ CMP ] [ CMOVL ] 2bi ;
 M: x86 %not     drop NOT ;
 M: x86 %log2    BSR ;
 GENERIC: copy-register* ( dst src rep -- )
 M: int-rep copy-register* drop MOV ;
 M: tagged-rep copy-register* drop MOV ;
 M: single-float-rep copy-register* drop MOVSS ;
 M: double-float-rep copy-register* drop MOVSD ;
 M: 4float-array-rep copy-register* drop MOVUPS ;
 M: 2double-array-rep copy-register* drop MOVUPD ;
 M: vector-rep copy-register* drop MOVDQU ;
 : copy-register ( dst src rep -- )
    2over eq? [ 3drop ] [ copy-register* ] if ;
 M: x86 %copy ( dst src rep -- ) copy-register ;
 :: overflow-template ( label dst src1 src2 insn -- )
    src1 src2 insn call
    label JO ; inline
@ -211,24 +226,120 @@ M: x86 %min-float nip MINSD ;
 M: x86 %max-float nip MAXSD ;
 M: x86 %sqrt SQRTSD ;
 M: x86 %single>double-float CVTSS2SD ;
 M: x86 %double>single-float CVTSD2SS ;
 M: x86 %integer>float CVTSI2SD ;
 M: x86 %float>integer CVTTSD2SI ;
 GENERIC: copy-register* ( dst src rep -- )
 M: int-rep copy-register* drop MOV ;
 M: tagged-rep copy-register* drop MOV ;
 M: single-float-rep copy-register* drop MOVSS ;
 M: double-float-rep copy-register* drop MOVSD ;
 : copy-register ( dst src rep -- )
    2over eq? [ 3drop ] [ copy-register* ] if ;
 M: x86 %copy ( dst src rep -- ) copy-register ;
 M: x86 %unbox-float ( dst src -- )
    float-offset [+] MOVSD ;
 M:: x86 %box-float ( dst src temp -- )
    dst 16 float temp %allot
    dst float-offset [+] src MOVSD ;
 M:: x86 %box-vector ( dst src rep temp -- )
    dst rep rep-size 2 cells + byte-array temp %allot
    16 tag-fixnum dst 1 byte-array tag-number %set-slot-imm
    dst byte-array-offset [+]
    src rep copy-register ;
 M:: x86 %unbox-vector ( dst src rep -- )
    dst src byte-array-offset [+]
    rep copy-register ;
 M: x86 %broadcast-vector ( dst src rep -- )
    {
        { 4float-array-rep [ [ MOVAPS ] [ drop dup 0 SHUFPS ] 2bi ] }
        { 2double-array-rep [ [ MOVAPD ] [ drop dup 0 SHUFPD ] 2bi ] }
    } case ;
 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
    rep {
        {
            4float-array-rep
            [
                dst src1 MOVSS
                dst src2 UNPCKLPS
                src3 src4 UNPCKLPS
                dst src3 HEX: 44 SHUFPS
            ]
        }
    } case ;
 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
    rep {
        {
            2double-array-rep
            [
                dst src1 MOVAPD
                dst src2 0 SHUFPD
            ]
        }
    } case ;
 M: x86 %add-vector ( dst src1 src2 rep -- )
    {
        { 4float-array-rep [ ADDPS ] }
        { 2double-array-rep [ ADDPD ] }
        { 16char-array-rep [ PADDB ] }
        { 16uchar-array-rep [ PADDB ] }
        { 8short-array-rep [ PADDW ] }
        { 8ushort-array-rep [ PADDW ] }
        { 4int-array-rep [ PADDD ] }
        { 4uint-array-rep [ PADDD ] }
    } case drop ;
 M: x86 %sub-vector ( dst src1 src2 rep -- )
    {
        { 4float-array-rep [ SUBPS ] }
        { 2double-array-rep [ SUBPD ] }
        { 16char-array-rep [ PSUBB ] }
        { 16uchar-array-rep [ PSUBB ] }
        { 8short-array-rep [ PSUBW ] }
        { 8ushort-array-rep [ PSUBW ] }
        { 4int-array-rep [ PSUBD ] }
        { 4uint-array-rep [ PSUBD ] }
    } case drop ;
 M: x86 %mul-vector ( dst src1 src2 rep -- )
    {
        { 4float-array-rep [ MULPS ] }
        { 2double-array-rep [ MULPD ] }
        { 4int-array-rep [ PMULLW ] }
    } case drop ;
 M: x86 %div-vector ( dst src1 src2 rep -- )
    {
        { 4float-array-rep [ DIVPS ] }
        { 2double-array-rep [ DIVPD ] }
    } case drop ;
 M: x86 %min-vector ( dst src1 src2 rep -- )
    {
        { 4float-array-rep [ MINPS ] }
        { 2double-array-rep [ MINPD ] }
    } case drop ;
 M: x86 %max-vector ( dst src1 src2 rep -- )
    {
        { 4float-array-rep [ MAXPS ] }
        { 2double-array-rep [ MAXPD ] }
    } case drop ;
 M: x86 %sqrt-vector ( dst src rep -- )
    {
        { 4float-array-rep [ SQRTPS ] }
        { 2double-array-rep [ SQRTPD ] }
    } case ;
 M: x86 %horizontal-add-vector ( dst src rep -- )
    {
        { 4float-array-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
        { 2double-array-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
    } case ;
 M:: x86 %unbox-any-c-ptr ( dst src temp -- )
    [
        { "is-byte-array" "end" "start" } [ define-label ] each
@ -255,10 +366,6 @@ M:: x86 %unbox-any-c-ptr ( dst src temp -- )
        "end" resolve-label
    ] with-scope ;
 M:: x86 %box-float ( dst src temp -- )
    dst 16 float temp %allot
    dst float-offset [+] src MOVSD ;
 : alien@ ( reg n -- op ) cells alien tag-number - [+] ;
 :: %allot-alien ( dst displacement base temp -- )
@ -405,8 +512,9 @@ M: x86 %alien-signed-2 16 %alien-signed-getter ;
 M: x86 %alien-signed-4 32 %alien-signed-getter ;
 M: x86 %alien-cell [] MOV ;
-M: x86 %alien-float dupd [] MOVSS dup CVTSS2SD ;
+M: x86 %alien-float [] MOVSS ;
 M: x86 %alien-double [] MOVSD ;
 M: x86 %alien-vector [ [] ] dip copy-register ;
 :: %alien-integer-setter ( ptr value size -- )
    value { ptr } size [| new-value |
@ -418,8 +526,9 @@ M: x86 %set-alien-integer-1 8 %alien-integer-setter ;
 M: x86 %set-alien-integer-2 16 %alien-integer-setter ;
 M: x86 %set-alien-integer-4 32 %alien-integer-setter ;
 M: x86 %set-alien-cell [ [] ] dip MOV ;
-M: x86 %set-alien-float dup dup CVTSD2SS [ [] ] dip MOVSS ;
+M: x86 %set-alien-float [ [] ] dip MOVSS ;
 M: x86 %set-alien-double [ [] ] dip MOVSD ;
 M: x86 %set-alien-vector [ [] ] 2dip copy-register ;
 : shift-count? ( reg -- ? ) { ECX RCX } memq? ;
--- a/basis/math/vectors/simd/alien/alien-tests.factor
+++ b/basis/math/vectors/simd/alien/alien-tests.factor
@ -0,0 +1,56 @@
 IN: math.vectors.simd.alien.tests
 USING: cpu.architecture math.vectors.simd accessors
 math.vectors.simd.alien kernel classes.struct tools.test
 compiler sequences byte-arrays alien math kernel.private
 specialized-arrays.float ;
 ! Vector alien intrinsics
 [ 4float-array{ 1 2 3 4 } ] [
    [
        4float-array{ 1 2 3 4 }
        underlying>> 0 4float-array-rep alien-vector
    ] compile-call 4float-array boa
 ] unit-test
 [ B{ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 } ] [
    16 [ 1 ] B{ } replicate-as 16 <byte-array>
    [
        0 [
            { byte-array c-ptr fixnum } declare
            4float-array-rep set-alien-vector
        ] compile-call
    ] keep
 ] unit-test
 [ float-array{ 1 2 3 4 } ] [
    [
        float-array{ 1 2 3 4 } underlying>>
        float-array{ 4 3 2 1 } clone
        [ underlying>> 0 4float-array-rep set-alien-vector ] keep
    ] compile-call
 ] unit-test
 STRUCT: simd-struct
 { x 4float-array }
 { y 2double-array }
 { z 4double-array } ;
 [ t ] [ [ simd-struct <struct> ] compile-call >c-ptr [ 0 = ] all? ] unit-test
 [ 4float-array{ 1 2 3 4 } 2double-array{ 2 1 } 4double-array{ 4 3 2 1 } ] [
    simd-struct <struct>
    4float-array{ 1 2 3 4 } >>x
    2double-array{ 2 1 } >>y
    4double-array{ 4 3 2 1 } >>z
    [ x>> ] [ y>> ] [ z>> ] tri
 ] unit-test
 [ 4float-array{ 1 2 3 4 } 2double-array{ 2 1 } 4double-array{ 4 3 2 1 } ] [
    [
        simd-struct <struct>
        4float-array{ 1 2 3 4 } >>x
        2double-array{ 2 1 } >>y
        4double-array{ 4 3 2 1 } >>z
        [ x>> ] [ y>> ] [ z>> ] tri
    ] compile-call
 ] unit-test
--- a/basis/math/vectors/simd/alien/alien.factor
+++ b/basis/math/vectors/simd/alien/alien.factor
@ -0,0 +1,51 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: alien accessors alien.c-types byte-arrays compiler.units
 cpu.architecture libc locals kernel math math.vectors.simd
 math.vectors.simd.private ;
 IN: math.vectors.simd.alien
 : alien-vector ( c-ptr n rep -- value )
    ! Inefficient version for when intrinsics are missing
    [ swap <displaced-alien> ] dip rep-size memory>byte-array ;
 : set-alien-vector ( value c-ptr n rep -- )
    ! Inefficient version for when intrinsics are missing
    [ swap <displaced-alien> swap ] dip rep-size memcpy ;
 :: define-simd-type ( class rep -- )
    <c-type>
        byte-array >>class
        class >>boxed-class
        [ rep alien-vector ] >>getter
        [ [ underlying>> ] 2dip rep set-alien-vector ] >>setter
        16 >>size
        8 >>align
        rep >>rep
        [ class boa ] >>boxer-quot
        [ underlying>> ] >>unboxer-quot
    class name>> typedef ;
 : define-4double-array-type ( -- )
    <c-type>
        4double-array >>class
        4double-array >>boxed-class
        [
            [ 2double-array-rep alien-vector ]
            [ 16 + >fixnum 2double-array-rep alien-vector ] 2bi
            4double-array boa
        ] >>getter
        [
            [ [ underlying1>> ] 2dip 2double-array-rep set-alien-vector ]
            [ [ underlying2>> ] 2dip 16 + >fixnum 2double-array-rep set-alien-vector ]
            3bi
        ] >>setter
        32 >>size
        8 >>align
        2double-array-rep >>rep
    "4double-array" typedef ;
 [
    4float-array 4float-array-rep define-simd-type
    2double-array 2double-array-rep define-simd-type
    define-4double-array-type
 ] with-compilation-unit
--- a/basis/math/vectors/simd/alien/authors.txt
+++ b/basis/math/vectors/simd/alien/authors.txt
@ -0,0 +1 @@
 Slava Pestov
--- a/basis/math/vectors/simd/authors.txt
+++ b/basis/math/vectors/simd/authors.txt
@ -0,0 +1 @@
 Slava Pestov
--- a/basis/math/vectors/simd/functor/authors.txt
+++ b/basis/math/vectors/simd/functor/authors.txt
@ -0,0 +1 @@
 Slava Pestov
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@ -0,0 +1,61 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors alien.c-types byte-arrays classes functors
 kernel math parser prettyprint.custom sequences
 sequences.private ;
 IN: math.vectors.simd.functor
 ERROR: bad-length got expected ;
 FUNCTOR: define-simd-type ( T N -- )
 A            DEFINES-CLASS ${N}${T}-array
 <A>          DEFINES <${A}>
 (A)          DEFINES (${A})
 >A           DEFINES >${A}
 A{           DEFINES ${A}{
 NTH          [ T dup c-type-getter-boxer array-accessor ]
 SET-NTH      [ T dup c-setter array-accessor ]
 BYTES        [ T heap-size N * ]
 INITIAL      [ BYTES <byte-array> ]
 WHERE
 TUPLE: A
 { underlying byte-array read-only initial: INITIAL } ;
 : <A> ( -- simd-array ) BYTES <byte-array> A boa ; inline
 : (A) ( -- simd-array ) BYTES (byte-array) A boa ; inline
 M: A clone underlying>> clone \ A boa ; inline
 M: A length drop N ; inline
 M: A nth-unsafe underlying>> NTH call ; inline
 M: A set-nth-unsafe underlying>> SET-NTH call ; inline
 : >A ( seq -- simd-array ) \ A new clone-like ;
 M: A like drop dup \ A instance? [ >A ] unless ; inline
 M: A new-sequence drop dup N = [ drop (A) ] [ N bad-length ] if ; inline
 M: A equal? over \ A instance? [ sequence= ] [ 2drop f ] if ;
 M: A byte-length underlying>> length ; inline
 M: A pprint-delims drop \ A{ \ } ;
 M: A >pprint-sequence ;
 M: A pprint* pprint-object ;
 SYNTAX: A{ \ } [ >A ] parse-literal ;
 INSTANCE: A sequence
 ;FUNCTOR
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@ -0,0 +1,301 @@
 IN: math.vectors.simd.tests
 USING: math math.vectors.simd math.vectors.simd.private
 math.vectors math.functions kernel.private compiler sequences
 tools.test compiler.tree.debugger accessors kernel ;
 [ 4float-array{ 0 0 0 0 } ] [ 4float-array new ] unit-test
 [ V{ float } ] [ [ { 4float-array } declare norm-sq ] final-classes ] unit-test
 [ V{ float } ] [ [ { 4float-array } declare norm ] final-classes ] unit-test
 [ 4float-array{ 12 12 12 12 } ] [
    12 [ 4float-array-with ] compile-call
 ] unit-test
 [ 4float-array{ 1 2 3 4 } ] [
    1 2 3 4 [ 4float-array-boa ] compile-call
 ] unit-test
 [ 4float-array{ 11 22 33 44 } ] [
    4float-array{ 1 2 3 4 } 4float-array{ 10 20 30 40 }
    [ { 4float-array 4float-array } declare v+ ] compile-call
 ] unit-test
 [ 4float-array{ -9 -18 -27 -36 } ] [
    4float-array{ 1 2 3 4 } 4float-array{ 10 20 30 40 }
    [ { 4float-array 4float-array } declare v- ] compile-call
 ] unit-test
 [ 4float-array{ 10 40 90 160 } ] [
    4float-array{ 1 2 3 4 } 4float-array{ 10 20 30 40 }
    [ { 4float-array 4float-array } declare v* ] compile-call
 ] unit-test
 [ 4float-array{ 10 100 1000 10000 } ] [
    4float-array{ 100 2000 30000 400000 } 4float-array{ 10 20 30 40 }
    [ { 4float-array 4float-array } declare v/ ] compile-call
 ] unit-test
 [ 4float-array{ -10 -20 -30 -40 } ] [
    4float-array{ -10 20 -30 40 } 4float-array{ 10 -20 30 -40 }
    [ { 4float-array 4float-array } declare vmin ] compile-call
 ] unit-test
 [ 4float-array{ 10 20 30 40 } ] [
    4float-array{ -10 20 -30 40 } 4float-array{ 10 -20 30 -40 }
    [ { 4float-array 4float-array } declare vmax ] compile-call
 ] unit-test
 [ 10.0 ] [
    4float-array{ 1 2 3 4 }
    [ { 4float-array } declare sum ] compile-call
 ] unit-test
 [ 13.0 ] [
    4float-array{ 1 2 3 4 }
    [ { 4float-array } declare sum 3.0 + ] compile-call
 ] unit-test
 [ 8.0 ] [
    4float-array{ 1 2 3 4 } 4float-array{ 2 0 2 0 }
    [ { 4float-array 4float-array } declare v. ] compile-call
 ] unit-test
 [ 4float-array{ 5 10 15 20 } ] [
    5.0 4float-array{ 1 2 3 4 }
    [ { float 4float-array } declare n*v ] compile-call
 ] unit-test
 [ 4float-array{ 5 10 15 20 } ] [
    4float-array{ 1 2 3 4 } 5.0
    [ { float 4float-array } declare v*n ] compile-call
 ] unit-test
 [ 4float-array{ 10 5 2 5 } ] [
    10.0 4float-array{ 1 2 5 2 }
    [ { float 4float-array } declare n/v ] compile-call
 ] unit-test
 [ 4float-array{ 0.5 1 1.5 2 } ] [
    4float-array{ 1 2 3 4 } 2
    [ { float 4float-array } declare v/n ] compile-call
 ] unit-test
 [ 4float-array{ 1 0 0 0 } ] [
    4float-array{ 10 0 0 0 }
    [ { 4float-array } declare normalize ] compile-call
 ] unit-test
 [ 30.0 ] [
    4float-array{ 1 2 3 4 }
    [ { 4float-array } declare norm-sq ] compile-call
 ] unit-test
 [ t ] [
    4float-array{ 1 0 0 0 }
    4float-array{ 0 1 0 0 }
    [ { 4float-array 4float-array } declare distance ] compile-call
    2 sqrt 1.0e-6 ~
 ] unit-test
 [ 2double-array{ 12 12 } ] [
    12 [ 2double-array-with ] compile-call
 ] unit-test
 [ 2double-array{ 1 2 } ] [
    1 2 [ 2double-array-boa ] compile-call
 ] unit-test
 [ 2double-array{ 11 22 } ] [
    2double-array{ 1 2 } 2double-array{ 10 20 }
    [ { 2double-array 2double-array } declare v+ ] compile-call
 ] unit-test
 [ 2double-array{ -9 -18 } ] [
    2double-array{ 1 2 } 2double-array{ 10 20 }
    [ { 2double-array 2double-array } declare v- ] compile-call
 ] unit-test
 [ 2double-array{ 10 40 } ] [
    2double-array{ 1 2 } 2double-array{ 10 20 }
    [ { 2double-array 2double-array } declare v* ] compile-call
 ] unit-test
 [ 2double-array{ 10 100 } ] [
    2double-array{ 100 2000 } 2double-array{ 10 20 }
    [ { 2double-array 2double-array } declare v/ ] compile-call
 ] unit-test
 [ 2double-array{ -10 -20 } ] [
    2double-array{ -10 20 } 2double-array{ 10 -20 }
    [ { 2double-array 2double-array } declare vmin ] compile-call
 ] unit-test
 [ 2double-array{ 10 20 } ] [
    2double-array{ -10 20 } 2double-array{ 10 -20 }
    [ { 2double-array 2double-array } declare vmax ] compile-call
 ] unit-test
 [ 3.0 ] [
    2double-array{ 1 2 }
    [ { 2double-array } declare sum ] compile-call
 ] unit-test
 [ 7.0 ] [
    2double-array{ 1 2 }
    [ { 2double-array } declare sum 4.0 + ] compile-call
 ] unit-test
 [ 16.0 ] [
    2double-array{ 1 2 } 2double-array{ 2 7 }
    [ { 2double-array 2double-array } declare v. ] compile-call
 ] unit-test
 [ 2double-array{ 5 10 } ] [
    5.0 2double-array{ 1 2 }
    [ { float 2double-array } declare n*v ] compile-call
 ] unit-test
 [ 2double-array{ 5 10 } ] [
    2double-array{ 1 2 } 5.0
    [ { float 2double-array } declare v*n ] compile-call
 ] unit-test
 [ 2double-array{ 10 5 } ] [
    10.0 2double-array{ 1 2 }
    [ { float 2double-array } declare n/v ] compile-call
 ] unit-test
 [ 2double-array{ 0.5 1 } ] [
    2double-array{ 1 2 } 2
    [ { float 2double-array } declare v/n ] compile-call
 ] unit-test
 [ 2double-array{ 0 0 } ] [ 2double-array new ] unit-test
 [ 2double-array{ 1 0 } ] [
    2double-array{ 10 0 }
    [ { 2double-array } declare normalize ] compile-call
 ] unit-test
 [ 5.0 ] [
    2double-array{ 1 2 }
    [ { 2double-array } declare norm-sq ] compile-call
 ] unit-test
 [ t ] [
    2double-array{ 1 0 }
    2double-array{ 0 1 }
    [ { 2double-array 2double-array } declare distance ] compile-call
    2 sqrt 1.0e-6 ~
 ] unit-test
 [ 4double-array{ 0 0 0 0 } ] [ 4double-array new ] unit-test
 [ 4double-array{ 1 2 3 4 } ] [
    1 2 3 4 4double-array-boa
 ] unit-test
 [ 4double-array{ 1 1 1 1 } ] [
    1 4double-array-with
 ] unit-test
 [ 4double-array{ 0 1 2 3 } ] [
    1 4double-array-with [ * ] map-index
 ] unit-test
 [ V{ float } ] [ [ { 4double-array } declare norm-sq ] final-classes ] unit-test
 [ V{ float } ] [ [ { 4double-array } declare norm ] final-classes ] unit-test
 [ 4double-array{ 12 12 12 12 } ] [
    12 [ 4double-array-with ] compile-call
 ] unit-test
 [ 4double-array{ 1 2 3 4 } ] [
    1 2 3 4 [ 4double-array-boa ] compile-call
 ] unit-test
 [ 4double-array{ 11 22 33 44 } ] [
    4double-array{ 1 2 3 4 } 4double-array{ 10 20 30 40 }
    [ { 4double-array 4double-array } declare v+ ] compile-call
 ] unit-test
 [ 4double-array{ -9 -18 -27 -36 } ] [
    4double-array{ 1 2 3 4 } 4double-array{ 10 20 30 40 }
    [ { 4double-array 4double-array } declare v- ] compile-call
 ] unit-test
 [ 4double-array{ 10 40 90 160 } ] [
    4double-array{ 1 2 3 4 } 4double-array{ 10 20 30 40 }
    [ { 4double-array 4double-array } declare v* ] compile-call
 ] unit-test
 [ 4double-array{ 10 100 1000 10000 } ] [
    4double-array{ 100 2000 30000 400000 } 4double-array{ 10 20 30 40 }
    [ { 4double-array 4double-array } declare v/ ] compile-call
 ] unit-test
 [ 4double-array{ -10 -20 -30 -40 } ] [
    4double-array{ -10 20 -30 40 } 4double-array{ 10 -20 30 -40 }
    [ { 4double-array 4double-array } declare vmin ] compile-call
 ] unit-test
 [ 4double-array{ 10 20 30 40 } ] [
    4double-array{ -10 20 -30 40 } 4double-array{ 10 -20 30 -40 }
    [ { 4double-array 4double-array } declare vmax ] compile-call
 ] unit-test
 [ 10.0 ] [
    4double-array{ 1 2 3 4 }
    [ { 4double-array } declare sum ] compile-call
 ] unit-test
 [ 13.0 ] [
    4double-array{ 1 2 3 4 }
    [ { 4double-array } declare sum 3.0 + ] compile-call
 ] unit-test
 [ 8.0 ] [
    4double-array{ 1 2 3 4 } 4double-array{ 2 0 2 0 }
    [ { 4double-array 4double-array } declare v. ] compile-call
 ] unit-test
 [ 4double-array{ 5 10 15 20 } ] [
    5.0 4double-array{ 1 2 3 4 }
    [ { float 4double-array } declare n*v ] compile-call
 ] unit-test
 [ 4double-array{ 5 10 15 20 } ] [
    4double-array{ 1 2 3 4 } 5.0
    [ { float 4double-array } declare v*n ] compile-call
 ] unit-test
 [ 4double-array{ 10 5 2 5 } ] [
    10.0 4double-array{ 1 2 5 2 }
    [ { float 4double-array } declare n/v ] compile-call
 ] unit-test
 [ 4double-array{ 0.5 1 1.5 2 } ] [
    4double-array{ 1 2 3 4 } 2
    [ { float 4double-array } declare v/n ] compile-call
 ] unit-test
 [ 4double-array{ 1 0 0 0 } ] [
    4double-array{ 10 0 0 0 }
    [ { 4double-array } declare normalize ] compile-call
 ] unit-test
 [ 30.0 ] [
    4double-array{ 1 2 3 4 }
    [ { 4double-array } declare norm-sq ] compile-call
 ] unit-test
 [ t ] [
    4double-array{ 1 0 0 0 }
    4double-array{ 0 1 0 0 }
    [ { 4double-array 4double-array } declare distance ] compile-call
    2 sqrt 1.0e-6 ~
 ] unit-test
--- a/basis/math/vectors/simd/simd.factor
+++ b/basis/math/vectors/simd/simd.factor
@ -0,0 +1,214 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors alien.c-types byte-arrays cpu.architecture
 generalizations kernel math math.functions math.vectors
 math.vectors.simd.functor math.vectors.specialization parser
 prettyprint.custom sequences sequences.private
 specialized-arrays.double locals assocs literals ;
 IN: math.vectors.simd
 <PRIVATE
 ERROR: bad-simd-call ;
 : (simd-v+) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v-) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v*) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v/) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-vmin) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-vmax) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-vsqrt) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-sum) ( v1 rep -- v2 ) bad-simd-call ;
 : (simd-broadcast) ( x rep -- v ) bad-simd-call ;
 : (simd-gather-2) ( a b rep -- v ) bad-simd-call ;
 : (simd-gather-4) ( a b c d rep -- v ) bad-simd-call ;
 : assert-positive ( x -- y ) ;
 PRIVATE>
 <<
 DEFER: 4float-array
 DEFER: 2double-array
 "double" 2 define-simd-type
 "float" 4 define-simd-type
 >>
 ! Constructors
 : 4float-array-with ( x -- simd-array )
    >float 4float-array-rep (simd-broadcast) 4float-array boa ; inline
 : 4float-array-boa ( a b c d -- simd-array )
    [ >float ] 4 napply 4float-array-rep (simd-gather-4) 4float-array boa ; inline
 : 2double-array-with ( x -- simd-array )
    >float 2double-array-rep (simd-broadcast) 2double-array boa ; inline
 : 2double-array-boa ( a b -- simd-array )
    [ >float ] bi@ 2double-array-rep (simd-gather-2) 2double-array boa ; inline
 <PRIVATE
 : 4float-array-vv->v-op ( v1 v2 quot -- v3 )
    [ [ underlying>> ] bi@ 4float-array-rep ] dip call 4float-array boa ; inline
 : 4float-array-v->n-op ( v1 quot -- v2 )
    [ underlying>> 4float-array-rep ] dip call ; inline
 : 2double-array-vv->v-op ( v1 v2 quot -- v3 )
    [ [ underlying>> ] bi@ 2double-array-rep ] dip call 2double-array boa ; inline
 : 2double-array-v->n-op ( v1 quot -- v2 )
    [ underlying>> 2double-array-rep ] dip call ; inline
 PRIVATE>
 <<
 <PRIVATE
 :: simd-vector-words ( class ctor elt-type assoc -- )
    class elt-type assoc {
        { vneg [ [ dup v- ] keep v- ] }
        { v. [ v* sum ] }
        { n+v [ [ ctor execute ] dip v+ ] }
        { v+n [ ctor execute v+ ] }
        { n-v [ [ ctor execute ] dip v- ] }
        { v-n [ ctor execute v- ] }
        { n*v [ [ ctor execute ] dip v* ] }
        { v*n [ ctor execute v* ] }
        { n/v [ [ ctor execute ] dip v/ ] }
        { v/n [ ctor execute v/ ] }
        { norm-sq [ dup v. assert-positive ] }
        { norm [ norm-sq sqrt ] }
        { normalize [ dup norm v/n ] }
        { distance [ v- norm ] }
    } assoc-union
    specialize-vector-words ;
 PRIVATE>
 \ 4float-array \ 4float-array-with float H{
    { v+ [ [ (simd-v+) ] 4float-array-vv->v-op ] }
    { v- [ [ (simd-v-) ] 4float-array-vv->v-op ] }
    { v* [ [ (simd-v*) ] 4float-array-vv->v-op ] }
    { v/ [ [ (simd-v/) ] 4float-array-vv->v-op ] }
    { vmin [ [ (simd-vmin) ] 4float-array-vv->v-op ] }
    { vmax [ [ (simd-vmax) ] 4float-array-vv->v-op ] }
    { sum [ [ (simd-sum) ] 4float-array-v->n-op ] }
 } simd-vector-words
 \ 2double-array \ 2double-array-with float H{
    { v+ [ [ (simd-v+) ] 2double-array-vv->v-op ] }
    { v- [ [ (simd-v-) ] 2double-array-vv->v-op ] }
    { v* [ [ (simd-v*) ] 2double-array-vv->v-op ] }
    { v/ [ [ (simd-v/) ] 2double-array-vv->v-op ] }
    { vmin [ [ (simd-vmin) ] 2double-array-vv->v-op ] }
    { vmax [ [ (simd-vmax) ] 2double-array-vv->v-op ] }
    { sum [ [ (simd-sum) ] 2double-array-v->n-op ] }
 } simd-vector-words
 >>
 ! Synthesize 256-bit vectors from a pair of 128-bit vectors
 ! Functorize this later so that we can do it for integers, etc
 TUPLE: 4double-array
 { underlying1 byte-array initial: $[ 16 <byte-array> ] read-only }
 { underlying2 byte-array initial: $[ 16 <byte-array> ] read-only } ;
 : <4double-array> ( -- simd-array )
    16 <byte-array> 16 <byte-array> 4double-array boa ; inline
 : (4double-array) ( -- simd-array )
    16 (byte-array) 16 (byte-array) 4double-array boa ; inline
 M: 4double-array clone
    [ underlying1>> clone ] [ underlying2>> clone ] bi
    4double-array boa ; inline
 M: 4double-array length drop 4 ; inline
 <PRIVATE
 : 4double-array-deref ( n seq -- n' seq' )
    over 2 < [ underlying1>> ] [ [ 2 - ] dip underlying2>> ] if
    2 swap double-array boa ; inline
 PRIVATE>
 M: 4double-array nth-unsafe
    4double-array-deref nth-unsafe ; inline
 M: 4double-array set-nth-unsafe
    4double-array-deref set-nth-unsafe ; inline
 : >4double-array ( seq -- simd-array )
    4double-array new clone-like ;
 M: 4double-array like
    drop dup 4double-array? [ >4double-array ] unless ; inline
 M: 4double-array new-sequence
    drop dup 4 = [ drop (4double-array) ] [ 4 bad-length ] if ; inline
 M: 4double-array equal?
    over 4double-array? [ sequence= ] [ 2drop f ] if ;
 M: 4double-array byte-length drop 32 ; inline
 SYNTAX: 4double-array{
    \ } [ >4double-array ] parse-literal ;
 M: 4double-array pprint-delims
    drop \ 4double-array{ \ } ;
 M: 4double-array >pprint-sequence ;
 M: 4double-array pprint* pprint-object ;
 INSTANCE: 4double-array sequence
 : 4double-array-with ( x -- simd-array )
    dup [ >float 2double-array-rep (simd-broadcast) ] bi@
    4double-array boa ; inline
 : 4double-array-boa ( a b c d -- simd-array )
    [ >float ] 4 napply [ 2double-array-rep (simd-gather-2) ] 2bi@
    4double-array boa ; inline
 ! SIMD operations on 4double-arrays
 <PRIVATE
 : 4double-array-vv->v-op ( v1 v2 quot -- v3 )
    [ [ [ underlying1>> ] bi@ 2double-array-rep ] dip call ]
    [ [ [ underlying2>> ] bi@ 2double-array-rep ] dip call ] 3bi
    4double-array boa ; inline
 : 4double-array-v->n-op ( v1 quot scalar-quot -- v2 )
    [
        [ [ underlying1>> 2double-array-rep ] dip call ]
        [ [ underlying2>> 2double-array-rep ] dip call ] 2bi
    ] dip call ; inline
 PRIVATE>
 <<
 \ 4double-array \ 4double-array-with float H{
    { v+ [ [ (simd-v+) ] 4double-array-vv->v-op ] }
    { v- [ [ (simd-v-) ] 4double-array-vv->v-op ] }
    { v* [ [ (simd-v*) ] 4double-array-vv->v-op ] }
    { v/ [ [ (simd-v/) ] 4double-array-vv->v-op ] }
    { vmin [ [ (simd-vmin) ] 4double-array-vv->v-op ] }
    { vmax [ [ (simd-vmax) ] 4double-array-vv->v-op ] }
    { sum [ [ (simd-sum) ] [ + ] 4double-array-v->n-op ] }
 } simd-vector-words
 >>
 USE: vocabs.loader
 "math.vectors.simd.alien" require
--- a/basis/math/vectors/specialization/specialization.factor
+++ b/basis/math/vectors/specialization/specialization.factor
@ -2,7 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: words kernel make sequences effects kernel.private accessors
 combinators math math.intervals math.vectors namespaces assocs fry
-splitting classes.algebra generalizations
+splitting classes.algebra generalizations locals
 compiler.tree.propagation.info ;
 IN: math.vectors.specialization
@ -67,6 +67,7 @@ H{
    { vmin { +vector+ +vector+ -> +vector+ } }
    { vneg { +vector+ -> +vector+ } }
    { vtruncate { +vector+ -> +vector+ } }
    { sum { +vector+ -> +scalar+ } }
 }
 SYMBOL: specializations
@ -82,19 +83,23 @@ specializations [ vector-words keys [ V{ } clone ] H{ } map>assoc ] initialize
 : outputs ( schema -- seq ) { -> } split second ;
-: specialize-vector-word ( word array-type elt-type -- word' )
+: loop-vector-op ( word array-type elt-type -- word' )
    pick word-schema
    [ inputs (specialize-vector-word) ]
    [ outputs record-output-signature ] 3bi ;
-: input-signature ( word -- signature ) def>> first ;
+:: specialize-vector-word ( word array-type elt-type simd -- word/quot' )
    word simd key? [ word simd at ] [ word array-type elt-type loop-vector-op ] if ;
-: specialize-vector-words ( array-type elt-type -- )
+:: input-signature ( word array-type elt-type -- signature )
-    [ vector-words keys ] 2dip
+    array-type elt-type word word-schema inputs signature-for-schema ;
-    '[
+
-        [ _ _ specialize-vector-word ] keep
+:: specialize-vector-words ( array-type elt-type simd -- )
-        [ dup input-signature ] dip
+    vector-words keys [
-        add-specialization
+        [ array-type elt-type simd specialize-vector-word ]
        [ array-type elt-type input-signature ]
        [ ]
        tri add-specialization
    ] each ;
 : find-specialization ( classes word -- word/f )