From ea2bcd69c71882d545fb57548ff0cda4ac009ce9 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Sun, 20 Sep 2009 02:08:32 -0500
Subject: [PATCH] math.vectors.simd: redesign to be more flexible, integer SIMD
 work in progress

---
 basis/alien/c-types/c-types.factor            |   8 +
 basis/cpu/architecture/architecture.factor    |  34 +++-
 basis/cpu/x86/features/features.factor        |  49 ++---
 basis/cpu/x86/x86.factor                      | 115 ++++++++++-
 basis/math/floats/env/x86/x86.factor          |   4 +-
 .../math/vectors/simd/functor/functor.factor  | 134 +++++++++++--
 .../simd/intrinsics/intrinsics-tests.factor   |  18 ++
 .../vectors/simd/intrinsics/intrinsics.factor |  54 +++++-
 basis/math/vectors/simd/simd-docs.factor      |  84 --------
 basis/math/vectors/simd/simd.factor           | 180 +-----------------
 basis/math/vectors/simd/summary.txt           |   1 +
 11 files changed, 363 insertions(+), 318 deletions(-)
 create mode 100644 basis/math/vectors/simd/intrinsics/intrinsics-tests.factor
 create mode 100644 basis/math/vectors/simd/summary.txt

diff --git a/basis/alien/c-types/c-types.factor b/basis/alien/c-types/c-types.factor
index fa27e29c04..afbb664fed 100755
--- a/basis/alien/c-types/c-types.factor
+++ b/basis/alien/c-types/c-types.factor
@@ -472,3 +472,11 @@ SYMBOLS:
     \ ulong \ size_t typedef
 ] with-compilation-unit
 
+M: char-16-rep rep-component-type drop char ;
+M: uchar-16-rep rep-component-type drop uchar ;
+M: short-8-rep rep-component-type drop short ;
+M: ushort-8-rep rep-component-type drop ushort ;
+M: int-4-rep rep-component-type drop int ;
+M: uint-4-rep rep-component-type drop uint ;
+M: float-4-rep rep-component-type drop float ;
+M: double-2-rep rep-component-type drop double ;
diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor
index d6611c3384..61e4e2df37 100644
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@@ -22,8 +22,6 @@ SINGLETONS: float-rep double-rep ;
 
 ! On x86, floating point registers are really vector registers
 SINGLETONS:
-float-4-rep
-double-2-rep
 char-16-rep
 uchar-16-rep
 short-8-rep
@@ -31,9 +29,11 @@ ushort-8-rep
 int-4-rep
 uint-4-rep ;
 
-UNION: vector-rep
+SINGLETONS:
 float-4-rep
-double-2-rep
+double-2-rep ;
+
+UNION: int-vector-rep
 char-16-rep
 uchar-16-rep
 short-8-rep
@@ -41,6 +41,14 @@ ushort-8-rep
 int-4-rep
 uint-4-rep ;
 
+UNION: float-vector-rep
+float-4-rep
+double-2-rep ;
+
+UNION: vector-rep
+int-vector-rep
+float-vector-rep ;
+
 UNION: representation
 any-rep
 tagged-rep
@@ -76,10 +84,15 @@ M: double-rep rep-size drop 8 ;
 M: stack-params rep-size drop cell ;
 M: vector-rep rep-size drop 16 ;
 
+GENERIC: rep-component-type ( rep -- n )
+
+! Methods defined in alien.c-types
+
 GENERIC: scalar-rep-of ( rep -- rep' )
 
 M: float-4-rep scalar-rep-of drop float-rep ;
 M: double-2-rep scalar-rep-of drop double-rep ;
+M: int-vector-rep scalar-rep-of drop int-rep ;
 
 ! Mapping from register class to machine registers
 HOOK: machine-registers cpu ( -- assoc )
@@ -167,7 +180,6 @@ HOOK: %unbox-vector cpu ( dst src rep -- )
 HOOK: %broadcast-vector cpu ( dst src rep -- )
 HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
-
 HOOK: %add-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
@@ -177,6 +189,18 @@ HOOK: %max-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sqrt-vector cpu ( dst src rep -- )
 HOOK: %horizontal-add-vector cpu ( dst src rep -- )
 
+HOOK: %broadcast-vector-reps cpu ( -- reps )
+HOOK: %gather-vector-2-reps cpu ( -- reps )
+HOOK: %gather-vector-4-reps cpu ( -- reps )
+HOOK: %add-vector-reps cpu ( -- reps )
+HOOK: %sub-vector-reps cpu ( -- reps )
+HOOK: %mul-vector-reps cpu ( -- reps )
+HOOK: %div-vector-reps cpu ( -- reps )
+HOOK: %min-vector-reps cpu ( -- reps )
+HOOK: %max-vector-reps cpu ( -- reps )
+HOOK: %sqrt-vector-reps cpu ( -- reps )
+HOOK: %horizontal-add-vector-reps cpu ( -- reps )
+
 HOOK: %unbox-alien cpu ( dst src -- )
 HOOK: %unbox-any-c-ptr cpu ( dst src temp -- )
 HOOK: %box-alien cpu ( dst src temp -- )
diff --git a/basis/cpu/x86/features/features.factor b/basis/cpu/x86/features/features.factor
index c5cf2d470a..5fad4e802c 100644
--- a/basis/cpu/x86/features/features.factor
+++ b/basis/cpu/x86/features/features.factor
@@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: system kernel math math.order math.parser namespaces
-alien.c-types alien.syntax combinators locals init io cpu.x86
+USING: system kernel memoize math math.order math.parser
+namespaces alien.c-types alien.syntax combinators locals init io
 compiler compiler.units accessors ;
 IN: cpu.x86.features
 
@@ -13,7 +13,16 @@ FUNCTION: longlong read_timestamp_counter ( ) ;
 
 PRIVATE>
 
-ALIAS: sse-version sse_version
+: sse-version ( -- n )
+    sse_version
+    "sse-version" get string>number [ min ] when* ; foldable
+
+: sse? ( -- ? ) sse-version 10 >= ; foldable
+: sse2? ( -- ? ) sse-version 20 >= ; foldable
+: sse3? ( -- ? ) sse-version 30 >= ; foldable
+: ssse3? ( -- ? ) sse-version 33 >= ; foldable
+: sse4.1? ( -- ? ) sse-version 41 >= ; foldable
+: sse4.2? ( -- ? ) sse-version 42 >= ; foldable
 
 : sse-string ( version -- string )
     {
@@ -32,37 +41,3 @@ M: x86 instruction-count read_timestamp_counter ;
 
 : count-instructions ( quot -- n )
     instruction-count [ call ] dip instruction-count swap - ; inline
-
-USING: cpu.x86.features cpu.x86.features.private ;
-
-:: install-sse-check ( version -- )
-    [
-        sse-version version < [
-            "This image was built to use " write
-            version sse-string write
-            " but your CPU only supports " write
-            sse-version sse-string write "." print
-            "You will need to bootstrap Factor again." print
-            flush
-            1 exit
-        ] when
-    ] "cpu.x86" add-init-hook ;
-
-: enable-sse ( version -- )
-    {
-        { 00 [ ] }
-        { 10 [ ] }
-        { 20 [ enable-sse2 ] }
-        { 30 [ enable-sse3 ] }
-        { 33 [ enable-sse3 ] }
-        { 41 [ enable-sse3 ] }
-        { 42 [ enable-sse3 ] }
-    } case ;
-
-[ { sse_version } compile ] with-optimizer
-
-"Checking for multimedia extensions: " write sse-version
-"sse-version" get [ string>number min ] when*
-[ sse-string write " detected" print ]
-[ install-sse-check ]
-[ enable-sse ] tri
diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index 04b5308836..4d80862ed3 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -4,7 +4,8 @@ USING: accessors assocs alien alien.c-types arrays strings
 cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
 cpu.architecture kernel kernel.private math memory namespaces make
 sequences words system layouts combinators math.order fry locals
-compiler.constants byte-arrays
+compiler.constants byte-arrays io macros quotations cpu.x86.features
+cpu.x86.features.private compiler compiler.units init
 compiler.cfg.registers
 compiler.cfg.instructions
 compiler.cfg.intrinsics
@@ -250,12 +251,26 @@ M:: x86 %unbox-vector ( dst src rep -- )
     dst src byte-array-offset [+]
     rep copy-register ;
 
+MACRO: available-reps ( alist -- )
+    ! Each SSE version adds new representations and supports
+    ! all old ones
+    unzip { } [ append ] accumulate rest swap suffix
+    [ [ 1quotation ] map ] bi@ zip
+    reverse [ { } ] suffix
+    '[ _ cond ] ;
+
 M: x86 %broadcast-vector ( dst src rep -- )
     {
         { float-4-rep [ [ MOVSS ] [ drop dup 0 SHUFPS ] 2bi ] }
         { double-2-rep [ [ MOVSD ] [ drop dup UNPCKLPD ] 2bi ] }
     } case ;
 
+M: x86 %broadcast-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep } }
+    } available-reps ;
+
 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
     rep {
         {
@@ -269,6 +284,11 @@ M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
         }
     } case ;
 
+M: x86 %gather-vector-4-reps
+    {
+        { sse? { float-4-rep } }
+    } available-reps ;
+
 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
     rep {
         {
@@ -280,6 +300,11 @@ M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
         }
     } case ;
 
+M: x86 %gather-vector-2-reps
+    {
+        { sse2? { double-2-rep } }
+    } available-reps ;
+
 M: x86 %add-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ ADDPS ] }
@@ -292,6 +317,12 @@ M: x86 %add-vector ( dst src1 src2 rep -- )
         { uint-4-rep [ PADDD ] }
     } case drop ;
 
+M: x86 %add-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
 M: x86 %sub-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ SUBPS ] }
@@ -304,43 +335,92 @@ M: x86 %sub-vector ( dst src1 src2 rep -- )
         { uint-4-rep [ PSUBD ] }
     } case drop ;
 
+M: x86 %sub-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
 M: x86 %mul-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ MULPS ] }
         { double-2-rep [ MULPD ] }
-        { int-4-rep [ PMULLW ] }
+        { short-8-rep [ PMULLW ] }
+        { ushort-8-rep [ PMULLW ] }
+        { int-4-rep [ PMULLD ] }
+        { uint-4-rep [ PMULLD ] }
     } case drop ;
 
+M: x86 %mul-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
 M: x86 %div-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ DIVPS ] }
         { double-2-rep [ DIVPD ] }
     } case drop ;
 
+M: x86 %div-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep } }
+    } available-reps ;
+
 M: x86 %min-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ MINPS ] }
         { double-2-rep [ MINPD ] }
+        { uchar-16-rep [ PMINUB ] }
+        { short-8-rep [ PMINSW ] }
     } case drop ;
 
+M: x86 %min-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %max-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ MAXPS ] }
         { double-2-rep [ MAXPD ] }
+        { uchar-16-rep [ PMAXUB ] }
+        { short-8-rep [ PMAXSW ] }
     } case drop ;
 
+M: x86 %max-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %sqrt-vector ( dst src rep -- )
     {
         { float-4-rep [ SQRTPS ] }
         { double-2-rep [ SQRTPD ] }
     } case ;
 
+M: x86 %sqrt-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep } }
+    } available-reps ;
+
 M: x86 %horizontal-add-vector ( dst src rep -- )
     {
         { float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
         { double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
     } case ;
 
+M: x86 %horizontal-add-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %unbox-alien ( dst src -- )
     alien-offset [+] MOV ;
 
@@ -775,3 +855,34 @@ M: x86 small-enough? ( n -- ? )
     enable-sse3-simd ;
 
 enable-min/max
+
+:: install-sse-check ( version -- )
+    [
+        sse-version version < [
+            "This image was built to use " write
+            version sse-string write
+            " but your CPU only supports " write
+            sse-version sse-string write "." print
+            "You will need to bootstrap Factor again." print
+            flush
+            1 exit
+        ] when
+    ] "cpu.x86" add-init-hook ;
+
+: enable-sse ( version -- )
+    {
+        { 00 [ ] }
+        { 10 [ ] }
+        { 20 [ enable-sse2 ] }
+        { 30 [ enable-sse3 ] }
+        { 33 [ enable-sse3 ] }
+        { 41 [ enable-sse3 ] }
+        { 42 [ enable-sse3 ] }
+    } case ;
+
+[ { sse_version } compile ] with-optimizer
+
+"Checking for multimedia extensions: " write sse-version 30 min
+[ sse-string write " detected" print ]
+[ install-sse-check ]
+[ enable-sse ] tri
diff --git a/basis/math/floats/env/x86/x86.factor b/basis/math/floats/env/x86/x86.factor
index e91fc4eda9..e9120567aa 100644
--- a/basis/math/floats/env/x86/x86.factor
+++ b/basis/math/floats/env/x86/x86.factor
@@ -31,9 +31,7 @@ M: x87-env (set-fp-env-register)
     set_x87_env ;
 
 M: x86 (fp-env-registers)
-    sse-version 20 >=
-    [ <sse-env> <x87-env> 2array ]
-    [ <x87-env> 1array ] if ;
+    sse2? [ <sse-env> <x87-env> 2array ] [ <x87-env> 1array ] if ;
 
 CONSTANT: sse-exception-flag-bits HEX: 3f
 CONSTANT: sse-exception-flag>bit
diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor
index 641585a5d7..a97dc192be 100644
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@@ -1,24 +1,94 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien.c-types byte-arrays classes functors
-kernel math parser prettyprint.custom sequences
-sequences.private literals ;
+USING: accessors alien.c-types assocs byte-arrays classes
+effects fry functors generalizations kernel literals locals
+math math.functions math.vectors math.vectors.simd.intrinsics
+math.vectors.specialization parser prettyprint.custom sequences
+sequences.private strings words definitions macros cpu.architecture ;
 IN: math.vectors.simd.functor
 
 ERROR: bad-length got expected ;
 
+MACRO: simd-boa ( rep class -- simd-array )
+    [ rep-components ] [ new ] bi* '[ _ _ nsequence ] ;
+
+:: define-boa-custom-inlining ( word rep class -- )
+    word [
+        drop
+        rep rep rep-gather-word supported-simd-op? [
+            [ rep (simd-boa) class boa ]
+        ] [ word def>> ] if
+    ] "custom-inlining" set-word-prop ;
+
+: simd-with ( rep class x -- simd-array )
+    [ rep-components ] [ new ] [ '[ _ ] ] tri* swap replicate-as ; inline
+
+:: define-with-custom-inlining ( word rep class -- )
+    word [
+        drop
+        rep \ (simd-broadcast) supported-simd-op? [
+            [ rep rep-coerce rep (simd-broadcast) class boa ]
+        ] [ word def>> ] if
+    ] "custom-inlining" set-word-prop ;
+
+: boa-effect ( rep n -- effect )
+    [ rep-components ] dip *
+    [ CHAR: a + 1string ] map
+    { "simd-vector" } <effect> ;
+
+: supported-simd-ops ( assoc rep -- assoc' )
+    [
+        {
+            { v+ (simd-v+) }
+            { v- (simd-v-) }
+            { v* (simd-v*) }
+            { v/ (simd-v/) }
+            { vmin (simd-vmin) }
+            { vmax (simd-vmax) }
+            { sum (simd-sum) }
+        }
+    ] dip 
+    '[ nip _ swap supported-simd-op? ] assoc-filter
+    '[ drop _ key? ] assoc-filter ;
+
+:: high-level-ops ( ctor -- assoc )
+    ! Some SIMD operations are defined in terms of others.
+    {
+        { vneg [ [ dup v- ] keep v- ] }
+        { v. [ v* sum ] }
+        { n+v [ [ ctor execute ] dip v+ ] }
+        { v+n [ ctor execute v+ ] }
+        { n-v [ [ ctor execute ] dip v- ] }
+        { v-n [ ctor execute v- ] }
+        { n*v [ [ ctor execute ] dip v* ] }
+        { v*n [ ctor execute v* ] }
+        { n/v [ [ ctor execute ] dip v/ ] }
+        { v/n [ ctor execute v/ ] }
+        { norm-sq [ dup v. assert-positive ] }
+        { norm [ norm-sq sqrt ] }
+        { normalize [ dup norm v/n ] }
+        { distance [ v- norm ] }
+    } ;
+
+:: simd-vector-words ( class ctor rep assoc -- )
+    class
+    rep rep-component-type c-type-boxed-class
+    assoc rep supported-simd-ops
+    ctor high-level-ops assoc-union
+    specialize-vector-words ;
+
 FUNCTOR: define-simd-128 ( T -- )
 
-T-TYPE       IS ${T}
-
-N            [ 16 T-TYPE heap-size /i ]
+N            [ 16 T heap-size /i ]
 
 A            DEFINES-CLASS ${T}-${N}
+A-boa        DEFINES ${A}-boa
+A-with       DEFINES ${A}-with
 >A           DEFINES >${A}
 A{           DEFINES ${A}{
 
-NTH          [ T-TYPE dup c-type-getter-boxer array-accessor ]
-SET-NTH      [ T-TYPE dup c-setter array-accessor ]
+NTH          [ T dup c-type-getter-boxer array-accessor ]
+SET-NTH      [ T dup c-setter array-accessor ]
 
 A-rep        IS ${A}-rep
 A-vv->v-op   DEFINES-PRIVATE ${A}-vv->v-op
@@ -59,6 +129,16 @@ M: A pprint* pprint-object ;
 
 SYNTAX: A{ \ } [ >A ] parse-literal ;
 
+: A-with ( x -- simd-array ) [ A-rep A ] dip simd-with ;
+
+\ A-with \ A-rep \ A define-with-custom-inlining
+
+\ A-boa [ \ A-rep \ A simd-boa ] \ A-rep 1 boa-effect define-declared
+
+\ A-rep rep-gather-word [
+    \ A-boa \ A-rep \ A define-boa-custom-inlining
+] when
+
 INSTANCE: A sequence
 
 <PRIVATE
@@ -69,6 +149,16 @@ INSTANCE: A sequence
 : A-v->n-op ( v quot -- n )
     [ underlying>> A-rep ] dip call ; inline
 
+\ A \ A-with \ A-rep H{
+    { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
+    { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
+    { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
+    { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
+    { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
+    { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
+    { sum [ [ (simd-sum) ] \ A-v->n-op execute ] }
+} simd-vector-words
+
 PRIVATE>
 
 ;FUNCTOR
@@ -76,14 +166,16 @@ PRIVATE>
 ! Synthesize 256-bit vectors from a pair of 128-bit vectors
 FUNCTOR: define-simd-256 ( T -- )
 
-T-TYPE       IS ${T}
-
-N            [ 32 T-TYPE heap-size /i ]
+N            [ 32 T heap-size /i ]
 
 N/2          [ N 2 / ]
 A/2          IS ${T}-${N/2}
+A/2-boa      IS ${A/2}-boa
+A/2-with     IS ${A/2}-with
 
 A            DEFINES-CLASS ${T}-${N}
+A-boa        DEFINES ${A}-boa
+A-with       DEFINES ${A}-with
 >A           DEFINES >${A}
 A{           DEFINES ${A}{
 
@@ -137,6 +229,16 @@ M: A >pprint-sequence ;
 
 M: A pprint* pprint-object ;
 
+: A-with ( x -- simd-array )
+    [ A/2-with ] [ A/2-with ] bi [ underlying>> ] bi@
+    \ A boa ; inline
+
+: A-boa ( ... -- simd-array )
+    [ A/2-boa ] N/2 ndip A/2-boa [ underlying>> ] bi@
+    \ A boa ;
+
+\ A-rep 2 boa-effect \ A-boa set-stack-effect
+
 INSTANCE: A sequence
 
 : A-vv->v-op ( v1 v2 quot -- v3 )
@@ -148,4 +250,14 @@ INSTANCE: A sequence
     [ [ [ underlying1>> ] [ underlying2>> ] bi A-rep ] dip call A-rep ]
     dip call ; inline
 
+\ A \ A-with \ A-rep H{
+    { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
+    { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
+    { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
+    { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
+    { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
+    { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
+    { sum [ [ (simd-v+) ] [ (simd-sum) ] \ A-v->n-op execute ] }
+} simd-vector-words
+
 ;FUNCTOR
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor b/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor
new file mode 100644
index 0000000000..84eee935a0
--- /dev/null
+++ b/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor
@@ -0,0 +1,18 @@
+IN: math.vectors.simd.intrinsics.tests
+USING: math.vectors.simd.intrinsics cpu.architecture tools.test ;
+
+[ 16 ] [ uchar-16-rep rep-components ] unit-test
+[ 16 ] [ char-16-rep rep-components ] unit-test
+[ 8 ] [ ushort-8-rep rep-components ] unit-test
+[ 8 ] [ short-8-rep rep-components ] unit-test
+[ 4 ] [ uint-4-rep rep-components ] unit-test
+[ 4 ] [ int-4-rep rep-components ] unit-test
+[ 4 ] [ float-4-rep rep-components ] unit-test
+[ 2 ] [ double-2-rep rep-components ] unit-test
+
+{ 4 1 } [ uint-4-rep (simd-boa) ] must-infer-as
+{ 4 1 } [ int-4-rep (simd-boa) ] must-infer-as
+{ 4 1 } [ float-4-rep (simd-boa) ] must-infer-as
+{ 2 1 } [ double-2-rep (simd-boa) ] must-infer-as
+
+
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor
index 914d1ef169..a7d019af81 100644
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@@ -1,6 +1,8 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel alien alien.data cpu.architecture libc ;
+USING: alien alien.c-types alien.data assocs combinators
+cpu.architecture fry generalizations kernel libc macros math
+sequences ;
 IN: math.vectors.simd.intrinsics
 
 ERROR: bad-simd-call ;
@@ -26,3 +28,53 @@ ERROR: bad-simd-call ;
     ! Inefficient version for when intrinsics are missing
     [ swap <displaced-alien> swap ] dip rep-size memcpy ;
 
+<<
+
+: rep-components ( rep -- n )
+    16 swap rep-component-type heap-size /i ; foldable
+
+: rep-coercer ( rep -- quot )
+    {
+        { [ dup int-vector-rep? ] [ [ >fixnum ] ] }
+        { [ dup float-vector-rep? ] [ [ >float ] ] }
+    } cond nip ; foldable
+
+: rep-coerce ( value rep -- value' )
+    rep-coercer call( value -- value' ) ; inline
+
+CONSTANT: rep-gather-words
+    {
+        { 2 (simd-gather-2) }
+        { 4 (simd-gather-4) }
+    }
+
+: rep-gather-word ( rep -- word )
+    rep-components rep-gather-words at ;
+
+>>
+
+MACRO: (simd-boa) ( rep -- quot )
+    {
+        [ rep-coercer ]
+        [ rep-components ]
+        [ ]
+        [ rep-gather-word ]
+    } cleave
+    '[ _ _ napply _ _ execute ] ;
+
+GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
+
+M: vector-rep supported-simd-op?
+    {
+        { \ (simd-v+)        [ %add-vector-reps            ] }
+        { \ (simd-v-)        [ %sub-vector-reps            ] }
+        { \ (simd-v*)        [ %mul-vector-reps            ] }
+        { \ (simd-v/)        [ %div-vector-reps            ] }
+        { \ (simd-vmin)      [ %min-vector-reps            ] }
+        { \ (simd-vmax)      [ %max-vector-reps            ] }
+        { \ (simd-vsqrt)     [ %sqrt-vector-reps           ] }
+        { \ (simd-sum)       [ %horizontal-add-vector-reps ] }
+        { \ (simd-broadcast) [ %broadcast-vector-reps      ] }
+        { \ (simd-gather-2)  [ %gather-vector-2-reps       ] }
+        { \ (simd-gather-4)  [ %gather-vector-4-reps       ] }
+    } case member? ;
diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor
index b110de1de8..d6131b3a71 100644
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@@ -43,22 +43,6 @@ $nl
 }
 "The " { $link float-4 } " and " { $link double-2 } " types correspond to 128-bit vector registers. The " { $link float-8 } " and " { $link double-4 } " types are not directly supported in hardware, and instead unbox to a pair of 128-bit vector registers."
 $nl
-"Operations on " { $link float-4 } " instances:"
-{ $subsection float-4-with }
-{ $subsection float-4-boa }
-{ $subsection POSTPONE: float-4{ }
-"Operations on " { $link double-2 } " instances:"
-{ $subsection double-2-with }
-{ $subsection double-2-boa }
-{ $subsection POSTPONE: double-2{ }
-"Operations on " { $link float-8 } " instances:"
-{ $subsection float-8-with }
-{ $subsection float-8-boa }
-{ $subsection POSTPONE: float-8{ }
-"Operations on " { $link double-4 } " instances:"
-{ $subsection double-4-with }
-{ $subsection double-4-boa }
-{ $subsection POSTPONE: double-4{ }
 "To actually perform vector arithmetic on SIMD vectors, use " { $link "math-vectors" } " words."
 { $see-also "c-types-specs" } ;
 
@@ -184,72 +168,4 @@ ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
 { $subsection "math.vectors.simd.alien" }
 { $subsection "math.vectors.simd.intrinsics" } ;
 
-! ! ! float-4
-
-HELP: float-4
-{ $class-description "A sequence of four single-precision floating point values. New instances can be created with " { $link float-4-with } " or " { $link float-4-boa } "." } ;
-
-HELP: float-4-with
-{ $values { "x" float } { "simd-array" float-4 } }
-{ $description "Creates a new vector with all four components equal to a scalar." } ;
-
-HELP: float-4-boa
-{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" float-4 } }
-{ $description "Creates a new vector from four scalar components." } ;
-
-HELP: float-4{
-{ $syntax "float-4{ a b c d }" }
-{ $description "Literal syntax for a " { $link float-4 } "." } ;
-
-! ! ! double-2
-
-HELP: double-2
-{ $class-description "A sequence of two double-precision floating point values. New instances can be created with " { $link double-2-with } " or " { $link double-2-boa } "." } ;
-
-HELP: double-2-with
-{ $values { "x" float } { "simd-array" double-2 } }
-{ $description "Creates a new vector with both components equal to a scalar." } ;
-
-HELP: double-2-boa
-{ $values { "a" float } { "b" float } { "simd-array" double-2 } }
-{ $description "Creates a new vector from two scalar components." } ;
-
-HELP: double-2{
-{ $syntax "double-2{ a b }" }
-{ $description "Literal syntax for a " { $link double-2 } "." } ;
-
-! ! ! float-8
-
-HELP: float-8
-{ $class-description "A sequence of eight single-precision floating point values. New instances can be created with " { $link float-8-with } " or " { $link float-8-boa } "." } ;
-
-HELP: float-8-with
-{ $values { "x" float } { "simd-array" float-8 } }
-{ $description "Creates a new vector with all eight components equal to a scalar." } ;
-
-HELP: float-8-boa
-{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "e" float } { "f" float } { "g" float } { "h" float } { "simd-array" float-8 } }
-{ $description "Creates a new vector from eight scalar components." } ;
-
-HELP: float-8{
-{ $syntax "float-8{ a b c d e f g h }" }
-{ $description "Literal syntax for a " { $link float-8 } "." } ;
-
-! ! ! double-4
-
-HELP: double-4
-{ $class-description "A sequence of four double-precision floating point values. New instances can be created with " { $link double-4-with } " or " { $link double-4-boa } "." } ;
-
-HELP: double-4-with
-{ $values { "x" float } { "simd-array" double-4 } }
-{ $description "Creates a new vector with all four components equal to a scalar." } ;
-
-HELP: double-4-boa
-{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" double-4 } }
-{ $description "Creates a new vector from four scalar components." } ;
-
-HELP: double-4{
-{ $syntax "double-4{ a b c d }" }
-{ $description "Literal syntax for a " { $link double-4 } "." } ;
-
 ABOUT: "math.vectors.simd"
diff --git a/basis/math/vectors/simd/simd.factor b/basis/math/vectors/simd/simd.factor
index a3c99ae217..c5e7d6f75d 100644
--- a/basis/math/vectors/simd/simd.factor
+++ b/basis/math/vectors/simd/simd.factor
@@ -1,185 +1,15 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien.c-types byte-arrays cpu.architecture
-kernel math math.functions math.vectors
-math.vectors.simd.functor math.vectors.simd.intrinsics
-math.vectors.specialization parser prettyprint.custom sequences
-sequences.private locals assocs words fry ;
-FROM: alien.c-types => float ;
-QUALIFIED-WITH: math m
+USING: alien.c-types cpu.architecture kernel
+math.vectors.simd.functor vocabs.loader ;
+FROM: sequences => each ;
 IN: math.vectors.simd
 
 <<
 
-DEFER: float-4
-DEFER: double-2
-DEFER: float-8
-DEFER: double-4
-
-"double" define-simd-128
-"float"  define-simd-128
-"double" define-simd-256
-"float"  define-simd-256
+{ double float char uchar short ushort int uint }
+[ [ define-simd-128 ] [ define-simd-256 ] bi ] each
 
 >>
 
-: float-4-with ( x -- simd-array )
-    [ 4 ] dip >float '[ _ ] \ float-4 new replicate-as ;
-
-: float-4-boa ( a b c d -- simd-array )
-    \ float-4 new 4sequence ;
-
-: double-2-with ( x -- simd-array )
-    [ 2 ] dip >float '[ _ ] \ double-2 new replicate-as ;
-
-: double-2-boa ( a b -- simd-array )
-    \ double-2 new 2sequence ;
-
-! More efficient expansions for the above, used when SIMD is
-! actually available.
-
-<<
-
-\ float-4-with [
-    drop
-    \ (simd-broadcast) "intrinsic" word-prop [
-        [ >float float-4-rep (simd-broadcast) \ float-4 boa ]
-    ] [ \ float-4-with def>> ] if
-] "custom-inlining" set-word-prop
-
-\ float-4-boa [
-    drop
-    \ (simd-gather-4) "intrinsic" word-prop [
-        [| a b c d |
-            a >float b >float c >float d >float
-            float-4-rep (simd-gather-4) \ float-4 boa
-        ]
-    ] [ \ float-4-boa def>> ] if
-] "custom-inlining" set-word-prop
-
-\ double-2-with [
-    drop
-    \ (simd-broadcast) "intrinsic" word-prop [
-        [ >float double-2-rep (simd-broadcast) \ double-2 boa ]
-    ] [ \ double-2-with def>> ] if
-] "custom-inlining" set-word-prop
-
-\ double-2-boa [
-    drop
-    \ (simd-gather-4) "intrinsic" word-prop [
-        [ [ >float ] bi@ double-2-rep (simd-gather-2) \ double-2 boa ]
-    ] [ \ double-2-boa def>> ] if
-] "custom-inlining" set-word-prop
-
->>
-
-: float-8-with ( x -- simd-array )
-    [ float-4-with ] [ float-4-with ] bi [ underlying>> ] bi@
-    \ float-8 boa ; inline
-
-:: float-8-boa ( a b c d e f g h -- simd-array )
-    a b c d float-4-boa
-    e f g h float-4-boa
-    [ underlying>> ] bi@
-    \ float-8 boa ; inline
-
-: double-4-with ( x -- simd-array )
-    [ double-2-with ] [ double-2-with ] bi [ underlying>> ] bi@
-    \ double-4 boa ; inline
-
-:: double-4-boa ( a b c d -- simd-array )
-    a b double-2-boa
-    c d double-2-boa
-    [ underlying>> ] bi@
-    \ double-4 boa ; inline
-
-<<
-
-<PRIVATE
-
-! Filter out operations that are not available, eg horizontal adds
-! on SSE2. Fallback code in math.vectors is used in that case.
-
-: supported-simd-ops ( assoc -- assoc' )
-    {
-        { v+ (simd-v+) }
-        { v- (simd-v-) }
-        { v* (simd-v*) }
-        { v/ (simd-v/) }
-        { vmin (simd-vmin) }
-        { vmax (simd-vmax) }
-        { sum (simd-sum) }
-    } [ nip "intrinsic" word-prop ] assoc-filter
-    '[ drop _ key? ] assoc-filter ;
-
-! Some SIMD operations are defined in terms of others.
-
-:: high-level-ops ( ctor -- assoc )
-    {
-        { vneg [ [ dup v- ] keep v- ] }
-        { v. [ v* sum ] }
-        { n+v [ [ ctor execute ] dip v+ ] }
-        { v+n [ ctor execute v+ ] }
-        { n-v [ [ ctor execute ] dip v- ] }
-        { v-n [ ctor execute v- ] }
-        { n*v [ [ ctor execute ] dip v* ] }
-        { v*n [ ctor execute v* ] }
-        { n/v [ [ ctor execute ] dip v/ ] }
-        { v/n [ ctor execute v/ ] }
-        { norm-sq [ dup v. assert-positive ] }
-        { norm [ norm-sq sqrt ] }
-        { normalize [ dup norm v/n ] }
-        { distance [ v- norm ] }
-    } ;
-
-:: simd-vector-words ( class ctor elt-type assoc -- )
-    class elt-type assoc supported-simd-ops ctor high-level-ops assoc-union
-    specialize-vector-words ;
-
-PRIVATE>
-
-\ float-4 \ float-4-with m:float H{
-    { v+ [ [ (simd-v+) ] float-4-vv->v-op ] }
-    { v- [ [ (simd-v-) ] float-4-vv->v-op ] }
-    { v* [ [ (simd-v*) ] float-4-vv->v-op ] }
-    { v/ [ [ (simd-v/) ] float-4-vv->v-op ] }
-    { vmin [ [ (simd-vmin) ] float-4-vv->v-op ] }
-    { vmax [ [ (simd-vmax) ] float-4-vv->v-op ] }
-    { sum [ [ (simd-sum) ] float-4-v->n-op ] }
-} simd-vector-words
-
-\ double-2 \ double-2-with m:float H{
-    { v+ [ [ (simd-v+) ] double-2-vv->v-op ] }
-    { v- [ [ (simd-v-) ] double-2-vv->v-op ] }
-    { v* [ [ (simd-v*) ] double-2-vv->v-op ] }
-    { v/ [ [ (simd-v/) ] double-2-vv->v-op ] }
-    { vmin [ [ (simd-vmin) ] double-2-vv->v-op ] }
-    { vmax [ [ (simd-vmax) ] double-2-vv->v-op ] }
-    { sum [ [ (simd-sum) ] double-2-v->n-op ] }
-} simd-vector-words
-
-\ float-8 \ float-8-with m:float H{
-    { v+ [ [ (simd-v+) ] float-8-vv->v-op ] }
-    { v- [ [ (simd-v-) ] float-8-vv->v-op ] }
-    { v* [ [ (simd-v*) ] float-8-vv->v-op ] }
-    { v/ [ [ (simd-v/) ] float-8-vv->v-op ] }
-    { vmin [ [ (simd-vmin) ] float-8-vv->v-op ] }
-    { vmax [ [ (simd-vmax) ] float-8-vv->v-op ] }
-    { sum [ [ (simd-sum) ] [ + ] float-8-v->n-op ] }
-} simd-vector-words
-
-\ double-4 \ double-4-with m:float H{
-    { v+ [ [ (simd-v+) ] double-4-vv->v-op ] }
-    { v- [ [ (simd-v-) ] double-4-vv->v-op ] }
-    { v* [ [ (simd-v*) ] double-4-vv->v-op ] }
-    { v/ [ [ (simd-v/) ] double-4-vv->v-op ] }
-    { vmin [ [ (simd-vmin) ] double-4-vv->v-op ] }
-    { vmax [ [ (simd-vmax) ] double-4-vv->v-op ] }
-    { sum [ [ (simd-v+) ] [ (simd-sum) ] double-4-v->n-op ] }
-} simd-vector-words
-
->>
-
-USE: vocabs.loader
-
 "math.vectors.simd.alien" require
diff --git a/basis/math/vectors/simd/summary.txt b/basis/math/vectors/simd/summary.txt
new file mode 100644
index 0000000000..22593f1286
--- /dev/null
+++ b/basis/math/vectors/simd/summary.txt
@@ -0,0 +1 @@
+Single-instruction-multiple-data parallel vector operations