From e77341b90cb47978046c12bdbba1ead2de20a337 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Sun, 20 Sep 2009 02:08:32 -0500
Subject: [PATCH 01/15] math.vectors.simd: redesign to be more flexible,
 integer SIMD work in progress

---
 basis/alien/c-types/c-types.factor            |   8 +
 basis/cpu/architecture/architecture.factor    |  34 +++-
 basis/cpu/x86/features/features.factor        |  49 ++---
 basis/cpu/x86/x86.factor                      | 115 ++++++++++-
 basis/math/floats/env/x86/x86.factor          |   4 +-
 .../math/vectors/simd/functor/functor.factor  | 134 +++++++++++--
 .../simd/intrinsics/intrinsics-tests.factor   |  18 ++
 .../vectors/simd/intrinsics/intrinsics.factor |  54 +++++-
 basis/math/vectors/simd/simd-docs.factor      |  84 --------
 basis/math/vectors/simd/simd.factor           | 180 +-----------------
 basis/math/vectors/simd/summary.txt           |   1 +
 11 files changed, 363 insertions(+), 318 deletions(-)
 create mode 100644 basis/math/vectors/simd/intrinsics/intrinsics-tests.factor
 create mode 100644 basis/math/vectors/simd/summary.txt

diff --git a/basis/alien/c-types/c-types.factor b/basis/alien/c-types/c-types.factor
index fa27e29c04..afbb664fed 100755
--- a/basis/alien/c-types/c-types.factor
+++ b/basis/alien/c-types/c-types.factor
@@ -472,3 +472,11 @@ SYMBOLS:
     \ ulong \ size_t typedef
 ] with-compilation-unit
 
+M: char-16-rep rep-component-type drop char ;
+M: uchar-16-rep rep-component-type drop uchar ;
+M: short-8-rep rep-component-type drop short ;
+M: ushort-8-rep rep-component-type drop ushort ;
+M: int-4-rep rep-component-type drop int ;
+M: uint-4-rep rep-component-type drop uint ;
+M: float-4-rep rep-component-type drop float ;
+M: double-2-rep rep-component-type drop double ;
diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor
index d6611c3384..61e4e2df37 100644
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@@ -22,8 +22,6 @@ SINGLETONS: float-rep double-rep ;
 
 ! On x86, floating point registers are really vector registers
 SINGLETONS:
-float-4-rep
-double-2-rep
 char-16-rep
 uchar-16-rep
 short-8-rep
@@ -31,9 +29,11 @@ ushort-8-rep
 int-4-rep
 uint-4-rep ;
 
-UNION: vector-rep
+SINGLETONS:
 float-4-rep
-double-2-rep
+double-2-rep ;
+
+UNION: int-vector-rep
 char-16-rep
 uchar-16-rep
 short-8-rep
@@ -41,6 +41,14 @@ ushort-8-rep
 int-4-rep
 uint-4-rep ;
 
+UNION: float-vector-rep
+float-4-rep
+double-2-rep ;
+
+UNION: vector-rep
+int-vector-rep
+float-vector-rep ;
+
 UNION: representation
 any-rep
 tagged-rep
@@ -76,10 +84,15 @@ M: double-rep rep-size drop 8 ;
 M: stack-params rep-size drop cell ;
 M: vector-rep rep-size drop 16 ;
 
+GENERIC: rep-component-type ( rep -- n )
+
+! Methods defined in alien.c-types
+
 GENERIC: scalar-rep-of ( rep -- rep' )
 
 M: float-4-rep scalar-rep-of drop float-rep ;
 M: double-2-rep scalar-rep-of drop double-rep ;
+M: int-vector-rep scalar-rep-of drop int-rep ;
 
 ! Mapping from register class to machine registers
 HOOK: machine-registers cpu ( -- assoc )
@@ -167,7 +180,6 @@ HOOK: %unbox-vector cpu ( dst src rep -- )
 HOOK: %broadcast-vector cpu ( dst src rep -- )
 HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
-
 HOOK: %add-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
@@ -177,6 +189,18 @@ HOOK: %max-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sqrt-vector cpu ( dst src rep -- )
 HOOK: %horizontal-add-vector cpu ( dst src rep -- )
 
+HOOK: %broadcast-vector-reps cpu ( -- reps )
+HOOK: %gather-vector-2-reps cpu ( -- reps )
+HOOK: %gather-vector-4-reps cpu ( -- reps )
+HOOK: %add-vector-reps cpu ( -- reps )
+HOOK: %sub-vector-reps cpu ( -- reps )
+HOOK: %mul-vector-reps cpu ( -- reps )
+HOOK: %div-vector-reps cpu ( -- reps )
+HOOK: %min-vector-reps cpu ( -- reps )
+HOOK: %max-vector-reps cpu ( -- reps )
+HOOK: %sqrt-vector-reps cpu ( -- reps )
+HOOK: %horizontal-add-vector-reps cpu ( -- reps )
+
 HOOK: %unbox-alien cpu ( dst src -- )
 HOOK: %unbox-any-c-ptr cpu ( dst src temp -- )
 HOOK: %box-alien cpu ( dst src temp -- )
diff --git a/basis/cpu/x86/features/features.factor b/basis/cpu/x86/features/features.factor
index c5cf2d470a..5fad4e802c 100644
--- a/basis/cpu/x86/features/features.factor
+++ b/basis/cpu/x86/features/features.factor
@@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: system kernel math math.order math.parser namespaces
-alien.c-types alien.syntax combinators locals init io cpu.x86
+USING: system kernel memoize math math.order math.parser
+namespaces alien.c-types alien.syntax combinators locals init io
 compiler compiler.units accessors ;
 IN: cpu.x86.features
 
@@ -13,7 +13,16 @@ FUNCTION: longlong read_timestamp_counter ( ) ;
 
 PRIVATE>
 
-ALIAS: sse-version sse_version
+: sse-version ( -- n )
+    sse_version
+    "sse-version" get string>number [ min ] when* ; foldable
+
+: sse? ( -- ? ) sse-version 10 >= ; foldable
+: sse2? ( -- ? ) sse-version 20 >= ; foldable
+: sse3? ( -- ? ) sse-version 30 >= ; foldable
+: ssse3? ( -- ? ) sse-version 33 >= ; foldable
+: sse4.1? ( -- ? ) sse-version 41 >= ; foldable
+: sse4.2? ( -- ? ) sse-version 42 >= ; foldable
 
 : sse-string ( version -- string )
     {
@@ -32,37 +41,3 @@ M: x86 instruction-count read_timestamp_counter ;
 
 : count-instructions ( quot -- n )
     instruction-count [ call ] dip instruction-count swap - ; inline
-
-USING: cpu.x86.features cpu.x86.features.private ;
-
-:: install-sse-check ( version -- )
-    [
-        sse-version version < [
-            "This image was built to use " write
-            version sse-string write
-            " but your CPU only supports " write
-            sse-version sse-string write "." print
-            "You will need to bootstrap Factor again." print
-            flush
-            1 exit
-        ] when
-    ] "cpu.x86" add-init-hook ;
-
-: enable-sse ( version -- )
-    {
-        { 00 [ ] }
-        { 10 [ ] }
-        { 20 [ enable-sse2 ] }
-        { 30 [ enable-sse3 ] }
-        { 33 [ enable-sse3 ] }
-        { 41 [ enable-sse3 ] }
-        { 42 [ enable-sse3 ] }
-    } case ;
-
-[ { sse_version } compile ] with-optimizer
-
-"Checking for multimedia extensions: " write sse-version
-"sse-version" get [ string>number min ] when*
-[ sse-string write " detected" print ]
-[ install-sse-check ]
-[ enable-sse ] tri
diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index 04b5308836..4d80862ed3 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -4,7 +4,8 @@ USING: accessors assocs alien alien.c-types arrays strings
 cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
 cpu.architecture kernel kernel.private math memory namespaces make
 sequences words system layouts combinators math.order fry locals
-compiler.constants byte-arrays
+compiler.constants byte-arrays io macros quotations cpu.x86.features
+cpu.x86.features.private compiler compiler.units init
 compiler.cfg.registers
 compiler.cfg.instructions
 compiler.cfg.intrinsics
@@ -250,12 +251,26 @@ M:: x86 %unbox-vector ( dst src rep -- )
     dst src byte-array-offset [+]
     rep copy-register ;
 
+MACRO: available-reps ( alist -- )
+    ! Each SSE version adds new representations and supports
+    ! all old ones
+    unzip { } [ append ] accumulate rest swap suffix
+    [ [ 1quotation ] map ] bi@ zip
+    reverse [ { } ] suffix
+    '[ _ cond ] ;
+
 M: x86 %broadcast-vector ( dst src rep -- )
     {
         { float-4-rep [ [ MOVSS ] [ drop dup 0 SHUFPS ] 2bi ] }
         { double-2-rep [ [ MOVSD ] [ drop dup UNPCKLPD ] 2bi ] }
     } case ;
 
+M: x86 %broadcast-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep } }
+    } available-reps ;
+
 M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
     rep {
         {
@@ -269,6 +284,11 @@ M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
         }
     } case ;
 
+M: x86 %gather-vector-4-reps
+    {
+        { sse? { float-4-rep } }
+    } available-reps ;
+
 M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
     rep {
         {
@@ -280,6 +300,11 @@ M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
         }
     } case ;
 
+M: x86 %gather-vector-2-reps
+    {
+        { sse2? { double-2-rep } }
+    } available-reps ;
+
 M: x86 %add-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ ADDPS ] }
@@ -292,6 +317,12 @@ M: x86 %add-vector ( dst src1 src2 rep -- )
         { uint-4-rep [ PADDD ] }
     } case drop ;
 
+M: x86 %add-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
 M: x86 %sub-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ SUBPS ] }
@@ -304,43 +335,92 @@ M: x86 %sub-vector ( dst src1 src2 rep -- )
         { uint-4-rep [ PSUBD ] }
     } case drop ;
 
+M: x86 %sub-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
 M: x86 %mul-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ MULPS ] }
         { double-2-rep [ MULPD ] }
-        { int-4-rep [ PMULLW ] }
+        { short-8-rep [ PMULLW ] }
+        { ushort-8-rep [ PMULLW ] }
+        { int-4-rep [ PMULLD ] }
+        { uint-4-rep [ PMULLD ] }
     } case drop ;
 
+M: x86 %mul-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
 M: x86 %div-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ DIVPS ] }
         { double-2-rep [ DIVPD ] }
     } case drop ;
 
+M: x86 %div-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep } }
+    } available-reps ;
+
 M: x86 %min-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ MINPS ] }
         { double-2-rep [ MINPD ] }
+        { uchar-16-rep [ PMINUB ] }
+        { short-8-rep [ PMINSW ] }
     } case drop ;
 
+M: x86 %min-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %max-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ MAXPS ] }
         { double-2-rep [ MAXPD ] }
+        { uchar-16-rep [ PMAXUB ] }
+        { short-8-rep [ PMAXSW ] }
     } case drop ;
 
+M: x86 %max-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %sqrt-vector ( dst src rep -- )
     {
         { float-4-rep [ SQRTPS ] }
         { double-2-rep [ SQRTPD ] }
     } case ;
 
+M: x86 %sqrt-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep } }
+    } available-reps ;
+
 M: x86 %horizontal-add-vector ( dst src rep -- )
     {
         { float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
         { double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
     } case ;
 
+M: x86 %horizontal-add-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %unbox-alien ( dst src -- )
     alien-offset [+] MOV ;
 
@@ -775,3 +855,34 @@ M: x86 small-enough? ( n -- ? )
     enable-sse3-simd ;
 
 enable-min/max
+
+:: install-sse-check ( version -- )
+    [
+        sse-version version < [
+            "This image was built to use " write
+            version sse-string write
+            " but your CPU only supports " write
+            sse-version sse-string write "." print
+            "You will need to bootstrap Factor again." print
+            flush
+            1 exit
+        ] when
+    ] "cpu.x86" add-init-hook ;
+
+: enable-sse ( version -- )
+    {
+        { 00 [ ] }
+        { 10 [ ] }
+        { 20 [ enable-sse2 ] }
+        { 30 [ enable-sse3 ] }
+        { 33 [ enable-sse3 ] }
+        { 41 [ enable-sse3 ] }
+        { 42 [ enable-sse3 ] }
+    } case ;
+
+[ { sse_version } compile ] with-optimizer
+
+"Checking for multimedia extensions: " write sse-version 30 min
+[ sse-string write " detected" print ]
+[ install-sse-check ]
+[ enable-sse ] tri
diff --git a/basis/math/floats/env/x86/x86.factor b/basis/math/floats/env/x86/x86.factor
index e91fc4eda9..e9120567aa 100644
--- a/basis/math/floats/env/x86/x86.factor
+++ b/basis/math/floats/env/x86/x86.factor
@@ -31,9 +31,7 @@ M: x87-env (set-fp-env-register)
     set_x87_env ;
 
 M: x86 (fp-env-registers)
-    sse-version 20 >=
-    [ <sse-env> <x87-env> 2array ]
-    [ <x87-env> 1array ] if ;
+    sse2? [ <sse-env> <x87-env> 2array ] [ <x87-env> 1array ] if ;
 
 CONSTANT: sse-exception-flag-bits HEX: 3f
 CONSTANT: sse-exception-flag>bit
diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor
index 641585a5d7..a97dc192be 100644
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@@ -1,24 +1,94 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien.c-types byte-arrays classes functors
-kernel math parser prettyprint.custom sequences
-sequences.private literals ;
+USING: accessors alien.c-types assocs byte-arrays classes
+effects fry functors generalizations kernel literals locals
+math math.functions math.vectors math.vectors.simd.intrinsics
+math.vectors.specialization parser prettyprint.custom sequences
+sequences.private strings words definitions macros cpu.architecture ;
 IN: math.vectors.simd.functor
 
 ERROR: bad-length got expected ;
 
+MACRO: simd-boa ( rep class -- simd-array )
+    [ rep-components ] [ new ] bi* '[ _ _ nsequence ] ;
+
+:: define-boa-custom-inlining ( word rep class -- )
+    word [
+        drop
+        rep rep rep-gather-word supported-simd-op? [
+            [ rep (simd-boa) class boa ]
+        ] [ word def>> ] if
+    ] "custom-inlining" set-word-prop ;
+
+: simd-with ( rep class x -- simd-array )
+    [ rep-components ] [ new ] [ '[ _ ] ] tri* swap replicate-as ; inline
+
+:: define-with-custom-inlining ( word rep class -- )
+    word [
+        drop
+        rep \ (simd-broadcast) supported-simd-op? [
+            [ rep rep-coerce rep (simd-broadcast) class boa ]
+        ] [ word def>> ] if
+    ] "custom-inlining" set-word-prop ;
+
+: boa-effect ( rep n -- effect )
+    [ rep-components ] dip *
+    [ CHAR: a + 1string ] map
+    { "simd-vector" } <effect> ;
+
+: supported-simd-ops ( assoc rep -- assoc' )
+    [
+        {
+            { v+ (simd-v+) }
+            { v- (simd-v-) }
+            { v* (simd-v*) }
+            { v/ (simd-v/) }
+            { vmin (simd-vmin) }
+            { vmax (simd-vmax) }
+            { sum (simd-sum) }
+        }
+    ] dip 
+    '[ nip _ swap supported-simd-op? ] assoc-filter
+    '[ drop _ key? ] assoc-filter ;
+
+:: high-level-ops ( ctor -- assoc )
+    ! Some SIMD operations are defined in terms of others.
+    {
+        { vneg [ [ dup v- ] keep v- ] }
+        { v. [ v* sum ] }
+        { n+v [ [ ctor execute ] dip v+ ] }
+        { v+n [ ctor execute v+ ] }
+        { n-v [ [ ctor execute ] dip v- ] }
+        { v-n [ ctor execute v- ] }
+        { n*v [ [ ctor execute ] dip v* ] }
+        { v*n [ ctor execute v* ] }
+        { n/v [ [ ctor execute ] dip v/ ] }
+        { v/n [ ctor execute v/ ] }
+        { norm-sq [ dup v. assert-positive ] }
+        { norm [ norm-sq sqrt ] }
+        { normalize [ dup norm v/n ] }
+        { distance [ v- norm ] }
+    } ;
+
+:: simd-vector-words ( class ctor rep assoc -- )
+    class
+    rep rep-component-type c-type-boxed-class
+    assoc rep supported-simd-ops
+    ctor high-level-ops assoc-union
+    specialize-vector-words ;
+
 FUNCTOR: define-simd-128 ( T -- )
 
-T-TYPE       IS ${T}
-
-N            [ 16 T-TYPE heap-size /i ]
+N            [ 16 T heap-size /i ]
 
 A            DEFINES-CLASS ${T}-${N}
+A-boa        DEFINES ${A}-boa
+A-with       DEFINES ${A}-with
 >A           DEFINES >${A}
 A{           DEFINES ${A}{
 
-NTH          [ T-TYPE dup c-type-getter-boxer array-accessor ]
-SET-NTH      [ T-TYPE dup c-setter array-accessor ]
+NTH          [ T dup c-type-getter-boxer array-accessor ]
+SET-NTH      [ T dup c-setter array-accessor ]
 
 A-rep        IS ${A}-rep
 A-vv->v-op   DEFINES-PRIVATE ${A}-vv->v-op
@@ -59,6 +129,16 @@ M: A pprint* pprint-object ;
 
 SYNTAX: A{ \ } [ >A ] parse-literal ;
 
+: A-with ( x -- simd-array ) [ A-rep A ] dip simd-with ;
+
+\ A-with \ A-rep \ A define-with-custom-inlining
+
+\ A-boa [ \ A-rep \ A simd-boa ] \ A-rep 1 boa-effect define-declared
+
+\ A-rep rep-gather-word [
+    \ A-boa \ A-rep \ A define-boa-custom-inlining
+] when
+
 INSTANCE: A sequence
 
 <PRIVATE
@@ -69,6 +149,16 @@ INSTANCE: A sequence
 : A-v->n-op ( v quot -- n )
     [ underlying>> A-rep ] dip call ; inline
 
+\ A \ A-with \ A-rep H{
+    { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
+    { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
+    { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
+    { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
+    { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
+    { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
+    { sum [ [ (simd-sum) ] \ A-v->n-op execute ] }
+} simd-vector-words
+
 PRIVATE>
 
 ;FUNCTOR
@@ -76,14 +166,16 @@ PRIVATE>
 ! Synthesize 256-bit vectors from a pair of 128-bit vectors
 FUNCTOR: define-simd-256 ( T -- )
 
-T-TYPE       IS ${T}
-
-N            [ 32 T-TYPE heap-size /i ]
+N            [ 32 T heap-size /i ]
 
 N/2          [ N 2 / ]
 A/2          IS ${T}-${N/2}
+A/2-boa      IS ${A/2}-boa
+A/2-with     IS ${A/2}-with
 
 A            DEFINES-CLASS ${T}-${N}
+A-boa        DEFINES ${A}-boa
+A-with       DEFINES ${A}-with
 >A           DEFINES >${A}
 A{           DEFINES ${A}{
 
@@ -137,6 +229,16 @@ M: A >pprint-sequence ;
 
 M: A pprint* pprint-object ;
 
+: A-with ( x -- simd-array )
+    [ A/2-with ] [ A/2-with ] bi [ underlying>> ] bi@
+    \ A boa ; inline
+
+: A-boa ( ... -- simd-array )
+    [ A/2-boa ] N/2 ndip A/2-boa [ underlying>> ] bi@
+    \ A boa ;
+
+\ A-rep 2 boa-effect \ A-boa set-stack-effect
+
 INSTANCE: A sequence
 
 : A-vv->v-op ( v1 v2 quot -- v3 )
@@ -148,4 +250,14 @@ INSTANCE: A sequence
     [ [ [ underlying1>> ] [ underlying2>> ] bi A-rep ] dip call A-rep ]
     dip call ; inline
 
+\ A \ A-with \ A-rep H{
+    { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
+    { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
+    { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
+    { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
+    { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
+    { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
+    { sum [ [ (simd-v+) ] [ (simd-sum) ] \ A-v->n-op execute ] }
+} simd-vector-words
+
 ;FUNCTOR
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor b/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor
new file mode 100644
index 0000000000..84eee935a0
--- /dev/null
+++ b/basis/math/vectors/simd/intrinsics/intrinsics-tests.factor
@@ -0,0 +1,18 @@
+IN: math.vectors.simd.intrinsics.tests
+USING: math.vectors.simd.intrinsics cpu.architecture tools.test ;
+
+[ 16 ] [ uchar-16-rep rep-components ] unit-test
+[ 16 ] [ char-16-rep rep-components ] unit-test
+[ 8 ] [ ushort-8-rep rep-components ] unit-test
+[ 8 ] [ short-8-rep rep-components ] unit-test
+[ 4 ] [ uint-4-rep rep-components ] unit-test
+[ 4 ] [ int-4-rep rep-components ] unit-test
+[ 4 ] [ float-4-rep rep-components ] unit-test
+[ 2 ] [ double-2-rep rep-components ] unit-test
+
+{ 4 1 } [ uint-4-rep (simd-boa) ] must-infer-as
+{ 4 1 } [ int-4-rep (simd-boa) ] must-infer-as
+{ 4 1 } [ float-4-rep (simd-boa) ] must-infer-as
+{ 2 1 } [ double-2-rep (simd-boa) ] must-infer-as
+
+
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor
index 914d1ef169..a7d019af81 100644
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@@ -1,6 +1,8 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel alien alien.data cpu.architecture libc ;
+USING: alien alien.c-types alien.data assocs combinators
+cpu.architecture fry generalizations kernel libc macros math
+sequences ;
 IN: math.vectors.simd.intrinsics
 
 ERROR: bad-simd-call ;
@@ -26,3 +28,53 @@ ERROR: bad-simd-call ;
     ! Inefficient version for when intrinsics are missing
     [ swap <displaced-alien> swap ] dip rep-size memcpy ;
 
+<<
+
+: rep-components ( rep -- n )
+    16 swap rep-component-type heap-size /i ; foldable
+
+: rep-coercer ( rep -- quot )
+    {
+        { [ dup int-vector-rep? ] [ [ >fixnum ] ] }
+        { [ dup float-vector-rep? ] [ [ >float ] ] }
+    } cond nip ; foldable
+
+: rep-coerce ( value rep -- value' )
+    rep-coercer call( value -- value' ) ; inline
+
+CONSTANT: rep-gather-words
+    {
+        { 2 (simd-gather-2) }
+        { 4 (simd-gather-4) }
+    }
+
+: rep-gather-word ( rep -- word )
+    rep-components rep-gather-words at ;
+
+>>
+
+MACRO: (simd-boa) ( rep -- quot )
+    {
+        [ rep-coercer ]
+        [ rep-components ]
+        [ ]
+        [ rep-gather-word ]
+    } cleave
+    '[ _ _ napply _ _ execute ] ;
+
+GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
+
+M: vector-rep supported-simd-op?
+    {
+        { \ (simd-v+)        [ %add-vector-reps            ] }
+        { \ (simd-v-)        [ %sub-vector-reps            ] }
+        { \ (simd-v*)        [ %mul-vector-reps            ] }
+        { \ (simd-v/)        [ %div-vector-reps            ] }
+        { \ (simd-vmin)      [ %min-vector-reps            ] }
+        { \ (simd-vmax)      [ %max-vector-reps            ] }
+        { \ (simd-vsqrt)     [ %sqrt-vector-reps           ] }
+        { \ (simd-sum)       [ %horizontal-add-vector-reps ] }
+        { \ (simd-broadcast) [ %broadcast-vector-reps      ] }
+        { \ (simd-gather-2)  [ %gather-vector-2-reps       ] }
+        { \ (simd-gather-4)  [ %gather-vector-4-reps       ] }
+    } case member? ;
diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor
index b110de1de8..d6131b3a71 100644
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@@ -43,22 +43,6 @@ $nl
 }
 "The " { $link float-4 } " and " { $link double-2 } " types correspond to 128-bit vector registers. The " { $link float-8 } " and " { $link double-4 } " types are not directly supported in hardware, and instead unbox to a pair of 128-bit vector registers."
 $nl
-"Operations on " { $link float-4 } " instances:"
-{ $subsection float-4-with }
-{ $subsection float-4-boa }
-{ $subsection POSTPONE: float-4{ }
-"Operations on " { $link double-2 } " instances:"
-{ $subsection double-2-with }
-{ $subsection double-2-boa }
-{ $subsection POSTPONE: double-2{ }
-"Operations on " { $link float-8 } " instances:"
-{ $subsection float-8-with }
-{ $subsection float-8-boa }
-{ $subsection POSTPONE: float-8{ }
-"Operations on " { $link double-4 } " instances:"
-{ $subsection double-4-with }
-{ $subsection double-4-boa }
-{ $subsection POSTPONE: double-4{ }
 "To actually perform vector arithmetic on SIMD vectors, use " { $link "math-vectors" } " words."
 { $see-also "c-types-specs" } ;
 
@@ -184,72 +168,4 @@ ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
 { $subsection "math.vectors.simd.alien" }
 { $subsection "math.vectors.simd.intrinsics" } ;
 
-! ! ! float-4
-
-HELP: float-4
-{ $class-description "A sequence of four single-precision floating point values. New instances can be created with " { $link float-4-with } " or " { $link float-4-boa } "." } ;
-
-HELP: float-4-with
-{ $values { "x" float } { "simd-array" float-4 } }
-{ $description "Creates a new vector with all four components equal to a scalar." } ;
-
-HELP: float-4-boa
-{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" float-4 } }
-{ $description "Creates a new vector from four scalar components." } ;
-
-HELP: float-4{
-{ $syntax "float-4{ a b c d }" }
-{ $description "Literal syntax for a " { $link float-4 } "." } ;
-
-! ! ! double-2
-
-HELP: double-2
-{ $class-description "A sequence of two double-precision floating point values. New instances can be created with " { $link double-2-with } " or " { $link double-2-boa } "." } ;
-
-HELP: double-2-with
-{ $values { "x" float } { "simd-array" double-2 } }
-{ $description "Creates a new vector with both components equal to a scalar." } ;
-
-HELP: double-2-boa
-{ $values { "a" float } { "b" float } { "simd-array" double-2 } }
-{ $description "Creates a new vector from two scalar components." } ;
-
-HELP: double-2{
-{ $syntax "double-2{ a b }" }
-{ $description "Literal syntax for a " { $link double-2 } "." } ;
-
-! ! ! float-8
-
-HELP: float-8
-{ $class-description "A sequence of eight single-precision floating point values. New instances can be created with " { $link float-8-with } " or " { $link float-8-boa } "." } ;
-
-HELP: float-8-with
-{ $values { "x" float } { "simd-array" float-8 } }
-{ $description "Creates a new vector with all eight components equal to a scalar." } ;
-
-HELP: float-8-boa
-{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "e" float } { "f" float } { "g" float } { "h" float } { "simd-array" float-8 } }
-{ $description "Creates a new vector from eight scalar components." } ;
-
-HELP: float-8{
-{ $syntax "float-8{ a b c d e f g h }" }
-{ $description "Literal syntax for a " { $link float-8 } "." } ;
-
-! ! ! double-4
-
-HELP: double-4
-{ $class-description "A sequence of four double-precision floating point values. New instances can be created with " { $link double-4-with } " or " { $link double-4-boa } "." } ;
-
-HELP: double-4-with
-{ $values { "x" float } { "simd-array" double-4 } }
-{ $description "Creates a new vector with all four components equal to a scalar." } ;
-
-HELP: double-4-boa
-{ $values { "a" float } { "b" float } { "c" float } { "d" float } { "simd-array" double-4 } }
-{ $description "Creates a new vector from four scalar components." } ;
-
-HELP: double-4{
-{ $syntax "double-4{ a b c d }" }
-{ $description "Literal syntax for a " { $link double-4 } "." } ;
-
 ABOUT: "math.vectors.simd"
diff --git a/basis/math/vectors/simd/simd.factor b/basis/math/vectors/simd/simd.factor
index a3c99ae217..c5e7d6f75d 100644
--- a/basis/math/vectors/simd/simd.factor
+++ b/basis/math/vectors/simd/simd.factor
@@ -1,185 +1,15 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien.c-types byte-arrays cpu.architecture
-kernel math math.functions math.vectors
-math.vectors.simd.functor math.vectors.simd.intrinsics
-math.vectors.specialization parser prettyprint.custom sequences
-sequences.private locals assocs words fry ;
-FROM: alien.c-types => float ;
-QUALIFIED-WITH: math m
+USING: alien.c-types cpu.architecture kernel
+math.vectors.simd.functor vocabs.loader ;
+FROM: sequences => each ;
 IN: math.vectors.simd
 
 <<
 
-DEFER: float-4
-DEFER: double-2
-DEFER: float-8
-DEFER: double-4
-
-"double" define-simd-128
-"float"  define-simd-128
-"double" define-simd-256
-"float"  define-simd-256
+{ double float char uchar short ushort int uint }
+[ [ define-simd-128 ] [ define-simd-256 ] bi ] each
 
 >>
 
-: float-4-with ( x -- simd-array )
-    [ 4 ] dip >float '[ _ ] \ float-4 new replicate-as ;
-
-: float-4-boa ( a b c d -- simd-array )
-    \ float-4 new 4sequence ;
-
-: double-2-with ( x -- simd-array )
-    [ 2 ] dip >float '[ _ ] \ double-2 new replicate-as ;
-
-: double-2-boa ( a b -- simd-array )
-    \ double-2 new 2sequence ;
-
-! More efficient expansions for the above, used when SIMD is
-! actually available.
-
-<<
-
-\ float-4-with [
-    drop
-    \ (simd-broadcast) "intrinsic" word-prop [
-        [ >float float-4-rep (simd-broadcast) \ float-4 boa ]
-    ] [ \ float-4-with def>> ] if
-] "custom-inlining" set-word-prop
-
-\ float-4-boa [
-    drop
-    \ (simd-gather-4) "intrinsic" word-prop [
-        [| a b c d |
-            a >float b >float c >float d >float
-            float-4-rep (simd-gather-4) \ float-4 boa
-        ]
-    ] [ \ float-4-boa def>> ] if
-] "custom-inlining" set-word-prop
-
-\ double-2-with [
-    drop
-    \ (simd-broadcast) "intrinsic" word-prop [
-        [ >float double-2-rep (simd-broadcast) \ double-2 boa ]
-    ] [ \ double-2-with def>> ] if
-] "custom-inlining" set-word-prop
-
-\ double-2-boa [
-    drop
-    \ (simd-gather-4) "intrinsic" word-prop [
-        [ [ >float ] bi@ double-2-rep (simd-gather-2) \ double-2 boa ]
-    ] [ \ double-2-boa def>> ] if
-] "custom-inlining" set-word-prop
-
->>
-
-: float-8-with ( x -- simd-array )
-    [ float-4-with ] [ float-4-with ] bi [ underlying>> ] bi@
-    \ float-8 boa ; inline
-
-:: float-8-boa ( a b c d e f g h -- simd-array )
-    a b c d float-4-boa
-    e f g h float-4-boa
-    [ underlying>> ] bi@
-    \ float-8 boa ; inline
-
-: double-4-with ( x -- simd-array )
-    [ double-2-with ] [ double-2-with ] bi [ underlying>> ] bi@
-    \ double-4 boa ; inline
-
-:: double-4-boa ( a b c d -- simd-array )
-    a b double-2-boa
-    c d double-2-boa
-    [ underlying>> ] bi@
-    \ double-4 boa ; inline
-
-<<
-
-<PRIVATE
-
-! Filter out operations that are not available, eg horizontal adds
-! on SSE2. Fallback code in math.vectors is used in that case.
-
-: supported-simd-ops ( assoc -- assoc' )
-    {
-        { v+ (simd-v+) }
-        { v- (simd-v-) }
-        { v* (simd-v*) }
-        { v/ (simd-v/) }
-        { vmin (simd-vmin) }
-        { vmax (simd-vmax) }
-        { sum (simd-sum) }
-    } [ nip "intrinsic" word-prop ] assoc-filter
-    '[ drop _ key? ] assoc-filter ;
-
-! Some SIMD operations are defined in terms of others.
-
-:: high-level-ops ( ctor -- assoc )
-    {
-        { vneg [ [ dup v- ] keep v- ] }
-        { v. [ v* sum ] }
-        { n+v [ [ ctor execute ] dip v+ ] }
-        { v+n [ ctor execute v+ ] }
-        { n-v [ [ ctor execute ] dip v- ] }
-        { v-n [ ctor execute v- ] }
-        { n*v [ [ ctor execute ] dip v* ] }
-        { v*n [ ctor execute v* ] }
-        { n/v [ [ ctor execute ] dip v/ ] }
-        { v/n [ ctor execute v/ ] }
-        { norm-sq [ dup v. assert-positive ] }
-        { norm [ norm-sq sqrt ] }
-        { normalize [ dup norm v/n ] }
-        { distance [ v- norm ] }
-    } ;
-
-:: simd-vector-words ( class ctor elt-type assoc -- )
-    class elt-type assoc supported-simd-ops ctor high-level-ops assoc-union
-    specialize-vector-words ;
-
-PRIVATE>
-
-\ float-4 \ float-4-with m:float H{
-    { v+ [ [ (simd-v+) ] float-4-vv->v-op ] }
-    { v- [ [ (simd-v-) ] float-4-vv->v-op ] }
-    { v* [ [ (simd-v*) ] float-4-vv->v-op ] }
-    { v/ [ [ (simd-v/) ] float-4-vv->v-op ] }
-    { vmin [ [ (simd-vmin) ] float-4-vv->v-op ] }
-    { vmax [ [ (simd-vmax) ] float-4-vv->v-op ] }
-    { sum [ [ (simd-sum) ] float-4-v->n-op ] }
-} simd-vector-words
-
-\ double-2 \ double-2-with m:float H{
-    { v+ [ [ (simd-v+) ] double-2-vv->v-op ] }
-    { v- [ [ (simd-v-) ] double-2-vv->v-op ] }
-    { v* [ [ (simd-v*) ] double-2-vv->v-op ] }
-    { v/ [ [ (simd-v/) ] double-2-vv->v-op ] }
-    { vmin [ [ (simd-vmin) ] double-2-vv->v-op ] }
-    { vmax [ [ (simd-vmax) ] double-2-vv->v-op ] }
-    { sum [ [ (simd-sum) ] double-2-v->n-op ] }
-} simd-vector-words
-
-\ float-8 \ float-8-with m:float H{
-    { v+ [ [ (simd-v+) ] float-8-vv->v-op ] }
-    { v- [ [ (simd-v-) ] float-8-vv->v-op ] }
-    { v* [ [ (simd-v*) ] float-8-vv->v-op ] }
-    { v/ [ [ (simd-v/) ] float-8-vv->v-op ] }
-    { vmin [ [ (simd-vmin) ] float-8-vv->v-op ] }
-    { vmax [ [ (simd-vmax) ] float-8-vv->v-op ] }
-    { sum [ [ (simd-sum) ] [ + ] float-8-v->n-op ] }
-} simd-vector-words
-
-\ double-4 \ double-4-with m:float H{
-    { v+ [ [ (simd-v+) ] double-4-vv->v-op ] }
-    { v- [ [ (simd-v-) ] double-4-vv->v-op ] }
-    { v* [ [ (simd-v*) ] double-4-vv->v-op ] }
-    { v/ [ [ (simd-v/) ] double-4-vv->v-op ] }
-    { vmin [ [ (simd-vmin) ] double-4-vv->v-op ] }
-    { vmax [ [ (simd-vmax) ] double-4-vv->v-op ] }
-    { sum [ [ (simd-v+) ] [ (simd-sum) ] double-4-v->n-op ] }
-} simd-vector-words
-
->>
-
-USE: vocabs.loader
-
 "math.vectors.simd.alien" require
diff --git a/basis/math/vectors/simd/summary.txt b/basis/math/vectors/simd/summary.txt
new file mode 100644
index 0000000000..22593f1286
--- /dev/null
+++ b/basis/math/vectors/simd/summary.txt
@@ -0,0 +1 @@
+Single-instruction-multiple-data parallel vector operations

From 47d87633406c48f1e7af4f2b5f6cdf21896ea35b Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Sun, 20 Sep 2009 16:48:17 -0500
Subject: [PATCH 02/15] More integer SIMD work - move generated vocab support
 from specialized-arrays to vocabs.generated - add fuzz testing to
 math.vectors.simd - add alien type support for integer SIMD vectors - SIMD:
 parsing word generates a SIMD type, instead of pre-generating them all in
 math.vectors.simd

---
 .../tree/propagation/simd/simd.factor         |   1 +
 basis/cpu/x86/x86.factor                      |   3 +-
 .../vectors/simd/alien/alien-tests.factor     |  70 ---
 basis/math/vectors/simd/alien/alien.factor    |  42 --
 .../math/vectors/simd/functor/functor.factor  |  60 +-
 basis/math/vectors/simd/simd-docs.factor      |  65 ++-
 basis/math/vectors/simd/simd-tests.factor     | 544 +++++++-----------
 basis/math/vectors/simd/simd.factor           |  33 +-
 .../specialized-arrays.factor                 |  11 +-
 .../specialized-vectors.factor                |   2 +-
 .../alien => vocabs/generated}/authors.txt    |   0
 basis/vocabs/generated/generated.factor       |  13 +
 extra/benchmark/nbody-simd/nbody-simd.factor  |   1 +
 .../raytracer-simd/raytracer-simd.factor      |   1 +
 extra/benchmark/simd-1/simd-1.factor          |   1 +
 15 files changed, 346 insertions(+), 501 deletions(-)
 delete mode 100644 basis/math/vectors/simd/alien/alien-tests.factor
 delete mode 100644 basis/math/vectors/simd/alien/alien.factor
 rename basis/{math/vectors/simd/alien => vocabs/generated}/authors.txt (100%)
 create mode 100644 basis/vocabs/generated/generated.factor

diff --git a/basis/compiler/tree/propagation/simd/simd.factor b/basis/compiler/tree/propagation/simd/simd.factor
index 3baa7cdcbf..42c1f35617 100644
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@@ -24,6 +24,7 @@ IN: compiler.tree.propagation.simd
         literal>> scalar-rep-of {
             { float-rep [ float ] }
             { double-rep [ float ] }
+            { int-rep [ integer ] }
         } case
     ] [ drop real ] if
     <class-info>
diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index 4d80862ed3..322b123d99 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -417,8 +417,7 @@ M: x86 %horizontal-add-vector ( dst src rep -- )
 
 M: x86 %horizontal-add-vector-reps
     {
-        { sse? { float-4-rep } }
-        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+        { sse3? { float-4-rep double-2-rep } }
     } available-reps ;
 
 M: x86 %unbox-alien ( dst src -- )
diff --git a/basis/math/vectors/simd/alien/alien-tests.factor b/basis/math/vectors/simd/alien/alien-tests.factor
deleted file mode 100644
index 87540dd9a5..0000000000
--- a/basis/math/vectors/simd/alien/alien-tests.factor
+++ /dev/null
@@ -1,70 +0,0 @@
-USING: cpu.architecture math.vectors.simd
-math.vectors.simd.intrinsics accessors math.vectors.simd.alien
-kernel classes.struct tools.test compiler sequences byte-arrays
-alien math kernel.private specialized-arrays combinators ;
-SPECIALIZED-ARRAY: float
-IN: math.vectors.simd.alien.tests
-
-! Vector alien intrinsics
-[ float-4{ 1 2 3 4 } ] [
-    [
-        float-4{ 1 2 3 4 }
-        underlying>> 0 float-4-rep alien-vector
-    ] compile-call float-4 boa
-] unit-test
-
-[ B{ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 } ] [
-    16 [ 1 ] B{ } replicate-as 16 <byte-array>
-    [
-        0 [
-            { byte-array c-ptr fixnum } declare
-            float-4-rep set-alien-vector
-        ] compile-call
-    ] keep
-] unit-test
-
-[ float-array{ 1 2 3 4 } ] [
-    [
-        float-array{ 1 2 3 4 } underlying>>
-        float-array{ 4 3 2 1 } clone
-        [ underlying>> 0 float-4-rep set-alien-vector ] keep
-    ] compile-call
-] unit-test
-
-STRUCT: simd-struct
-{ x float-4 }
-{ y double-2 }
-{ z double-4 }
-{ w float-8 } ;
-
-[ t ] [ [ simd-struct <struct> ] compile-call >c-ptr [ 0 = ] all? ] unit-test
-
-[
-    float-4{ 1 2 3 4 }
-    double-2{ 2 1 }
-    double-4{ 4 3 2 1 }
-    float-8{ 1 2 3 4 5 6 7 8 }
-] [
-    simd-struct <struct>
-    float-4{ 1 2 3 4 } >>x
-    double-2{ 2 1 } >>y
-    double-4{ 4 3 2 1 } >>z
-    float-8{ 1 2 3 4 5 6 7 8 } >>w
-    { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
-] unit-test
-
-[
-    float-4{ 1 2 3 4 }
-    double-2{ 2 1 }
-    double-4{ 4 3 2 1 }
-    float-8{ 1 2 3 4 5 6 7 8 }
-] [
-    [
-        simd-struct <struct>
-        float-4{ 1 2 3 4 } >>x
-        double-2{ 2 1 } >>y
-        double-4{ 4 3 2 1 } >>z
-        float-8{ 1 2 3 4 5 6 7 8 } >>w
-        { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
-    ] compile-call
-] unit-test
diff --git a/basis/math/vectors/simd/alien/alien.factor b/basis/math/vectors/simd/alien/alien.factor
deleted file mode 100644
index 1486f6d0af..0000000000
--- a/basis/math/vectors/simd/alien/alien.factor
+++ /dev/null
@@ -1,42 +0,0 @@
-! Copyright (C) 2009 Slava Pestov.
-! See http://factorcode.org/license.txt for BSD license.
-USING: alien accessors alien.c-types byte-arrays compiler.units
-cpu.architecture locals kernel math math.vectors.simd
-math.vectors.simd.intrinsics ;
-IN: math.vectors.simd.alien
-
-:: define-simd-128-type ( class rep -- )
-    <c-type>
-        byte-array >>class
-        class >>boxed-class
-        [ rep alien-vector class boa ] >>getter
-        [ [ underlying>> ] 2dip rep set-alien-vector ] >>setter
-        16 >>size
-        8 >>align
-        rep >>rep
-    class name>> typedef ;
-
-:: define-simd-256-type ( class rep -- )
-    <c-type>
-        class >>class
-        class >>boxed-class
-        [
-            [ rep alien-vector ]
-            [ 16 + >fixnum rep alien-vector ] 2bi
-            class boa
-        ] >>getter
-        [
-            [ [ underlying1>> ] 2dip rep set-alien-vector ]
-            [ [ underlying2>> ] 2dip 16 + >fixnum rep set-alien-vector ]
-            3bi
-        ] >>setter
-        32 >>size
-        8 >>align
-        rep >>rep
-    class name>> typedef ;
-[
-    float-4 float-4-rep define-simd-128-type
-    double-2 double-2-rep define-simd-128-type
-    float-8 float-4-rep define-simd-256-type
-    double-4 double-2-rep define-simd-256-type
-] with-compilation-unit
diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor
index a97dc192be..57126f1bf8 100644
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@@ -5,6 +5,7 @@ effects fry functors generalizations kernel literals locals
 math math.functions math.vectors math.vectors.simd.intrinsics
 math.vectors.specialization parser prettyprint.custom sequences
 sequences.private strings words definitions macros cpu.architecture ;
+QUALIFIED-WITH: math m
 IN: math.vectors.simd.functor
 
 ERROR: bad-length got expected ;
@@ -51,11 +52,10 @@ MACRO: simd-boa ( rep class -- simd-array )
     '[ nip _ swap supported-simd-op? ] assoc-filter
     '[ drop _ key? ] assoc-filter ;
 
-:: high-level-ops ( ctor -- assoc )
+:: high-level-ops ( ctor elt-class -- assoc )
     ! Some SIMD operations are defined in terms of others.
     {
         { vneg [ [ dup v- ] keep v- ] }
-        { v. [ v* sum ] }
         { n+v [ [ ctor execute ] dip v+ ] }
         { v+n [ ctor execute v+ ] }
         { n-v [ [ ctor execute ] dip v- ] }
@@ -67,16 +67,36 @@ MACRO: simd-boa ( rep class -- simd-array )
         { norm-sq [ dup v. assert-positive ] }
         { norm [ norm-sq sqrt ] }
         { normalize [ dup norm v/n ] }
-        { distance [ v- norm ] }
-    } ;
+    }
+    ! To compute dot product and distance with integer vectors, we
+    ! have to do things less efficiently, with integer overflow checks,
+    ! in the general case.
+    elt-class m:float = [
+        {
+            { distance [ v- norm ] }
+            { v. [ v* sum ] }
+        } append
+    ] when ;
 
 :: simd-vector-words ( class ctor rep assoc -- )
+    rep rep-component-type c-type-boxed-class :> elt-class
     class
-    rep rep-component-type c-type-boxed-class
+    elt-class
     assoc rep supported-simd-ops
-    ctor high-level-ops assoc-union
+    ctor elt-class high-level-ops assoc-union
     specialize-vector-words ;
 
+:: define-simd-128-type ( class rep -- )
+    <c-type>
+        byte-array >>class
+        class >>boxed-class
+        [ rep alien-vector class boa ] >>getter
+        [ [ underlying>> ] 2dip rep set-alien-vector ] >>setter
+        16 >>size
+        8 >>align
+        rep >>rep
+    class typedef ;
+
 FUNCTOR: define-simd-128 ( T -- )
 
 N            [ 16 T heap-size /i ]
@@ -159,11 +179,35 @@ INSTANCE: A sequence
     { sum [ [ (simd-sum) ] \ A-v->n-op execute ] }
 } simd-vector-words
 
+\ A \ A-rep define-simd-128-type
+
 PRIVATE>
 
 ;FUNCTOR
 
 ! Synthesize 256-bit vectors from a pair of 128-bit vectors
+SLOT: underlying1
+SLOT: underlying2
+
+:: define-simd-256-type ( class rep -- )
+    <c-type>
+        class >>class
+        class >>boxed-class
+        [
+            [ rep alien-vector ]
+            [ 16 + >fixnum rep alien-vector ] 2bi
+            class boa
+        ] >>getter
+        [
+            [ [ underlying1>> ] 2dip rep set-alien-vector ]
+            [ [ underlying2>> ] 2dip 16 + >fixnum rep set-alien-vector ]
+            3bi
+        ] >>setter
+        32 >>size
+        8 >>align
+        rep >>rep
+    class typedef ;
+
 FUNCTOR: define-simd-256 ( T -- )
 
 N            [ 32 T heap-size /i ]
@@ -235,7 +279,7 @@ M: A pprint* pprint-object ;
 
 : A-boa ( ... -- simd-array )
     [ A/2-boa ] N/2 ndip A/2-boa [ underlying>> ] bi@
-    \ A boa ;
+    \ A boa ; inline
 
 \ A-rep 2 boa-effect \ A-boa set-stack-effect
 
@@ -260,4 +304,6 @@ INSTANCE: A sequence
     { sum [ [ (simd-v+) ] [ (simd-sum) ] \ A-v->n-op execute ] }
 } simd-vector-words
 
+\ A \ A-rep define-simd-256-type
+
 ;FUNCTOR
diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor
index d6131b3a71..42512feb6f 100644
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@@ -17,23 +17,45 @@ $nl
 "There should never be any reason to use " { $link "math.vectors.simd.intrinsics" } " directly, but they too have a straightforward, but lower-level, interface." ;
 
 ARTICLE: "math.vectors.simd.support" "Supported SIMD instruction sets and operations"
-"At present, the SIMD support makes use of SSE2 and a few SSE3 instructions on x86 CPUs."
+"At present, the SIMD support makes use of SSE, SSE2 and a few SSE3 instructions on x86 CPUs."
 $nl
-"SSE3 introduces horizontal adds (summing all components of a single vector register), which is useful for computing dot products. Where available, SSE3 operations are used to speed up " { $link sum } ", " { $link v. } ", " { $link norm-sq } ", " { $link norm } ", and " { $link distance } ". If SSE3 is not available, software fallbacks are used for " { $link sum } " and related words, decreasing performance."
+"SSE1 only supports single-precision SIMD (" { $snippet "float-4" } " and " { $snippet "float-8" } ")."
 $nl
-"On PowerPC, or older x86 chips without SSE2, software fallbacks are used for all high-level vector operations. SIMD code can run with no loss in functionality, just decreased performance."
+"SSE2 introduces double-precision and integer SIMD."
+$nl
+"SSE3 introduces horizontal adds (summing all components of a single vector register), which is useful for computing dot products. Where available, SSE3 operations are used to speed up " { $link sum } ", " { $link v. } ", " { $link norm-sq } ", " { $link norm } ", and " { $link distance } ". If SSE3 is not available, software fallbacks are used for " { $link sum } " and related words."
+$nl
+"On PowerPC, or older x86 chips without SSE, software fallbacks are used for all high-level vector operations. SIMD code can run with no loss in functionality, just decreased performance."
 $nl
 "The primities in the " { $vocab-link "math.vectors.simd.intrinsics" } " vocabulary do not have software fallbacks, but they should not be called directly in any case." ;
 
 ARTICLE: "math.vectors.simd.types" "SIMD vector types"
-"Each SIMD vector type is named " { $snippet "scalar-count" } ", where " { $snippet "scalar" } " is a scalar C type such as " { $snippet "float" } " or " { $snippet "double" } ", and " { $snippet "count" } " is a vector dimension, such as 2, 4, or 8."
+"Each SIMD vector type is named " { $snippet "scalar-count" } ", where " { $snippet "scalar" } " is a scalar C type and " { $snippet "count" } " is a vector dimension."
 $nl
-"The following vector types are defined:"
-{ $subsection float-4 }
-{ $subsection double-2 }
-{ $subsection float-8 }
-{ $subsection double-4 }
-"For each vector type, several words are defined:"
+"To use a SIMD vector type, a parsing word is used to generate the relevant code and bring it into the vocabulary search path; this is the same idea as with " { $link "specialized-arrays" } ":"
+{ $subsection POSTPONE: SIMD: }
+"The following vector types are supported:"
+{ $code
+    "char-16"
+    "uchar-16"
+    "char-32"
+    "uchar-32"
+    "short-8"
+    "ushort-8"
+    "short-16"
+    "ushort-16"
+    "int-4"
+    "uint-4"
+    "int-8"
+    "uint-8"
+    "float-4"
+    "float-8"
+    "double-2"
+    "double-4"
+} ;
+
+ARTICLE: "math.vectors.simd.words" "SIMD vector words"
+"For each SIMD vector type, several words are defined:"
 { $table
     { "Word" "Stack effect" "Description" }
     { { $snippet "type-with" } { $snippet "( x -- simd-array )" } "creates a new instance where all components are set to a single scalar" }
@@ -41,8 +63,6 @@ $nl
     { { $snippet ">type" } { $snippet "( seq -- simd-array )" } "creates a new instance initialized with the elements of an existing sequence, which must have the correct length" }
     { { $snippet "type{" } { $snippet "type{ elements... }" } "parsing word defining literal syntax for an SIMD vector; the correct number of elements must be given" }
 }
-"The " { $link float-4 } " and " { $link double-2 } " types correspond to 128-bit vector registers. The " { $link float-8 } " and " { $link double-4 } " types are not directly supported in hardware, and instead unbox to a pair of 128-bit vector registers."
-$nl
 "To actually perform vector arithmetic on SIMD vectors, use " { $link "math-vectors" } " words."
 { $see-also "c-types-specs" } ;
 
@@ -68,6 +88,8 @@ SYMBOLS: x y ;
 { $code
 <" USING: compiler.tree.debugger kernel.private
 math.vectors math.vectors.simd ;
+SIMD: float-4
+IN: simd-demo
 
 : interpolate ( v a b -- w )
     { float-4 float-4 float-4 } declare
@@ -80,6 +102,8 @@ $nl
 { $code
 <" USING: compiler.tree.debugger hints
 math.vectors math.vectors.simd ;
+SIMD: float-4
+IN: simd-demo
 
 : interpolate ( v a b -- w )
     [ v* ] [ [ 1.0 ] dip n-v v* ] bi-curry* bi v+ ;
@@ -94,6 +118,7 @@ $nl
 "In the " { $snippet "interpolate" } " word, there is still a call to the " { $link <tuple-boa> } " primitive, because the return value at the end is being boxed on the heap. In the next example, no memory allocation occurs at all because the SIMD vectors are stored inside a struct class (see " { $link "classes.struct" } "); also note the use of inlining:"
 { $code
 <" USING: compiler.tree.debugger math.vectors math.vectors.simd ;
+SIMD: float-4
 IN: simd-demo
 
 STRUCT: actor
@@ -151,21 +176,23 @@ $nl
 "For the most part, the above primitives correspond directly to vector arithmetic words. They take a representation parameter, which is one of the singleton members of the " { $link vector-rep } " union in the " { $vocab-link "cpu.architecture" } " vocabulary." ;
 
 ARTICLE: "math.vectors.simd.alien" "SIMD data in struct classes"
-"Struct classes may contain fields which store SIMD data; use one of the following C type names:"
-{ $code
-<" float-4
-double-2
-float-8
-double-4"> }
-"Passing SIMD data as function parameters is not yet supported." ;
+"Struct classes may contain fields which store SIMD data; for each SIMD vector type listed in " { $snippet "math.vectors.simd.types" } " there is a C type with the same name."
+$nl
+"Only SIMD struct fields are allowed at the moment; passing SIMD data as function parameters is not yet supported." ;
 
 ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
 "The " { $vocab-link "math.vectors.simd" } " vocabulary extends the " { $vocab-link "math.vectors" } " vocabulary to support efficient vector arithmetic on small, fixed-size vectors."
 { $subsection "math.vectors.simd.intro" }
 { $subsection "math.vectors.simd.types" }
+{ $subsection "math.vectors.simd.words" }
 { $subsection "math.vectors.simd.support" }
 { $subsection "math.vectors.simd.efficiency" }
 { $subsection "math.vectors.simd.alien" }
 { $subsection "math.vectors.simd.intrinsics" } ;
 
+HELP: SIMD:
+{ $syntax "SIMD: type-length" }
+{ $values { "type" "a scalar C type" } { "length" "a vector dimension" } }
+{ $description "Brings a SIMD array for holding " { $snippet "length" } " values of " { $snippet "type" } " into the vocabulary search path. The possible type/length combinations are listed in " { $link "math.vectors.simd.types" } " and the generated words are documented in " { $link "math.vectors.simd.words" } "." } ;
+
 ABOUT: "math.vectors.simd"
diff --git a/basis/math/vectors/simd/simd-tests.factor b/basis/math/vectors/simd/simd-tests.factor
index f5318c341f..39afe3cb03 100644
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@@ -1,8 +1,30 @@
+USING: accessors arrays classes compiler compiler.tree.debugger
+effects fry io kernel kernel.private math math.functions
+math.private math.vectors math.vectors.simd
+math.vectors.simd.private prettyprint random sequences system
+tools.test vocabs assocs compiler.cfg.debugger words
+locals math.vectors.specialization combinators cpu.architecture
+math.vectors.simd.intrinsics namespaces byte-arrays alien
+specialized-arrays classes.struct ;
+FROM: alien.c-types => c-type-boxed-class ;
+SPECIALIZED-ARRAY: float
+SIMD: char-16
+SIMD: uchar-16
+SIMD: char-32
+SIMD: uchar-32
+SIMD: short-8
+SIMD: ushort-8
+SIMD: short-16
+SIMD: ushort-16
+SIMD: int-4
+SIMD: uint-4
+SIMD: int-8
+SIMD: uint-8
+SIMD: float-4
+SIMD: float-8
+SIMD: double-2
+SIMD: double-4
 IN: math.vectors.simd.tests
-USING: math math.vectors.simd math.vectors.simd.private
-math.vectors math.functions math.private kernel.private compiler
-sequences tools.test compiler.tree.debugger accessors kernel
-system ;
 
 [ float-4{ 0 0 0 0 } ] [ float-4 new ] unit-test
 
@@ -12,344 +34,6 @@ system ;
 
 [ V{ float } ] [ [ { float-4 } declare norm ] final-classes ] unit-test
 
-[ float-4{ 12 12 12 12 } ] [
-    12 [ float-4-with ] compile-call
-] unit-test
-
-[ float-4{ 1 2 3 4 } ] [
-    1 2 3 4 [ float-4-boa ] compile-call
-] unit-test
-
-[ float-4{ 11 22 33 44 } ] [
-    float-4{ 1 2 3 4 } float-4{ 10 20 30 40 }
-    [ { float-4 float-4 } declare v+ ] compile-call
-] unit-test
-
-[ float-4{ -9 -18 -27 -36 } ] [
-    float-4{ 1 2 3 4 } float-4{ 10 20 30 40 }
-    [ { float-4 float-4 } declare v- ] compile-call
-] unit-test
-
-[ float-4{ 10 40 90 160 } ] [
-    float-4{ 1 2 3 4 } float-4{ 10 20 30 40 }
-    [ { float-4 float-4 } declare v* ] compile-call
-] unit-test
-
-[ float-4{ 10 100 1000 10000 } ] [
-    float-4{ 100 2000 30000 400000 } float-4{ 10 20 30 40 }
-    [ { float-4 float-4 } declare v/ ] compile-call
-] unit-test
-
-[ float-4{ -10 -20 -30 -40 } ] [
-    float-4{ -10 20 -30 40 } float-4{ 10 -20 30 -40 }
-    [ { float-4 float-4 } declare vmin ] compile-call
-] unit-test
-
-[ float-4{ 10 20 30 40 } ] [
-    float-4{ -10 20 -30 40 } float-4{ 10 -20 30 -40 }
-    [ { float-4 float-4 } declare vmax ] compile-call
-] unit-test
-
-[ 10.0 ] [
-    float-4{ 1 2 3 4 }
-    [ { float-4 } declare sum ] compile-call
-] unit-test
-
-[ 13.0 ] [
-    float-4{ 1 2 3 4 }
-    [ { float-4 } declare sum 3.0 + ] compile-call
-] unit-test
-
-[ 8.0 ] [
-    float-4{ 1 2 3 4 } float-4{ 2 0 2 0 }
-    [ { float-4 float-4 } declare v. ] compile-call
-] unit-test
-
-[ float-4{ 5 10 15 20 } ] [
-    5.0 float-4{ 1 2 3 4 }
-    [ { float float-4 } declare n*v ] compile-call
-] unit-test
-
-[ float-4{ 5 10 15 20 } ] [
-    float-4{ 1 2 3 4 } 5.0
-    [ { float float-4 } declare v*n ] compile-call
-] unit-test
-
-[ float-4{ 10 5 2 5 } ] [
-    10.0 float-4{ 1 2 5 2 }
-    [ { float float-4 } declare n/v ] compile-call
-] unit-test
-
-[ float-4{ 0.5 1 1.5 2 } ] [
-    float-4{ 1 2 3 4 } 2
-    [ { float float-4 } declare v/n ] compile-call
-] unit-test
-
-[ float-4{ 1 0 0 0 } ] [
-    float-4{ 10 0 0 0 }
-    [ { float-4 } declare normalize ] compile-call
-] unit-test
-
-[ 30.0 ] [
-    float-4{ 1 2 3 4 }
-    [ { float-4 } declare norm-sq ] compile-call
-] unit-test
-
-[ t ] [
-    float-4{ 1 0 0 0 }
-    float-4{ 0 1 0 0 }
-    [ { float-4 float-4 } declare distance ] compile-call
-    2 sqrt 1.0e-6 ~
-] unit-test
-
-[ double-2{ 12 12 } ] [
-    12 [ double-2-with ] compile-call
-] unit-test
-
-[ double-2{ 1 2 } ] [
-    1 2 [ double-2-boa ] compile-call
-] unit-test
-
-[ double-2{ 11 22 } ] [
-    double-2{ 1 2 } double-2{ 10 20 }
-    [ { double-2 double-2 } declare v+ ] compile-call
-] unit-test
-
-[ double-2{ -9 -18 } ] [
-    double-2{ 1 2 } double-2{ 10 20 }
-    [ { double-2 double-2 } declare v- ] compile-call
-] unit-test
-
-[ double-2{ 10 40 } ] [
-    double-2{ 1 2 } double-2{ 10 20 }
-    [ { double-2 double-2 } declare v* ] compile-call
-] unit-test
-
-[ double-2{ 10 100 } ] [
-    double-2{ 100 2000 } double-2{ 10 20 }
-    [ { double-2 double-2 } declare v/ ] compile-call
-] unit-test
-
-[ double-2{ -10 -20 } ] [
-    double-2{ -10 20 } double-2{ 10 -20 }
-    [ { double-2 double-2 } declare vmin ] compile-call
-] unit-test
-
-[ double-2{ 10 20 } ] [
-    double-2{ -10 20 } double-2{ 10 -20 }
-    [ { double-2 double-2 } declare vmax ] compile-call
-] unit-test
-
-[ 3.0 ] [
-    double-2{ 1 2 }
-    [ { double-2 } declare sum ] compile-call
-] unit-test
-
-[ 7.0 ] [
-    double-2{ 1 2 }
-    [ { double-2 } declare sum 4.0 + ] compile-call
-] unit-test
-
-[ 16.0 ] [
-    double-2{ 1 2 } double-2{ 2 7 }
-    [ { double-2 double-2 } declare v. ] compile-call
-] unit-test
-
-[ double-2{ 5 10 } ] [
-    5.0 double-2{ 1 2 }
-    [ { float double-2 } declare n*v ] compile-call
-] unit-test
-
-[ double-2{ 5 10 } ] [
-    double-2{ 1 2 } 5.0
-    [ { float double-2 } declare v*n ] compile-call
-] unit-test
-
-[ double-2{ 10 5 } ] [
-    10.0 double-2{ 1 2 }
-    [ { float double-2 } declare n/v ] compile-call
-] unit-test
-
-[ double-2{ 0.5 1 } ] [
-    double-2{ 1 2 } 2
-    [ { float double-2 } declare v/n ] compile-call
-] unit-test
-
-[ double-2{ 0 0 } ] [ double-2 new ] unit-test
-
-[ double-2{ 1 0 } ] [
-    double-2{ 10 0 }
-    [ { double-2 } declare normalize ] compile-call
-] unit-test
-
-[ 5.0 ] [
-    double-2{ 1 2 }
-    [ { double-2 } declare norm-sq ] compile-call
-] unit-test
-
-[ t ] [
-    double-2{ 1 0 }
-    double-2{ 0 1 }
-    [ { double-2 double-2 } declare distance ] compile-call
-    2 sqrt 1.0e-6 ~
-] unit-test
-
-[ double-4{ 0 0 0 0 } ] [ double-4 new ] unit-test
-
-[ double-4{ 1 2 3 4 } ] [
-    1 2 3 4 double-4-boa
-] unit-test
-
-[ double-4{ 1 1 1 1 } ] [
-    1 double-4-with
-] unit-test
-
-[ double-4{ 0 1 2 3 } ] [
-    1 double-4-with [ * ] map-index
-] unit-test
-
-[ V{ float } ] [ [ { double-4 } declare norm-sq ] final-classes ] unit-test
-
-[ V{ float } ] [ [ { double-4 } declare norm ] final-classes ] unit-test
-
-[ double-4{ 12 12 12 12 } ] [
-    12 [ double-4-with ] compile-call
-] unit-test
-
-[ double-4{ 1 2 3 4 } ] [
-    1 2 3 4 [ double-4-boa ] compile-call
-] unit-test
-
-[ double-4{ 11 22 33 44 } ] [
-    double-4{ 1 2 3 4 } double-4{ 10 20 30 40 }
-    [ { double-4 double-4 } declare v+ ] compile-call
-] unit-test
-
-[ double-4{ -9 -18 -27 -36 } ] [
-    double-4{ 1 2 3 4 } double-4{ 10 20 30 40 }
-    [ { double-4 double-4 } declare v- ] compile-call
-] unit-test
-
-[ double-4{ 10 40 90 160 } ] [
-    double-4{ 1 2 3 4 } double-4{ 10 20 30 40 }
-    [ { double-4 double-4 } declare v* ] compile-call
-] unit-test
-
-[ double-4{ 10 100 1000 10000 } ] [
-    double-4{ 100 2000 30000 400000 } double-4{ 10 20 30 40 }
-    [ { double-4 double-4 } declare v/ ] compile-call
-] unit-test
-
-[ double-4{ -10 -20 -30 -40 } ] [
-    double-4{ -10 20 -30 40 } double-4{ 10 -20 30 -40 }
-    [ { double-4 double-4 } declare vmin ] compile-call
-] unit-test
-
-[ double-4{ 10 20 30 40 } ] [
-    double-4{ -10 20 -30 40 } double-4{ 10 -20 30 -40 }
-    [ { double-4 double-4 } declare vmax ] compile-call
-] unit-test
-
-[ 10.0 ] [
-    double-4{ 1 2 3 4 }
-    [ { double-4 } declare sum ] compile-call
-] unit-test
-
-[ 13.0 ] [
-    double-4{ 1 2 3 4 }
-    [ { double-4 } declare sum 3.0 + ] compile-call
-] unit-test
-
-[ 8.0 ] [
-    double-4{ 1 2 3 4 } double-4{ 2 0 2 0 }
-    [ { double-4 double-4 } declare v. ] compile-call
-] unit-test
-
-[ double-4{ 5 10 15 20 } ] [
-    5.0 double-4{ 1 2 3 4 }
-    [ { float double-4 } declare n*v ] compile-call
-] unit-test
-
-[ double-4{ 5 10 15 20 } ] [
-    double-4{ 1 2 3 4 } 5.0
-    [ { float double-4 } declare v*n ] compile-call
-] unit-test
-
-[ double-4{ 10 5 2 5 } ] [
-    10.0 double-4{ 1 2 5 2 }
-    [ { float double-4 } declare n/v ] compile-call
-] unit-test
-
-[ double-4{ 0.5 1 1.5 2 } ] [
-    double-4{ 1 2 3 4 } 2
-    [ { float double-4 } declare v/n ] compile-call
-] unit-test
-
-[ double-4{ 1 0 0 0 } ] [
-    double-4{ 10 0 0 0 }
-    [ { double-4 } declare normalize ] compile-call
-] unit-test
-
-[ 30.0 ] [
-    double-4{ 1 2 3 4 }
-    [ { double-4 } declare norm-sq ] compile-call
-] unit-test
-
-[ t ] [
-    double-4{ 1 0 0 0 }
-    double-4{ 0 1 0 0 }
-    [ { double-4 double-4 } declare distance ] compile-call
-    2 sqrt 1.0e-6 ~
-] unit-test
-
-[ float-8{ 0 0 0 0 0 0 0 0 } ] [ float-8 new ] unit-test
-
-[ float-8{ 0 0 0 0 0 0 0 0 } ] [ [ float-8 new ] compile-call ] unit-test
-
-[ float-8{ 1 1 1 1 1 1 1 1 } ] [ 1 float-8-with ] unit-test
-
-[ float-8{ 1 1 1 1 1 1 1 1 } ] [ [ 1 float-8-with ] compile-call ] unit-test
-
-[ float-8{ 1 2 3 4 5 6 7 8 } ] [ 1 2 3 4 5 6 7 8 float-8-boa ] unit-test
-
-[ float-8{ 1 2 3 4 5 6 7 8 } ] [ [ 1 2 3 4 5 6 7 8 float-8-boa ] compile-call ] unit-test
-
-[ float-8{ 3 6 9 12 15 18 21 24 } ] [
-    float-8{ 1 2 3 4 5 6 7 8 }
-    float-8{ 2 4 6 8 10 12 14 16 }
-    [ { float-8 float-8 } declare v+ ] compile-call
-] unit-test
-
-[ float-8{ -1 -2 -3 -4 -5 -6 -7 -8 } ] [
-    float-8{ 1 2 3 4 5 6 7 8 }
-    float-8{ 2 4 6 8 10 12 14 16 }
-    [ { float-8 float-8 } declare v- ] compile-call
-] unit-test
-
-[ float-8{ -1 -2 -3 -4 -5 -6 -7 -8 } ] [
-    -0.5
-    float-8{ 2 4 6 8 10 12 14 16 }
-    [ { float float-8 } declare n*v ] compile-call
-] unit-test
-
-[ float-8{ -1 -2 -3 -4 -5 -6 -7 -8 } ] [
-    float-8{ 2 4 6 8 10 12 14 16 }
-    -0.5
-    [ { float-8 float } declare v*n ] compile-call
-] unit-test
-
-[ float-8{ 256 128 64 32 16 8 4 2 } ] [
-    256.0
-    float-8{ 1 2 4 8 16 32 64 128 }
-    [ { float float-8 } declare n/v ] compile-call
-] unit-test
-
-[ float-8{ -1 -2 -3 -4 -5 -6 -7 -8 } ] [
-    float-8{ 2 4 6 8 10 12 14 16 }
-    -2.0
-    [ { float-8 float } declare v/n ] compile-call
-] unit-test
-
 ! Test puns; only on x86
 cpu x86? [
     [ double-2{ 4 1024 } ] [
@@ -362,3 +46,179 @@ cpu x86? [
         [ { double-2 double-2 } declare v+ underlying>> 3.0 float* ] compile-call
     ] unit-test
 ] when
+
+! Fuzz testing
+CONSTANT: simd-classes
+    {
+        char-16
+        uchar-16
+        char-32
+        uchar-32
+        short-8
+        ushort-8
+        short-16
+        ushort-16
+        int-4
+        uint-4
+        int-8
+        uint-8
+        float-4
+        float-8
+        double-2
+        double-4
+    }
+
+: with-ctors ( -- seq )
+    simd-classes [ [ name>> "-with" append ] [ vocabulary>> ] bi lookup ] map ;
+
+: boa-ctors ( -- seq )
+    simd-classes [ [ name>> "-boa" append ] [ vocabulary>> ] bi lookup ] map ;
+
+: check-optimizer ( seq inputs quot -- )
+    [
+        [ "print-mr" get [ nip test-mr mr. ] [ 2drop ] if ]
+        [ [ call ] dip call ]
+        [ [ call ] dip compile-call ] 2tri = not
+    ] compose filter ; inline
+
+"== Checking -new constructors" print
+
+[ { } ] [
+    simd-classes [ [ [ ] ] dip '[ _ new ] ] check-optimizer
+] unit-test
+
+[ { } ] [
+    simd-classes [ '[ _ new ] compile-call [ zero? ] all? not ] filter
+] unit-test
+
+"== Checking -with constructors" print
+
+[ { } ] [
+    with-ctors [
+        [ 1000 random '[ _ ] ] dip '[ { fixnum } declare _ execute ]
+    ] check-optimizer
+] unit-test
+
+"== Checking -boa constructors" print
+
+[ { } ] [
+    boa-ctors [
+        dup stack-effect in>> length
+        [ nip [ 1000 random ] [ ] replicate-as ]
+        [ fixnum <array> swap '[ _ declare _ execute ] ]
+        2bi
+    ] check-optimizer
+] unit-test
+
+"== Checking vector operations" print
+
+: random-vector ( class -- vec )
+    new [ drop 1000 random ] map ;
+
+:: check-vector-op ( word inputs class elt-class -- inputs quot )
+    inputs [
+        [
+            {
+                { +vector+ [ class random-vector ] }
+                { +scalar+ [ 1000 random elt-class float = [ >float ] when ] }
+            } case
+        ] [ ] map-as
+    ] [
+        [
+            {
+                { +vector+ [ class ] }
+                { +scalar+ [ elt-class ] }
+            } case
+        ] map
+    ] bi
+    word '[ _ declare _ execute ] ;
+
+: ops-to-check ( elt-class -- alist )
+    [ vector-words >alist ] dip float = [
+        [ drop { n/v v/n v/ normalize } member? not ] assoc-filter
+    ] unless ;
+
+: check-vector-ops ( class elt-class -- )
+    [ nip ops-to-check ] 2keep
+    '[ first2 inputs _ _ check-vector-op ] check-optimizer ; inline
+
+: simd-classes&reps ( -- alist )
+    simd-classes [
+        dup name>> [ "float" head? ] [ "double" head? ] bi or
+        float fixnum ?
+    ] { } map>assoc ;
+
+simd-classes&reps [
+    [ [ { } ] ] 2dip '[ _ _ check-vector-ops ] unit-test
+] assoc-each
+
+! Other regressions
+[ 8000000 ] [
+    int-8{ 1000 1000 1000 1000 1000 1000 1000 1000 }
+    [ { int-8 } declare dup [ * ] [ + ] 2map-reduce ] compile-call
+] unit-test
+
+
+! Vector alien intrinsics
+[ float-4{ 1 2 3 4 } ] [
+    [
+        float-4{ 1 2 3 4 }
+        underlying>> 0 float-4-rep alien-vector
+    ] compile-call float-4 boa
+] unit-test
+
+[ B{ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 } ] [
+    16 [ 1 ] B{ } replicate-as 16 <byte-array>
+    [
+        0 [
+            { byte-array c-ptr fixnum } declare
+            float-4-rep set-alien-vector
+        ] compile-call
+    ] keep
+] unit-test
+
+[ float-array{ 1 2 3 4 } ] [
+    [
+        float-array{ 1 2 3 4 } underlying>>
+        float-array{ 4 3 2 1 } clone
+        [ underlying>> 0 float-4-rep set-alien-vector ] keep
+    ] compile-call
+] unit-test
+
+STRUCT: simd-struct
+{ x float-4 }
+{ y double-2 }
+{ z double-4 }
+{ w float-8 } ;
+
+[ t ] [ [ simd-struct <struct> ] compile-call >c-ptr [ 0 = ] all? ] unit-test
+
+[
+    float-4{ 1 2 3 4 }
+    double-2{ 2 1 }
+    double-4{ 4 3 2 1 }
+    float-8{ 1 2 3 4 5 6 7 8 }
+] [
+    simd-struct <struct>
+    float-4{ 1 2 3 4 } >>x
+    double-2{ 2 1 } >>y
+    double-4{ 4 3 2 1 } >>z
+    float-8{ 1 2 3 4 5 6 7 8 } >>w
+    { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
+] unit-test
+
+[
+    float-4{ 1 2 3 4 }
+    double-2{ 2 1 }
+    double-4{ 4 3 2 1 }
+    float-8{ 1 2 3 4 5 6 7 8 }
+] [
+    [
+        simd-struct <struct>
+        float-4{ 1 2 3 4 } >>x
+        double-2{ 2 1 } >>y
+        double-4{ 4 3 2 1 } >>z
+        float-8{ 1 2 3 4 5 6 7 8 } >>w
+        { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
+    ] compile-call
+] unit-test
diff --git a/basis/math/vectors/simd/simd.factor b/basis/math/vectors/simd/simd.factor
index c5e7d6f75d..fe043032b8 100644
--- a/basis/math/vectors/simd/simd.factor
+++ b/basis/math/vectors/simd/simd.factor
@@ -1,15 +1,32 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: alien.c-types cpu.architecture kernel
-math.vectors.simd.functor vocabs.loader ;
-FROM: sequences => each ;
+USING: alien.c-types combinators fry kernel lexer math math.parser
+math.vectors.simd.functor sequences splitting vocabs.generated
+vocabs.loader vocabs.parser words ;
 IN: math.vectors.simd
 
-<<
+ERROR: bad-vector-size bits ;
 
-{ double float char uchar short ushort int uint }
-[ [ define-simd-128 ] [ define-simd-256 ] bi ] each
+<PRIVATE
 
->>
+: simd-vocab ( type -- vocab )
+    "math.vectors.simd.instances." prepend ;
 
-"math.vectors.simd.alien" require
+: parse-simd-name ( string -- c-type quot )
+    "-" split1
+    [ "alien.c-types" lookup dup heap-size ] [ string>number ] bi*
+    * 8 * {
+        { 128 [ [ define-simd-128 ] ] }
+        { 256 [ [ define-simd-256 ] ] }
+        [ bad-vector-size ]
+    } case ;
+
+PRIVATE>
+
+: define-simd-vocab ( type -- vocab )
+    [ simd-vocab ]
+    [ '[ _ parse-simd-name call( type -- ) ] ] bi
+    generate-vocab ;
+
+SYNTAX: SIMD:
+    scan define-simd-vocab use-vocab ;
diff --git a/basis/specialized-arrays/specialized-arrays.factor b/basis/specialized-arrays/specialized-arrays.factor
index 6931c83677..a64d052fd1 100755
--- a/basis/specialized-arrays/specialized-arrays.factor
+++ b/basis/specialized-arrays/specialized-arrays.factor
@@ -4,7 +4,7 @@ USING: accessors alien alien.c-types alien.data alien.parser assocs
 byte-arrays classes compiler.units functors kernel lexer libc math
 math.vectors.specialization namespaces parser prettyprint.custom
 sequences sequences.private strings summary vocabs vocabs.loader
-vocabs.parser words fry combinators ;
+vocabs.parser vocabs.generated words fry combinators ;
 IN: specialized-arrays
 
 MIXIN: specialized-array
@@ -123,15 +123,6 @@ M: word (underlying-type) "c-type" word-prop ;
 
 PRIVATE>
 
-: generate-vocab ( vocab-name quot -- vocab )
-    [ dup vocab [ ] ] dip '[
-        [
-            [
-                 _ with-current-vocab
-            ] with-compilation-unit
-        ] keep
-    ] ?if ; inline
-
 : define-array-vocab ( type -- vocab )
     underlying-type-name
     [ specialized-array-vocab ] [ '[ _ define-array ] ] bi
diff --git a/basis/specialized-vectors/specialized-vectors.factor b/basis/specialized-vectors/specialized-vectors.factor
index 58fb97764b..7cda026cb3 100644
--- a/basis/specialized-vectors/specialized-vectors.factor
+++ b/basis/specialized-vectors/specialized-vectors.factor
@@ -3,7 +3,7 @@
 USING: accessors alien.c-types assocs compiler.units functors
 growable kernel lexer namespaces parser prettyprint.custom
 sequences specialized-arrays specialized-arrays.private strings
-vocabs vocabs.parser fry ;
+vocabs vocabs.parser vocabs.generated fry ;
 QUALIFIED: vectors.functor
 IN: specialized-vectors
 
diff --git a/basis/math/vectors/simd/alien/authors.txt b/basis/vocabs/generated/authors.txt
similarity index 100%
rename from basis/math/vectors/simd/alien/authors.txt
rename to basis/vocabs/generated/authors.txt
diff --git a/basis/vocabs/generated/generated.factor b/basis/vocabs/generated/generated.factor
new file mode 100644
index 0000000000..1ddcc73db2
--- /dev/null
+++ b/basis/vocabs/generated/generated.factor
@@ -0,0 +1,13 @@
+! Copyright (C) 2009 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: compiler.units fry kernel vocabs vocabs.parser ;
+IN: vocabs.generated
+
+: generate-vocab ( vocab-name quot -- vocab )
+    [ dup vocab [ ] ] dip '[
+        [
+            [
+                 _ with-current-vocab
+            ] with-compilation-unit
+        ] keep
+    ] ?if ; inline
diff --git a/extra/benchmark/nbody-simd/nbody-simd.factor b/extra/benchmark/nbody-simd/nbody-simd.factor
index e8bef58923..3aedffed91 100644
--- a/extra/benchmark/nbody-simd/nbody-simd.factor
+++ b/extra/benchmark/nbody-simd/nbody-simd.factor
@@ -4,6 +4,7 @@ USING: accessors fry kernel locals math math.constants
 math.functions math.vectors math.vectors.simd prettyprint
 combinators.smart sequences hints classes.struct
 specialized-arrays ;
+SIMD: double-4
 IN: benchmark.nbody-simd
 
 : solar-mass ( -- x ) 4 pi sq * ; inline
diff --git a/extra/benchmark/raytracer-simd/raytracer-simd.factor b/extra/benchmark/raytracer-simd/raytracer-simd.factor
index 3712972862..2d16c8cd1f 100644
--- a/extra/benchmark/raytracer-simd/raytracer-simd.factor
+++ b/extra/benchmark/raytracer-simd/raytracer-simd.factor
@@ -5,6 +5,7 @@ USING: arrays accessors io io.files io.files.temp
 io.encodings.binary kernel math math.constants math.functions
 math.vectors math.vectors.simd math.parser make sequences
 sequences.private words hints classes.struct ;
+SIMD: double-4
 IN: benchmark.raytracer-simd
 
 ! parameters
diff --git a/extra/benchmark/simd-1/simd-1.factor b/extra/benchmark/simd-1/simd-1.factor
index 4f57cca0bb..1e753a331d 100644
--- a/extra/benchmark/simd-1/simd-1.factor
+++ b/extra/benchmark/simd-1/simd-1.factor
@@ -2,6 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: kernel io math math.functions math.parser math.vectors
 math.vectors.simd sequences specialized-arrays ;
+SIMD: float-4
 SPECIALIZED-ARRAY: float-4
 IN: benchmark.simd-1
 

From acea55c692ce994311287d8c3fe6466d70d23bb3 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Sun, 20 Sep 2009 17:43:16 -0500
Subject: [PATCH 03/15] math.vectors: add v+- word which is accelerated by SSE3

---
 .../cfg/instructions/instructions.factor      |  5 +++
 .../compiler/cfg/intrinsics/intrinsics.factor |  1 +
 .../cfg/two-operand/two-operand.factor        |  1 +
 basis/compiler/codegen/codegen.factor         |  1 +
 .../tree/propagation/simd/simd.factor         | 37 ++++++++-----------
 basis/cpu/architecture/architecture.factor    |  2 +
 basis/cpu/x86/32/32.factor                    |  2 +-
 basis/cpu/x86/64/64.factor                    |  2 +-
 basis/cpu/x86/x86.factor                      | 22 ++++++++---
 .../math/vectors/simd/functor/functor.factor  |  3 ++
 .../vectors/simd/intrinsics/intrinsics.factor |  2 +
 basis/math/vectors/simd/simd-docs.factor      |  1 +
 .../specialization/specialization.factor      |  1 +
 basis/math/vectors/vectors-docs.factor        | 12 ++++++
 basis/math/vectors/vectors-tests.factor       |  4 +-
 basis/math/vectors/vectors.factor             |  5 +++
 16 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/basis/compiler/cfg/instructions/instructions.factor b/basis/compiler/cfg/instructions/instructions.factor
index 32e5d46c61..63297b9bdf 100644
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@@ -310,6 +310,11 @@ def: dst
 use: src1 src2
 literal: rep ;
 
+PURE-INSN: ##add-sub-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
 PURE-INSN: ##mul-vector
 def: dst
 use: src1 src2
diff --git a/basis/compiler/cfg/intrinsics/intrinsics.factor b/basis/compiler/cfg/intrinsics/intrinsics.factor
index 0daab82395..5b3fd1b324 100644
--- a/basis/compiler/cfg/intrinsics/intrinsics.factor
+++ b/basis/compiler/cfg/intrinsics/intrinsics.factor
@@ -155,6 +155,7 @@ IN: compiler.cfg.intrinsics
     {
         { math.vectors.simd.intrinsics:assert-positive [ drop ] }
         { math.vectors.simd.intrinsics:(simd-v+) [ [ ^^add-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-v+-) [ [ ^^add-sub-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-v-) [ [ ^^sub-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-v*) [ [ ^^mul-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-v/) [ [ ^^div-vector ] emit-binary-vector-op ] }
diff --git a/basis/compiler/cfg/two-operand/two-operand.factor b/basis/compiler/cfg/two-operand/two-operand.factor
index 20fa1d0b18..c275756046 100644
--- a/basis/compiler/cfg/two-operand/two-operand.factor
+++ b/basis/compiler/cfg/two-operand/two-operand.factor
@@ -48,6 +48,7 @@ UNION: two-operand-insn
     ##max-float
     ##add-vector
     ##sub-vector
+    ##add-sub-vector
     ##mul-vector
     ##div-vector
     ##min-vector
diff --git a/basis/compiler/codegen/codegen.factor b/basis/compiler/codegen/codegen.factor
index ddf5aa0e02..14246a3fbf 100755
--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@@ -170,6 +170,7 @@ CODEGEN: ##gather-vector-4 %gather-vector-4
 CODEGEN: ##box-vector %box-vector
 CODEGEN: ##add-vector %add-vector
 CODEGEN: ##sub-vector %sub-vector
+CODEGEN: ##add-sub-vector %add-sub-vector
 CODEGEN: ##mul-vector %mul-vector
 CODEGEN: ##div-vector %div-vector
 CODEGEN: ##min-vector %min-vector
diff --git a/basis/compiler/tree/propagation/simd/simd.factor b/basis/compiler/tree/propagation/simd/simd.factor
index 42c1f35617..db39985c94 100644
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@@ -1,23 +1,24 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors byte-arrays combinators fry
+USING: accessors byte-arrays combinators fry sequences
 compiler.tree.propagation.info cpu.architecture kernel words math
 math.intervals math.vectors.simd.intrinsics ;
 IN: compiler.tree.propagation.simd
 
-\ (simd-v+) { byte-array } "default-output-classes" set-word-prop
-
-\ (simd-v-) { byte-array } "default-output-classes" set-word-prop
-
-\ (simd-v*) { byte-array } "default-output-classes" set-word-prop
-
-\ (simd-v/) { byte-array } "default-output-classes" set-word-prop
-
-\ (simd-vmin) { byte-array } "default-output-classes" set-word-prop
-
-\ (simd-vmax) { byte-array } "default-output-classes" set-word-prop
-
-\ (simd-vsqrt) { byte-array } "default-output-classes" set-word-prop
+{
+    (simd-v+)
+    (simd-v-)
+    (simd-v+-)
+    (simd-v*)
+    (simd-v/)
+    (simd-vmin)
+    (simd-vmax)
+    (simd-vsqrt)
+    (simd-broadcast)
+    (simd-gather-2)
+    (simd-gather-4)
+    alien-vector
+} [ { byte-array } "default-output-classes" set-word-prop ] each
 
 \ (simd-sum) [
     nip dup literal?>> [
@@ -30,18 +31,10 @@ IN: compiler.tree.propagation.simd
     <class-info>
 ] "outputs" set-word-prop
 
-\ (simd-broadcast) { byte-array } "default-output-classes" set-word-prop
-
-\ (simd-gather-2) { byte-array } "default-output-classes" set-word-prop
-
-\ (simd-gather-4) { byte-array } "default-output-classes" set-word-prop
-
 \ assert-positive [
     real [0,inf] <class/interval-info> value-info-intersect
 ] "outputs" set-word-prop
 
-\ alien-vector { byte-array } "default-output-classes" set-word-prop
-
 ! If SIMD is not available, inline alien-vector and set-alien-vector
 ! to get a speedup
 : inline-unless-intrinsic ( word -- )
diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor
index 61e4e2df37..331d459adf 100644
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@@ -182,6 +182,7 @@ HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
 HOOK: %add-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
+HOOK: %add-sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
 HOOK: %div-vector cpu ( dst src1 src2 rep -- )
 HOOK: %min-vector cpu ( dst src1 src2 rep -- )
@@ -194,6 +195,7 @@ HOOK: %gather-vector-2-reps cpu ( -- reps )
 HOOK: %gather-vector-4-reps cpu ( -- reps )
 HOOK: %add-vector-reps cpu ( -- reps )
 HOOK: %sub-vector-reps cpu ( -- reps )
+HOOK: %add-sub-vector-reps cpu ( -- reps )
 HOOK: %mul-vector-reps cpu ( -- reps )
 HOOK: %div-vector-reps cpu ( -- reps )
 HOOK: %min-vector-reps cpu ( -- reps )
diff --git a/basis/cpu/x86/32/32.factor b/basis/cpu/x86/32/32.factor
index 9939154512..172b500cd5 100755
--- a/basis/cpu/x86/32/32.factor
+++ b/basis/cpu/x86/32/32.factor
@@ -295,4 +295,4 @@ os windows? [
     4 "double" c-type (>>align)
 ] unless
 
-"cpu.x86.features" require
+check-sse
diff --git a/basis/cpu/x86/64/64.factor b/basis/cpu/x86/64/64.factor
index f4018b1508..3958ba5ec8 100644
--- a/basis/cpu/x86/64/64.factor
+++ b/basis/cpu/x86/64/64.factor
@@ -228,4 +228,4 @@ USE: vocabs.loader
     { [ os winnt? ] [ "cpu.x86.64.winnt" require ] }
 } cond
 
-"cpu.x86.features" require
+check-sse
diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index 322b123d99..a132947cf1 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -341,6 +341,17 @@ M: x86 %sub-vector-reps
         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
     } available-reps ;
 
+M: x86 %add-sub-vector ( dst src1 src2 rep -- )
+    {
+        { float-4-rep [ ADDSUBPS ] }
+        { double-2-rep [ ADDSUBPD ] }
+    } case drop ;
+
+M: x86 %add-sub-vector-reps
+    {
+        { sse3? { float-4-rep double-2-rep } }
+    } available-reps ;
+
 M: x86 %mul-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ MULPS ] }
@@ -879,9 +890,10 @@ enable-min/max
         { 42 [ enable-sse3 ] }
     } case ;
 
-[ { sse_version } compile ] with-optimizer
+: check-sse ( -- )
+    [ { sse_version } compile ] with-optimizer
 
-"Checking for multimedia extensions: " write sse-version 30 min
-[ sse-string write " detected" print ]
-[ install-sse-check ]
-[ enable-sse ] tri
+    "Checking for multimedia extensions: " write sse-version 30 min
+    [ sse-string write " detected" print ]
+    [ install-sse-check ]
+    [ enable-sse ] tri ;
diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor
index 57126f1bf8..2141914d1c 100644
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@@ -42,6 +42,7 @@ MACRO: simd-boa ( rep class -- simd-array )
         {
             { v+ (simd-v+) }
             { v- (simd-v-) }
+            { v+- (simd-v+-) }
             { v* (simd-v*) }
             { v/ (simd-v/) }
             { vmin (simd-vmin) }
@@ -171,6 +172,7 @@ INSTANCE: A sequence
 
 \ A \ A-with \ A-rep H{
     { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
+    { v+- [ [ (simd-v+-) ] \ A-vv->v-op execute ] }
     { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
     { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
     { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
@@ -297,6 +299,7 @@ INSTANCE: A sequence
 \ A \ A-with \ A-rep H{
     { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
     { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
+    { v+- [ [ (simd-v+-) ] \ A-vv->v-op execute ] }
     { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
     { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
     { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor
index a7d019af81..6d39b9e70a 100644
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@@ -8,6 +8,7 @@ IN: math.vectors.simd.intrinsics
 ERROR: bad-simd-call ;
 
 : (simd-v+) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-v+-) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v-) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v*) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v/) ( v1 v2 rep -- v3 ) bad-simd-call ;
@@ -67,6 +68,7 @@ GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
 M: vector-rep supported-simd-op?
     {
         { \ (simd-v+)        [ %add-vector-reps            ] }
+        { \ (simd-v+-)       [ %add-sub-vector-reps        ] }
         { \ (simd-v-)        [ %sub-vector-reps            ] }
         { \ (simd-v*)        [ %mul-vector-reps            ] }
         { \ (simd-v/)        [ %div-vector-reps            ] }
diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor
index 42512feb6f..d108d70b26 100644
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@@ -162,6 +162,7 @@ $nl
 "It is best to avoid calling these primitives directly. To write efficient high-level code that compiles down to primitives and avoids memory allocation, see " { $link "math.vectors.simd.efficiency" } "."
 { $subsection (simd-v+) }
 { $subsection (simd-v-) }
+{ $subsection (simd-v+-) }
 { $subsection (simd-v/) }
 { $subsection (simd-vmin) }
 { $subsection (simd-vmax) }
diff --git a/basis/math/vectors/specialization/specialization.factor b/basis/math/vectors/specialization/specialization.factor
index 21ec9f64f3..1a85f5ade7 100644
--- a/basis/math/vectors/specialization/specialization.factor
+++ b/basis/math/vectors/specialization/specialization.factor
@@ -55,6 +55,7 @@ H{
     { v* { +vector+ +vector+ -> +vector+ } }
     { v*n { +vector+ +scalar+ -> +vector+ } }
     { v+ { +vector+ +vector+ -> +vector+ } }
+    { v+- { +vector+ +vector+ -> +vector+ } }
     { v+n { +vector+ +scalar+ -> +vector+ } }
     { v- { +vector+ +vector+ -> +vector+ } }
     { v-n { +vector+ +scalar+ -> +vector+ } }
diff --git a/basis/math/vectors/vectors-docs.factor b/basis/math/vectors/vectors-docs.factor
index 7456597278..4f2f093216 100644
--- a/basis/math/vectors/vectors-docs.factor
+++ b/basis/math/vectors/vectors-docs.factor
@@ -17,6 +17,7 @@ $nl
 "Combining two vectors to form another vector with " { $link 2map } ":"
 { $subsection v+ }
 { $subsection v- }
+{ $subsection v+- }
 { $subsection v* }
 { $subsection v/ }
 { $subsection vmax }
@@ -57,6 +58,17 @@ HELP: v-
 { $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "w" "a sequence of numbers" } }
 { $description "Subtracts " { $snippet "v" } " from " { $snippet "u" } " component-wise." } ;
 
+HELP: v+-
+{ $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "w" "a sequence of numbers" } }
+{ $description "Adds and subtracts alternate elements of " { $snippet "v" } " and " { $snippet "u" } " component-wise." }
+{ $examples
+    { $example
+        "USING: math.vectors prettyprint ;"
+        "{ 1 2 3 } { 2 3 2 } v+- ."
+        "{ -1 5 1 }"
+    }
+} ;
+
 HELP: [v-]
 { $values { "u" "a sequence of real numbers" } { "v" "a sequence of real numbers" } { "w" "a sequence of real numbers" } }
 { $description "Subtracts " { $snippet "v" } " from " { $snippet "u" } " component-wise; any components which become negative are set to zero." } ;
diff --git a/basis/math/vectors/vectors-tests.factor b/basis/math/vectors/vectors-tests.factor
index 3e56644d3e..fc482815a9 100644
--- a/basis/math/vectors/vectors-tests.factor
+++ b/basis/math/vectors/vectors-tests.factor
@@ -17,4 +17,6 @@ USING: math.vectors tools.test ;
 
 [ 1.125 ] [ 0.0 1.0 2.0 4.0 { 0.5 0.25 } bilerp ] unit-test
 
-[ 17 ] [ 0 1 2 3 4 5 6 7 { 1 2 3 } trilerp ] unit-test
\ No newline at end of file
+[ 17 ] [ 0 1 2 3 4 5 6 7 { 1 2 3 } trilerp ] unit-test
+
+[ { 0 3 2 5 4 } ] [ { 1 2 3 4 5 } { 1 1 1 1 1 } v+- ] unit-test
\ No newline at end of file
diff --git a/basis/math/vectors/vectors.factor b/basis/math/vectors/vectors.factor
index dd48525b53..deda1dc505 100644
--- a/basis/math/vectors/vectors.factor
+++ b/basis/math/vectors/vectors.factor
@@ -24,6 +24,11 @@ IN: math.vectors
 : vmax ( u v -- w ) [ max ] 2map ;
 : vmin ( u v -- w ) [ min ] 2map ;
 
+: v+- ( u v -- w )
+    [ t ] 2dip
+    [ [ not ] 2dip pick [ + ] [ - ] if ] 2map
+    nip ;
+
 : vfloor    ( v -- _v_ ) [ floor    ] map ;
 : vceiling  ( v -- ^v^ ) [ ceiling  ] map ;
 : vtruncate ( v -- -v- ) [ truncate ] map ;

From ea44ea3522591ac1d4a4efb70a7be0bdcb940cdc Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Sun, 20 Sep 2009 23:16:02 -0500
Subject: [PATCH 04/15] math.vectors.simd: add saturated arithmetic operations

---
 basis/alien/c-types/c-types-tests.factor      |  9 +-
 basis/alien/c-types/c-types.factor            | 25 ++++--
 .../cfg/instructions/instructions.factor      | 17 +++-
 .../compiler/cfg/intrinsics/intrinsics.factor | 11 ++-
 .../cfg/two-operand/two-operand.factor        |  5 +-
 basis/compiler/codegen/codegen.factor         |  5 +-
 .../known-words/known-words.factor            | 13 +--
 basis/cpu/architecture/architecture.factor    | 10 ++-
 basis/cpu/x86/x86.factor                      | 89 +++++++++++--------
 .../math/vectors/simd/functor/functor.factor  | 19 +++-
 .../vectors/simd/intrinsics/intrinsics.factor |  8 +-
 basis/math/vectors/simd/simd-docs.factor      |  6 +-
 .../specialization/specialization.factor      |  3 +
 basis/math/vectors/vectors-docs.factor        | 36 +++++++-
 basis/math/vectors/vectors.factor             | 13 ++-
 .../specialized-arrays-tests.factor           |  5 +-
 .../specialized-arrays.factor                 | 31 ++++---
 vm/cpu-x86.S                                  |  4 +-
 18 files changed, 220 insertions(+), 89 deletions(-)

diff --git a/basis/alien/c-types/c-types-tests.factor b/basis/alien/c-types/c-types-tests.factor
index 792e7d416a..a893ffebe8 100644
--- a/basis/alien/c-types/c-types-tests.factor
+++ b/basis/alien/c-types/c-types-tests.factor
@@ -1,5 +1,6 @@
 USING: alien alien.syntax alien.c-types kernel tools.test
-sequences system libc alien.strings io.encodings.utf8 ;
+sequences system libc alien.strings io.encodings.utf8
+math.constants ;
 IN: alien.c-types.tests
 
 CONSTANT: xyz 123
@@ -52,3 +53,9 @@ TYPEDEF: uchar* MyLPBYTE
 os windows? cpu x86.64? and [
     [ -2147467259 ] [ 2147500037 <long> *long ] unit-test
 ] when
+
+[ 0 ] [ -10 uchar c-type-clamp ] unit-test
+[ 12 ] [ 12 uchar c-type-clamp ] unit-test
+[ -10 ] [ -10 char c-type-clamp ] unit-test
+[ 127 ] [ 230 char c-type-clamp ] unit-test
+[ t ] [ pi dup float c-type-clamp = ] unit-test
diff --git a/basis/alien/c-types/c-types.factor b/basis/alien/c-types/c-types.factor
index afbb664fed..1ad4f75a3c 100755
--- a/basis/alien/c-types/c-types.factor
+++ b/basis/alien/c-types/c-types.factor
@@ -1,11 +1,12 @@
 ! Copyright (C) 2004, 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: byte-arrays arrays assocs kernel kernel.private math
-namespaces make parser sequences strings words splitting math.parser
-cpu.architecture alien alien.accessors alien.strings quotations
-layouts system compiler.units io io.files io.encodings.binary
-io.streams.memory accessors combinators effects continuations fry
-classes vocabs vocabs.loader words.symbol ;
+math.order math.parser namespaces make parser sequences strings
+words splitting cpu.architecture alien alien.accessors
+alien.strings quotations layouts system compiler.units io
+io.files io.encodings.binary io.streams.memory accessors
+combinators effects continuations fry classes vocabs
+vocabs.loader words.symbol ;
 QUALIFIED: math
 IN: alien.c-types
 
@@ -480,3 +481,17 @@ M: int-4-rep rep-component-type drop int ;
 M: uint-4-rep rep-component-type drop uint ;
 M: float-4-rep rep-component-type drop float ;
 M: double-2-rep rep-component-type drop double ;
+
+: (unsigned-interval) ( bytes -- from to ) [ 0 ] dip 8 * 2^ 1 - ; foldable
+: unsigned-interval ( c-type -- from to ) heap-size (unsigned-interval) ; foldable
+: (signed-interval) ( bytes -- from to ) 8 * 1 - 2^ [ neg ] [ 1 - ] bi ; foldable
+: signed-interval ( c-type -- from to ) heap-size (signed-interval) ; foldable
+
+: c-type-interval ( c-type -- from to )
+    {
+        { [ dup { float double } memq? ] [ drop -1/0. 1/0. ] }
+        { [ dup { char short int long longlong } memq? ] [ signed-interval ] }
+        { [ dup { uchar ushort uint ulong ulonglong } memq? ] [ unsigned-interval ] }
+    } cond ; foldable
+
+: c-type-clamp ( value c-type -- value' ) c-type-interval clamp ; inline
diff --git a/basis/compiler/cfg/instructions/instructions.factor b/basis/compiler/cfg/instructions/instructions.factor
index 63297b9bdf..0be42a24ab 100644
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@@ -305,7 +305,7 @@ def: dst
 use: src1 src2
 literal: rep ;
 
-PURE-INSN: ##sub-vector
+PURE-INSN: ##saturated-add-vector
 def: dst
 use: src1 src2
 literal: rep ;
@@ -315,11 +315,26 @@ def: dst
 use: src1 src2
 literal: rep ;
 
+PURE-INSN: ##sub-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+PURE-INSN: ##saturated-sub-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
 PURE-INSN: ##mul-vector
 def: dst
 use: src1 src2
 literal: rep ;
 
+PURE-INSN: ##saturated-mul-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
 PURE-INSN: ##div-vector
 def: dst
 use: src1 src2
diff --git a/basis/compiler/cfg/intrinsics/intrinsics.factor b/basis/compiler/cfg/intrinsics/intrinsics.factor
index 5b3fd1b324..b9835827fa 100644
--- a/basis/compiler/cfg/intrinsics/intrinsics.factor
+++ b/basis/compiler/cfg/intrinsics/intrinsics.factor
@@ -151,13 +151,16 @@ IN: compiler.cfg.intrinsics
         { math.integers.private:fixnum-log2 [ drop emit-fixnum-log2 ] }
     } enable-intrinsics ;
 
-: enable-sse2-simd ( -- )
+: enable-simd ( -- )
     {
         { math.vectors.simd.intrinsics:assert-positive [ drop ] }
         { math.vectors.simd.intrinsics:(simd-v+) [ [ ^^add-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-vs+) [ [ ^^saturated-add-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-v+-) [ [ ^^add-sub-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-v-) [ [ ^^sub-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-vs-) [ [ ^^saturated-sub-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-v*) [ [ ^^mul-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-vs*) [ [ ^^saturated-mul-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-v/) [ [ ^^div-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-vmin) [ [ ^^min-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-vmax) [ [ ^^max-vector ] emit-binary-vector-op ] }
@@ -165,14 +168,10 @@ IN: compiler.cfg.intrinsics
         { math.vectors.simd.intrinsics:(simd-broadcast) [ [ ^^broadcast-vector ] emit-unary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-gather-2) [ emit-gather-vector-2 ] }
         { math.vectors.simd.intrinsics:(simd-gather-4) [ emit-gather-vector-4 ] }
+        { math.vectors.simd.intrinsics:(simd-sum) [ [ ^^horizontal-add-vector ] emit-unary-vector-op ] }
         { math.vectors.simd.intrinsics:alien-vector [ emit-alien-vector ] }
         { math.vectors.simd.intrinsics:set-alien-vector [ emit-set-alien-vector ] }
     } enable-intrinsics ;
 
-: enable-sse3-simd ( -- )
-    {
-        { math.vectors.simd.intrinsics:(simd-sum) [ [ ^^horizontal-add-vector ] emit-unary-vector-op ] }
-    } enable-intrinsics ;
-
 : emit-intrinsic ( node word -- )
     "intrinsic" word-prop call( node -- ) ;
diff --git a/basis/compiler/cfg/two-operand/two-operand.factor b/basis/compiler/cfg/two-operand/two-operand.factor
index c275756046..1186e6b41f 100644
--- a/basis/compiler/cfg/two-operand/two-operand.factor
+++ b/basis/compiler/cfg/two-operand/two-operand.factor
@@ -47,9 +47,12 @@ UNION: two-operand-insn
     ##min-float
     ##max-float
     ##add-vector
-    ##sub-vector
+    ##saturated-add-vector
     ##add-sub-vector
+    ##sub-vector
+    ##saturated-sub-vector
     ##mul-vector
+    ##saturated-mul-vector
     ##div-vector
     ##min-vector
     ##max-vector ;
diff --git a/basis/compiler/codegen/codegen.factor b/basis/compiler/codegen/codegen.factor
index 14246a3fbf..06a16663cb 100755
--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@@ -169,9 +169,12 @@ CODEGEN: ##gather-vector-2 %gather-vector-2
 CODEGEN: ##gather-vector-4 %gather-vector-4
 CODEGEN: ##box-vector %box-vector
 CODEGEN: ##add-vector %add-vector
-CODEGEN: ##sub-vector %sub-vector
+CODEGEN: ##saturated-add-vector %saturated-add-vector
 CODEGEN: ##add-sub-vector %add-sub-vector
+CODEGEN: ##sub-vector %sub-vector
+CODEGEN: ##saturated-sub-vector %saturated-sub-vector
 CODEGEN: ##mul-vector %mul-vector
+CODEGEN: ##saturated-mul-vector %saturated-mul-vector
 CODEGEN: ##div-vector %div-vector
 CODEGEN: ##min-vector %min-vector
 CODEGEN: ##max-vector %max-vector
diff --git a/basis/compiler/tree/propagation/known-words/known-words.factor b/basis/compiler/tree/propagation/known-words/known-words.factor
index 621b8d082b..d4780b335b 100644
--- a/basis/compiler/tree/propagation/known-words/known-words.factor
+++ b/basis/compiler/tree/propagation/known-words/known-words.factor
@@ -18,6 +18,7 @@ compiler.tree.propagation.constraints
 compiler.tree.propagation.call-effect
 compiler.tree.propagation.transforms
 compiler.tree.propagation.simd ;
+FROM: alien.c-types => (signed-interval) (unsigned-interval) ;
 IN: compiler.tree.propagation.known-words
 
 { + - * / }
@@ -260,15 +261,9 @@ generic-comparison-ops [
     alien-unsigned-8
 } [
     dup name>> {
-        {
-            [ "alien-signed-" ?head ]
-            [ string>number 8 * 1 - 2^ dup neg swap 1 - [a,b] ]
-        }
-        {
-            [ "alien-unsigned-" ?head ]
-            [ string>number 8 * 2^ 1 - 0 swap [a,b] ]
-        }
-    } cond
+        { [ "alien-signed-" ?head ] [ string>number (signed-interval) ] }
+        { [ "alien-unsigned-" ?head ] [ string>number (unsigned-interval) ] }
+    } cond [a,b]
     [ fits-in-fixnum? fixnum integer ? ] keep <class/interval-info>
     '[ 2drop _ ] "outputs" set-word-prop
 ] each
diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor
index 331d459adf..29e58cb163 100644
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@@ -181,9 +181,12 @@ HOOK: %broadcast-vector cpu ( dst src rep -- )
 HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
 HOOK: %add-vector cpu ( dst src1 src2 rep -- )
-HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
+HOOK: %saturated-add-vector cpu ( dst src1 src2 rep -- )
 HOOK: %add-sub-vector cpu ( dst src1 src2 rep -- )
+HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
+HOOK: %saturated-sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
+HOOK: %saturated-mul-vector cpu ( dst src1 src2 rep -- )
 HOOK: %div-vector cpu ( dst src1 src2 rep -- )
 HOOK: %min-vector cpu ( dst src1 src2 rep -- )
 HOOK: %max-vector cpu ( dst src1 src2 rep -- )
@@ -194,9 +197,12 @@ HOOK: %broadcast-vector-reps cpu ( -- reps )
 HOOK: %gather-vector-2-reps cpu ( -- reps )
 HOOK: %gather-vector-4-reps cpu ( -- reps )
 HOOK: %add-vector-reps cpu ( -- reps )
-HOOK: %sub-vector-reps cpu ( -- reps )
+HOOK: %saturated-add-vector-reps cpu ( -- reps )
 HOOK: %add-sub-vector-reps cpu ( -- reps )
+HOOK: %sub-vector-reps cpu ( -- reps )
+HOOK: %saturated-sub-vector-reps cpu ( -- reps )
 HOOK: %mul-vector-reps cpu ( -- reps )
+HOOK: %saturated-mul-vector-reps cpu ( -- reps )
 HOOK: %div-vector-reps cpu ( -- reps )
 HOOK: %min-vector-reps cpu ( -- reps )
 HOOK: %max-vector-reps cpu ( -- reps )
diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index a132947cf1..62caf1ffe9 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -323,6 +323,30 @@ M: x86 %add-vector-reps
         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
     } available-reps ;
 
+M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
+    {
+        { char-16-rep [ PADDSB ] }
+        { uchar-16-rep [ PADDUSB ] }
+        { short-8-rep [ PADDSW ] }
+        { ushort-8-rep [ PADDUSW ] }
+    } case drop ;
+
+M: x86 %saturated-add-vector-reps
+    {
+        { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
+    } available-reps ;
+
+M: x86 %add-sub-vector ( dst src1 src2 rep -- )
+    {
+        { float-4-rep [ ADDSUBPS ] }
+        { double-2-rep [ ADDSUBPD ] }
+    } case drop ;
+
+M: x86 %add-sub-vector-reps
+    {
+        { sse3? { float-4-rep double-2-rep } }
+    } available-reps ;
+
 M: x86 %sub-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ SUBPS ] }
@@ -341,15 +365,17 @@ M: x86 %sub-vector-reps
         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
     } available-reps ;
 
-M: x86 %add-sub-vector ( dst src1 src2 rep -- )
+M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
     {
-        { float-4-rep [ ADDSUBPS ] }
-        { double-2-rep [ ADDSUBPD ] }
+        { char-16-rep [ PSUBSB ] }
+        { uchar-16-rep [ PSUBUSB ] }
+        { short-8-rep [ PSUBSW ] }
+        { ushort-8-rep [ PSUBUSW ] }
     } case drop ;
 
-M: x86 %add-sub-vector-reps
+M: x86 %saturated-sub-vector-reps
     {
-        { sse3? { float-4-rep double-2-rep } }
+        { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } }
     } available-reps ;
 
 M: x86 %mul-vector ( dst src1 src2 rep -- )
@@ -368,6 +394,10 @@ M: x86 %mul-vector-reps
         { sse2? { double-2-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
     } available-reps ;
 
+M: x86 %saturated-mul-vector-reps
+    ! No multiplication with saturation on x86
+    { } ;
+
 M: x86 %div-vector ( dst src1 src2 rep -- )
     {
         { float-4-rep [ DIVPS ] }
@@ -854,46 +884,29 @@ M: x86 small-enough? ( n -- ? )
     #! set up by the caller.
     stack-frame get total-size>> + stack@ ;
 
-: enable-sse2 ( -- )
-    enable-float-intrinsics
-    enable-fsqrt
-    enable-float-min/max
-    enable-sse2-simd ;
-
-: enable-sse3 ( -- )
-    enable-sse2
-    enable-sse3-simd ;
-
-enable-min/max
-
-:: install-sse-check ( version -- )
+:: install-sse2-check ( -- )
     [
-        sse-version version < [
-            "This image was built to use " write
-            version sse-string write
-            " but your CPU only supports " write
-            sse-version sse-string write "." print
+        sse-version 20 < [
+            "This image was built to use SSE2 but your CPU does not support it." print
             "You will need to bootstrap Factor again." print
             flush
             1 exit
         ] when
     ] "cpu.x86" add-init-hook ;
 
-: enable-sse ( version -- )
-    {
-        { 00 [ ] }
-        { 10 [ ] }
-        { 20 [ enable-sse2 ] }
-        { 30 [ enable-sse3 ] }
-        { 33 [ enable-sse3 ] }
-        { 41 [ enable-sse3 ] }
-        { 42 [ enable-sse3 ] }
-    } case ;
+: enable-sse2 ( version -- )
+    20 >= [
+        enable-float-intrinsics
+        enable-fsqrt
+        enable-float-min/max
+        install-sse2-check
+    ] when ;
+
+
+enable-simd
+enable-min/max
 
 : check-sse ( -- )
     [ { sse_version } compile ] with-optimizer
-
-    "Checking for multimedia extensions: " write sse-version 30 min
-    [ sse-string write " detected" print ]
-    [ install-sse-check ]
-    [ enable-sse ] tri ;
+    "Checking for multimedia extensions: " write sse-version
+    [ sse-string write " detected" print ] [ enable-sse2 ] bi ;
diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor
index 2141914d1c..7d84b18225 100644
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@@ -41,9 +41,12 @@ MACRO: simd-boa ( rep class -- simd-array )
     [
         {
             { v+ (simd-v+) }
-            { v- (simd-v-) }
+            { vs+ (simd-vs+) }
             { v+- (simd-v+-) }
+            { v- (simd-v-) }
+            { vs- (simd-vs-) }
             { v* (simd-v*) }
+            { vs* (simd-vs*) }
             { v/ (simd-v/) }
             { vmin (simd-vmin) }
             { vmax (simd-vmax) }
@@ -111,7 +114,7 @@ A{           DEFINES ${A}{
 NTH          [ T dup c-type-getter-boxer array-accessor ]
 SET-NTH      [ T dup c-setter array-accessor ]
 
-A-rep        IS ${A}-rep
+A-rep        [ A name>> "-rep" append "cpu.architecture" lookup ]
 A-vv->v-op   DEFINES-PRIVATE ${A}-vv->v-op
 A-v->n-op    DEFINES-PRIVATE ${A}-v->n-op
 
@@ -142,6 +145,8 @@ M: A equal? over \ A instance? [ sequence= ] [ 2drop f ] if ;
 
 M: A byte-length underlying>> length ; inline
 
+M: A element-type drop A-rep rep-component-type ;
+
 M: A pprint-delims drop \ A{ \ } ;
 
 M: A >pprint-sequence ;
@@ -172,9 +177,12 @@ INSTANCE: A sequence
 
 \ A \ A-with \ A-rep H{
     { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
+    { vs+ [ [ (simd-vs+) ] \ A-vv->v-op execute ] }
     { v+- [ [ (simd-v+-) ] \ A-vv->v-op execute ] }
     { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
+    { vs- [ [ (simd-vs-) ] \ A-vv->v-op execute ] }
     { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
+    { vs* [ [ (simd-vs*) ] \ A-vv->v-op execute ] }
     { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
     { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
     { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
@@ -227,7 +235,7 @@ A{           DEFINES ${A}{
 
 A-deref      DEFINES-PRIVATE ${A}-deref
 
-A-rep        IS ${A/2}-rep
+A-rep        [ A/2 name>> "-rep" append "cpu.architecture" lookup ]
 A-vv->v-op   DEFINES-PRIVATE ${A}-vv->v-op
 A-v->n-op    DEFINES-PRIVATE ${A}-v->n-op
 
@@ -267,6 +275,8 @@ M: A equal? over \ A instance? [ sequence= ] [ 2drop f ] if ;
 
 M: A byte-length drop 32 ; inline
 
+M: A element-type drop A-rep rep-component-type ;
+
 SYNTAX: A{ \ } [ >A ] parse-literal ;
 
 M: A pprint-delims drop \ A{ \ } ;
@@ -298,9 +308,12 @@ INSTANCE: A sequence
 
 \ A \ A-with \ A-rep H{
     { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
+    { vs+ [ [ (simd-vs+) ] \ A-vv->v-op execute ] }
     { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
+    { vs- [ [ (simd-vs-) ] \ A-vv->v-op execute ] }
     { v+- [ [ (simd-v+-) ] \ A-vv->v-op execute ] }
     { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
+    { vs* [ [ (simd-vs*) ] \ A-vv->v-op execute ] }
     { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
     { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
     { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor
index 6d39b9e70a..63214f7da6 100644
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@@ -8,8 +8,11 @@ IN: math.vectors.simd.intrinsics
 ERROR: bad-simd-call ;
 
 : (simd-v+) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-v+-) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v-) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-v+-) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-vs+) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-vs-) ( v1 v2 rep -- v3 ) bad-simd-call ;
+: (simd-vs*) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v*) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-v/) ( v1 v2 rep -- v3 ) bad-simd-call ;
 : (simd-vmin) ( v1 v2 rep -- v3 ) bad-simd-call ;
@@ -68,9 +71,12 @@ GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
 M: vector-rep supported-simd-op?
     {
         { \ (simd-v+)        [ %add-vector-reps            ] }
+        { \ (simd-vs+)       [ %saturated-add-vector-reps  ] }
         { \ (simd-v+-)       [ %add-sub-vector-reps        ] }
         { \ (simd-v-)        [ %sub-vector-reps            ] }
+        { \ (simd-vs-)       [ %saturated-sub-vector-reps  ] }
         { \ (simd-v*)        [ %mul-vector-reps            ] }
+        { \ (simd-vs*)       [ %saturated-mul-vector-reps  ] }
         { \ (simd-v/)        [ %div-vector-reps            ] }
         { \ (simd-vmin)      [ %min-vector-reps            ] }
         { \ (simd-vmax)      [ %max-vector-reps            ] }
diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor
index d108d70b26..ef625ffff0 100644
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@@ -161,8 +161,12 @@ ARTICLE: "math.vectors.simd.intrinsics" "Low-level SIMD primitives"
 $nl
 "It is best to avoid calling these primitives directly. To write efficient high-level code that compiles down to primitives and avoids memory allocation, see " { $link "math.vectors.simd.efficiency" } "."
 { $subsection (simd-v+) }
-{ $subsection (simd-v-) }
+{ $subsection (simd-vs+) }
 { $subsection (simd-v+-) }
+{ $subsection (simd-v-) }
+{ $subsection (simd-vs-) }
+{ $subsection (simd-v*) }
+{ $subsection (simd-vs*) }
 { $subsection (simd-v/) }
 { $subsection (simd-vmin) }
 { $subsection (simd-vmax) }
diff --git a/basis/math/vectors/specialization/specialization.factor b/basis/math/vectors/specialization/specialization.factor
index 1a85f5ade7..2fb36d428a 100644
--- a/basis/math/vectors/specialization/specialization.factor
+++ b/basis/math/vectors/specialization/specialization.factor
@@ -53,11 +53,14 @@ H{
     { norm-sq { +vector+ -> +nonnegative+ } }
     { normalize { +vector+ -> +vector+ } }
     { v* { +vector+ +vector+ -> +vector+ } }
+    { vs* { +vector+ +vector+ -> +vector+ } }
     { v*n { +vector+ +scalar+ -> +vector+ } }
     { v+ { +vector+ +vector+ -> +vector+ } }
+    { vs+ { +vector+ +vector+ -> +vector+ } }
     { v+- { +vector+ +vector+ -> +vector+ } }
     { v+n { +vector+ +scalar+ -> +vector+ } }
     { v- { +vector+ +vector+ -> +vector+ } }
+    { vs- { +vector+ +vector+ -> +vector+ } }
     { v-n { +vector+ +scalar+ -> +vector+ } }
     { v. { +vector+ +vector+ -> +scalar+ } }
     { v/ { +vector+ +vector+ -> +vector+ } }
diff --git a/basis/math/vectors/vectors-docs.factor b/basis/math/vectors/vectors-docs.factor
index 4f2f093216..ce635b6d60 100644
--- a/basis/math/vectors/vectors-docs.factor
+++ b/basis/math/vectors/vectors-docs.factor
@@ -26,7 +26,11 @@ $nl
 { $subsection v. }
 { $subsection norm }
 { $subsection norm-sq }
-{ $subsection normalize } ;
+{ $subsection normalize }
+"Saturated arithmetic may be performed on " { $link "specialized-arrays" } "; the results are clamped to the minimum and maximum bounds of the array element type, instead of wrapping around:"
+{ $subsection vs+ }
+{ $subsection vs- }
+{ $subsection vs* } ;
 
 ABOUT: "math-vectors"
 
@@ -100,6 +104,34 @@ HELP: v.
     { $snippet "0 [ conjugate * + ] 2reduce" }
 } ;
 
+HELP: vs+
+{ $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "w" "a sequence of numbers" } }
+{ $description "Adds " { $snippet "u" } " and " { $snippet "v" } " component-wise with saturation." }
+{ $examples
+    "With saturation:"
+    { $example
+        "USING: math.vectors prettyprint specialized-arrays ;"
+        "SPECIALIZED-ARRAY: uchar"
+        "uchar-array{ 100 200 150 } uchar-array{ 70 70 70 } vs+ ."
+        "uchar-array{ 170 255 220 }"
+    }
+    "Without saturation:"
+    { $example
+        "USING: math.vectors prettyprint specialized-arrays ;"
+        "SPECIALIZED-ARRAY: uchar"
+        "uchar-array{ 100 200 150 } uchar-array{ 70 70 70 } v+ ."
+        "uchar-array{ 170 14 220 }"
+    }
+} ;
+
+HELP: vs-
+{ $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "w" "a sequence of numbers" } }
+{ $description "Subtracts " { $snippet "v" } " from " { $snippet "u" } " component-wise with saturation." } ;
+
+HELP: vs*
+{ $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "w" "a sequence of numbers" } }
+{ $description "Multiplies " { $snippet "u" } " and " { $snippet "v" } " component-wise with saturation." } ;
+
 HELP: norm-sq
 { $values { "v" "a sequence of numbers" } { "x" "a non-negative real number" } }
 { $description "Computes the squared length of a mathematical vector." } ;
@@ -120,3 +152,5 @@ HELP: set-axis
 { 2map v+ v- v* v/ } related-words
 
 { 2reduce v. } related-words
+
+{ vs+ vs- vs* } related-words
diff --git a/basis/math/vectors/vectors.factor b/basis/math/vectors/vectors.factor
index deda1dc505..3a1b2875a9 100644
--- a/basis/math/vectors/vectors.factor
+++ b/basis/math/vectors/vectors.factor
@@ -1,9 +1,11 @@
 ! Copyright (C) 2005, 2008 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: arrays kernel sequences math math.functions hints
-math.order ;
+USING: arrays alien.c-types kernel sequences math math.functions
+hints math.order fry ;
 IN: math.vectors
 
+GENERIC: element-type ( obj -- c-type )
+
 : vneg ( u -- v ) [ neg ] map ;
 
 : v+n ( u n -- v ) [ + ] curry map ;
@@ -29,6 +31,13 @@ IN: math.vectors
     [ [ not ] 2dip pick [ + ] [ - ] if ] 2map
     nip ;
 
+: 2saturate-map ( u v quot -- w )
+    pick element-type '[ @ _ c-type-clamp ] 2map ; inline
+
+: vs+ ( u v -- w ) [ + ] 2saturate-map ;
+: vs- ( u v -- w ) [ - ] 2saturate-map ;
+: vs* ( u v -- w ) [ * ] 2saturate-map ;
+
 : vfloor    ( v -- _v_ ) [ floor    ] map ;
 : vceiling  ( v -- ^v^ ) [ ceiling  ] map ;
 : vtruncate ( v -- -v- ) [ truncate ] map ;
diff --git a/basis/specialized-arrays/specialized-arrays-tests.factor b/basis/specialized-arrays/specialized-arrays-tests.factor
index 5d88f42d50..526312e0aa 100755
--- a/basis/specialized-arrays/specialized-arrays-tests.factor
+++ b/basis/specialized-arrays/specialized-arrays-tests.factor
@@ -4,7 +4,7 @@ specialized-arrays.private sequences alien.c-types accessors
 kernel arrays combinators compiler compiler.units classes.struct
 combinators.smart compiler.tree.debugger math libc destructors
 sequences.private multiline eval words vocabs namespaces
-assocs prettyprint alien.data ;
+assocs prettyprint alien.data math.vectors ;
 FROM: alien.c-types => float ;
 
 SPECIALIZED-ARRAY: int
@@ -13,6 +13,9 @@ SPECIALIZED-ARRAY: ushort
 SPECIALIZED-ARRAY: char
 SPECIALIZED-ARRAY: uint
 SPECIALIZED-ARRAY: float
+SPECIALIZED-ARRAY: ulonglong
+
+[ ulonglong ] [ ulonglong-array{ } element-type ] unit-test
 
 [ t ] [ { 1 2 3 } >int-array int-array? ] unit-test
 
diff --git a/basis/specialized-arrays/specialized-arrays.factor b/basis/specialized-arrays/specialized-arrays.factor
index a64d052fd1..9692980858 100755
--- a/basis/specialized-arrays/specialized-arrays.factor
+++ b/basis/specialized-arrays/specialized-arrays.factor
@@ -1,10 +1,11 @@
 ! Copyright (C) 2008, 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien alien.c-types alien.data alien.parser assocs
-byte-arrays classes compiler.units functors kernel lexer libc math
-math.vectors.specialization namespaces parser prettyprint.custom
-sequences sequences.private strings summary vocabs vocabs.loader
-vocabs.parser vocabs.generated words fry combinators ;
+USING: accessors alien alien.c-types alien.data alien.parser
+assocs byte-arrays classes compiler.units functors kernel lexer
+libc math math.vectors math.vectors.specialization namespaces
+parser prettyprint.custom sequences sequences.private strings
+summary vocabs vocabs.loader vocabs.parser vocabs.generated
+words fry combinators present ;
 IN: specialized-arrays
 
 MIXIN: specialized-array
@@ -53,14 +54,14 @@ TUPLE: A
 
 : <direct-A> ( alien len -- specialized-array ) A boa ; inline
 
-: <A> ( n -- specialized-array ) [ T <underlying> ] keep <direct-A> ; inline
+: <A> ( n -- specialized-array ) [ \ T <underlying> ] keep <direct-A> ; inline
 
-: (A) ( n -- specialized-array ) [ T (underlying) ] keep <direct-A> ; inline
+: (A) ( n -- specialized-array ) [ \ T (underlying) ] keep <direct-A> ; inline
 
-: malloc-A ( len -- specialized-array ) [ T heap-size calloc ] keep <direct-A> ; inline
+: malloc-A ( len -- specialized-array ) [ \ T heap-size calloc ] keep <direct-A> ; inline
 
 : byte-array>A ( byte-array -- specialized-array )
-    dup length T heap-size /mod 0 = [ drop T bad-byte-array-length ] unless
+    dup length \ T heap-size /mod 0 = [ drop \ T bad-byte-array-length ] unless
     <direct-A> ; inline
 
 M: A clone [ underlying>> clone ] [ length>> ] bi <direct-A> ; inline
@@ -81,12 +82,14 @@ M: A equal? over A instance? [ sequence= ] [ 2drop f ] if ;
 
 M: A resize
     [
-        [ T heap-size * ] [ underlying>> ] bi*
+        [ \ T heap-size * ] [ underlying>> ] bi*
         resize-byte-array
     ] [ drop ] 2bi
     <direct-A> ; inline
 
-M: A byte-length length T heap-size * ; inline
+M: A byte-length length \ T heap-size * ; inline
+
+M: A element-type drop \ T ; inline
 
 M: A direct-array-syntax drop \ A@ ;
 
@@ -116,15 +119,15 @@ M: word (underlying-type) "c-type" word-prop ;
     } cond ;
 
 : underlying-type-name ( c-type -- name )
-    underlying-type dup word? [ name>> ] when ;
+    underlying-type present ;
 
 : specialized-array-vocab ( c-type -- vocab )
-    "specialized-arrays.instances." prepend ;
+    present "specialized-arrays.instances." prepend ;
 
 PRIVATE>
 
 : define-array-vocab ( type -- vocab )
-    underlying-type-name
+    underlying-type
     [ specialized-array-vocab ] [ '[ _ define-array ] ] bi
     generate-vocab ;
 
diff --git a/vm/cpu-x86.S b/vm/cpu-x86.S
index d229b2cb79..fe97fe2179 100644
--- a/vm/cpu-x86.S
+++ b/vm/cpu-x86.S
@@ -77,12 +77,12 @@ DEF(F_FASTCALL void,lazy_jit_compile,(CELL quot)):
 DEF(bool,sse_version,(void)):
 	mov $0x1,RETURN_REG
 	cpuid
-	/* test $0x100000,%ecx
+	test $0x100000,%ecx
 	jnz sse_42
 	test $0x80000,%ecx
 	jnz sse_41
 	test $0x200,%ecx
-	jnz ssse_3 */
+	jnz ssse_3
 	test $0x1,%ecx
 	jnz sse_3
 	test $0x4000000,%edx

From 3de9f4c33962f2fa1e4bf5e780fb4a5f57466144 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Sun, 20 Sep 2009 23:16:14 -0500
Subject: [PATCH 05/15] mason.child: don't need command line switches for SSE
 version anymore

---
 extra/mason/child/child-tests.factor | 2 +-
 extra/mason/child/child.factor       | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/extra/mason/child/child-tests.factor b/extra/mason/child/child-tests.factor
index e8e1a9e0e9..2d5a7c6635 100644
--- a/extra/mason/child/child-tests.factor
+++ b/extra/mason/child/child-tests.factor
@@ -33,7 +33,7 @@ USING: mason.child mason.config tools.test namespaces io kernel sequences ;
     ] with-scope
 ] unit-test
 
-[ { "./factor.com" "-i=boot.x86.32.image" "-no-user-init" "-sse-version=30" } ] [
+[ { "./factor.com" "-i=boot.x86.32.image" "-no-user-init" } ] [
     [
         "winnt" target-os set
         "x86.32" target-cpu set
diff --git a/extra/mason/child/child.factor b/extra/mason/child/child.factor
index b3ee6c2c76..193ac1e212 100755
--- a/extra/mason/child/child.factor
+++ b/extra/mason/child/child.factor
@@ -34,7 +34,6 @@ IN: mason.child
         factor-vm ,
         "-i=" boot-image-name append ,
         "-no-user-init" ,
-        target-cpu get { "x86.32" "x86.64" } member? [ "-sse-version=30" , ] when
     ] { } make ;
 
 : boot ( -- )

From 603a560eafd14f3b71a9faeebb79970d9cfb2f92 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Sun, 20 Sep 2009 23:24:30 -0500
Subject: [PATCH 06/15] cpu.x86: fix using list

---
 basis/cpu/x86/x86.factor | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index d40e03cdb6..4850403908 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -5,8 +5,7 @@ cpu.x86.assembler cpu.x86.assembler.private cpu.x86.assembler.operands
 cpu.architecture kernel kernel.private math memory namespaces make
 sequences words system layouts combinators math.order fry locals
 compiler.constants byte-arrays io macros quotations cpu.x86.features
-cpu.x86.features.private compiler compiler.units init
-compiler.constants vm byte-arrays
+cpu.x86.features.private compiler compiler.units init vm
 compiler.cfg.registers
 compiler.cfg.instructions
 compiler.cfg.intrinsics

From 74a72ce17c31b4df3f260154d65419f130f67078 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@factorcode.org>
Date: Tue, 22 Sep 2009 23:54:42 -0500
Subject: [PATCH 07/15] checksums.openssl: use correct API for creating
 EVP_MD_CTX, fixes crash on NetBSD

---
 basis/checksums/openssl/openssl.factor | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/basis/checksums/openssl/openssl.factor b/basis/checksums/openssl/openssl.factor
index bc70230fd0..095ab38ace 100644
--- a/basis/checksums/openssl/openssl.factor
+++ b/basis/checksums/openssl/openssl.factor
@@ -1,4 +1,4 @@
-! Copyright (C) 2008 Slava Pestov
+! copyright (C) 2008 Slava Pestov
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors byte-arrays alien.c-types alien.data kernel
 continuations destructors sequences io openssl openssl.libcrypto
@@ -23,10 +23,10 @@ TUPLE: evp-md-context < disposable handle ;
 
 : <evp-md-context> ( -- ctx )
     evp-md-context new-disposable
-    EVP_MD_CTX <struct> dup EVP_MD_CTX_init >>handle ;
+    EVP_MD_CTX_create >>handle ;
 
 M: evp-md-context dispose*
-    handle>> EVP_MD_CTX_cleanup drop ;
+    handle>> EVP_MD_CTX_destroy ;
 
 : with-evp-md-context ( quot -- )
     maybe-init-ssl [ <evp-md-context> ] dip with-disposal ; inline

From 15ba7e299b333b63e349268a832ad369c62167fd Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Wed, 23 Sep 2009 01:05:19 -0500
Subject: [PATCH 08/15] math.vectors.simd: fix all tests

---
 basis/math/vectors/simd/simd-docs.factor  | 28 ++++-------
 basis/math/vectors/simd/simd-tests.factor | 60 +++++++++++++++--------
 2 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor
index ef625ffff0..6d212f8e2b 100644
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@@ -1,5 +1,5 @@
-USING: help.markup help.syntax sequences math math.vectors
-multiline kernel.private classes.tuple.private
+USING: help.markup help.syntax help.vocabs sequences math
+math.vectors multiline kernel.private classes.tuple.private
 math.vectors.simd.intrinsics cpu.architecture ;
 IN: math.vectors.simd
 
@@ -159,22 +159,8 @@ ARTICLE: "math.vectors.simd.intrinsics" "Low-level SIMD primitives"
 }
 "The compiler converts " { $link "math-vectors" } " into SIMD primitives automatically in cases where it is safe; this means that the input types are known to be SIMD vectors, and the CPU supports SIMD."
 $nl
-"It is best to avoid calling these primitives directly. To write efficient high-level code that compiles down to primitives and avoids memory allocation, see " { $link "math.vectors.simd.efficiency" } "."
-{ $subsection (simd-v+) }
-{ $subsection (simd-vs+) }
-{ $subsection (simd-v+-) }
-{ $subsection (simd-v-) }
-{ $subsection (simd-vs-) }
-{ $subsection (simd-v*) }
-{ $subsection (simd-vs*) }
-{ $subsection (simd-v/) }
-{ $subsection (simd-vmin) }
-{ $subsection (simd-vmax) }
-{ $subsection (simd-vsqrt) }
-{ $subsection (simd-sum) }
-{ $subsection (simd-broadcast) }
-{ $subsection (simd-gather-2) }
-{ $subsection (simd-gather-4) }
+"It is best to avoid calling SIMD primitives directly. To write efficient high-level code that compiles down to primitives and avoids memory allocation, see " { $link "math.vectors.simd.efficiency" } "."
+$nl
 "There are two primitives which are used to implement accessing SIMD vector fields of " { $link "classes.struct" } ":"
 { $subsection alien-vector }
 { $subsection set-alien-vector }
@@ -185,12 +171,18 @@ ARTICLE: "math.vectors.simd.alien" "SIMD data in struct classes"
 $nl
 "Only SIMD struct fields are allowed at the moment; passing SIMD data as function parameters is not yet supported." ;
 
+ARTICLE: "math.vectors.simd.accuracy" "Numerical accuracy of SIMD primitives"
+"No guarantees are made that " { $vocab-link "math.vectors.simd" } " words will give identical results on different SSE versions, or between the hardware intrinsics and the software fallbacks."
+$nl
+"In particular, horizontal operations on " { $snippet "float-4" } " and " { $snippet "float-8" } " are affected by this. They are computed with lower precision in intrinsics than the software fallback. Horizontal opeartions include anything involving adding together the components of a vector, such as " { $link sum } " or " { $link normalize } "." ;
+
 ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
 "The " { $vocab-link "math.vectors.simd" } " vocabulary extends the " { $vocab-link "math.vectors" } " vocabulary to support efficient vector arithmetic on small, fixed-size vectors."
 { $subsection "math.vectors.simd.intro" }
 { $subsection "math.vectors.simd.types" }
 { $subsection "math.vectors.simd.words" }
 { $subsection "math.vectors.simd.support" }
+{ $subsection "math.vectors.simd.accuracy" }
 { $subsection "math.vectors.simd.efficiency" }
 { $subsection "math.vectors.simd.alien" }
 { $subsection "math.vectors.simd.intrinsics" } ;
diff --git a/basis/math/vectors/simd/simd-tests.factor b/basis/math/vectors/simd/simd-tests.factor
index 39afe3cb03..007ec338cd 100644
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@@ -74,17 +74,18 @@ CONSTANT: simd-classes
 : boa-ctors ( -- seq )
     simd-classes [ [ name>> "-boa" append ] [ vocabulary>> ] bi lookup ] map ;
 
-: check-optimizer ( seq inputs quot -- )
-    [
+: check-optimizer ( seq inputs quot eq-quot -- )
+    '[
+        @
         [ "print-mr" get [ nip test-mr mr. ] [ 2drop ] if ]
         [ [ call ] dip call ]
-        [ [ call ] dip compile-call ] 2tri = not
-    ] compose filter ; inline
+        [ [ call ] dip compile-call ] 2tri @ not
+    ] filter ; inline
 
 "== Checking -new constructors" print
 
 [ { } ] [
-    simd-classes [ [ [ ] ] dip '[ _ new ] ] check-optimizer
+    simd-classes [ [ [ ] ] dip '[ _ new ] ] [ = ] check-optimizer
 ] unit-test
 
 [ { } ] [
@@ -96,7 +97,7 @@ CONSTANT: simd-classes
 [ { } ] [
     with-ctors [
         [ 1000 random '[ _ ] ] dip '[ { fixnum } declare _ execute ]
-    ] check-optimizer
+    ] [ = ] check-optimizer
 ] unit-test
 
 "== Checking -boa constructors" print
@@ -107,7 +108,7 @@ CONSTANT: simd-classes
         [ nip [ 1000 random ] [ ] replicate-as ]
         [ fixnum <array> swap '[ _ declare _ execute ] ]
         2bi
-    ] check-optimizer
+    ] [ = ] check-optimizer
 ] unit-test
 
 "== Checking vector operations" print
@@ -133,24 +134,44 @@ CONSTANT: simd-classes
     ] bi
     word '[ _ declare _ execute ] ;
 
-: ops-to-check ( elt-class -- alist )
-    [ vector-words >alist ] dip float = [
-        [ drop { n/v v/n v/ normalize } member? not ] assoc-filter
-    ] unless ;
+: remove-float-words ( alist -- alist' )
+    [ drop { n/v v/n v/ normalize } member? not ] assoc-filter ;
 
-: check-vector-ops ( class elt-class -- )
-    [ nip ops-to-check ] 2keep
-    '[ first2 inputs _ _ check-vector-op ] check-optimizer ; inline
+: ops-to-check ( elt-class -- alist )
+    [ vector-words >alist ] dip
+    float = [ remove-float-words ] unless ;
+
+: check-vector-ops ( class elt-class compare-quot -- )
+    [
+        [ nip ops-to-check ] 2keep
+        '[ first2 inputs _ _ check-vector-op ]
+    ] dip check-optimizer ; inline
+
+: approx= ( x y -- ? )
+    {
+        { [ 2dup [ float? ] both? ] [ -1.e8 ~ ] }
+        { [ 2dup [ sequence? ] both? ] [
+            [
+                {
+                    { [ 2dup [ fp-nan? ] both? ] [ 2drop t ] }
+                    { [ 2dup [ fp-nan? ] either? not ] [ -1.e8 ~ ] }
+                } cond
+            ] 2all?
+        ] }
+    } cond ;
 
 : simd-classes&reps ( -- alist )
     simd-classes [
-        dup name>> [ "float" head? ] [ "double" head? ] bi or
-        float fixnum ?
-    ] { } map>assoc ;
+        {
+            { [ dup name>> "float" head? ] [ float [ approx= ] ] }
+            { [ dup name>> "double" tail? ] [ float [ = ] ] }
+            [ fixnum [ = ] ]
+        } cond 3array
+    ] map ;
 
 simd-classes&reps [
-    [ [ { } ] ] 2dip '[ _ _ check-vector-ops ] unit-test
-] assoc-each
+    [ [ { } ] ] dip first3 '[ _ _ _ check-vector-ops ] unit-test
+] each
 
 ! Other regressions
 [ 8000000 ] [
@@ -158,7 +179,6 @@ simd-classes&reps [
     [ { int-8 } declare dup [ * ] [ + ] 2map-reduce ] compile-call
 ] unit-test
 
-
 ! Vector alien intrinsics
 [ float-4{ 1 2 3 4 } ] [
     [

From 43fa252af50997095d1a1ee492fc2fc92d330ada Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Wed, 23 Sep 2009 02:46:54 -0500
Subject: [PATCH 09/15] math.vectors.simd: new operations: vabs vsqrt vbitand
 vbitor vbitxor

---
 .../cfg/instructions/instructions.factor      |  26 ++++-
 .../compiler/cfg/intrinsics/intrinsics.factor |   6 +-
 .../cfg/two-operand/two-operand.factor        |   5 +-
 basis/compiler/codegen/codegen.factor         |   4 +
 .../tree/propagation/simd/simd.factor         |   5 +
 basis/cpu/architecture/architecture.factor    |   8 ++
 basis/cpu/x86/x86.factor                      | 100 +++++++++++++++---
 .../math/vectors/simd/functor/functor.factor  |  81 ++++++--------
 .../vectors/simd/intrinsics/intrinsics.factor |  54 +++++++---
 basis/math/vectors/simd/simd-tests.factor     |   4 +-
 .../specialization/specialization.factor      |   5 +
 basis/math/vectors/vectors-docs.factor        |   9 +-
 basis/math/vectors/vectors.factor             |  27 ++++-
 13 files changed, 252 insertions(+), 82 deletions(-)

diff --git a/basis/compiler/cfg/instructions/instructions.factor b/basis/compiler/cfg/instructions/instructions.factor
index 87e981f362..874093ed40 100644
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@@ -350,14 +350,34 @@ def: dst
 use: src1 src2
 literal: rep ;
 
+PURE-INSN: ##horizontal-add-vector
+def: dst/scalar-rep
+use: src
+literal: rep ;
+
+PURE-INSN: ##abs-vector
+def: dst
+use: src
+literal: rep ;
+
 PURE-INSN: ##sqrt-vector
 def: dst
 use: src
 literal: rep ;
 
-PURE-INSN: ##horizontal-add-vector
-def: dst/scalar-rep
-use: src
+PURE-INSN: ##and-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+PURE-INSN: ##or-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+PURE-INSN: ##xor-vector
+def: dst
+use: src1 src2
 literal: rep ;
 
 ! Boxing and unboxing aliens
diff --git a/basis/compiler/cfg/intrinsics/intrinsics.factor b/basis/compiler/cfg/intrinsics/intrinsics.factor
index b9835827fa..d2f158f06d 100644
--- a/basis/compiler/cfg/intrinsics/intrinsics.factor
+++ b/basis/compiler/cfg/intrinsics/intrinsics.factor
@@ -164,7 +164,11 @@ IN: compiler.cfg.intrinsics
         { math.vectors.simd.intrinsics:(simd-v/) [ [ ^^div-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-vmin) [ [ ^^min-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-vmax) [ [ ^^max-vector ] emit-binary-vector-op ] }
-        { math.vectors.simd.intrinsics:(simd-vsqrt) [ [ ^^sqrt-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-vabs) [ [ ^^abs-vector ] emit-unary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-vsqrt) [ [ ^^sqrt-vector ] emit-unary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-vbitand) [ [ ^^and-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-vbitor) [ [ ^^or-vector ] emit-binary-vector-op ] }
+        { math.vectors.simd.intrinsics:(simd-vbitxor) [ [ ^^xor-vector ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-broadcast) [ [ ^^broadcast-vector ] emit-unary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-gather-2) [ emit-gather-vector-2 ] }
         { math.vectors.simd.intrinsics:(simd-gather-4) [ emit-gather-vector-4 ] }
diff --git a/basis/compiler/cfg/two-operand/two-operand.factor b/basis/compiler/cfg/two-operand/two-operand.factor
index 1186e6b41f..45d248f8f4 100644
--- a/basis/compiler/cfg/two-operand/two-operand.factor
+++ b/basis/compiler/cfg/two-operand/two-operand.factor
@@ -55,7 +55,10 @@ UNION: two-operand-insn
     ##saturated-mul-vector
     ##div-vector
     ##min-vector
-    ##max-vector ;
+    ##max-vector
+    ##and-vector
+    ##or-vector
+    ##xor-vector ;
 
 GENERIC: convert-two-operand* ( insn -- )
 
diff --git a/basis/compiler/codegen/codegen.factor b/basis/compiler/codegen/codegen.factor
index e7a2548eff..43d11b5d4f 100755
--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@@ -180,6 +180,10 @@ CODEGEN: ##min-vector %min-vector
 CODEGEN: ##max-vector %max-vector
 CODEGEN: ##sqrt-vector %sqrt-vector
 CODEGEN: ##horizontal-add-vector %horizontal-add-vector
+CODEGEN: ##abs-vector %abs-vector
+CODEGEN: ##and-vector %and-vector
+CODEGEN: ##or-vector %or-vector
+CODEGEN: ##xor-vector %xor-vector
 CODEGEN: ##box-alien %box-alien
 CODEGEN: ##box-displaced-alien %box-displaced-alien
 CODEGEN: ##unbox-alien %unbox-alien
diff --git a/basis/compiler/tree/propagation/simd/simd.factor b/basis/compiler/tree/propagation/simd/simd.factor
index db39985c94..fadb382398 100644
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@@ -13,7 +13,12 @@ IN: compiler.tree.propagation.simd
     (simd-v/)
     (simd-vmin)
     (simd-vmax)
+    (simd-sum)
+    (simd-vabs)
     (simd-vsqrt)
+    (simd-vbitand)
+    (simd-vbitor)
+    (simd-vbitxor)
     (simd-broadcast)
     (simd-gather-2)
     (simd-gather-4)
diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor
index 6bc78836cd..2dbe724f0a 100644
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@@ -192,6 +192,10 @@ HOOK: %min-vector cpu ( dst src1 src2 rep -- )
 HOOK: %max-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sqrt-vector cpu ( dst src rep -- )
 HOOK: %horizontal-add-vector cpu ( dst src rep -- )
+HOOK: %abs-vector cpu ( dst src rep -- )
+HOOK: %and-vector cpu ( dst src1 src2 rep -- )
+HOOK: %or-vector cpu ( dst src1 src2 rep -- )
+HOOK: %xor-vector cpu ( dst src1 src2 rep -- )
 
 HOOK: %broadcast-vector-reps cpu ( -- reps )
 HOOK: %gather-vector-2-reps cpu ( -- reps )
@@ -208,6 +212,10 @@ HOOK: %min-vector-reps cpu ( -- reps )
 HOOK: %max-vector-reps cpu ( -- reps )
 HOOK: %sqrt-vector-reps cpu ( -- reps )
 HOOK: %horizontal-add-vector-reps cpu ( -- reps )
+HOOK: %abs-vector-reps cpu ( -- reps )
+HOOK: %and-vector-reps cpu ( -- reps )
+HOOK: %or-vector-reps cpu ( -- reps )
+HOOK: %xor-vector-reps cpu ( -- reps )
 
 HOOK: %unbox-alien cpu ( dst src -- )
 HOOK: %unbox-any-c-ptr cpu ( dst src temp -- )
diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index 4850403908..1a96e93c63 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -410,30 +410,63 @@ M: x86 %div-vector-reps
 
 M: x86 %min-vector ( dst src1 src2 rep -- )
     {
-        { float-4-rep [ MINPS ] }
-        { double-2-rep [ MINPD ] }
+        { char-16-rep [ PMINSB ] }
         { uchar-16-rep [ PMINUB ] }
         { short-8-rep [ PMINSW ] }
+        { ushort-8-rep [ PMINUW ] }
+        { int-4-rep [ PMINSD ] }
+        { uint-4-rep [ PMINUD ] }
+        { float-4-rep [ MINPS ] }
+        { double-2-rep [ MINPD ] }
     } case drop ;
 
 M: x86 %min-vector-reps
     {
         { sse? { float-4-rep } }
-        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+        { sse2? { uchar-16-rep short-8-rep double-2-rep short-8-rep uchar-16-rep } }
+        { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
     } available-reps ;
 
 M: x86 %max-vector ( dst src1 src2 rep -- )
     {
-        { float-4-rep [ MAXPS ] }
-        { double-2-rep [ MAXPD ] }
+        { char-16-rep [ PMAXSB ] }
         { uchar-16-rep [ PMAXUB ] }
         { short-8-rep [ PMAXSW ] }
+        { ushort-8-rep [ PMAXUW ] }
+        { int-4-rep [ PMAXSD ] }
+        { uint-4-rep [ PMAXUD ] }
+        { float-4-rep [ MAXPS ] }
+        { double-2-rep [ MAXPD ] }
     } case drop ;
 
 M: x86 %max-vector-reps
     {
         { sse? { float-4-rep } }
-        { sse2? { double-2-rep short-8-rep uchar-16-rep } }
+        { sse2? { uchar-16-rep short-8-rep double-2-rep short-8-rep uchar-16-rep } }
+        { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
+M: x86 %horizontal-add-vector ( dst src rep -- )
+    {
+        { float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
+        { double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
+    } case ;
+
+M: x86 %horizontal-add-vector-reps
+    {
+        { sse3? { float-4-rep double-2-rep } }
+    } available-reps ;
+
+M: x86 %abs-vector ( dst src rep -- )
+    {
+        { char-16-rep [ PABSB ] }
+        { short-8-rep [ PABSW ] }
+        { int-4-rep [ PABSD ] }
+    } case ;
+
+M: x86 %abs-vector-reps
+    {
+        { ssse3? { char-16-rep short-8-rep int-4-rep } }
     } available-reps ;
 
 M: x86 %sqrt-vector ( dst src rep -- )
@@ -448,15 +481,58 @@ M: x86 %sqrt-vector-reps
         { sse2? { double-2-rep } }
     } available-reps ;
 
-M: x86 %horizontal-add-vector ( dst src rep -- )
+M: x86 %and-vector ( dst src1 src2 rep -- )
     {
-        { float-4-rep [ [ MOVAPS ] [ HADDPS ] [ HADDPS ] 2tri ] }
-        { double-2-rep [ [ MOVAPD ] [ HADDPD ] 2bi ] }
-    } case ;
+        { float-4-rep [ ANDPS ] }
+        { double-2-rep [ ANDPD ] }
+        { char-16-rep [ PAND ] }
+        { uchar-16-rep [ PAND ] }
+        { short-8-rep [ PAND ] }
+        { ushort-8-rep [ PAND ] }
+        { int-4-rep [ PAND ] }
+        { uint-4-rep [ PAND ] }
+    } case drop ;
 
-M: x86 %horizontal-add-vector-reps
+M: x86 %and-vector-reps
     {
-        { sse3? { float-4-rep double-2-rep } }
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
+M: x86 %or-vector ( dst src1 src2 rep -- )
+    {
+        { float-4-rep [ ORPS ] }
+        { double-2-rep [ ORPD ] }
+        { char-16-rep [ POR ] }
+        { uchar-16-rep [ POR ] }
+        { short-8-rep [ POR ] }
+        { ushort-8-rep [ POR ] }
+        { int-4-rep [ POR ] }
+        { uint-4-rep [ POR ] }
+    } case drop ;
+
+M: x86 %or-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
+    } available-reps ;
+
+M: x86 %xor-vector ( dst src1 src2 rep -- )
+    {
+        { float-4-rep [ XORPS ] }
+        { double-2-rep [ XORPD ] }
+        { char-16-rep [ PXOR ] }
+        { uchar-16-rep [ PXOR ] }
+        { short-8-rep [ PXOR ] }
+        { ushort-8-rep [ PXOR ] }
+        { int-4-rep [ PXOR ] }
+        { uint-4-rep [ PXOR ] }
+    } case drop ;
+
+M: x86 %xor-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
     } available-reps ;
 
 M: x86 %unbox-alien ( dst src -- )
diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor
index 7d84b18225..e934a641c4 100644
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@@ -4,7 +4,8 @@ USING: accessors alien.c-types assocs byte-arrays classes
 effects fry functors generalizations kernel literals locals
 math math.functions math.vectors math.vectors.simd.intrinsics
 math.vectors.specialization parser prettyprint.custom sequences
-sequences.private strings words definitions macros cpu.architecture ;
+sequences.private strings words definitions macros cpu.architecture
+namespaces arrays quotations ;
 QUALIFIED-WITH: math m
 IN: math.vectors.simd.functor
 
@@ -38,24 +39,19 @@ MACRO: simd-boa ( rep class -- simd-array )
     { "simd-vector" } <effect> ;
 
 : supported-simd-ops ( assoc rep -- assoc' )
-    [
-        {
-            { v+ (simd-v+) }
-            { vs+ (simd-vs+) }
-            { v+- (simd-v+-) }
-            { v- (simd-v-) }
-            { vs- (simd-vs-) }
-            { v* (simd-v*) }
-            { vs* (simd-vs*) }
-            { v/ (simd-v/) }
-            { vmin (simd-vmin) }
-            { vmax (simd-vmax) }
-            { sum (simd-sum) }
-        }
-    ] dip 
+    [ simd-ops get ] dip 
     '[ nip _ swap supported-simd-op? ] assoc-filter
     '[ drop _ key? ] assoc-filter ;
 
+ERROR: bad-schema schema ;
+
+: low-level-ops ( box-quot: ( inputs... simd-op -- outputs... ) -- alist )
+    [ simd-ops get ] dip '[
+        1quotation
+        over word-schema _ ?at [ bad-schema ] unless
+        [ ] 2sequence
+    ] assoc-map ;
+
 :: high-level-ops ( ctor elt-class -- assoc )
     ! Some SIMD operations are defined in terms of others.
     {
@@ -82,11 +78,17 @@ MACRO: simd-boa ( rep class -- simd-array )
         } append
     ] when ;
 
-:: simd-vector-words ( class ctor rep assoc -- )
+:: simd-vector-words ( class ctor rep vv->v v->v v->n -- )
     rep rep-component-type c-type-boxed-class :> elt-class
     class
     elt-class
-    assoc rep supported-simd-ops
+    {
+        { { +vector+ +vector+ -> +vector+ } vv->v }
+        { { +vector+ -> +vector+ } v->v }
+        { { +vector+ -> +scalar+ } v->n }
+        { { +vector+ -> +nonnegative+ } v->n }
+    } low-level-ops
+    rep supported-simd-ops
     ctor elt-class high-level-ops assoc-union
     specialize-vector-words ;
 
@@ -116,6 +118,7 @@ SET-NTH      [ T dup c-setter array-accessor ]
 
 A-rep        [ A name>> "-rep" append "cpu.architecture" lookup ]
 A-vv->v-op   DEFINES-PRIVATE ${A}-vv->v-op
+A-v->v-op    DEFINES-PRIVATE ${A}-v->v-op
 A-v->n-op    DEFINES-PRIVATE ${A}-v->n-op
 
 WHERE
@@ -172,23 +175,13 @@ INSTANCE: A sequence
 : A-vv->v-op ( v1 v2 quot -- v3 )
     [ [ underlying>> ] bi@ A-rep ] dip call \ A boa ; inline
 
+: A-v->v-op ( v1 quot -- v2 )
+    [ underlying>> A-rep ] dip call \ A boa ; inline
+
 : A-v->n-op ( v quot -- n )
     [ underlying>> A-rep ] dip call ; inline
 
-\ A \ A-with \ A-rep H{
-    { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
-    { vs+ [ [ (simd-vs+) ] \ A-vv->v-op execute ] }
-    { v+- [ [ (simd-v+-) ] \ A-vv->v-op execute ] }
-    { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
-    { vs- [ [ (simd-vs-) ] \ A-vv->v-op execute ] }
-    { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
-    { vs* [ [ (simd-vs*) ] \ A-vv->v-op execute ] }
-    { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
-    { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
-    { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
-    { sum [ [ (simd-sum) ] \ A-v->n-op execute ] }
-} simd-vector-words
-
+\ A \ A-with \ A-rep \ A-vv->v-op \ A-v->v-op \ A-v->n-op simd-vector-words
 \ A \ A-rep define-simd-128-type
 
 PRIVATE>
@@ -237,6 +230,7 @@ A-deref      DEFINES-PRIVATE ${A}-deref
 
 A-rep        [ A/2 name>> "-rep" append "cpu.architecture" lookup ]
 A-vv->v-op   DEFINES-PRIVATE ${A}-vv->v-op
+A-v->v-op    DEFINES-PRIVATE ${A}-v->v-op
 A-v->n-op    DEFINES-PRIVATE ${A}-v->n-op
 
 WHERE
@@ -302,24 +296,15 @@ INSTANCE: A sequence
     [ [ [ underlying2>> ] bi@ A-rep ] dip call ] 3bi
     \ A boa ; inline
 
-: A-v->n-op ( v1 combine-quot reduce-quot -- v2 )
-    [ [ [ underlying1>> ] [ underlying2>> ] bi A-rep ] dip call A-rep ]
-    dip call ; inline
+: A-v->v-op ( v1 combine-quot -- v2 )
+    [ [ underlying1>> A-rep ] dip call ]
+    [ [ underlying2>> A-rep ] dip call ] 2bi
+    \ A boa ; inline
 
-\ A \ A-with \ A-rep H{
-    { v+ [ [ (simd-v+) ] \ A-vv->v-op execute ] }
-    { vs+ [ [ (simd-vs+) ] \ A-vv->v-op execute ] }
-    { v- [ [ (simd-v-) ] \ A-vv->v-op execute ] }
-    { vs- [ [ (simd-vs-) ] \ A-vv->v-op execute ] }
-    { v+- [ [ (simd-v+-) ] \ A-vv->v-op execute ] }
-    { v* [ [ (simd-v*) ] \ A-vv->v-op execute ] }
-    { vs* [ [ (simd-vs*) ] \ A-vv->v-op execute ] }
-    { v/ [ [ (simd-v/) ] \ A-vv->v-op execute ] }
-    { vmin [ [ (simd-vmin) ] \ A-vv->v-op execute ] }
-    { vmax [ [ (simd-vmax) ] \ A-vv->v-op execute ] }
-    { sum [ [ (simd-v+) ] [ (simd-sum) ] \ A-v->n-op execute ] }
-} simd-vector-words
+: A-v->n-op ( v1 combine-quot -- v2 )
+    [ [ underlying1>> ] [ underlying2>> ] bi A-rep (simd-v+) A-rep ] dip call ; inline
 
+\ A \ A-with \ A-rep \ A-vv->v-op \ A-v->v-op \ A-v->n-op simd-vector-words
 \ A \ A-rep define-simd-256-type
 
 ;FUNCTOR
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor
index 63214f7da6..2c1f76cfe1 100644
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@@ -2,23 +2,47 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: alien alien.c-types alien.data assocs combinators
 cpu.architecture fry generalizations kernel libc macros math
-sequences ;
+sequences effects accessors namespaces lexer parser vocabs.parser
+words arrays math.vectors ;
 IN: math.vectors.simd.intrinsics
 
 ERROR: bad-simd-call ;
 
-: (simd-v+) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-v-) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-v+-) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-vs+) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-vs-) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-vs*) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-v*) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-v/) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-vmin) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-vmax) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-vsqrt) ( v1 v2 rep -- v3 ) bad-simd-call ;
-: (simd-sum) ( v1 rep -- v2 ) bad-simd-call ;
+<<
+
+: simd-effect ( word -- effect )
+    stack-effect [ in>> "rep" suffix ] [ out>> ] bi <effect> ;
+
+SYMBOL: simd-ops
+
+V{ } clone simd-ops set-global
+
+SYNTAX: SIMD-OP:
+    scan-word dup name>> "(simd-" ")" surround create-in
+    [ nip [ bad-simd-call ] define ]
+    [ [ simd-effect ] dip set-stack-effect ]
+    [ 2array simd-ops get push ]
+    2tri ;
+
+>>
+
+SIMD-OP: v+
+SIMD-OP: v-
+SIMD-OP: v+-
+SIMD-OP: vs+
+SIMD-OP: vs-
+SIMD-OP: vs*
+SIMD-OP: v*
+SIMD-OP: v/
+SIMD-OP: vmin
+SIMD-OP: vmax
+SIMD-OP: vsqrt
+SIMD-OP: sum
+SIMD-OP: vabs
+SIMD-OP: vbitand
+SIMD-OP: vbitor
+SIMD-OP: vbitxor
+
 : (simd-broadcast) ( x rep -- v ) bad-simd-call ;
 : (simd-gather-2) ( a b rep -- v ) bad-simd-call ;
 : (simd-gather-4) ( a b c d rep -- v ) bad-simd-call ;
@@ -82,6 +106,10 @@ M: vector-rep supported-simd-op?
         { \ (simd-vmax)      [ %max-vector-reps            ] }
         { \ (simd-vsqrt)     [ %sqrt-vector-reps           ] }
         { \ (simd-sum)       [ %horizontal-add-vector-reps ] }
+        { \ (simd-vabs)      [ %abs-vector-reps            ] }
+        { \ (simd-vbitand)   [ %and-vector-reps            ] }
+        { \ (simd-vbitor)    [ %or-vector-reps             ] }
+        { \ (simd-vbitxor)   [ %xor-vector-reps            ] }
         { \ (simd-broadcast) [ %broadcast-vector-reps      ] }
         { \ (simd-gather-2)  [ %gather-vector-2-reps       ] }
         { \ (simd-gather-4)  [ %gather-vector-4-reps       ] }
diff --git a/basis/math/vectors/simd/simd-tests.factor b/basis/math/vectors/simd/simd-tests.factor
index 007ec338cd..db8597fc9d 100644
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@@ -135,7 +135,7 @@ CONSTANT: simd-classes
     word '[ _ declare _ execute ] ;
 
 : remove-float-words ( alist -- alist' )
-    [ drop { n/v v/n v/ normalize } member? not ] assoc-filter ;
+    [ drop { vsqrt n/v v/n v/ normalize } member? not ] assoc-filter ;
 
 : ops-to-check ( elt-class -- alist )
     [ vector-words >alist ] dip
@@ -242,3 +242,5 @@ STRUCT: simd-struct
         { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
     ] compile-call
 ] unit-test
+
+[ ] [ char-16 new 1array stack. ] unit-test
diff --git a/basis/math/vectors/specialization/specialization.factor b/basis/math/vectors/specialization/specialization.factor
index 2fb36d428a..bf2dac29d6 100644
--- a/basis/math/vectors/specialization/specialization.factor
+++ b/basis/math/vectors/specialization/specialization.factor
@@ -72,6 +72,11 @@ H{
     { vneg { +vector+ -> +vector+ } }
     { vtruncate { +vector+ -> +vector+ } }
     { sum { +vector+ -> +scalar+ } }
+    { vabs { +vector+ -> +vector+ } }
+    { vsqrt { +vector+ -> +vector+ } }
+    { vbitand { +vector+ +vector+ -> +vector+ } }
+    { vbitor { +vector+ +vector+ -> +vector+ } }
+    { vbitxor { +vector+ +vector+ -> +vector+ } }
 }
 
 PREDICATE: vector-word < word vector-words key? ;
diff --git a/basis/math/vectors/vectors-docs.factor b/basis/math/vectors/vectors-docs.factor
index ce635b6d60..67ac27400e 100644
--- a/basis/math/vectors/vectors-docs.factor
+++ b/basis/math/vectors/vectors-docs.factor
@@ -30,7 +30,10 @@ $nl
 "Saturated arithmetic may be performed on " { $link "specialized-arrays" } "; the results are clamped to the minimum and maximum bounds of the array element type, instead of wrapping around:"
 { $subsection vs+ }
 { $subsection vs- }
-{ $subsection vs* } ;
+{ $subsection vs* }
+"Comparing vectors:"
+{ $subsection distance }
+{ $subsection v~ } ;
 
 ABOUT: "math-vectors"
 
@@ -144,6 +147,10 @@ HELP: normalize
 { $values { "u" "a sequence of numbers, not all zero" } { "v" "a sequence of numbers" } }
 { $description "Outputs a vector with the same direction as " { $snippet "u" } " but length 1." } ;
 
+HELP: distance
+{ $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "x" "a non-negative real number" } }
+{ $description "Outputs the Euclidean distance between two vectors." } ;
+
 HELP: set-axis
 { $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "axis" "a sequence of 0/1" } { "w" "a sequence of numbers" } }
 { $description "Using " { $snippet "w" } " as a template, creates a new sequence containing corresponding elements from " { $snippet "u" } " in place of 0, and corresponding elements from " { $snippet "v" } " in place of 1." }
diff --git a/basis/math/vectors/vectors.factor b/basis/math/vectors/vectors.factor
index 3a1b2875a9..eb97fba663 100644
--- a/basis/math/vectors/vectors.factor
+++ b/basis/math/vectors/vectors.factor
@@ -1,7 +1,8 @@
-! Copyright (C) 2005, 2008 Slava Pestov.
+! Copyright (C) 2005, 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: arrays alien.c-types kernel sequences math math.functions
-hints math.order fry ;
+hints math.order fry combinators ;
+QUALIFIED-WITH: alien.c-types c
 IN: math.vectors
 
 GENERIC: element-type ( obj -- c-type )
@@ -31,13 +32,35 @@ GENERIC: element-type ( obj -- c-type )
     [ [ not ] 2dip pick [ + ] [ - ] if ] 2map
     nip ;
 
+<PRIVATE
+
 : 2saturate-map ( u v quot -- w )
     pick element-type '[ @ _ c-type-clamp ] 2map ; inline
 
+PRIVATE>
+
 : vs+ ( u v -- w ) [ + ] 2saturate-map ;
 : vs- ( u v -- w ) [ - ] 2saturate-map ;
 : vs* ( u v -- w ) [ * ] 2saturate-map ;
 
+: vabs ( u -- v ) [ abs ] map ;
+: vsqrt ( u -- v ) [ sqrt ] map ;
+
+<PRIVATE
+
+: fp-bitwise-op ( x y seq quot -- z )
+    swap element-type {
+        { c:double [ [ [ double>bits ] bi@ ] dip call bits>double ] }
+        { c:float [ [ [ float>bits ] bi@ ] dip call bits>float ] }
+        [ drop call ]
+    } case ; inline
+
+PRIVATE>
+
+: vbitand ( u v -- w ) over '[ _ [ bitand ] fp-bitwise-op ] 2map ;
+: vbitor ( u v -- w ) over '[ _ [ bitor ] fp-bitwise-op ] 2map ;
+: vbitxor ( u v -- w ) over '[ _ [ bitxor ] fp-bitwise-op ] 2map ;
+
 : vfloor    ( v -- _v_ ) [ floor    ] map ;
 : vceiling  ( v -- ^v^ ) [ ceiling  ] map ;
 : vtruncate ( v -- -v- ) [ truncate ] map ;

From 7d39499d86dec087d0f3c5bd7e77159959b31529 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Wed, 23 Sep 2009 02:47:56 -0500
Subject: [PATCH 10/15] compiler.cfg.ssa.cssa: correctly handle overflowing
 fixnum instructions

---
 basis/compiler/cfg/ssa/cssa/cssa.factor | 18 ++++++++++++++----
 basis/compiler/tests/codegen.factor     | 20 +++++++++++++++++++-
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/basis/compiler/cfg/ssa/cssa/cssa.factor b/basis/compiler/cfg/ssa/cssa/cssa.factor
index 14287e900f..d58cebac65 100644
--- a/basis/compiler/cfg/ssa/cssa/cssa.factor
+++ b/basis/compiler/cfg/ssa/cssa/cssa.factor
@@ -1,8 +1,9 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors assocs kernel locals fry
+USING: accessors assocs kernel locals fry sequences
 cpu.architecture
 compiler.cfg.rpo
+compiler.cfg.def-use
 compiler.cfg.utilities
 compiler.cfg.registers
 compiler.cfg.instructions
@@ -13,10 +14,19 @@ IN: compiler.cfg.ssa.cssa
 ! selection, so it must keep track of representations when introducing
 ! new values.
 
+: insert-copy? ( bb vreg -- ? )
+    ! If the last instruction defines a value (which means it is
+    ! ##fixnum-add, ##fixnum-sub or ##fixnum-mul) then we don't
+    ! need to insert a copy since in fact doing so will result
+    ! in incorrect code.
+    [ instructions>> last defs-vreg ] dip eq? not ;
+
 :: insert-copy ( bb src rep -- bb dst )
-    rep next-vreg-rep :> dst
-    bb [ dst src rep src rep-of emit-conversion ] add-instructions
-    bb dst ;
+    bb src insert-copy? [
+        rep next-vreg-rep :> dst
+        bb [ dst src rep src rep-of emit-conversion ] add-instructions
+        bb dst
+    ] [ bb src ] if ;
 
 : convert-phi ( ##phi -- )
     dup dst>> rep-of '[ [ _ insert-copy ] assoc-map ] change-inputs drop ;
diff --git a/basis/compiler/tests/codegen.factor b/basis/compiler/tests/codegen.factor
index 14ed2294c7..3dbde076a6 100644
--- a/basis/compiler/tests/codegen.factor
+++ b/basis/compiler/tests/codegen.factor
@@ -3,7 +3,7 @@ math hashtables.private math.private namespaces sequences tools.test
 namespaces.private slots.private sequences.private byte-arrays alien
 alien.accessors layouts words definitions compiler.units io
 combinators vectors grouping make alien.c-types combinators.short-circuit
-math.order math.libm math.parser ;
+math.order math.libm math.parser alien.c-types ;
 FROM: math => float ;
 QUALIFIED: namespaces.private
 IN: compiler.tests.codegen
@@ -431,3 +431,21 @@ cell 4 = [
         ] curry each-integer
     ] compile-call
 ] unit-test
+
+TUPLE: myseq { underlying1 byte-array read-only } { underlying2 byte-array read-only } ;
+
+[ 2 ] [
+    little-endian?
+    T{ myseq f B{ 1 0 0 0 } B{ 1 0 0 0 } }
+    T{ myseq f B{ 0 0 0 1 } B{ 0 0 0 1 } } ?
+    [
+        { myseq } declare
+        [ 0 2 ] dip dup
+        [
+            [
+                over 1 < [ underlying1>> ] [ [ 1 - ] dip underlying2>> ] if
+                swap 4 * >fixnum alien-signed-4
+            ] bi-curry@ bi * +
+        ] 2curry each-integer
+    ] compile-call
+] unit-test

From 3d4ce094d282272265e76a767d89fc659e8c260c Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Wed, 23 Sep 2009 02:48:23 -0500
Subject: [PATCH 11/15] prettyprint.backend: don't bomb if printing
 length-limited sequence which doesn't like being resized itself. Instead
 convert to array of elements first

---
 basis/prettyprint/backend/backend.factor | 1 +
 1 file changed, 1 insertion(+)

diff --git a/basis/prettyprint/backend/backend.factor b/basis/prettyprint/backend/backend.factor
index cba40bbff1..fb47c50fbe 100644
--- a/basis/prettyprint/backend/backend.factor
+++ b/basis/prettyprint/backend/backend.factor
@@ -173,6 +173,7 @@ M: tuple pprint*
     ] when ;
 
 : pprint-elements ( seq -- )
+    >array
     do-length-limit
     [ [ pprint* ] each ] dip
     [ "~" swap number>string " more~" 3append text ] when* ;

From e1efb189a44f131fba0809bc1dcd4e08928df7d6 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Wed, 23 Sep 2009 03:08:45 -0500
Subject: [PATCH 12/15] math and math.vectors: improved documentation

---
 basis/math/vectors/vectors-docs.factor | 96 +++++++++++++++++++++++---
 basis/math/vectors/vectors.factor      | 10 +--
 core/math/math-docs.factor             | 11 ++-
 3 files changed, 98 insertions(+), 19 deletions(-)

diff --git a/basis/math/vectors/vectors-docs.factor b/basis/math/vectors/vectors-docs.factor
index 67ac27400e..3790e38d55 100644
--- a/basis/math/vectors/vectors-docs.factor
+++ b/basis/math/vectors/vectors-docs.factor
@@ -1,4 +1,4 @@
-USING: help.markup help.syntax math sequences ;
+USING: help.markup help.syntax math math.functions sequences ;
 IN: math.vectors
 
 ARTICLE: "math-vectors" "Vector arithmetic"
@@ -14,26 +14,46 @@ $nl
 { $subsection n+v }
 { $subsection v-n }
 { $subsection n-v }
-"Combining two vectors to form another vector with " { $link 2map } ":"
+"Vector unary operations:"
+{ $subsection vneg }
+{ $subsection vabs }
+{ $subsection vsqrt }
+{ $subsection vfloor }
+{ $subsection vceiling }
+{ $subsection vtruncate }
+"Vector/vector binary operations:"
 { $subsection v+ }
 { $subsection v- }
 { $subsection v+- }
 { $subsection v* }
 { $subsection v/ }
+"Saturated arithmetic (only on " { $link "specialized-arrays" } "):"
+{ $subsection vs+ }
+{ $subsection vs- }
+{ $subsection vs* }
+"Comparisons:"
 { $subsection vmax }
 { $subsection vmin }
+"Bitwise operations:"
+{ $subsection vbitand }
+{ $subsection vbitor }
+{ $subsection vbitxor }
 "Inner product and norm:"
 { $subsection v. }
 { $subsection norm }
 { $subsection norm-sq }
 { $subsection normalize }
-"Saturated arithmetic may be performed on " { $link "specialized-arrays" } "; the results are clamped to the minimum and maximum bounds of the array element type, instead of wrapping around:"
-{ $subsection vs+ }
-{ $subsection vs- }
-{ $subsection vs* }
 "Comparing vectors:"
 { $subsection distance }
-{ $subsection v~ } ;
+{ $subsection v~ }
+"Other functions:"
+{ $subsection vsupremum }
+{ $subsection vinfimum }
+{ $subsection trilerp }
+{ $subsection bilerp }
+{ $subsection vlerp }
+{ $subsection vnlerp }
+{ $subsection vbilerp } ;
 
 ABOUT: "math-vectors"
 
@@ -41,6 +61,43 @@ HELP: vneg
 { $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } }
 { $description "Negates each element of " { $snippet "u" } "." } ;
 
+HELP: vabs
+{ $values { "u" "a sequence of numbers" } { "v" "a sequence of non-negative real numbers" } }
+{ $description "Takes the absolute value of each element of " { $snippet "u" } "." } ;
+
+HELP: vsqrt
+{ $values { "u" "a sequence of non-negative real numbers" } { "v" "a sequence of non-negative real numbers" } }
+{ $description "Takes the square root of each element of " { $snippet "u" } "." }
+{ $warning "For performance reasons, this does not work with negative inputs, unlike " { $link sqrt } "." } ;
+
+HELP: vfloor
+{ $values { "u" "a sequence of real numbers" } { "v" "a sequence of real numbers" } }
+{ $description "Takes the " { $link floor } " of each element of " { $snippet "u" } "." } ;
+
+HELP: vceiling
+{ $values { "u" "a sequence of real numbers" } { "v" "a sequence of real numbers" } }
+{ $description "Takes the " { $link ceiling } " of each element of " { $snippet "u" } "." } ;
+
+HELP: vtruncate
+{ $values { "u" "a sequence of real numbers" } { "v" "a sequence of real numbers" } }
+{ $description "Truncates each element of " { $snippet "u" } "." } ;
+
+HELP: n+v
+{ $values { "n" "a number" } { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } }
+{ $description "Adds " { $snippet "n" } " to each element of " { $snippet "u" } "." } ;
+
+HELP: v+n
+{ $values { "u" "a sequence of numbers" } { "n" "a number" } { "v" "a sequence of numbers" } }
+{ $description "Adds " { $snippet "n" } " to each element of " { $snippet "u" } "." } ;
+
+HELP: n-v
+{ $values { "n" "a number" } { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } }
+{ $description "Subtracts each element of " { $snippet "u" } " from " { $snippet "n" } "." } ;
+
+HELP: v-n
+{ $values { "u" "a sequence of numbers" } { "n" "a number" } { "v" "a sequence of numbers" } }
+{ $description "Subtracts " { $snippet "n" } " from each element of " { $snippet "u" } "." } ;
+
 HELP: n*v
 { $values { "n" "a number" } { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } }
 { $description "Multiplies each element of " { $snippet "u" } " by " { $snippet "n" } "." } ;
@@ -51,11 +108,13 @@ HELP: v*n
 
 HELP: n/v
 { $values { "n" "a number" } { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } }
-{ $description "Divides " { $snippet "n" } " by each element of " { $snippet "u" } "." } ;
+{ $description "Divides " { $snippet "n" } " by each element of " { $snippet "u" } "." }
+{ $errors "May throw an error if a division by zero occurs; see " { $link "division-by-zero" } "." } ;
 
 HELP: v/n
 { $values { "u" "a sequence of numbers" } { "n" "a number" } { "v" "a sequence of numbers" } }
-{ $description "Divides each element of " { $snippet "u" } " by " { $snippet "n" } "." } ;
+{ $description "Divides each element of " { $snippet "u" } " by " { $snippet "n" } "." }
+{ $errors "May throw an error if a division by zero occurs; see " { $link "division-by-zero" } "." } ;
 
 HELP: v+
 { $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "w" "a sequence of numbers" } }
@@ -87,7 +146,7 @@ HELP: v*
 HELP: v/
 { $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "w" "a sequence of numbers" } }
 { $description "Divides " { $snippet "u" } " by " { $snippet "v" } " component-wise." }
-{ $errors "Throws an error if an integer division by zero occurs." } ;
+{ $errors "May throw an error if a division by zero occurs; see " { $link "division-by-zero" } "." } ;
 
 HELP: vmax
 { $values { "u" "a sequence of real numbers" } { "v" "a sequence of real numbers" } { "w" "a sequence of real numbers" } }
@@ -104,7 +163,7 @@ HELP: v.
 { $description "Computes the real-valued dot product." }
 { $notes
     "This word can also take complex number sequences as input, however mathematically it will compute the wrong result. The complex-valued dot product is defined differently:"
-    { $snippet "0 [ conjugate * + ] 2reduce" }
+    { $code "0 [ conjugate * + ] 2reduce" }
 } ;
 
 HELP: vs+
@@ -135,6 +194,21 @@ HELP: vs*
 { $values { "u" "a sequence of numbers" } { "v" "a sequence of numbers" } { "w" "a sequence of numbers" } }
 { $description "Multiplies " { $snippet "u" } " and " { $snippet "v" } " component-wise with saturation." } ;
 
+HELP: vbitand
+{ $values { "u" "a sequence of real numbers" } { "v" "a sequence of real numbers" } { "w" "a sequence of real numbers" } }
+{ $description "Takes the bitwise and of " { $snippet "u" } " and " { $snippet "v" } " component-wise." }
+{ $notes "Unlike " { $link bitand } ", this word may be used on a specialized array of floats or doubles, in which case the bitwise representation of the floating point numbers is operated upon." } ;
+
+HELP: vbitor
+{ $values { "u" "a sequence of real numbers" } { "v" "a sequence of real numbers" } { "w" "a sequence of real numbers" } }
+{ $description "Takes the bitwise or of " { $snippet "u" } " and " { $snippet "v" } " component-wise." }
+{ $notes "Unlike " { $link bitor } ", this word may be used on a specialized array of floats or doubles, in which case the bitwise representation of the floating point numbers is operated upon." } ;
+
+HELP: vbitxor
+{ $values { "u" "a sequence of real numbers" } { "v" "a sequence of real numbers" } { "w" "a sequence of real numbers" } }
+{ $description "Takes the bitwise exclusive or of " { $snippet "u" } " and " { $snippet "v" } " component-wise." }
+{ $notes "Unlike " { $link bitxor } ", this word may be used on a specialized array of floats or doubles, in which case the bitwise representation of the floating point numbers is operated upon." } ;
+
 HELP: norm-sq
 { $values { "v" "a sequence of numbers" } { "x" "a non-negative real number" } }
 { $description "Computes the squared length of a mathematical vector." } ;
diff --git a/basis/math/vectors/vectors.factor b/basis/math/vectors/vectors.factor
index eb97fba663..4b6f67544a 100644
--- a/basis/math/vectors/vectors.factor
+++ b/basis/math/vectors/vectors.factor
@@ -1,7 +1,7 @@
 ! Copyright (C) 2005, 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: arrays alien.c-types kernel sequences math math.functions
-hints math.order fry combinators ;
+hints math.order math.libm fry combinators ;
 QUALIFIED-WITH: alien.c-types c
 IN: math.vectors
 
@@ -44,7 +44,7 @@ PRIVATE>
 : vs* ( u v -- w ) [ * ] 2saturate-map ;
 
 : vabs ( u -- v ) [ abs ] map ;
-: vsqrt ( u -- v ) [ sqrt ] map ;
+: vsqrt ( u -- v ) [ >float fsqrt ] map ;
 
 <PRIVATE
 
@@ -61,9 +61,9 @@ PRIVATE>
 : vbitor ( u v -- w ) over '[ _ [ bitor ] fp-bitwise-op ] 2map ;
 : vbitxor ( u v -- w ) over '[ _ [ bitxor ] fp-bitwise-op ] 2map ;
 
-: vfloor    ( v -- _v_ ) [ floor    ] map ;
-: vceiling  ( v -- ^v^ ) [ ceiling  ] map ;
-: vtruncate ( v -- -v- ) [ truncate ] map ;
+: vfloor    ( u -- v ) [ floor ] map ;
+: vceiling  ( u -- v ) [ ceiling ] map ;
+: vtruncate ( u -- v ) [ truncate ] map ;
 
 : vsupremum ( seq -- vmax ) [ ] [ vmax ] map-reduce ; 
 : vinfimum ( seq -- vmin ) [ ] [ vmin ] map-reduce ; 
diff --git a/core/math/math-docs.factor b/core/math/math-docs.factor
index e5de106bbb..e6805d693b 100644
--- a/core/math/math-docs.factor
+++ b/core/math/math-docs.factor
@@ -434,11 +434,15 @@ HELP: byte-array>bignum
 { $description "Converts a byte-array, interpreted as little-endian, into a bignum integer. User code should call " { $link le> } " or " { $link be> } " instead." } ;
 
 ARTICLE: "division-by-zero" "Division by zero"
-"Floating point division never raises an error if the denominator is zero. This means that if at least one of the two inputs to " { $link / } ", " { $link /f } " or " { $link mod } " is a float, the result will be a floating point infinity or not a number value."
+"Behavior of division operations when a denominator of zero is used depends on the data types in question, as well as the platform being used."
+$nl
+"Floating point division only throws an error if the appropriate traps are enabled in the floating point environment. If traps are disabled, a Not-a-number value or an infinity is output, depending on whether the numerator is zero or non-zero."
+$nl
+"Floating point traps are disabled by default and the " { $vocab-link "math.floats.env" } " vocabulary provides words to enable them. Floating point division is performed by " { $link / } ", " { $link /f } " or " { $link mod } " if at least one of the two inputs is a float. Floating point division is always performed by " { $link /f } "."
 $nl
 "The behavior of integer division is hardware specific. On x86 processors, " { $link /i } " and " { $link mod } " raise an error if both inputs are integers and the denominator is zero. On PowerPC, integer division by zero yields a result of zero."
 $nl
-"On the other hand, the " { $link / } " word, when given integer arguments, implements a much more expensive division algorithm which always yields an exact rational answer, and this word always tests for division by zero explicitly." ;
+"The " { $link / } " word, when given integer arguments, implements a much more expensive division algorithm which always yields an exact rational answer, and this word always tests for division by zero explicitly." ;
 
 ARTICLE: "number-protocol" "Number protocol"
 "Math operations obey certain numerical upgrade rules. If one of the inputs is a bignum and the other is a fixnum, the latter is first coerced to a bignum; if one of the inputs is a float, the other is coerced to a float."
@@ -459,7 +463,8 @@ $nl
 { $subsection > }
 { $subsection >= }
 "Numbers can be compared for equality using " { $link = } ", or a less precise test which disregards types:"
-{ $subsection number= } ;
+{ $subsection number= }
+{ $see-also "math.floats.compare" } ;
 
 ARTICLE: "modular-arithmetic" "Modular arithmetic"
 { $subsection mod }

From 43acea72b994317329e215c7ff798291625c20ed Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Wed, 23 Sep 2009 03:08:54 -0500
Subject: [PATCH 13/15] specialized-vectors: improved documentation

---
 basis/specialized-vectors/specialized-vectors-docs.factor | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/basis/specialized-vectors/specialized-vectors-docs.factor b/basis/specialized-vectors/specialized-vectors-docs.factor
index 9c575fe73a..c773356a64 100644
--- a/basis/specialized-vectors/specialized-vectors-docs.factor
+++ b/basis/specialized-vectors/specialized-vectors-docs.factor
@@ -16,8 +16,8 @@ ARTICLE: "specialized-vector-words" "Specialized vector words"
 }
 "Behind the scenes, these words are placed in a vocabulary named " { $snippet "specialized-vectors.instances.T" } ", however this vocabulary should not be placed in a " { $link POSTPONE: USING: } " form directly. Instead, always use " { $link POSTPONE: SPECIALIZED-VECTOR: } ". This ensures that the vocabulary can get generated the first time it is needed." ;
 
-ARTICLE: "specialized-vector-c" "Passing specialized arrays to C functions"
-"Each specialized array has a " { $slot "underlying" } " slot holding a specialized array, which in turn has an " { $slot "underlying" } " slot holding a " { $link byte-array } " with the raw data. Passing a specialized vector as a parameter to a C function call will automatically extract the underlying data. To get at the underlying data directly, call the " { $link >c-ptr } " word on a specialized vector." ;
+ARTICLE: "specialized-vector-c" "Passing specialized vectors to C functions"
+"Each specialized vector has a " { $slot "underlying" } " slot holding a specialized array, which in turn has an " { $slot "underlying" } " slot holding a " { $link byte-array } " with the raw data. Passing a specialized vector as a parameter to a C function call will automatically extract the underlying data. To get at the underlying data directly, call the " { $link >c-ptr } " word on a specialized vector." ;
 
 ARTICLE: "specialized-vectors" "Specialized vectors"
 "The " { $vocab-link "specialized-vectors" } " vocabulary implements resizable sequence types for storing machine values in a space-efficient manner without boxing."

From f017f8904b46643c3d6f83ad90b3d4a0e24a9c3e Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@slava-pestovs-macbook-pro.local>
Date: Wed, 23 Sep 2009 03:14:06 -0500
Subject: [PATCH 14/15] math.vectors.simd: Improved documentation

---
 basis/math/vectors/simd/simd-docs.factor | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor
index 6d212f8e2b..02ed7f5ff8 100644
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@@ -17,13 +17,17 @@ $nl
 "There should never be any reason to use " { $link "math.vectors.simd.intrinsics" } " directly, but they too have a straightforward, but lower-level, interface." ;
 
 ARTICLE: "math.vectors.simd.support" "Supported SIMD instruction sets and operations"
-"At present, the SIMD support makes use of SSE, SSE2 and a few SSE3 instructions on x86 CPUs."
+"At present, the SIMD support makes use of a subset of SSE up to SSE4.1. The subset used depends on the current CPU type."
 $nl
 "SSE1 only supports single-precision SIMD (" { $snippet "float-4" } " and " { $snippet "float-8" } ")."
 $nl
-"SSE2 introduces double-precision and integer SIMD."
+"SSE2 introduces double-precision SIMD (" { $snippet "double-2" } " and " { $snippet "double-4" } ") and integer SIMD (all types). Integer SIMD in missing a few features, in particular the " { $link vmin } " and " { $link vmax } " operations only work on " { $snippet "uchar-16" } " and " { $snippet "short-8" } "."
 $nl
-"SSE3 introduces horizontal adds (summing all components of a single vector register), which is useful for computing dot products. Where available, SSE3 operations are used to speed up " { $link sum } ", " { $link v. } ", " { $link norm-sq } ", " { $link norm } ", and " { $link distance } ". If SSE3 is not available, software fallbacks are used for " { $link sum } " and related words."
+"SSE3 introduces horizontal adds (summing all components of a single vector register), which is useful for computing dot products. Where available, SSE3 operations are used to speed up " { $link sum } ", " { $link v. } ", " { $link norm-sq } ", " { $link norm } ", and " { $link distance } "."
+$nl
+"SSSE3 introduces " { $link vabs } " for " { $snippet "char-16" } ", " { $snippet "short-8" } " and " { $snippet "int-4" } "."
+$nl
+"SSE4.1 introduces " { $link vmin } " and " { $link vmax } " for all remaining integer types."
 $nl
 "On PowerPC, or older x86 chips without SSE, software fallbacks are used for all high-level vector operations. SIMD code can run with no loss in functionality, just decreased performance."
 $nl

From 6d4e5af088780383c1a1e6c8dfe21142deaac2b9 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@shill.local>
Date: Wed, 23 Sep 2009 05:13:15 -0500
Subject: [PATCH 15/15] cpu.x86.features: don't fold away sse-version, instead
 memoize it and recompute on startup

---
 basis/cpu/x86/features/features.factor | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/basis/cpu/x86/features/features.factor b/basis/cpu/x86/features/features.factor
index 5fad4e802c..b21aa762d8 100644
--- a/basis/cpu/x86/features/features.factor
+++ b/basis/cpu/x86/features/features.factor
@@ -13,16 +13,18 @@ FUNCTION: longlong read_timestamp_counter ( ) ;
 
 PRIVATE>
 
-: sse-version ( -- n )
+MEMO: sse-version ( -- n )
     sse_version
-    "sse-version" get string>number [ min ] when* ; foldable
+    "sse-version" get string>number [ min ] when* ;
 
-: sse? ( -- ? ) sse-version 10 >= ; foldable
-: sse2? ( -- ? ) sse-version 20 >= ; foldable
-: sse3? ( -- ? ) sse-version 30 >= ; foldable
-: ssse3? ( -- ? ) sse-version 33 >= ; foldable
-: sse4.1? ( -- ? ) sse-version 41 >= ; foldable
-: sse4.2? ( -- ? ) sse-version 42 >= ; foldable
+[ \ sse-version reset-memoized ] "cpu.x86.features" add-init-hook
+
+: sse? ( -- ? ) sse-version 10 >= ;
+: sse2? ( -- ? ) sse-version 20 >= ;
+: sse3? ( -- ? ) sse-version 30 >= ;
+: ssse3? ( -- ? ) sse-version 33 >= ;
+: sse4.1? ( -- ? ) sse-version 41 >= ;
+: sse4.2? ( -- ? ) sse-version 42 >= ;
 
 : sse-string ( version -- string )
     {