From 3bc097f6ff9388924faa41c2a12a965bc5d2dd50 Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Fri, 9 Oct 2009 20:46:52 -0500
Subject: [PATCH 01/13] rename ##shuffle-vector to ##shuffle-vector-imm, and
 add a new ##shuffle-vector for dynamic shuffles. have vshuffle use
 ##shuffle-vector to do word and byte shuffles on x86

---
 .../cfg/instructions/instructions.factor      |  5 ++
 .../compiler/cfg/intrinsics/simd/simd.factor  | 47 +++++++++++++++----
 .../value-numbering/rewrite/rewrite.factor    | 18 +++----
 .../value-numbering/simplify/simplify.factor  |  2 +-
 .../value-numbering-tests.factor              | 22 ++++-----
 basis/compiler/codegen/codegen.factor         |  1 +
 basis/cpu/architecture/architecture.factor    |  3 ++
 basis/cpu/x86/x86.factor                      | 43 +++++++++++++----
 .../vectors/simd/intrinsics/intrinsics.factor |  5 +-
 9 files changed, 105 insertions(+), 41 deletions(-)
diff --git a/basis/compiler/cfg/instructions/instructions.factor b/basis/compiler/cfg/instructions/instructions.factor
index 57d88a2d86..119af6d0b1 100644
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@@ -277,6 +277,11 @@ literal: rep ;
 
 PURE-INSN: ##shuffle-vector
 def: dst
+use: src shuffle
+literal: rep ;
+
+PURE-INSN: ##shuffle-vector-imm
+def: dst
 use: src
 literal: shuffle rep ;
 
diff --git a/basis/compiler/cfg/intrinsics/simd/simd.factor b/basis/compiler/cfg/intrinsics/simd/simd.factor
index 7607d69e45..3f7530caca 100644
--- a/basis/compiler/cfg/intrinsics/simd/simd.factor
+++ b/basis/compiler/cfg/intrinsics/simd/simd.factor
@@ -1,15 +1,15 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors byte-arrays fry cpu.architecture kernel math
-sequences math.vectors.simd.intrinsics macros generalizations
-combinators combinators.short-circuit arrays locals
+USING: accessors alien byte-arrays fry cpu.architecture kernel math
+sequences math.vectors math.vectors.simd.intrinsics macros
+generalizations combinators combinators.short-circuit arrays locals
 compiler.tree.propagation.info compiler.cfg.builder.blocks
 compiler.cfg.comparisons
 compiler.cfg.stacks compiler.cfg.stacks.local compiler.cfg.hats
 compiler.cfg.instructions compiler.cfg.registers
 compiler.cfg.intrinsics.alien
 specialized-arrays ;
-FROM: alien.c-types => float double ;
+FROM: alien.c-types => heap-size char uchar float double ;
 SPECIALIZED-ARRAYS: float double ;
 IN: compiler.cfg.intrinsics.simd
 
@@ -21,7 +21,7 @@ MACRO: check-elements ( quots -- )
 
 MACRO: if-literals-match ( quots -- )
     [ length ] [ ] [ length ] tri
-    ! n quots n n
+    ! n quots n
     '[
         ! node quot
         [
@@ -75,17 +75,46 @@ MACRO: if-literals-match ( quots -- )
         ds-push
     ] emit-vector-op ;
 
-: shuffle? ( obj -- ? ) { [ array? ] [ [ integer? ] all? ] } 1&& ;
+: variable-shuffle? ( obj -- ? )
+    ! the vshuffle intrinsic current doesn't allow variable shuffles
+    drop f ;
+
+: immediate-shuffle? ( obj -- ? ) { [ array? ] [ [ integer? ] all? ] } 1&& ;
+
+: shuffle? ( obj -- ? ) { [ variable-shuffle? ] [ immediate-shuffle? ] } 1|| ;
+
+: (>variable-shuffle) ( shuffle rep -- shuffle )
+    rep-component-type heap-size
+    [ dup <repetition> >byte-array ]
+    [ iota >byte-array ] bi
+    '[ _ n*v _ v+ ] map concat ;
+
+: >variable-shuffle ( shuffle rep -- shuffle' )
+    over immediate-shuffle? [ (>variable-shuffle) ] [ drop ] if ;
+
+: generate-shuffle-vector-imm? ( shuffle rep -- ? )
+    {
+        [ drop immediate-shuffle? ]
+        [ nip %shuffle-vector-imm-reps member? ]
+    } 2&& ;
+
+: generate-shuffle-vector ( src shuffle rep -- dst )
+    2dup generate-shuffle-vector-imm?
+    [ ^^shuffle-vector-imm ]
+    [
+        [ >variable-shuffle ^^load-constant ] keep
+        ^^shuffle-vector
+    ] if ;
 
 : emit-shuffle-vector ( node -- )
-    ! Pad the permutation with zeroes if its too short, since we
+    ! Pad the permutation with zeroes if it's too short, since we
     ! can't throw an error at this point.
-    [ [ rep-components 0 pad-tail ] keep ^^shuffle-vector ] [unary/param]
+    [ [ rep-components 0 pad-tail ] keep generate-shuffle-vector ] [unary/param]
     { [ shuffle? ] [ representation? ] } if-literals-match ;
 
 : ^^broadcast-vector ( src n rep -- dst )
     [ rep-components swap <array> ] keep
-    ^^shuffle-vector ;
+    generate-shuffle-vector ;
 
 : emit-broadcast-vector ( node -- )
     [ ^^broadcast-vector ] [unary/param]
diff --git a/basis/compiler/cfg/value-numbering/rewrite/rewrite.factor b/basis/compiler/cfg/value-numbering/rewrite/rewrite.factor
index 4a63777019..3842942a3b 100755
--- a/basis/compiler/cfg/value-numbering/rewrite/rewrite.factor
+++ b/basis/compiler/cfg/value-numbering/rewrite/rewrite.factor
@@ -450,26 +450,26 @@ M: ##set-alien-vector rewrite rewrite-alien-addressing ;
 ! Some lame constant folding for SIMD intrinsics. Eventually this
 ! should be redone completely.
 
-: rewrite-shuffle-vector ( insn expr -- insn' )
+: rewrite-shuffle-vector-imm ( insn expr -- insn' )
     2dup [ rep>> ] bi@ eq? [
         [ [ dst>> ] [ src>> vn>vreg ] bi* ]
         [ [ shuffle>> ] bi@ nths ]
         [ drop rep>> ]
-        2tri \ ##shuffle-vector new-insn
+        2tri \ ##shuffle-vector-imm new-insn
     ] [ 2drop f ] if ;
 
-: (fold-shuffle-vector) ( shuffle bytes -- bytes' )
+: (fold-shuffle-vector-imm) ( shuffle bytes -- bytes' )
     2dup length swap length /i group nths concat ;
 
-: fold-shuffle-vector ( insn expr -- insn' )
+: fold-shuffle-vector-imm ( insn expr -- insn' )
     [ [ dst>> ] [ shuffle>> ] bi ] dip value>>
-    (fold-shuffle-vector) \ ##load-constant new-insn ;
+    (fold-shuffle-vector-imm) \ ##load-constant new-insn ;
 
-M: ##shuffle-vector rewrite
+M: ##shuffle-vector-imm rewrite
     dup src>> vreg>expr {
-        { [ dup shuffle-vector-expr? ] [ rewrite-shuffle-vector ] }
-        { [ dup reference-expr? ] [ fold-shuffle-vector ] }
-        { [ dup constant-expr? ] [ fold-shuffle-vector ] }
+        { [ dup shuffle-vector-imm-expr? ] [ rewrite-shuffle-vector-imm ] }
+        { [ dup reference-expr? ] [ fold-shuffle-vector-imm ] }
+        { [ dup constant-expr? ] [ fold-shuffle-vector-imm ] }
         [ 2drop f ]
     } cond ;
 
diff --git a/basis/compiler/cfg/value-numbering/simplify/simplify.factor b/basis/compiler/cfg/value-numbering/simplify/simplify.factor
index c2026a9483..df3dc6aab9 100644
--- a/basis/compiler/cfg/value-numbering/simplify/simplify.factor
+++ b/basis/compiler/cfg/value-numbering/simplify/simplify.factor
@@ -136,7 +136,7 @@ M: scalar>vector-expr simplify*
         [ drop f ]
     } cond ;
 
-M: shuffle-vector-expr simplify*
+M: shuffle-vector-imm-expr simplify*
     [ src>> ] [ shuffle>> ] [ rep>> rep-components iota ] tri
     sequence= [ drop f ] unless ;
 
diff --git a/basis/compiler/cfg/value-numbering/value-numbering-tests.factor b/basis/compiler/cfg/value-numbering/value-numbering-tests.factor
index f98824cb95..733b8cc22a 100644
--- a/basis/compiler/cfg/value-numbering/value-numbering-tests.factor
+++ b/basis/compiler/cfg/value-numbering/value-numbering-tests.factor
@@ -1215,31 +1215,31 @@ cell 8 = [
     }
 ] [
     {
-        T{ ##shuffle-vector f 1 0 { 0 1 2 3 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 0 1 2 3 } float-4-rep }
     } value-numbering-step
 ] unit-test
 
 [
     {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 0 { 0 2 3 1 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 0 { 0 2 3 1 } float-4-rep }
     }
 ] [
     {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 3 1 2 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 3 1 2 0 } float-4-rep }
     } value-numbering-step
 ] unit-test
 
 [
     {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 1 0 } double-2-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 1 0 } double-2-rep }
     }
 ] [
     {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 1 0 } double-2-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 1 0 } double-2-rep }
     } value-numbering-step
 ] unit-test
 
@@ -1253,7 +1253,7 @@ cell 8 = [
     {
         T{ ##load-constant f 0 $[ 55 tag-fixnum ] }
         T{ ##scalar>vector f 1 0 int-4-rep }
-        T{ ##shuffle-vector f 2 1 { 0 0 0 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 0 0 0 0 } float-4-rep }
     } value-numbering-step
 ] unit-test
 
@@ -1267,7 +1267,7 @@ cell 8 = [
     {
         T{ ##load-constant f 0 1.25 }
         T{ ##scalar>vector f 1 0 float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 0 0 0 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 0 0 0 0 } float-4-rep }
     } value-numbering-step
 ] unit-test
 
diff --git a/basis/compiler/codegen/codegen.factor b/basis/compiler/codegen/codegen.factor
index 39dd21d893..938219af22 100755
--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@@ -154,6 +154,7 @@ CODEGEN: ##zero-vector %zero-vector
 CODEGEN: ##fill-vector %fill-vector
 CODEGEN: ##gather-vector-2 %gather-vector-2
 CODEGEN: ##gather-vector-4 %gather-vector-4
+CODEGEN: ##shuffle-vector-imm %shuffle-vector-imm
 CODEGEN: ##shuffle-vector %shuffle-vector
 CODEGEN: ##tail>head-vector %tail>head-vector
 CODEGEN: ##merge-vector-head %merge-vector-head
diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor
index 8bf84f6670..85a43e99fd 100644
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@@ -242,6 +242,7 @@ HOOK: %fill-vector cpu ( dst rep -- )
 HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
 HOOK: %shuffle-vector cpu ( dst src shuffle rep -- )
+HOOK: %shuffle-vector-imm cpu ( dst src shuffle rep -- )
 HOOK: %tail>head-vector cpu ( dst src rep -- )
 HOOK: %merge-vector-head cpu ( dst src1 src2 rep -- )
 HOOK: %merge-vector-tail cpu ( dst src1 src2 rep -- )
@@ -289,6 +290,7 @@ HOOK: %fill-vector-reps cpu ( -- reps )
 HOOK: %gather-vector-2-reps cpu ( -- reps )
 HOOK: %gather-vector-4-reps cpu ( -- reps )
 HOOK: %shuffle-vector-reps cpu ( -- reps )
+HOOK: %shuffle-vector-imm-reps cpu ( -- reps )
 HOOK: %merge-vector-reps cpu ( -- reps )
 HOOK: %signed-pack-vector-reps cpu ( -- reps )
 HOOK: %unsigned-pack-vector-reps cpu ( -- reps )
@@ -329,6 +331,7 @@ M: object %fill-vector-reps { } ;
 M: object %gather-vector-2-reps { } ;
 M: object %gather-vector-4-reps { } ;
 M: object %shuffle-vector-reps { } ;
+M: object %shuffle-vector-imm-reps { } ;
 M: object %merge-vector-reps { } ;
 M: object %signed-pack-vector-reps { } ;
 M: object %unsigned-pack-vector-reps { } ;
diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index 13727bdc61..dbd34c774a 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -698,7 +698,7 @@ M: x86 %gather-vector-2-reps
 : longlong-2-shuffle ( dst shuffle -- )
     first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
 
-M:: x86 %shuffle-vector ( dst src shuffle rep -- )
+M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
     dst src rep %copy
     dst shuffle rep unsign-rep {
         { double-2-rep [ double-2-shuffle ] }
@@ -707,12 +707,20 @@ M:: x86 %shuffle-vector ( dst src shuffle rep -- )
         { longlong-2-rep [ longlong-2-shuffle ] }
     } case ;
 
-M: x86 %shuffle-vector-reps
+M: x86 %shuffle-vector-imm-reps
     {
         { sse? { float-4-rep } }
         { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
     } available-reps ;
 
+M: x86 %shuffle-vector ( dst src shuffle rep -- )
+    two-operand PSHUFB ;
+
+M: x86 %shuffle-vector-reps
+    {
+        { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %merge-vector-head
     [ two-operand ] keep
     unsign-rep {
@@ -790,8 +798,6 @@ M: x86 %unpack-vector-head-reps ( -- reps )
         { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
     } available-reps ;
 
-M: x86 %unpack-vector-tail-reps ( -- reps ) { } ;
-
 M: x86 %integer>float-vector ( dst src rep -- )
     {
         { int-4-rep [ CVTDQ2PS ] }
@@ -1037,10 +1043,6 @@ M: x86 %mul-vector-reps
         { sse4.1? { int-4-rep uint-4-rep } }
     } available-reps ;
 
-M: x86 %saturated-mul-vector-reps
-    ! No multiplication with saturation on x86
-    { } ;
-
 M: x86 %div-vector ( dst src1 src2 rep -- )
     [ two-operand ] keep
     {
@@ -1223,8 +1225,6 @@ M: x86 %xor-vector-reps
         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
     } available-reps ;
 
-M: x86 %not-vector-reps { } ;
-
 M: x86 %shl-vector ( dst src1 src2 rep -- )
     [ two-operand ] keep
     {
@@ -1271,6 +1271,29 @@ M:: x86 %scalar>integer ( dst src rep -- )
         { uint-scalar-rep [
             dst 32-bit-version-of src MOVD
         ] }
+        { short-scalar-rep [
+            dst 32-bit-version-of src MOVD
+            dst dst 16-bit-version-of MOVSX
+        ] }
+        { ushort-scalar-rep [
+            dst 32-bit-version-of src MOVD
+            dst dst 16-bit-version-of MOVZX
+        ] }
+        { char-scalar-rep [
+            dst 32-bit-version-of src MOVD
+            dst { } 8 [| tmp-dst |
+                tmp-dst dst int-rep %copy
+                tmp-dst tmp-dst 8-bit-version-of MOVSX
+                dst tmp-dst int-rep %copy
+            ] with-small-register
+        ] }
+        { uchar-scalar-rep [
+            dst { } 8 [| tmp-dst |
+                tmp-dst dst int-rep %copy
+                tmp-dst tmp-dst 8-bit-version-of MOVZX
+                dst tmp-dst int-rep %copy
+            ] with-small-register
+        ] }
     } case ;
 
 M: x86 %vector>scalar %copy ;
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor
index 5a7974a75f..deb92c2944 100644
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@@ -148,6 +148,9 @@ GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
     union
     { uchar-16-rep ushort-8-rep uint-4-rep ulonglong-2-rep } union ;
 
+: (%shuffle-reps) ( -- reps )
+    %shuffle-vector-reps %shuffle-vector-imm-reps union ;
+
 M: vector-rep supported-simd-op?
     {
         { \ (simd-v+)            [ %add-vector-reps            ] }
@@ -179,7 +182,7 @@ M: vector-rep supported-simd-op?
         { \ (simd-vrshift)       [ %shr-vector-reps            ] }
         { \ (simd-hlshift)       [ %horizontal-shl-vector-reps ] }
         { \ (simd-hrshift)       [ %horizontal-shr-vector-reps ] }
-        { \ (simd-vshuffle)      [ %shuffle-vector-reps        ] }
+        { \ (simd-vshuffle)      [ (%shuffle-reps)             ] }
         { \ (simd-(vmerge-head)) [ %merge-vector-reps          ] }
         { \ (simd-(vmerge-tail)) [ %merge-vector-reps          ] }
         { \ (simd-(v>float))        [ %integer>float-vector-reps ] }

From 1fa6f32790374372dc637f8529da12dae261f03e Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Sat, 10 Oct 2009 10:39:23 -0500
Subject: [PATCH 02/13] fix x86 uchar %scalar>integer

---
 basis/cpu/x86/x86.factor | 1 +
 1 file changed, 1 insertion(+)

diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index dbd34c774a..dab7d9d52b 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -1288,6 +1288,7 @@ M:: x86 %scalar>integer ( dst src rep -- )
             ] with-small-register
         ] }
         { uchar-scalar-rep [
+            dst 32-bit-version-of src MOVD
             dst { } 8 [| tmp-dst |
                 tmp-dst dst int-rep %copy
                 tmp-dst tmp-dst 8-bit-version-of MOVZX

From d9002127fae73af40dcb79c3d64a4c3d8301de16 Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Sat, 10 Oct 2009 10:40:09 -0500
Subject: [PATCH 03/13] have vshuffle accept simd-128 variable byte shuffles

---
 .../compiler/cfg/intrinsics/intrinsics.factor |  3 +-
 .../compiler/cfg/intrinsics/simd/simd.factor  | 48 +++++++++----------
 .../tree/propagation/simd/simd.factor         |  3 +-
 .../math/vectors/simd/functor/functor.factor  |  8 +++-
 .../vectors/simd/intrinsics/intrinsics.factor |  8 ++--
 basis/math/vectors/simd/simd-tests.factor     | 19 +++++++-
 .../specialization/specialization.factor      |  5 +-
 basis/math/vectors/vectors.factor             | 18 ++++++-
 .../specialized-arrays.factor                 |  5 +-
 9 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/basis/compiler/cfg/intrinsics/intrinsics.factor b/basis/compiler/cfg/intrinsics/intrinsics.factor
index 9c4447e654..3b6674efee 100644
--- a/basis/compiler/cfg/intrinsics/intrinsics.factor
+++ b/basis/compiler/cfg/intrinsics/intrinsics.factor
@@ -194,7 +194,8 @@ IN: compiler.cfg.intrinsics
         { math.vectors.simd.intrinsics:(simd-with) [ [ ^^with-vector ] emit-unary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-gather-2) [ emit-gather-vector-2 ] }
         { math.vectors.simd.intrinsics:(simd-gather-4) [ emit-gather-vector-4 ] }
-        { math.vectors.simd.intrinsics:(simd-vshuffle) [ emit-shuffle-vector ] }
+        { math.vectors.simd.intrinsics:(simd-vshuffle-elements) [ emit-shuffle-vector ] }
+        { math.vectors.simd.intrinsics:(simd-vshuffle-bytes) [ emit-shuffle-vector-var ] }
         { math.vectors.simd.intrinsics:(simd-(vmerge-head)) [ [ ^^merge-vector-head ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-(vmerge-tail)) [ [ ^^merge-vector-tail ] emit-binary-vector-op ] }
         { math.vectors.simd.intrinsics:(simd-(v>float)) [ [ ^^integer>float-vector ] emit-unary-vector-op ] }
diff --git a/basis/compiler/cfg/intrinsics/simd/simd.factor b/basis/compiler/cfg/intrinsics/simd/simd.factor
index 3f7530caca..e608cf999c 100644
--- a/basis/compiler/cfg/intrinsics/simd/simd.factor
+++ b/basis/compiler/cfg/intrinsics/simd/simd.factor
@@ -1,8 +1,9 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors alien byte-arrays fry cpu.architecture kernel math
-sequences math.vectors math.vectors.simd.intrinsics macros
-generalizations combinators combinators.short-circuit arrays locals
+USING: accessors alien byte-arrays fry classes.algebra
+cpu.architecture kernel math sequences math.vectors
+math.vectors.simd.intrinsics macros generalizations combinators
+combinators.short-circuit arrays locals
 compiler.tree.propagation.info compiler.cfg.builder.blocks
 compiler.cfg.comparisons
 compiler.cfg.stacks compiler.cfg.stacks.local compiler.cfg.hats
@@ -75,46 +76,43 @@ MACRO: if-literals-match ( quots -- )
         ds-push
     ] emit-vector-op ;
 
-: variable-shuffle? ( obj -- ? )
-    ! the vshuffle intrinsic current doesn't allow variable shuffles
-    drop f ;
+: shuffle? ( obj -- ? ) { [ array? ] [ [ integer? ] all? ] } 1&& ;
 
-: immediate-shuffle? ( obj -- ? ) { [ array? ] [ [ integer? ] all? ] } 1&& ;
-
-: shuffle? ( obj -- ? ) { [ variable-shuffle? ] [ immediate-shuffle? ] } 1|| ;
-
-: (>variable-shuffle) ( shuffle rep -- shuffle )
+: >variable-shuffle ( shuffle rep -- shuffle' )
     rep-component-type heap-size
     [ dup <repetition> >byte-array ]
     [ iota >byte-array ] bi
     '[ _ n*v _ v+ ] map concat ;
 
-: >variable-shuffle ( shuffle rep -- shuffle' )
-    over immediate-shuffle? [ (>variable-shuffle) ] [ drop ] if ;
-
-: generate-shuffle-vector-imm? ( shuffle rep -- ? )
-    {
-        [ drop immediate-shuffle? ]
-        [ nip %shuffle-vector-imm-reps member? ]
-    } 2&& ;
-
-: generate-shuffle-vector ( src shuffle rep -- dst )
-    2dup generate-shuffle-vector-imm?
+: generate-shuffle-vector-imm ( src shuffle rep -- dst )
+    dup %shuffle-vector-imm-reps member?
     [ ^^shuffle-vector-imm ]
     [
         [ >variable-shuffle ^^load-constant ] keep
         ^^shuffle-vector
     ] if ;
 
-: emit-shuffle-vector ( node -- )
+: emit-shuffle-vector-imm ( node -- )
     ! Pad the permutation with zeroes if it's too short, since we
     ! can't throw an error at this point.
-    [ [ rep-components 0 pad-tail ] keep generate-shuffle-vector ] [unary/param]
+    [ [ rep-components 0 pad-tail ] keep generate-shuffle-vector-imm ] [unary/param]
     { [ shuffle? ] [ representation? ] } if-literals-match ;
 
+: emit-shuffle-vector-var ( node -- )
+    [ ^^shuffle-vector ] [binary]
+    { [ %shuffle-vector-reps member? ] } if-literals-match ;
+
+: emit-shuffle-vector ( node -- )
+    dup node-input-infos {
+        [ length 3 = ]
+        [ first  class>> byte-array class<= ]
+        [ second class>> byte-array class<= ]
+        [ third  literal>> representation?  ]
+    } 1&& [ emit-shuffle-vector-var ] [ emit-shuffle-vector-imm ] if ;
+
 : ^^broadcast-vector ( src n rep -- dst )
     [ rep-components swap <array> ] keep
-    generate-shuffle-vector ;
+    generate-shuffle-vector-imm ;
 
 : emit-broadcast-vector ( node -- )
     [ ^^broadcast-vector ] [unary/param]
diff --git a/basis/compiler/tree/propagation/simd/simd.factor b/basis/compiler/tree/propagation/simd/simd.factor
index 1909a83488..1637148b88 100644
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@@ -31,7 +31,8 @@ IN: compiler.tree.propagation.simd
     (simd-vrshift)
     (simd-hlshift)
     (simd-hrshift)
-    (simd-vshuffle)
+    (simd-vshuffle-bytes)
+    (simd-vshuffle-elements)
     (simd-(vmerge-head))
     (simd-(vmerge-tail))
     (simd-(v>float))
diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor
index 7f28f644e1..2ddaf2b8a5 100644
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@@ -60,7 +60,7 @@ MACRO: simd-boa ( rep class -- simd-array )
     [ rep-components ] [ new ] [ '[ _ ] ] tri* swap replicate-as ; inline
 
 : simd-with/nth-fast? ( rep -- ? )
-    [ \ (simd-vshuffle) supported-simd-op? ]
+    [ \ (simd-vshuffle-elements) supported-simd-op? ]
     [ rep-component-type can-be-unboxed? ]
     bi and ;
 
@@ -184,6 +184,8 @@ WHERE
 TUPLE: A
 { underlying byte-array read-only initial: $[ 16 <byte-array> ] } ;
 
+INSTANCE: A simd-128
+
 M: A clone underlying>> clone \ A boa ; inline
 
 M: A length drop N ; inline
@@ -315,7 +317,7 @@ SLOT: underlying2
     class c:typedef ;
 
 : (define-simd-256) ( simd -- )
-    simd-ops get { vshuffle hlshift hrshift } unique assoc-diff >>ops
+    simd-ops get { vshuffle-elements vshuffle-bytes hlshift hrshift } unique assoc-diff >>ops
     [ define-simd ]
     [ [ class>> ] [ rep>> ] bi define-simd-256-type ] bi ;
 
@@ -362,6 +364,8 @@ TUPLE: A
 { underlying1 byte-array initial: $[ 16 <byte-array> ] read-only }
 { underlying2 byte-array initial: $[ 16 <byte-array> ] read-only } ;
 
+INSTANCE: A simd-256
+
 M: A clone
     [ underlying1>> clone ] [ underlying2>> clone ] bi
     \ A boa ; inline
diff --git a/basis/math/vectors/simd/intrinsics/intrinsics.factor b/basis/math/vectors/simd/intrinsics/intrinsics.factor
index deb92c2944..fab55949b4 100644
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@@ -67,7 +67,8 @@ SIMD-OP: vlshift
 SIMD-OP: vrshift
 SIMD-OP: hlshift
 SIMD-OP: hrshift
-SIMD-OP: vshuffle
+SIMD-OP: vshuffle-elements
+SIMD-OP: vshuffle-bytes
 SIMD-OP: (vmerge-head)
 SIMD-OP: (vmerge-tail)
 SIMD-OP: v<=
@@ -148,7 +149,7 @@ GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
     union
     { uchar-16-rep ushort-8-rep uint-4-rep ulonglong-2-rep } union ;
 
-: (%shuffle-reps) ( -- reps )
+: (%shuffle-imm-reps) ( -- reps )
     %shuffle-vector-reps %shuffle-vector-imm-reps union ;
 
 M: vector-rep supported-simd-op?
@@ -182,7 +183,8 @@ M: vector-rep supported-simd-op?
         { \ (simd-vrshift)       [ %shr-vector-reps            ] }
         { \ (simd-hlshift)       [ %horizontal-shl-vector-reps ] }
         { \ (simd-hrshift)       [ %horizontal-shr-vector-reps ] }
-        { \ (simd-vshuffle)      [ (%shuffle-reps)             ] }
+        { \ (simd-vshuffle-elements) [ (%shuffle-imm-reps)         ] }
+        { \ (simd-vshuffle-bytes)    [ %shuffle-vector-reps        ] }
         { \ (simd-(vmerge-head)) [ %merge-vector-reps          ] }
         { \ (simd-(vmerge-tail)) [ %merge-vector-reps          ] }
         { \ (simd-(v>float))        [ %integer>float-vector-reps ] }
diff --git a/basis/math/vectors/simd/simd-tests.factor b/basis/math/vectors/simd/simd-tests.factor
index 9e999ba9b7..460059809e 100644
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@@ -174,7 +174,7 @@ CONSTANT: simd-classes
 : remove-special-words ( alist -- alist' )
     ! These have their own tests later
     {
-        hlshift hrshift vshuffle vbroadcast
+        hlshift hrshift vshuffle-bytes vshuffle-elements vbroadcast
         vany? vall? vnone?
         (v>float) (v>integer)
         (vpack-signed) (vpack-unsigned)
@@ -360,6 +360,23 @@ simd-classes [
     ] unit-test
 ] each
 
+"== Checking variable shuffles" print
+
+: random-shift-vector ( class -- vec )
+    new [ drop 16 random ] map ;
+
+:: test-shift-vector ( class -- ? )
+    class random-int-vector :> src
+    char-16 random-shift-vector :> perm
+    { class char-16 } :> decl
+
+    src perm vshuffle
+    src perm [ decl declare vshuffle ] compile-call
+    = ; inline
+
+{ char-16 uchar-16 short-8 ushort-8 int-4 uint-4 longlong-2 ulonglong-2 }
+[ 10 swap '[ [ t ] [ _ test-shift-vector ] unit-test ] times ] each
+
 "== Checking vector tests" print
 
 :: test-vector-tests-bool ( vector declaration -- none? any? all? )
diff --git a/basis/math/vectors/specialization/specialization.factor b/basis/math/vectors/specialization/specialization.factor
index 28c3ee82c3..e51d8c4553 100644
--- a/basis/math/vectors/specialization/specialization.factor
+++ b/basis/math/vectors/specialization/specialization.factor
@@ -98,7 +98,8 @@ H{
     { vrshift { +vector+ +scalar+ -> +vector+ } }
     { hlshift { +vector+ +literal+ -> +vector+ } }
     { hrshift { +vector+ +literal+ -> +vector+ } }
-    { vshuffle { +vector+ +literal+ -> +vector+ } }
+    { vshuffle-elements { +vector+ +literal+ -> +vector+ } }
+    { vshuffle-bytes    { +vector+ +vector+  -> +vector+ } }
     { vbroadcast { +vector+ +literal+ -> +vector+ } }
     { (vmerge-head) { +vector+ +vector+ -> +vector+ } }
     { (vmerge-tail) { +vector+ +vector+ -> +vector+ } }
@@ -162,7 +163,7 @@ ERROR: bad-vector-word word ;
     } cond
     ! Don't specialize horizontal shifts, shuffles, and conversions at all, they're only for SIMD
     {
-        hlshift hrshift vshuffle vbroadcast
+        hlshift hrshift vshuffle-elements vshuffle-bytes vbroadcast
         (v>integer) (v>float)
         (vpack-signed) (vpack-unsigned)
         (vunpack-head) (vunpack-tail)
diff --git a/basis/math/vectors/vectors.factor b/basis/math/vectors/vectors.factor
index c65009950d..4cb03af44c 100644
--- a/basis/math/vectors/vectors.factor
+++ b/basis/math/vectors/vectors.factor
@@ -6,6 +6,9 @@ locals ;
 QUALIFIED-WITH: alien.c-types c
 IN: math.vectors
 
+MIXIN: simd-128
+MIXIN: simd-256
+
 GENERIC: element-type ( obj -- c-type )
 M: object element-type drop f ; inline
 
@@ -83,7 +86,20 @@ PRIVATE>
 : vbitnot ( u -- w ) dup '[ _ [ bitnot ] fp-bitwise-unary ] map ;
 
 :: vbroadcast ( u n -- v ) u length n u nth <repetition> u like ;
-: vshuffle ( u perm -- v ) swap [ '[ _ nth ] ] keep map-as ;
+
+: vshuffle-elements ( u perm -- v )
+    swap [ '[ _ nth ] ] keep map-as ;
+
+: vshuffle-bytes ( u perm -- v )
+    underlying>> [
+        swap [ '[ _ nth ] ] keep map-as
+    ] curry change-underlying ;
+
+GENERIC: vshuffle ( u perm -- v )
+M: array vshuffle ( u perm -- v )
+    vshuffle-elements ; inline
+M: simd-128 vshuffle ( u perm -- v )
+    vshuffle-bytes ; inline
 
 : vlshift ( u n -- w ) '[ _ shift ] map ;
 : vrshift ( u n -- w ) neg '[ _ shift ] map ;
diff --git a/basis/specialized-arrays/specialized-arrays.factor b/basis/specialized-arrays/specialized-arrays.factor
index a3d24c10c2..c5de95b5b5 100755
--- a/basis/specialized-arrays/specialized-arrays.factor
+++ b/basis/specialized-arrays/specialized-arrays.factor
@@ -2,7 +2,8 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors alien alien.c-types alien.data alien.parser
 assocs byte-arrays classes compiler.units functors kernel lexer
-libc math math.vectors math.vectors.specialization namespaces
+libc math math.vectors math.vectors.private
+math.vectors.specialization namespaces
 parser prettyprint.custom sequences sequences.private strings
 summary vocabs vocabs.loader vocabs.parser vocabs.generated
 words fry combinators present ;
@@ -68,6 +69,8 @@ TUPLE: A
     [ drop \ T bad-byte-array-length ] unless
     <direct-A> ; inline
 
+M: A new-underlying drop byte-array>A ;
+
 M: A clone [ underlying>> clone ] [ length>> ] bi <direct-A> ; inline
 
 M: A length length>> ; inline

From a5898dffdebab1240e7e32054dcfb49171fd1b1d Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Sat, 10 Oct 2009 12:00:47 -0500
Subject: [PATCH 04/13] don't use MOVSLDUP/MOVSHDUP to do specialized shuffles
 unless sse3 is available

---
 basis/cpu/x86/x86.factor | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index dab7d9d52b..c1acf92246 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -673,11 +673,9 @@ M: x86 %gather-vector-2-reps
         [ dupd SHUFPD ]
     } case ;
 
-: float-4-shuffle ( dst shuffle -- )
+: sse1-float-4-shuffle ( dst shuffle -- )
     {
         { { 0 1 2 3 } [ drop ] }
-        { { 0 0 2 2 } [ dup MOVSLDUP ] }
-        { { 1 1 3 3 } [ dup MOVSHDUP ] }
         { { 0 1 0 1 } [ dup MOVLHPS ] }
         { { 2 3 2 3 } [ dup MOVHLPS ] }
         { { 0 0 1 1 } [ dup UNPCKLPS ] }
@@ -685,6 +683,15 @@ M: x86 %gather-vector-2-reps
         [ dupd SHUFPS ]
     } case ;
 
+: float-4-shuffle ( dst shuffle -- )
+    sse3? [
+        {
+            { { 0 0 2 2 } [ dup MOVSLDUP ] }
+            { { 1 1 3 3 } [ dup MOVSHDUP ] }
+            [ sse1-float-4-shuffle ]
+        } case
+    ] [ sse1-float-4-shuffle ] if ;
+
 : int-4-shuffle ( dst shuffle -- )
     {
         { { 0 1 2 3 } [ drop ] }

From 140780439346efcb184352d018eb52bbf14652cc Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Sat, 10 Oct 2009 12:23:25 -0500
Subject: [PATCH 05/13] change the simd-struct tests to cover integer vector
 slots

---
 basis/math/vectors/simd/simd-tests.factor | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/basis/math/vectors/simd/simd-tests.factor b/basis/math/vectors/simd/simd-tests.factor
index 460059809e..8766056a96 100644
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@@ -529,38 +529,38 @@ SYMBOL: !!inconsistent!!
 
 STRUCT: simd-struct
 { x float-4 }
-{ y double-2 }
+{ y longlong-2 }
 { z double-4 }
-{ w float-8 } ;
+{ w int-8 } ;
 
 [ t ] [ [ simd-struct <struct> ] compile-call >c-ptr [ 0 = ] all? ] unit-test
 
 [
     float-4{ 1 2 3 4 }
-    double-2{ 2 1 }
+    longlong-2{ 2 1 }
     double-4{ 4 3 2 1 }
-    float-8{ 1 2 3 4 5 6 7 8 }
+    int-8{ 1 2 3 4 5 6 7 8 }
 ] [
     simd-struct <struct>
     float-4{ 1 2 3 4 } >>x
-    double-2{ 2 1 } >>y
+    longlong-2{ 2 1 } >>y
     double-4{ 4 3 2 1 } >>z
-    float-8{ 1 2 3 4 5 6 7 8 } >>w
+    int-8{ 1 2 3 4 5 6 7 8 } >>w
     { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
 ] unit-test
 
 [
     float-4{ 1 2 3 4 }
-    double-2{ 2 1 }
+    longlong-2{ 2 1 }
     double-4{ 4 3 2 1 }
-    float-8{ 1 2 3 4 5 6 7 8 }
+    int-8{ 1 2 3 4 5 6 7 8 }
 ] [
     [
         simd-struct <struct>
         float-4{ 1 2 3 4 } >>x
-        double-2{ 2 1 } >>y
+        longlong-2{ 2 1 } >>y
         double-4{ 4 3 2 1 } >>z
-        float-8{ 1 2 3 4 5 6 7 8 } >>w
+        int-8{ 1 2 3 4 5 6 7 8 } >>w
         { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
     ] compile-call
 ] unit-test

From 97ab9dc4ab3bb4e6ff9a7326dca12c3d2f66dd64 Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Sat, 10 Oct 2009 12:53:10 -0500
Subject: [PATCH 06/13] only emit ##alien-vector/##set-alien-vector insns if
 the rep is available

---
 basis/compiler/cfg/intrinsics/simd/simd.factor | 7 +++++--
 basis/cpu/architecture/architecture.factor     | 2 ++
 basis/cpu/x86/x86.factor                       | 6 ++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/basis/compiler/cfg/intrinsics/simd/simd.factor b/basis/compiler/cfg/intrinsics/simd/simd.factor
index e608cf999c..bd851199ca 100644
--- a/basis/compiler/cfg/intrinsics/simd/simd.factor
+++ b/basis/compiler/cfg/intrinsics/simd/simd.factor
@@ -128,6 +128,9 @@ MACRO: if-literals-match ( quots -- )
     [ ^^select-vector ] [unary/param]
     { [ integer? ] [ representation? ] } if-literals-match ; inline
 
+: emit-alien-vector-op ( node quot: ( rep -- ) -- )
+    { [ %alien-vector-reps member? ] } if-literals-match ; inline
+
 : emit-alien-vector ( node -- )
     dup [
         '[
@@ -135,7 +138,7 @@ MACRO: if-literals-match ( quots -- )
             _ ^^alien-vector ds-push
         ]
         [ inline-alien-getter? ] inline-alien
-    ] with emit-vector-op ;
+    ] with emit-alien-vector-op ;
 
 : emit-set-alien-vector ( node -- )
     dup [
@@ -145,7 +148,7 @@ MACRO: if-literals-match ( quots -- )
         ]
         [ byte-array inline-alien-setter? ]
         inline-alien
-    ] with emit-vector-op ;
+    ] with emit-alien-vector-op ;
 
 : generate-not-vector ( src rep -- dst )
     dup %not-vector-reps member?
diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor
index 85a43e99fd..19b38fd8f8 100644
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@@ -289,6 +289,7 @@ HOOK: %zero-vector-reps cpu ( -- reps )
 HOOK: %fill-vector-reps cpu ( -- reps )
 HOOK: %gather-vector-2-reps cpu ( -- reps )
 HOOK: %gather-vector-4-reps cpu ( -- reps )
+HOOK: %alien-vector-reps cpu ( -- reps )
 HOOK: %shuffle-vector-reps cpu ( -- reps )
 HOOK: %shuffle-vector-imm-reps cpu ( -- reps )
 HOOK: %merge-vector-reps cpu ( -- reps )
@@ -330,6 +331,7 @@ M: object %zero-vector-reps { } ;
 M: object %fill-vector-reps { } ;
 M: object %gather-vector-2-reps { } ;
 M: object %gather-vector-4-reps { } ;
+M: object %alien-vector-reps { } ;
 M: object %shuffle-vector-reps { } ;
 M: object %shuffle-vector-imm-reps { } ;
 M: object %merge-vector-reps { } ;
diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index c1acf92246..a163ba6482 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -562,6 +562,12 @@ MACRO: available-reps ( alist -- )
     reverse [ { } ] suffix
     '[ _ cond ] ;
 
+M: x86 %alien-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
+    } available-reps ;
+
 M: x86 %zero-vector
     {
         { double-2-rep [ dup XORPD ] }

From 588899a1b3814194febf4e4701db2aadc67a419c Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Sat, 10 Oct 2009 13:01:13 -0500
Subject: [PATCH 07/13] fix fallbacks for vall?, vany?, vnone?

---
 basis/math/vectors/vectors.factor | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/basis/math/vectors/vectors.factor b/basis/math/vectors/vectors.factor
index 4cb03af44c..0a984ba2e7 100644
--- a/basis/math/vectors/vectors.factor
+++ b/basis/math/vectors/vectors.factor
@@ -123,9 +123,9 @@ M: simd-128 vshuffle ( u perm -- v )
 : vxor ( u v -- w )  over '[ [ _ element>bool ] bi@ xor ] 2map ;
 : vnot ( u -- w )    dup '[ _ element>bool not ] map ;
 
-: vall? ( v -- ? ) [ ] all? ;
-: vany? ( v -- ? ) [ ] any? ;
-: vnone? ( v -- ? ) [ not ] all? ;
+: vall? ( v -- ? ) dup '[ _ element>bool ] all? ;
+: vany? ( v -- ? ) dup '[ _ element>bool ] any? ;
+: vnone? ( v -- ? ) dup '[ _ element>bool not ] all? ;
 
 : v<  ( u v -- w ) [ <   ] 2map ;
 : v<= ( u v -- w ) [ <=  ] 2map ;

From 4e4be608840d50f152d4f1643c5f06f2b7899a12 Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Sat, 10 Oct 2009 13:13:53 -0500
Subject: [PATCH 08/13] use TEST reg, reg to compare integer equality with zero

---
 basis/cpu/x86/x86.factor | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor
index a163ba6482..8612acdcff 100644
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@@ -432,8 +432,13 @@ M: x86 %epilogue ( n -- ) cell - incr-stack-reg ;
     temp 0 MOV \ t rc-absolute-cell rel-immediate
     dst temp word execute ; inline
 
+: (%compare) ( src1 src2 cc -- )
+    2over [ { cc= cc/= } member? ] [ register? ] [ 0 = ] tri* and and
+    [ drop dup TEST ]
+    [ CMP ] if ;
+
 M:: x86 %compare ( dst src1 src2 cc temp -- )
-    src1 src2 CMP
+    src1 src2 cc (%compare)
     cc order-cc {
         { cc<  [ dst temp \ CMOVL %boolean ] }
         { cc<= [ dst temp \ CMOVLE %boolean ] }
@@ -447,7 +452,7 @@ M: x86 %compare-imm ( dst src1 src2 cc temp -- )
     %compare ;
 
 M:: x86 %compare-branch ( label src1 src2 cc -- )
-    src1 src2 CMP
+    src1 src2 cc (%compare)
     cc order-cc {
         { cc<  [ label JL ] }
         { cc<= [ label JLE ] }

From 584cd44856bb42e5dd81ddd0041eaa16a43e2ca9 Mon Sep 17 00:00:00 2001
From: Doug Coleman <doug.coleman@gmail.com>
Date: Sat, 10 Oct 2009 14:02:35 -0500
Subject: [PATCH 09/13] fix the tuple subclassing example docs

---
 core/classes/tuple/tuple-docs.factor | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/core/classes/tuple/tuple-docs.factor b/core/classes/tuple/tuple-docs.factor
index daa275e2a7..cc67075168 100644
--- a/core/classes/tuple/tuple-docs.factor
+++ b/core/classes/tuple/tuple-docs.factor
@@ -129,17 +129,18 @@ ARTICLE: "tuple-inheritance-example" "Tuple subclassing example"
 }
 "Rectangles and parallelograms use the same algorithm for computing the area, whereas they use different algorithms for computing perimiter. Also, rectangles and parallelograms both have " { $snippet "width" } " and " { $snippet "height" } " slots. We can exploit this with subclassing:"
 { $code
+    "USING: math.constants math.functions ;"
     "GENERIC: area ( shape -- n )"
     "GENERIC: perimiter ( shape -- n )"
     ""
     "TUPLE: shape ;"
     ""
     "TUPLE: circle < shape radius ;"
-    "M: area circle radius>> sq pi * ;"
-    "M: perimiter circle radius>> 2 * pi * ;"
+    "M: circle area radius>> sq pi * ;"
+    "M: circle perimiter radius>> 2 * pi * ;"
     ""
-    "TUPLE: quad < shape width height"
-    "M: area quad [ width>> ] [ height>> ] bi * ;"
+    "TUPLE: quad < shape width height ;"
+    "M: quad area [ width>> ] [ height>> ] bi * ;"
     ""
     "TUPLE: rectangle < quad ;"
     "M: rectangle perimiter [ width>> 2 * ] [ height>> 2 * ] bi + ;"

From 50d4eb27bf2e4132952f8c1e0d4a50c59d51c156 Mon Sep 17 00:00:00 2001
From: Doug Coleman <doug.coleman@gmail.com>
Date: Sat, 10 Oct 2009 14:04:22 -0500
Subject: [PATCH 10/13] add missing usings

---
 core/classes/tuple/tuple-docs.factor | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/classes/tuple/tuple-docs.factor b/core/classes/tuple/tuple-docs.factor
index cc67075168..45d3931448 100644
--- a/core/classes/tuple/tuple-docs.factor
+++ b/core/classes/tuple/tuple-docs.factor
@@ -129,7 +129,7 @@ ARTICLE: "tuple-inheritance-example" "Tuple subclassing example"
 }
 "Rectangles and parallelograms use the same algorithm for computing the area, whereas they use different algorithms for computing perimiter. Also, rectangles and parallelograms both have " { $snippet "width" } " and " { $snippet "height" } " slots. We can exploit this with subclassing:"
 { $code
-    "USING: math.constants math.functions ;"
+    "USING: accessors kernel math math.constants math.functions ;"
     "GENERIC: area ( shape -- n )"
     "GENERIC: perimiter ( shape -- n )"
     ""

From 2a24e30a242be0bd30a6540d3f3cde4676bdfb21 Mon Sep 17 00:00:00 2001
From: Joe Groff <arcata@gmail.com>
Date: Sat, 10 Oct 2009 17:45:15 -0500
Subject: [PATCH 11/13] store math.matrices.simd matrices in column-major order
 so that m4.v, frustum, and translation construction are faster

---
 extra/math/matrices/simd/simd-tests.factor |  68 +++++-----
 extra/math/matrices/simd/simd.factor       | 142 +++++++++++----------
 2 files changed, 106 insertions(+), 104 deletions(-)

diff --git a/extra/math/matrices/simd/simd-tests.factor b/extra/math/matrices/simd/simd-tests.factor
index 3798c3e98e..965c2bddb5 100644
--- a/extra/math/matrices/simd/simd-tests.factor
+++ b/extra/math/matrices/simd/simd-tests.factor
@@ -52,10 +52,10 @@ IN: math.matrices.simd.tests
 [ 
     S{ matrix4 f
         float-4-array{
-            float-4{ 1.0 0.0 0.0 3.0 }
-            float-4{ 0.0 1.0 0.0 4.0 }
-            float-4{ 0.0 0.0 1.0 2.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+            float-4{ 3.0 4.0 2.0 1.0 }
         }
     }
 ] [ float-4{ 3.0 4.0 2.0 0.0 } translation-matrix4 ] unit-test
@@ -77,9 +77,9 @@ IN: math.matrices.simd.tests
     float-4{ 0.0 1.0 0.0 1.0 } pi 1/2. * rotation-matrix4
     S{ matrix4 f
         float-4-array{
-            float-4{  0.0  0.0  1.0 0.0 }
+            float-4{  0.0  0.0 -1.0 0.0 }
             float-4{  0.0  1.0  0.0 0.0 }
-            float-4{ -1.0  0.0  0.0 0.0 }
+            float-4{  1.0  0.0  0.0 0.0 }
             float-4{  0.0  0.0  0.0 1.0 }
         }
     }
@@ -89,10 +89,10 @@ IN: math.matrices.simd.tests
 [
     S{ matrix4 f
         float-4-array{
-            float-4{ 2.0 0.0 0.0 10.0 }
-            float-4{ 0.0 3.0 0.0 18.0 }
-            float-4{ 0.0 0.0 4.0 28.0 }
-            float-4{ 0.0 0.0 0.0  1.0 }
+            float-4{  2.0  0.0  0.0  0.0 }
+            float-4{  0.0  3.0  0.0  0.0 }
+            float-4{  0.0  0.0  4.0  0.0 }
+            float-4{ 10.0 18.0 28.0  1.0 }
         }
     }
 ] [
@@ -106,10 +106,10 @@ IN: math.matrices.simd.tests
     }
     S{ matrix4 f
         float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
         }
     }
     m4.
@@ -118,10 +118,10 @@ IN: math.matrices.simd.tests
 [
     S{ matrix4 f
         float-4-array{
-            float-4{ 3.0 0.0 0.0 5.0 }
-            float-4{ 0.0 4.0 0.0 6.0 }
-            float-4{ 0.0 0.0 5.0 7.0 }
-            float-4{ 0.0 0.0 0.0 2.0 }
+            float-4{ 3.0 0.0 0.0 0.0 }
+            float-4{ 0.0 4.0 0.0 0.0 }
+            float-4{ 0.0 0.0 5.0 0.0 }
+            float-4{ 5.0 6.0 7.0 2.0 }
         }
     }
 ] [
@@ -135,10 +135,10 @@ IN: math.matrices.simd.tests
     }
     S{ matrix4 f
         float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
         }
     }
     m4+
@@ -147,10 +147,10 @@ IN: math.matrices.simd.tests
 [
     S{ matrix4 f
         float-4-array{
-            float-4{ 1.0 0.0 0.0 -5.0 }
-            float-4{ 0.0 2.0 0.0 -6.0 }
-            float-4{ 0.0 0.0 3.0 -7.0 }
-            float-4{ 0.0 0.0 0.0  0.0 }
+            float-4{  1.0  0.0  0.0 0.0 }
+            float-4{  0.0  2.0  0.0 0.0 }
+            float-4{  0.0  0.0  3.0 0.0 }
+            float-4{ -5.0 -6.0 -7.0 0.0 }
         }
     }
 ] [
@@ -164,10 +164,10 @@ IN: math.matrices.simd.tests
     }
     S{ matrix4 f
         float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
         }
     }
     m4-
@@ -219,10 +219,10 @@ IN: math.matrices.simd.tests
 [
     S{ matrix4 f
         float-4-array{
-            float-4{ 1/2. 0.0   0.0    0.0  }
-            float-4{ 0.0  1/2.  0.0    0.0  }
-            float-4{ 0.0  0.0  -6/4. -10/4. }
-            float-4{ 0.0  0.0  -1.0    0.0  }
+            float-4{ 1/2. 0.0   0.0   0.0 }
+            float-4{ 0.0  1/2.  0.0   0.0 }
+            float-4{ 0.0  0.0  -6/4. -1.0 }
+            float-4{ 0.0  0.0 -10/4.  0.0 }
         }
     }
 ] [
diff --git a/extra/math/matrices/simd/simd.factor b/extra/math/matrices/simd/simd.factor
index da1149dfec..edbe77781f 100644
--- a/extra/math/matrices/simd/simd.factor
+++ b/extra/math/matrices/simd/simd.factor
@@ -9,34 +9,34 @@ SPECIALIZED-ARRAY: float-4
 IN: math.matrices.simd
 
 STRUCT: matrix4
-    { rows float-4[4] } ;
+    { columns float-4[4] } ;
 
 INSTANCE: matrix4 immutable-sequence
 
 M: matrix4 length drop 4 ; inline
-M: matrix4 nth-unsafe rows>> nth-unsafe ; inline
+M: matrix4 nth-unsafe columns>> nth-unsafe ; inline
 M: matrix4 new-sequence 2drop matrix4 (struct) ; inline
 
 <PRIVATE
 
-: rows ( a -- a1 a2 a3 a4 )
-    rows>> 4 firstn ; inline
+: columns ( a -- a1 a2 a3 a4 )
+    columns>> 4 firstn ; inline
 
-:: set-rows ( c1 c2 c3 c4 c -- c )
-    c rows>> :> rows
-    c1 rows set-first
-    c2 rows set-second
-    c3 rows set-third
-    c4 rows set-fourth
+:: set-columns ( c1 c2 c3 c4 c -- c )
+    c columns>> :> columns
+    c1 columns set-first
+    c2 columns set-second
+    c3 columns set-third
+    c4 columns set-fourth
     c ; inline
 
 : make-matrix4 ( quot: ( -- c1 c2 c3 c4 ) -- c )
-    matrix4 (struct) swap dip set-rows ; inline
+    matrix4 (struct) swap dip set-columns ; inline
 
-:: 2map-rows ( a b quot -- c )
+:: 2map-columns ( a b quot -- c )
     [
-        a rows :> a4 :> a3 :> a2 :> a1
-        b rows :> b4 :> b3 :> b2 :> b1
+        a columns :> a4 :> a3 :> a2 :> a1
+        b columns :> b4 :> b3 :> b2 :> b1
 
         a1 b1 quot call
         a2 b2 quot call
@@ -44,57 +44,57 @@ M: matrix4 new-sequence 2drop matrix4 (struct) ; inline
         a4 b4 quot call
     ] make-matrix4 ; inline
 
-: map-rows ( a quot -- c )
-    '[ rows _ 4 napply ] make-matrix4 ; inline
+: map-columns ( a quot -- c )
+    '[ columns _ 4 napply ] make-matrix4 ; inline
     
 PRIVATE>
 
-TYPED: m4+ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v+ ] 2map-rows ;
-TYPED: m4- ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v- ] 2map-rows ;
-TYPED: m4* ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v* ] 2map-rows ;
-TYPED: m4/ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v/ ] 2map-rows ;
+TYPED: m4+ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v+ ] 2map-columns ;
+TYPED: m4- ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v- ] 2map-columns ;
+TYPED: m4* ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v* ] 2map-columns ;
+TYPED: m4/ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v/ ] 2map-columns ;
 
-TYPED: m4*n ( a: matrix4 b: float -- c: matrix4 ) [ v*n ] curry map-rows ;
-TYPED: m4/n ( a: matrix4 b: float -- c: matrix4 ) [ v/n ] curry map-rows ;
-TYPED: n*m4 ( a: float b: matrix4 -- c: matrix4 ) [ n*v ] with map-rows ;
-TYPED: n/m4 ( a: float b: matrix4 -- c: matrix4 ) [ n/v ] with map-rows ;
+TYPED: m4*n ( a: matrix4 b: float -- c: matrix4 ) [ v*n ] curry map-columns ;
+TYPED: m4/n ( a: matrix4 b: float -- c: matrix4 ) [ v/n ] curry map-columns ;
+TYPED: n*m4 ( a: float b: matrix4 -- c: matrix4 ) [ n*v ] with map-columns ;
+TYPED: n/m4 ( a: float b: matrix4 -- c: matrix4 ) [ n/v ] with map-columns ;
 
 TYPED:: m4. ( a: matrix4 b: matrix4 -- c: matrix4 )
     [
-        a rows :> a4 :> a3 :> a2 :> a1
-        b rows :> b4 :> b3 :> b2 :> b1
+        a columns :> a4 :> a3 :> a2 :> a1
+        b columns :> b4 :> b3 :> b2 :> b1
 
-        a1 first  b1 n*v :> c1a
-        a2 first  b1 n*v :> c2a
-        a3 first  b1 n*v :> c3a
-        a4 first  b1 n*v :> c4a
+        b1 first  a1 n*v :> c1a
+        b2 first  a1 n*v :> c2a
+        b3 first  a1 n*v :> c3a
+        b4 first  a1 n*v :> c4a
 
-        a1 second b2 n*v c1a v+ :> c1b 
-        a2 second b2 n*v c2a v+ :> c2b
-        a3 second b2 n*v c3a v+ :> c3b
-        a4 second b2 n*v c4a v+ :> c4b
+        b1 second a2 n*v c1a v+ :> c1b 
+        b2 second a2 n*v c2a v+ :> c2b
+        b3 second a2 n*v c3a v+ :> c3b
+        b4 second a2 n*v c4a v+ :> c4b
 
-        a1 third  b3 n*v c1b v+ :> c1c 
-        a2 third  b3 n*v c2b v+ :> c2c
-        a3 third  b3 n*v c3b v+ :> c3c
-        a4 third  b3 n*v c4b v+ :> c4c
+        b1 third  a3 n*v c1b v+ :> c1c 
+        b2 third  a3 n*v c2b v+ :> c2c
+        b3 third  a3 n*v c3b v+ :> c3c
+        b4 third  a3 n*v c4b v+ :> c4c
 
-        a1 fourth b4 n*v c1c v+
-        a2 fourth b4 n*v c2c v+
-        a3 fourth b4 n*v c3c v+
-        a4 fourth b4 n*v c4c v+
+        b1 fourth a4 n*v c1c v+
+        b2 fourth a4 n*v c2c v+
+        b3 fourth a4 n*v c3c v+
+        b4 fourth a4 n*v c4c v+
     ] make-matrix4 ;
 
-TYPED:: v.m4 ( a: float-4 b: matrix4 -- c: float-4 )
-    b rows :> b4 :> b3 :> b2 :> b1
+TYPED:: m4.v ( m: matrix4 v: float-4 -- v': float-4 )
+    m columns :> m4 :> m3 :> m2 :> m1
     
-    a first  b1 n*v
-    a second b2 n*v v+
-    a third  b3 n*v v+
-    a fourth b4 n*v v+ ;
+    v first  m1 n*v
+    v second m2 n*v v+
+    v third  m3 n*v v+
+    v fourth m4 n*v v+ ;
 
-TYPED:: m4.v ( a: matrix4 b: float-4 -- c: float-4 )
-    a rows [ b v. ] 4 napply float-4-boa ;
+TYPED:: v.m4 ( v: float-4 m: matrix4 -- c: float-4 )
+    m columns [ v v. ] 4 napply float-4-boa ;
 
 CONSTANT: identity-matrix4
     S{ matrix4 f
@@ -131,37 +131,37 @@ TYPED: diagonal-matrix4 ( diagonal: float-4 -- matrix: matrix4 )
     [ (vmerge) ] bi-curry@ bi* ; inline
 
 TYPED: transpose-matrix4 ( matrix: matrix4 -- matrix: matrix4 )
-    [ rows vmerge-transpose vmerge-transpose ] make-matrix4 ;
+    [ columns vmerge-transpose vmerge-transpose ] make-matrix4 ;
+
+: linear>homogeneous ( v -- v' )
+    [ float-4{ t t t f } ] dip float-4{ 0.0 0.0 0.0 1.0 } v? ; inline
 
 : scale-matrix4 ( factors -- matrix )
-    [ float-4{ t t t f } ] dip float-4{ 0.0 0.0 0.0 1.0 } v?
-    diagonal-matrix4 ; inline
+    linear>homogeneous diagonal-matrix4 ; inline
 
 : ortho-matrix4 ( factors -- matrix )
     float-4{ 1.0 1.0 1.0 1.0 } swap v/ scale-matrix4 ; inline
 
-TYPED:: translation-matrix4 ( offset: float-4 -- matrix: matrix4 )
+TYPED: translation-matrix4 ( offset: float-4 -- matrix: matrix4 )
     [
-        float-4{ 1.0 1.0 1.0 1.0 } :> diagonal
-
-        offset 0 float-4-with (vmerge)
-        [ 0 float-4-with swap (vmerge) ] bi@ drop :> z :> y :> x
-
-        diagonal y vmerge-diagonal*
-        [ x vmerge-diagonal* ]
-        [ z vmerge-diagonal* ] bi*
+        linear>homogeneous
+        [ 
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+        ] dip
     ] make-matrix4 ;
 
 TYPED:: rotation-matrix4 ( axis: float-4 theta: float -- matrix: matrix4 )
-    !   x*x + c*(1.0 - x*x)   x*y*(1.0 - c) - s*z   x*z*(1.0 - c) + s*y   0
-    !   x*y*(1.0 - c) + s*z   y*y + c*(1.0 - y*y)   y*z*(1.0 - c) - s*x   0
-    !   x*z*(1.0 - c) - s*y   y*z*(1.0 - c) + s*x   z*z + c*(1.0 - z*z)   0
+    !   x*x + c*(1.0 - x*x)   x*y*(1.0 - c) + s*z   x*z*(1.0 - c) - s*y   0
+    !   x*y*(1.0 - c) - s*z   y*y + c*(1.0 - y*y)   y*z*(1.0 - c) + s*x   0
+    !   x*z*(1.0 - c) + s*y   y*z*(1.0 - c) - s*x   z*z + c*(1.0 - z*z)   0
     !   0                     0                     0                     1
     matrix4 (struct) :> triangle-m
     theta cos :> c
     theta sin :> s
 
-    float-4{  1.0 -1.0  1.0 0.0 } :> triangle-sign
+    float-4{ -1.0  1.0 -1.0 0.0 } :> triangle-sign
 
     c float-4-with :> cc
     s float-4-with :> ss
@@ -184,7 +184,7 @@ TYPED:: rotation-matrix4 ( axis: float-4 theta: float -- matrix: matrix4 )
     triangle-lo { 1 0 3 3 } vshuffle
     float-4 new
 
-    triangle-m set-rows drop
+    triangle-m set-columns drop
 
     diagonal-m triangle-m m4+ ;
 
@@ -194,8 +194,10 @@ TYPED:: frustum-matrix4 ( xy: float-4 near: float far: float -- matrix: matrix4
         float-4{ t t f f } xy near far - float-4-with v? ! denom
         v/ :> fov
         
-        fov 0.0 float-4-with (vmerge-head) vmerge-diagonal
-        fov float-4{ f f t t } vand
-        float-4{ 0.0 0.0 -1.0 0.0 }
+        float-4{ 0.0 -1.0 0.0 0.0 } :> negone
+
+        fov vmerge-diagonal
+        [ vmerge-diagonal ]
+        [ negone (vmerge) ] bi*
     ] make-matrix4 ;
 

From 9548b7bdd8cacf4ae3a32b209d786d19e9234dcf Mon Sep 17 00:00:00 2001
From: Doug Coleman <doug.coleman@gmail.com>
Date: Mon, 12 Oct 2009 16:55:52 -0500
Subject: [PATCH 12/13] fix a bug when printing calendars, add a unit test

---
 basis/calendar/format/format-tests.factor | 4 +++-
 basis/calendar/format/format.factor       | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/basis/calendar/format/format-tests.factor b/basis/calendar/format/format-tests.factor
index f8864351a4..cb1ff0b60f 100644
--- a/basis/calendar/format/format-tests.factor
+++ b/basis/calendar/format/format-tests.factor
@@ -1,5 +1,5 @@
 USING: calendar.format calendar kernel math tools.test
-io.streams.string accessors io math.order ;
+io.streams.string accessors io math.order sequences ;
 IN: calendar.format.tests
 
 [ 0 ] [
@@ -81,3 +81,5 @@ IN: calendar.format.tests
 ] [ "Thursday, 02-Oct-2008 23:59:59 GMT" cookie-string>timestamp ] unit-test
 
 
+[ ]
+[ { 2008 2009 } [ year. ] each ] unit-test
diff --git a/basis/calendar/format/format.factor b/basis/calendar/format/format.factor
index 6aa4126ff9..d07d74722a 100644
--- a/basis/calendar/format/format.factor
+++ b/basis/calendar/format/format.factor
@@ -66,7 +66,7 @@ M: array month. ( pair -- )
     [ month-name write bl number>string print ]
     [ 1 zeller-congruence ]
     [ (days-in-month) day-abbreviations2 " " join print ] 2tri
-    over "   " <repetition> concat write
+    over "   " <repetition> "" concat-as write
     [
         [ 1 + day. ] keep
         1 + + 7 mod zero? [ nl ] [ bl ] if

From 48f479950540b2b73ff3a221a1c4f46a801e2699 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slava@shill.local>
Date: Tue, 13 Oct 2009 05:13:22 -0500
Subject: [PATCH 13/13] math.vectors.simd: fix typos in docs reported by Ed
 Swartz

---
 basis/math/vectors/simd/simd-docs.factor | 27 ++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/basis/math/vectors/simd/simd-docs.factor b/basis/math/vectors/simd/simd-docs.factor
index 97ff99cd28..2fbe823965 100644
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@@ -41,7 +41,21 @@ $nl
     POSTPONE: SIMD:
     POSTPONE: SIMDS:
 }
-"The following vector types are supported:"
+"The following scalar types are supported:"
+{ $code
+    "char"
+    "uchar"
+    "short"
+    "ushort"
+    "int"
+    "uint"
+    "longlong"
+    "ulonglong"
+    "float"
+    "double"
+}
+
+"The following vector types are generated from the above scalar types:"
 { $code
     "char-16"
     "uchar-16"
@@ -89,6 +103,7 @@ $nl
 { $code
 """USING: compiler.tree.debugger math.vectors
 math.vectors.simd ;
+SIMD: double
 SYMBOLS: x y ;
 
 [
@@ -107,7 +122,7 @@ IN: simd-demo
     { float-4 float-4 float-4 } declare
     [ v* ] [ [ 1.0 ] dip n-v v* ] bi-curry* bi v+ ;
 
-\ interpolate optimizer-report.""" }
+\\ interpolate optimizer-report.""" }
 "Note that using " { $link declare } " is not recommended. Safer ways of getting type information for the input parameters to a word include defining methods on a generic word (the value being dispatched upon has a statically known type in the method body), as well as using " { $link "hints" } " and " { $link POSTPONE: inline } " declarations."
 $nl
 "Here is a better version of the " { $snippet "interpolate" } " words above that uses hints:"
@@ -122,7 +137,7 @@ IN: simd-demo
 
 HINTS: interpolate float-4 float-4 float-4 ;
 
-\ interpolate optimizer-report. """ }
+\\ interpolate optimizer-report. """ }
 "This time, the optimizer report lists calls to both SIMD primitives and high-level vector words, because hints cause two code paths to be generated. The " { $snippet "optimized." } " word can be used to make sure that the fast code path consists entirely of calls to primitives."
 $nl
 "If the " { $snippet "interpolate" } " word was to be used in several places with different types of vectors, it would be best to declare it " { $link POSTPONE: inline } "."
@@ -153,13 +168,13 @@ M: actor advance ( dt actor -- )
     [ >float ] dip
     [ update-velocity ] [ update-position ] 2bi ;
 
-M\ actor advance optimized."""
+M\\ actor advance optimized."""
 }
 "The " { $vocab-link "compiler.cfg.debugger" } " vocabulary can give a lower-level picture of the generated code, that includes register assignments and other low-level details. To look at low-level optimizer output, call " { $snippet "test-mr mr." } " on a word or quotation:"
 { $code
 """USE: compiler.tree.debugger
 
-M\ actor advance test-mr mr.""" }
+M\\ actor advance test-mr mr.""" }
 "An example of a high-performance algorithm that uses SIMD primitives can be found in the " { $vocab-link "benchmark.nbody-simd" } " vocabulary." ;
 
 ARTICLE: "math.vectors.simd.intrinsics" "Low-level SIMD primitives"
@@ -206,7 +221,7 @@ ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
 HELP: SIMD:
 { $syntax "SIMD: type" }
 { $values { "type" "a scalar C type" } }
-{ $description "Defines 128-bit and 256-bit SIMD arrays for holding elements of " { $snippet "type" } " into the vocabulary search path. The possible type/length combinations are listed in " { $link "math.vectors.simd.types" } " and the generated words are documented in " { $link "math.vectors.simd.words" } "." } ;
+{ $description "Defines 128-bit and 256-bit SIMD arrays for holding elements of " { $snippet "type" } " into the vocabulary search path. The allowed scalar types, and the auto-generated type/length vector combinations that result, are listed in " { $link "math.vectors.simd.types" } ". Generated words are documented in " { $link "math.vectors.simd.words" } "." } ;
 
 HELP: SIMDS:
 { $syntax "SIMDS: type type type ... ;" }