Merge branch 'master' of git://factorcode.org/git/factor into new_gc

2009-10-13 06:57:37 -05:00 · 2009-10-13 06:57:37 -05:00 · e81e076935
parent a280ccfe1c 48f4799505
commit e81e076935
22 changed files with 338 additions and 187 deletions
--- a/basis/calendar/format/format-tests.factor
+++ b/basis/calendar/format/format-tests.factor
@ -1,5 +1,5 @@
 USING: calendar.format calendar kernel math tools.test
-io.streams.string accessors io math.order ;
+io.streams.string accessors io math.order sequences ;
 IN: calendar.format.tests

 [ 0 ] [
@ -81,3 +81,5 @@ IN: calendar.format.tests
 ] [ "Thursday, 02-Oct-2008 23:59:59 GMT" cookie-string>timestamp ] unit-test


+[ ]
+[ { 2008 2009 } [ year. ] each ] unit-test
--- a/basis/calendar/format/format.factor
+++ b/basis/calendar/format/format.factor
@ -66,7 +66,7 @@ M: array month. ( pair -- )
    [ month-name write bl number>string print ]
    [ 1 zeller-congruence ]
    [ (days-in-month) day-abbreviations2 " " join print ] 2tri
-    over "   " <repetition> concat write
+    over "   " <repetition> "" concat-as write
    [
        [ 1 + day. ] keep
        1 + + 7 mod zero? [ nl ] [ bl ] if
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@ -277,6 +277,11 @@ literal: rep ;

 PURE-INSN: ##shuffle-vector
 def: dst
+use: src shuffle
+literal: rep ;
+
+PURE-INSN: ##shuffle-vector-imm
+def: dst
 use: src
 literal: shuffle rep ;

--- a/basis/compiler/cfg/intrinsics/intrinsics.factor
+++ b/basis/compiler/cfg/intrinsics/intrinsics.factor
@ -194,7 +194,8 @@ IN: compiler.cfg.intrinsics
        { math.vectors.simd.intrinsics:(simd-with) [ [ ^^with-vector ] emit-unary-vector-op ] }
        { math.vectors.simd.intrinsics:(simd-gather-2) [ emit-gather-vector-2 ] }
        { math.vectors.simd.intrinsics:(simd-gather-4) [ emit-gather-vector-4 ] }
-        { math.vectors.simd.intrinsics:(simd-vshuffle) [ emit-shuffle-vector ] }
+        { math.vectors.simd.intrinsics:(simd-vshuffle-elements) [ emit-shuffle-vector ] }
+        { math.vectors.simd.intrinsics:(simd-vshuffle-bytes) [ emit-shuffle-vector-var ] }
        { math.vectors.simd.intrinsics:(simd-(vmerge-head)) [ [ ^^merge-vector-head ] emit-binary-vector-op ] }
        { math.vectors.simd.intrinsics:(simd-(vmerge-tail)) [ [ ^^merge-vector-tail ] emit-binary-vector-op ] }
        { math.vectors.simd.intrinsics:(simd-(v>float)) [ [ ^^integer>float-vector ] emit-unary-vector-op ] }
--- a/basis/compiler/cfg/intrinsics/simd/simd.factor
+++ b/basis/compiler/cfg/intrinsics/simd/simd.factor
@ -1,15 +1,16 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors byte-arrays fry cpu.architecture kernel math
-sequences math.vectors.simd.intrinsics macros generalizations
-combinators combinators.short-circuit arrays locals
+USING: accessors alien byte-arrays fry classes.algebra
+cpu.architecture kernel math sequences math.vectors
+math.vectors.simd.intrinsics macros generalizations combinators
+combinators.short-circuit arrays locals
 compiler.tree.propagation.info compiler.cfg.builder.blocks
 compiler.cfg.comparisons
 compiler.cfg.stacks compiler.cfg.stacks.local compiler.cfg.hats
 compiler.cfg.instructions compiler.cfg.registers
 compiler.cfg.intrinsics.alien
 specialized-arrays ;
-FROM: alien.c-types => float double ;
+FROM: alien.c-types => heap-size char uchar float double ;
 SPECIALIZED-ARRAYS: float double ;
 IN: compiler.cfg.intrinsics.simd

@ -21,7 +22,7 @@ MACRO: check-elements ( quots -- )

 MACRO: if-literals-match ( quots -- )
    [ length ] [ ] [ length ] tri
-    ! n quots n n
+    ! n quots n
    '[
        ! node quot
        [
@ -77,15 +78,41 @@ MACRO: if-literals-match ( quots -- )

 : shuffle? ( obj -- ? ) { [ array? ] [ [ integer? ] all? ] } 1&& ;

-: emit-shuffle-vector ( node -- )
-    ! Pad the permutation with zeroes if its too short, since we
+: >variable-shuffle ( shuffle rep -- shuffle' )
+    rep-component-type heap-size
+    [ dup <repetition> >byte-array ]
+    [ iota >byte-array ] bi
+    '[ _ n*v _ v+ ] map concat ;
+
+: generate-shuffle-vector-imm ( src shuffle rep -- dst )
+    dup %shuffle-vector-imm-reps member?
+    [ ^^shuffle-vector-imm ]
+    [
+        [ >variable-shuffle ^^load-constant ] keep
+        ^^shuffle-vector
+    ] if ;
+
+: emit-shuffle-vector-imm ( node -- )
+    ! Pad the permutation with zeroes if it's too short, since we
    ! can't throw an error at this point.
-    [ [ rep-components 0 pad-tail ] keep ^^shuffle-vector ] [unary/param]
+    [ [ rep-components 0 pad-tail ] keep generate-shuffle-vector-imm ] [unary/param]
    { [ shuffle? ] [ representation? ] } if-literals-match ;

+: emit-shuffle-vector-var ( node -- )
+    [ ^^shuffle-vector ] [binary]
+    { [ %shuffle-vector-reps member? ] } if-literals-match ;
+
+: emit-shuffle-vector ( node -- )
+    dup node-input-infos {
+        [ length 3 = ]
+        [ first  class>> byte-array class<= ]
+        [ second class>> byte-array class<= ]
+        [ third  literal>> representation?  ]
+    } 1&& [ emit-shuffle-vector-var ] [ emit-shuffle-vector-imm ] if ;
+
 : ^^broadcast-vector ( src n rep -- dst )
    [ rep-components swap <array> ] keep
-    ^^shuffle-vector ;
+    generate-shuffle-vector-imm ;

 : emit-broadcast-vector ( node -- )
    [ ^^broadcast-vector ] [unary/param]
@ -101,6 +128,9 @@ MACRO: if-literals-match ( quots -- )
    [ ^^select-vector ] [unary/param]
    { [ integer? ] [ representation? ] } if-literals-match ; inline

+: emit-alien-vector-op ( node quot: ( rep -- ) -- )
+    { [ %alien-vector-reps member? ] } if-literals-match ; inline
+
 : emit-alien-vector ( node -- )
    dup [
        '[
@ -108,7 +138,7 @@ MACRO: if-literals-match ( quots -- )
            _ ^^alien-vector ds-push
        ]
        [ inline-alien-getter? ] inline-alien
-    ] with emit-vector-op ;
+    ] with emit-alien-vector-op ;

 : emit-set-alien-vector ( node -- )
    dup [
@ -118,7 +148,7 @@ MACRO: if-literals-match ( quots -- )
        ]
        [ byte-array inline-alien-setter? ]
        inline-alien
-    ] with emit-vector-op ;
+    ] with emit-alien-vector-op ;

 : generate-not-vector ( src rep -- dst )
    dup %not-vector-reps member?
--- a/basis/compiler/cfg/value-numbering/rewrite/rewrite.factor
+++ b/basis/compiler/cfg/value-numbering/rewrite/rewrite.factor
@ -450,26 +450,26 @@ M: ##set-alien-vector rewrite rewrite-alien-addressing ;
 ! Some lame constant folding for SIMD intrinsics. Eventually this
 ! should be redone completely.

-: rewrite-shuffle-vector ( insn expr -- insn' )
+: rewrite-shuffle-vector-imm ( insn expr -- insn' )
    2dup [ rep>> ] bi@ eq? [
        [ [ dst>> ] [ src>> vn>vreg ] bi* ]
        [ [ shuffle>> ] bi@ nths ]
        [ drop rep>> ]
-        2tri \ ##shuffle-vector new-insn
+        2tri \ ##shuffle-vector-imm new-insn
    ] [ 2drop f ] if ;

-: (fold-shuffle-vector) ( shuffle bytes -- bytes' )
+: (fold-shuffle-vector-imm) ( shuffle bytes -- bytes' )
    2dup length swap length /i group nths concat ;

-: fold-shuffle-vector ( insn expr -- insn' )
+: fold-shuffle-vector-imm ( insn expr -- insn' )
    [ [ dst>> ] [ shuffle>> ] bi ] dip value>>
-    (fold-shuffle-vector) \ ##load-constant new-insn ;
+    (fold-shuffle-vector-imm) \ ##load-constant new-insn ;

-M: ##shuffle-vector rewrite
+M: ##shuffle-vector-imm rewrite
    dup src>> vreg>expr {
-        { [ dup shuffle-vector-expr? ] [ rewrite-shuffle-vector ] }
-        { [ dup reference-expr? ] [ fold-shuffle-vector ] }
-        { [ dup constant-expr? ] [ fold-shuffle-vector ] }
+        { [ dup shuffle-vector-imm-expr? ] [ rewrite-shuffle-vector-imm ] }
+        { [ dup reference-expr? ] [ fold-shuffle-vector-imm ] }
+        { [ dup constant-expr? ] [ fold-shuffle-vector-imm ] }
        [ 2drop f ]
    } cond ;

--- a/basis/compiler/cfg/value-numbering/simplify/simplify.factor
+++ b/basis/compiler/cfg/value-numbering/simplify/simplify.factor
@ -136,7 +136,7 @@ M: scalar>vector-expr simplify*
        [ drop f ]
    } cond ;

-M: shuffle-vector-expr simplify*
+M: shuffle-vector-imm-expr simplify*
    [ src>> ] [ shuffle>> ] [ rep>> rep-components iota ] tri
    sequence= [ drop f ] unless ;

--- a/basis/compiler/cfg/value-numbering/value-numbering-tests.factor
+++ b/basis/compiler/cfg/value-numbering/value-numbering-tests.factor
@ -1215,31 +1215,31 @@ cell 8 = [
    }
 ] [
    {
-        T{ ##shuffle-vector f 1 0 { 0 1 2 3 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 0 1 2 3 } float-4-rep }
    } value-numbering-step
 ] unit-test

 [
    {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 0 { 0 2 3 1 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 0 { 0 2 3 1 } float-4-rep }
    }
 ] [
    {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 3 1 2 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 3 1 2 0 } float-4-rep }
    } value-numbering-step
 ] unit-test

 [
    {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 1 0 } double-2-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 1 0 } double-2-rep }
    }
 ] [
    {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 1 0 } double-2-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 1 0 } double-2-rep }
    } value-numbering-step
 ] unit-test

@ -1253,7 +1253,7 @@ cell 8 = [
    {
        T{ ##load-constant f 0 $[ 55 tag-fixnum ] }
        T{ ##scalar>vector f 1 0 int-4-rep }
-        T{ ##shuffle-vector f 2 1 { 0 0 0 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 0 0 0 0 } float-4-rep }
    } value-numbering-step
 ] unit-test

@ -1267,7 +1267,7 @@ cell 8 = [
    {
        T{ ##load-constant f 0 1.25 }
        T{ ##scalar>vector f 1 0 float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 0 0 0 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 0 0 0 0 } float-4-rep }
    } value-numbering-step
 ] unit-test

--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@ -154,6 +154,7 @@ CODEGEN: ##zero-vector %zero-vector
 CODEGEN: ##fill-vector %fill-vector
 CODEGEN: ##gather-vector-2 %gather-vector-2
 CODEGEN: ##gather-vector-4 %gather-vector-4
+CODEGEN: ##shuffle-vector-imm %shuffle-vector-imm
 CODEGEN: ##shuffle-vector %shuffle-vector
 CODEGEN: ##tail>head-vector %tail>head-vector
 CODEGEN: ##merge-vector-head %merge-vector-head
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@ -31,7 +31,8 @@ IN: compiler.tree.propagation.simd
    (simd-vrshift)
    (simd-hlshift)
    (simd-hrshift)
-    (simd-vshuffle)
+    (simd-vshuffle-bytes)
+    (simd-vshuffle-elements)
    (simd-(vmerge-head))
    (simd-(vmerge-tail))
    (simd-(v>float))
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@ -242,6 +242,7 @@ HOOK: %fill-vector cpu ( dst rep -- )
 HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
 HOOK: %shuffle-vector cpu ( dst src shuffle rep -- )
+HOOK: %shuffle-vector-imm cpu ( dst src shuffle rep -- )
 HOOK: %tail>head-vector cpu ( dst src rep -- )
 HOOK: %merge-vector-head cpu ( dst src1 src2 rep -- )
 HOOK: %merge-vector-tail cpu ( dst src1 src2 rep -- )
@ -288,7 +289,9 @@ HOOK: %zero-vector-reps cpu ( -- reps )
 HOOK: %fill-vector-reps cpu ( -- reps )
 HOOK: %gather-vector-2-reps cpu ( -- reps )
 HOOK: %gather-vector-4-reps cpu ( -- reps )
+HOOK: %alien-vector-reps cpu ( -- reps )
 HOOK: %shuffle-vector-reps cpu ( -- reps )
+HOOK: %shuffle-vector-imm-reps cpu ( -- reps )
 HOOK: %merge-vector-reps cpu ( -- reps )
 HOOK: %signed-pack-vector-reps cpu ( -- reps )
 HOOK: %unsigned-pack-vector-reps cpu ( -- reps )
@ -328,7 +331,9 @@ M: object %zero-vector-reps { } ;
 M: object %fill-vector-reps { } ;
 M: object %gather-vector-2-reps { } ;
 M: object %gather-vector-4-reps { } ;
+M: object %alien-vector-reps { } ;
 M: object %shuffle-vector-reps { } ;
+M: object %shuffle-vector-imm-reps { } ;
 M: object %merge-vector-reps { } ;
 M: object %signed-pack-vector-reps { } ;
 M: object %unsigned-pack-vector-reps { } ;
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@ -432,8 +432,13 @@ M: x86 %epilogue ( n -- ) cell - incr-stack-reg ;
    temp 0 MOV \ t rc-absolute-cell rel-immediate
    dst temp word execute ; inline

+: (%compare) ( src1 src2 cc -- )
+    2over [ { cc= cc/= } member? ] [ register? ] [ 0 = ] tri* and and
+    [ drop dup TEST ]
+    [ CMP ] if ;
+
 M:: x86 %compare ( dst src1 src2 cc temp -- )
-    src1 src2 CMP
+    src1 src2 cc (%compare)
    cc order-cc {
        { cc<  [ dst temp \ CMOVL %boolean ] }
        { cc<= [ dst temp \ CMOVLE %boolean ] }
@ -447,7 +452,7 @@ M: x86 %compare-imm ( dst src1 src2 cc temp -- )
    %compare ;

 M:: x86 %compare-branch ( label src1 src2 cc -- )
-    src1 src2 CMP
+    src1 src2 cc (%compare)
    cc order-cc {
        { cc<  [ label JL ] }
        { cc<= [ label JLE ] }
@ -562,6 +567,12 @@ MACRO: available-reps ( alist -- )
    reverse [ { } ] suffix
    '[ _ cond ] ;

+M: x86 %alien-vector-reps
+    {
+        { sse? { float-4-rep } }
+        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
+    } available-reps ;
+
 M: x86 %zero-vector
    {
        { double-2-rep [ dup XORPD ] }
@ -673,11 +684,9 @@ M: x86 %gather-vector-2-reps
        [ dupd SHUFPD ]
    } case ;

-: float-4-shuffle ( dst shuffle -- )
+: sse1-float-4-shuffle ( dst shuffle -- )
    {
        { { 0 1 2 3 } [ drop ] }
-        { { 0 0 2 2 } [ dup MOVSLDUP ] }
-        { { 1 1 3 3 } [ dup MOVSHDUP ] }
        { { 0 1 0 1 } [ dup MOVLHPS ] }
        { { 2 3 2 3 } [ dup MOVHLPS ] }
        { { 0 0 1 1 } [ dup UNPCKLPS ] }
@ -685,6 +694,15 @@ M: x86 %gather-vector-2-reps
        [ dupd SHUFPS ]
    } case ;

+: float-4-shuffle ( dst shuffle -- )
+    sse3? [
+        {
+            { { 0 0 2 2 } [ dup MOVSLDUP ] }
+            { { 1 1 3 3 } [ dup MOVSHDUP ] }
+            [ sse1-float-4-shuffle ]
+        } case
+    ] [ sse1-float-4-shuffle ] if ;
+
 : int-4-shuffle ( dst shuffle -- )
    {
        { { 0 1 2 3 } [ drop ] }
@ -698,7 +716,7 @@ M: x86 %gather-vector-2-reps
 : longlong-2-shuffle ( dst shuffle -- )
    first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;

-M:: x86 %shuffle-vector ( dst src shuffle rep -- )
+M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
    dst src rep %copy
    dst shuffle rep unsign-rep {
        { double-2-rep [ double-2-shuffle ] }
@ -707,12 +725,20 @@ M:: x86 %shuffle-vector ( dst src shuffle rep -- )
        { longlong-2-rep [ longlong-2-shuffle ] }
    } case ;

-M: x86 %shuffle-vector-reps
+M: x86 %shuffle-vector-imm-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
    } available-reps ;

+M: x86 %shuffle-vector ( dst src shuffle rep -- )
+    two-operand PSHUFB ;
+
+M: x86 %shuffle-vector-reps
+    {
+        { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %merge-vector-head
    [ two-operand ] keep
    unsign-rep {
@ -790,8 +816,6 @@ M: x86 %unpack-vector-head-reps ( -- reps )
        { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
    } available-reps ;

-M: x86 %unpack-vector-tail-reps ( -- reps ) { } ;
-
 M: x86 %integer>float-vector ( dst src rep -- )
    {
        { int-4-rep [ CVTDQ2PS ] }
@ -1037,10 +1061,6 @@ M: x86 %mul-vector-reps
        { sse4.1? { int-4-rep uint-4-rep } }
    } available-reps ;

-M: x86 %saturated-mul-vector-reps
-    ! No multiplication with saturation on x86
-    { } ;
-
 M: x86 %div-vector ( dst src1 src2 rep -- )
    [ two-operand ] keep
    {
@ -1223,8 +1243,6 @@ M: x86 %xor-vector-reps
        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
    } available-reps ;

-M: x86 %not-vector-reps { } ;
-
 M: x86 %shl-vector ( dst src1 src2 rep -- )
    [ two-operand ] keep
    {
@ -1271,6 +1289,30 @@ M:: x86 %scalar>integer ( dst src rep -- )
        { uint-scalar-rep [
            dst 32-bit-version-of src MOVD
        ] }
+        { short-scalar-rep [
+            dst 32-bit-version-of src MOVD
+            dst dst 16-bit-version-of MOVSX
+        ] }
+        { ushort-scalar-rep [
+            dst 32-bit-version-of src MOVD
+            dst dst 16-bit-version-of MOVZX
+        ] }
+        { char-scalar-rep [
+            dst 32-bit-version-of src MOVD
+            dst { } 8 [| tmp-dst |
+                tmp-dst dst int-rep %copy
+                tmp-dst tmp-dst 8-bit-version-of MOVSX
+                dst tmp-dst int-rep %copy
+            ] with-small-register
+        ] }
+        { uchar-scalar-rep [
+            dst 32-bit-version-of src MOVD
+            dst { } 8 [| tmp-dst |
+                tmp-dst dst int-rep %copy
+                tmp-dst tmp-dst 8-bit-version-of MOVZX
+                dst tmp-dst int-rep %copy
+            ] with-small-register
+        ] }
    } case ;

 M: x86 %vector>scalar %copy ;
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@ -60,7 +60,7 @@ MACRO: simd-boa ( rep class -- simd-array )
    [ rep-components ] [ new ] [ '[ _ ] ] tri* swap replicate-as ; inline

 : simd-with/nth-fast? ( rep -- ? )
-    [ \ (simd-vshuffle) supported-simd-op? ]
+    [ \ (simd-vshuffle-elements) supported-simd-op? ]
    [ rep-component-type can-be-unboxed? ]
    bi and ;

@ -184,6 +184,8 @@ WHERE
 TUPLE: A
 { underlying byte-array read-only initial: $[ 16 <byte-array> ] } ;

+INSTANCE: A simd-128
+
 M: A clone underlying>> clone \ A boa ; inline

 M: A length drop N ; inline
@ -315,7 +317,7 @@ SLOT: underlying2
    class c:typedef ;

 : (define-simd-256) ( simd -- )
-    simd-ops get { vshuffle hlshift hrshift } unique assoc-diff >>ops
+    simd-ops get { vshuffle-elements vshuffle-bytes hlshift hrshift } unique assoc-diff >>ops
    [ define-simd ]
    [ [ class>> ] [ rep>> ] bi define-simd-256-type ] bi ;

@ -362,6 +364,8 @@ TUPLE: A
 { underlying1 byte-array initial: $[ 16 <byte-array> ] read-only }
 { underlying2 byte-array initial: $[ 16 <byte-array> ] read-only } ;

+INSTANCE: A simd-256
+
 M: A clone
    [ underlying1>> clone ] [ underlying2>> clone ] bi
    \ A boa ; inline
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@ -67,7 +67,8 @@ SIMD-OP: vlshift
 SIMD-OP: vrshift
 SIMD-OP: hlshift
 SIMD-OP: hrshift
-SIMD-OP: vshuffle
+SIMD-OP: vshuffle-elements
+SIMD-OP: vshuffle-bytes
 SIMD-OP: (vmerge-head)
 SIMD-OP: (vmerge-tail)
 SIMD-OP: v<=
@ -148,6 +149,9 @@ GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
    union
    { uchar-16-rep ushort-8-rep uint-4-rep ulonglong-2-rep } union ;

+: (%shuffle-imm-reps) ( -- reps )
+    %shuffle-vector-reps %shuffle-vector-imm-reps union ;
+
 M: vector-rep supported-simd-op?
    {
        { \ (simd-v+)            [ %add-vector-reps            ] }
@ -179,7 +183,8 @@ M: vector-rep supported-simd-op?
        { \ (simd-vrshift)       [ %shr-vector-reps            ] }
        { \ (simd-hlshift)       [ %horizontal-shl-vector-reps ] }
        { \ (simd-hrshift)       [ %horizontal-shr-vector-reps ] }
-        { \ (simd-vshuffle)      [ %shuffle-vector-reps        ] }
+        { \ (simd-vshuffle-elements) [ (%shuffle-imm-reps)         ] }
+        { \ (simd-vshuffle-bytes)    [ %shuffle-vector-reps        ] }
        { \ (simd-(vmerge-head)) [ %merge-vector-reps          ] }
        { \ (simd-(vmerge-tail)) [ %merge-vector-reps          ] }
        { \ (simd-(v>float))        [ %integer>float-vector-reps ] }
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@ -41,7 +41,21 @@ $nl
    POSTPONE: SIMD:
    POSTPONE: SIMDS:
 }
-"The following vector types are supported:"
+"The following scalar types are supported:"
+{ $code
+    "char"
+    "uchar"
+    "short"
+    "ushort"
+    "int"
+    "uint"
+    "longlong"
+    "ulonglong"
+    "float"
+    "double"
+}
+
+"The following vector types are generated from the above scalar types:"
 { $code
    "char-16"
    "uchar-16"
@ -89,6 +103,7 @@ $nl
 { $code
 """USING: compiler.tree.debugger math.vectors
 math.vectors.simd ;
+SIMD: double
 SYMBOLS: x y ;

 [
@ -107,7 +122,7 @@ IN: simd-demo
    { float-4 float-4 float-4 } declare
    [ v* ] [ [ 1.0 ] dip n-v v* ] bi-curry* bi v+ ;

-\ interpolate optimizer-report.""" }
+\\ interpolate optimizer-report.""" }
 "Note that using " { $link declare } " is not recommended. Safer ways of getting type information for the input parameters to a word include defining methods on a generic word (the value being dispatched upon has a statically known type in the method body), as well as using " { $link "hints" } " and " { $link POSTPONE: inline } " declarations."
 $nl
 "Here is a better version of the " { $snippet "interpolate" } " words above that uses hints:"
@ -122,7 +137,7 @@ IN: simd-demo

 HINTS: interpolate float-4 float-4 float-4 ;

-\ interpolate optimizer-report. """ }
+\\ interpolate optimizer-report. """ }
 "This time, the optimizer report lists calls to both SIMD primitives and high-level vector words, because hints cause two code paths to be generated. The " { $snippet "optimized." } " word can be used to make sure that the fast code path consists entirely of calls to primitives."
 $nl
 "If the " { $snippet "interpolate" } " word was to be used in several places with different types of vectors, it would be best to declare it " { $link POSTPONE: inline } "."
@ -153,13 +168,13 @@ M: actor advance ( dt actor -- )
    [ >float ] dip
    [ update-velocity ] [ update-position ] 2bi ;

-M\ actor advance optimized."""
+M\\ actor advance optimized."""
 }
 "The " { $vocab-link "compiler.cfg.debugger" } " vocabulary can give a lower-level picture of the generated code, that includes register assignments and other low-level details. To look at low-level optimizer output, call " { $snippet "test-mr mr." } " on a word or quotation:"
 { $code
 """USE: compiler.tree.debugger

-M\ actor advance test-mr mr.""" }
+M\\ actor advance test-mr mr.""" }
 "An example of a high-performance algorithm that uses SIMD primitives can be found in the " { $vocab-link "benchmark.nbody-simd" } " vocabulary." ;

 ARTICLE: "math.vectors.simd.intrinsics" "Low-level SIMD primitives"
@ -206,7 +221,7 @@ ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
 HELP: SIMD:
 { $syntax "SIMD: type" }
 { $values { "type" "a scalar C type" } }
-{ $description "Defines 128-bit and 256-bit SIMD arrays for holding elements of " { $snippet "type" } " into the vocabulary search path. The possible type/length combinations are listed in " { $link "math.vectors.simd.types" } " and the generated words are documented in " { $link "math.vectors.simd.words" } "." } ;
+{ $description "Defines 128-bit and 256-bit SIMD arrays for holding elements of " { $snippet "type" } " into the vocabulary search path. The allowed scalar types, and the auto-generated type/length vector combinations that result, are listed in " { $link "math.vectors.simd.types" } ". Generated words are documented in " { $link "math.vectors.simd.words" } "." } ;

 HELP: SIMDS:
 { $syntax "SIMDS: type type type ... ;" }
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@ -174,7 +174,7 @@ CONSTANT: simd-classes
 : remove-special-words ( alist -- alist' )
    ! These have their own tests later
    {
-        hlshift hrshift vshuffle vbroadcast
+        hlshift hrshift vshuffle-bytes vshuffle-elements vbroadcast
        vany? vall? vnone?
        (v>float) (v>integer)
        (vpack-signed) (vpack-unsigned)
@ -360,6 +360,23 @@ simd-classes [
    ] unit-test
 ] each

+"== Checking variable shuffles" print
+
+: random-shift-vector ( class -- vec )
+    new [ drop 16 random ] map ;
+
+:: test-shift-vector ( class -- ? )
+    class random-int-vector :> src
+    char-16 random-shift-vector :> perm
+    { class char-16 } :> decl
+
+    src perm vshuffle
+    src perm [ decl declare vshuffle ] compile-call
+    = ; inline
+
+{ char-16 uchar-16 short-8 ushort-8 int-4 uint-4 longlong-2 ulonglong-2 }
+[ 10 swap '[ [ t ] [ _ test-shift-vector ] unit-test ] times ] each
+
 "== Checking vector tests" print

 :: test-vector-tests-bool ( vector declaration -- none? any? all? )
@ -512,38 +529,38 @@ SYMBOL: !!inconsistent!!

 STRUCT: simd-struct
 { x float-4 }
-{ y double-2 }
+{ y longlong-2 }
 { z double-4 }
-{ w float-8 } ;
+{ w int-8 } ;

 [ t ] [ [ simd-struct <struct> ] compile-call >c-ptr [ 0 = ] all? ] unit-test

 [
    float-4{ 1 2 3 4 }
-    double-2{ 2 1 }
+    longlong-2{ 2 1 }
    double-4{ 4 3 2 1 }
-    float-8{ 1 2 3 4 5 6 7 8 }
+    int-8{ 1 2 3 4 5 6 7 8 }
 ] [
    simd-struct <struct>
    float-4{ 1 2 3 4 } >>x
-    double-2{ 2 1 } >>y
+    longlong-2{ 2 1 } >>y
    double-4{ 4 3 2 1 } >>z
-    float-8{ 1 2 3 4 5 6 7 8 } >>w
+    int-8{ 1 2 3 4 5 6 7 8 } >>w
    { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
 ] unit-test

 [
    float-4{ 1 2 3 4 }
-    double-2{ 2 1 }
+    longlong-2{ 2 1 }
    double-4{ 4 3 2 1 }
-    float-8{ 1 2 3 4 5 6 7 8 }
+    int-8{ 1 2 3 4 5 6 7 8 }
 ] [
    [
        simd-struct <struct>
        float-4{ 1 2 3 4 } >>x
-        double-2{ 2 1 } >>y
+        longlong-2{ 2 1 } >>y
        double-4{ 4 3 2 1 } >>z
-        float-8{ 1 2 3 4 5 6 7 8 } >>w
+        int-8{ 1 2 3 4 5 6 7 8 } >>w
        { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
    ] compile-call
 ] unit-test
--- a/basis/math/vectors/specialization/specialization.factor
+++ b/basis/math/vectors/specialization/specialization.factor
@ -98,7 +98,8 @@ H{
    { vrshift { +vector+ +scalar+ -> +vector+ } }
    { hlshift { +vector+ +literal+ -> +vector+ } }
    { hrshift { +vector+ +literal+ -> +vector+ } }
-    { vshuffle { +vector+ +literal+ -> +vector+ } }
+    { vshuffle-elements { +vector+ +literal+ -> +vector+ } }
+    { vshuffle-bytes    { +vector+ +vector+  -> +vector+ } }
    { vbroadcast { +vector+ +literal+ -> +vector+ } }
    { (vmerge-head) { +vector+ +vector+ -> +vector+ } }
    { (vmerge-tail) { +vector+ +vector+ -> +vector+ } }
@ -162,7 +163,7 @@ ERROR: bad-vector-word word ;
    } cond
    ! Don't specialize horizontal shifts, shuffles, and conversions at all, they're only for SIMD
    {
-        hlshift hrshift vshuffle vbroadcast
+        hlshift hrshift vshuffle-elements vshuffle-bytes vbroadcast
        (v>integer) (v>float)
        (vpack-signed) (vpack-unsigned)
        (vunpack-head) (vunpack-tail)
--- a/basis/math/vectors/vectors.factor
+++ b/basis/math/vectors/vectors.factor
@ -6,6 +6,9 @@ locals ;
 QUALIFIED-WITH: alien.c-types c
 IN: math.vectors

+MIXIN: simd-128
+MIXIN: simd-256
+
 GENERIC: element-type ( obj -- c-type )
 M: object element-type drop f ; inline

@ -83,7 +86,20 @@ PRIVATE>
 : vbitnot ( u -- w ) dup '[ _ [ bitnot ] fp-bitwise-unary ] map ;

 :: vbroadcast ( u n -- v ) u length n u nth <repetition> u like ;
-: vshuffle ( u perm -- v ) swap [ '[ _ nth ] ] keep map-as ;
+
+: vshuffle-elements ( u perm -- v )
+    swap [ '[ _ nth ] ] keep map-as ;
+
+: vshuffle-bytes ( u perm -- v )
+    underlying>> [
+        swap [ '[ _ nth ] ] keep map-as
+    ] curry change-underlying ;
+
+GENERIC: vshuffle ( u perm -- v )
+M: array vshuffle ( u perm -- v )
+    vshuffle-elements ; inline
+M: simd-128 vshuffle ( u perm -- v )
+    vshuffle-bytes ; inline

 : vlshift ( u n -- w ) '[ _ shift ] map ;
 : vrshift ( u n -- w ) neg '[ _ shift ] map ;
@ -107,9 +123,9 @@ PRIVATE>
 : vxor ( u v -- w )  over '[ [ _ element>bool ] bi@ xor ] 2map ;
 : vnot ( u -- w )    dup '[ _ element>bool not ] map ;

-: vall? ( v -- ? ) [ ] all? ;
-: vany? ( v -- ? ) [ ] any? ;
-: vnone? ( v -- ? ) [ not ] all? ;
+: vall? ( v -- ? ) dup '[ _ element>bool ] all? ;
+: vany? ( v -- ? ) dup '[ _ element>bool ] any? ;
+: vnone? ( v -- ? ) dup '[ _ element>bool not ] all? ;

 : v<  ( u v -- w ) [ <   ] 2map ;
 : v<= ( u v -- w ) [ <=  ] 2map ;
--- a/basis/specialized-arrays/specialized-arrays.factor
+++ b/basis/specialized-arrays/specialized-arrays.factor
@ -2,7 +2,8 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors alien alien.c-types alien.data alien.parser
 assocs byte-arrays classes compiler.units functors kernel lexer
-libc math math.vectors math.vectors.specialization namespaces
+libc math math.vectors math.vectors.private
+math.vectors.specialization namespaces
 parser prettyprint.custom sequences sequences.private strings
 summary vocabs vocabs.loader vocabs.parser vocabs.generated
 words fry combinators present ;
@ -68,6 +69,8 @@ TUPLE: A
    [ drop \ T bad-byte-array-length ] unless
    <direct-A> ; inline

+M: A new-underlying drop byte-array>A ;
+
 M: A clone [ underlying>> clone ] [ length>> ] bi <direct-A> ; inline

 M: A length length>> ; inline
--- a/core/classes/tuple/tuple-docs.factor
+++ b/core/classes/tuple/tuple-docs.factor
@ -129,17 +129,18 @@ ARTICLE: "tuple-inheritance-example" "Tuple subclassing example"
 }
 "Rectangles and parallelograms use the same algorithm for computing the area, whereas they use different algorithms for computing perimiter. Also, rectangles and parallelograms both have " { $snippet "width" } " and " { $snippet "height" } " slots. We can exploit this with subclassing:"
 { $code
+    "USING: accessors kernel math math.constants math.functions ;"
    "GENERIC: area ( shape -- n )"
    "GENERIC: perimiter ( shape -- n )"
    ""
    "TUPLE: shape ;"
    ""
    "TUPLE: circle < shape radius ;"
-    "M: area circle radius>> sq pi * ;"
-    "M: perimiter circle radius>> 2 * pi * ;"
+    "M: circle area radius>> sq pi * ;"
+    "M: circle perimiter radius>> 2 * pi * ;"
    ""
-    "TUPLE: quad < shape width height"
-    "M: area quad [ width>> ] [ height>> ] bi * ;"
+    "TUPLE: quad < shape width height ;"
+    "M: quad area [ width>> ] [ height>> ] bi * ;"
    ""
    "TUPLE: rectangle < quad ;"
    "M: rectangle perimiter [ width>> 2 * ] [ height>> 2 * ] bi + ;"
--- a/extra/math/matrices/simd/simd-tests.factor
+++ b/extra/math/matrices/simd/simd-tests.factor
@ -52,10 +52,10 @@ IN: math.matrices.simd.tests
 [ 
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 3.0 }
-            float-4{ 0.0 1.0 0.0 4.0 }
-            float-4{ 0.0 0.0 1.0 2.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+            float-4{ 3.0 4.0 2.0 1.0 }
        }
    }
 ] [ float-4{ 3.0 4.0 2.0 0.0 } translation-matrix4 ] unit-test
@ -77,9 +77,9 @@ IN: math.matrices.simd.tests
    float-4{ 0.0 1.0 0.0 1.0 } pi 1/2. * rotation-matrix4
    S{ matrix4 f
        float-4-array{
-            float-4{  0.0  0.0  1.0 0.0 }
+            float-4{  0.0  0.0 -1.0 0.0 }
            float-4{  0.0  1.0  0.0 0.0 }
-            float-4{ -1.0  0.0  0.0 0.0 }
+            float-4{  1.0  0.0  0.0 0.0 }
            float-4{  0.0  0.0  0.0 1.0 }
        }
    }
@ -89,10 +89,10 @@ IN: math.matrices.simd.tests
 [
    S{ matrix4 f
        float-4-array{
-            float-4{ 2.0 0.0 0.0 10.0 }
-            float-4{ 0.0 3.0 0.0 18.0 }
-            float-4{ 0.0 0.0 4.0 28.0 }
-            float-4{ 0.0 0.0 0.0  1.0 }
+            float-4{  2.0  0.0  0.0  0.0 }
+            float-4{  0.0  3.0  0.0  0.0 }
+            float-4{  0.0  0.0  4.0  0.0 }
+            float-4{ 10.0 18.0 28.0  1.0 }
        }
    }
 ] [
@ -106,10 +106,10 @@ IN: math.matrices.simd.tests
    }
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
        }
    }
    m4.
@ -118,10 +118,10 @@ IN: math.matrices.simd.tests
 [
    S{ matrix4 f
        float-4-array{
-            float-4{ 3.0 0.0 0.0 5.0 }
-            float-4{ 0.0 4.0 0.0 6.0 }
-            float-4{ 0.0 0.0 5.0 7.0 }
-            float-4{ 0.0 0.0 0.0 2.0 }
+            float-4{ 3.0 0.0 0.0 0.0 }
+            float-4{ 0.0 4.0 0.0 0.0 }
+            float-4{ 0.0 0.0 5.0 0.0 }
+            float-4{ 5.0 6.0 7.0 2.0 }
        }
    }
 ] [
@ -135,10 +135,10 @@ IN: math.matrices.simd.tests
    }
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
        }
    }
    m4+
@ -147,10 +147,10 @@ IN: math.matrices.simd.tests
 [
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 -5.0 }
-            float-4{ 0.0 2.0 0.0 -6.0 }
-            float-4{ 0.0 0.0 3.0 -7.0 }
-            float-4{ 0.0 0.0 0.0  0.0 }
+            float-4{  1.0  0.0  0.0 0.0 }
+            float-4{  0.0  2.0  0.0 0.0 }
+            float-4{  0.0  0.0  3.0 0.0 }
+            float-4{ -5.0 -6.0 -7.0 0.0 }
        }
    }
 ] [
@ -164,10 +164,10 @@ IN: math.matrices.simd.tests
    }
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
        }
    }
    m4-
@ -219,10 +219,10 @@ IN: math.matrices.simd.tests
 [
    S{ matrix4 f
        float-4-array{
-            float-4{ 1/2. 0.0   0.0    0.0  }
-            float-4{ 0.0  1/2.  0.0    0.0  }
-            float-4{ 0.0  0.0  -6/4. -10/4. }
-            float-4{ 0.0  0.0  -1.0    0.0  }
+            float-4{ 1/2. 0.0   0.0   0.0 }
+            float-4{ 0.0  1/2.  0.0   0.0 }
+            float-4{ 0.0  0.0  -6/4. -1.0 }
+            float-4{ 0.0  0.0 -10/4.  0.0 }
        }
    }
 ] [
--- a/extra/math/matrices/simd/simd.factor
+++ b/extra/math/matrices/simd/simd.factor
@ -9,34 +9,34 @@ SPECIALIZED-ARRAY: float-4
 IN: math.matrices.simd

 STRUCT: matrix4
-    { rows float-4[4] } ;
+    { columns float-4[4] } ;

 INSTANCE: matrix4 immutable-sequence

 M: matrix4 length drop 4 ; inline
-M: matrix4 nth-unsafe rows>> nth-unsafe ; inline
+M: matrix4 nth-unsafe columns>> nth-unsafe ; inline
 M: matrix4 new-sequence 2drop matrix4 (struct) ; inline

 <PRIVATE

-: rows ( a -- a1 a2 a3 a4 )
-    rows>> 4 firstn ; inline
+: columns ( a -- a1 a2 a3 a4 )
+    columns>> 4 firstn ; inline

-:: set-rows ( c1 c2 c3 c4 c -- c )
-    c rows>> :> rows
-    c1 rows set-first
-    c2 rows set-second
-    c3 rows set-third
-    c4 rows set-fourth
+:: set-columns ( c1 c2 c3 c4 c -- c )
+    c columns>> :> columns
+    c1 columns set-first
+    c2 columns set-second
+    c3 columns set-third
+    c4 columns set-fourth
    c ; inline

 : make-matrix4 ( quot: ( -- c1 c2 c3 c4 ) -- c )
-    matrix4 (struct) swap dip set-rows ; inline
+    matrix4 (struct) swap dip set-columns ; inline

-:: 2map-rows ( a b quot -- c )
+:: 2map-columns ( a b quot -- c )
    [
-        a rows :> a4 :> a3 :> a2 :> a1
-        b rows :> b4 :> b3 :> b2 :> b1
+        a columns :> a4 :> a3 :> a2 :> a1
+        b columns :> b4 :> b3 :> b2 :> b1

        a1 b1 quot call
        a2 b2 quot call
@ -44,57 +44,57 @@ M: matrix4 new-sequence 2drop matrix4 (struct) ; inline
        a4 b4 quot call
    ] make-matrix4 ; inline

-: map-rows ( a quot -- c )
-    '[ rows _ 4 napply ] make-matrix4 ; inline
+: map-columns ( a quot -- c )
+    '[ columns _ 4 napply ] make-matrix4 ; inline
    
 PRIVATE>

-TYPED: m4+ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v+ ] 2map-rows ;
-TYPED: m4- ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v- ] 2map-rows ;
-TYPED: m4* ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v* ] 2map-rows ;
-TYPED: m4/ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v/ ] 2map-rows ;
+TYPED: m4+ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v+ ] 2map-columns ;
+TYPED: m4- ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v- ] 2map-columns ;
+TYPED: m4* ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v* ] 2map-columns ;
+TYPED: m4/ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v/ ] 2map-columns ;

-TYPED: m4*n ( a: matrix4 b: float -- c: matrix4 ) [ v*n ] curry map-rows ;
-TYPED: m4/n ( a: matrix4 b: float -- c: matrix4 ) [ v/n ] curry map-rows ;
-TYPED: n*m4 ( a: float b: matrix4 -- c: matrix4 ) [ n*v ] with map-rows ;
-TYPED: n/m4 ( a: float b: matrix4 -- c: matrix4 ) [ n/v ] with map-rows ;
+TYPED: m4*n ( a: matrix4 b: float -- c: matrix4 ) [ v*n ] curry map-columns ;
+TYPED: m4/n ( a: matrix4 b: float -- c: matrix4 ) [ v/n ] curry map-columns ;
+TYPED: n*m4 ( a: float b: matrix4 -- c: matrix4 ) [ n*v ] with map-columns ;
+TYPED: n/m4 ( a: float b: matrix4 -- c: matrix4 ) [ n/v ] with map-columns ;

 TYPED:: m4. ( a: matrix4 b: matrix4 -- c: matrix4 )
    [
-        a rows :> a4 :> a3 :> a2 :> a1
-        b rows :> b4 :> b3 :> b2 :> b1
+        a columns :> a4 :> a3 :> a2 :> a1
+        b columns :> b4 :> b3 :> b2 :> b1

-        a1 first  b1 n*v :> c1a
-        a2 first  b1 n*v :> c2a
-        a3 first  b1 n*v :> c3a
-        a4 first  b1 n*v :> c4a
+        b1 first  a1 n*v :> c1a
+        b2 first  a1 n*v :> c2a
+        b3 first  a1 n*v :> c3a
+        b4 first  a1 n*v :> c4a

-        a1 second b2 n*v c1a v+ :> c1b 
-        a2 second b2 n*v c2a v+ :> c2b
-        a3 second b2 n*v c3a v+ :> c3b
-        a4 second b2 n*v c4a v+ :> c4b
+        b1 second a2 n*v c1a v+ :> c1b 
+        b2 second a2 n*v c2a v+ :> c2b
+        b3 second a2 n*v c3a v+ :> c3b
+        b4 second a2 n*v c4a v+ :> c4b

-        a1 third  b3 n*v c1b v+ :> c1c 
-        a2 third  b3 n*v c2b v+ :> c2c
-        a3 third  b3 n*v c3b v+ :> c3c
-        a4 third  b3 n*v c4b v+ :> c4c
+        b1 third  a3 n*v c1b v+ :> c1c 
+        b2 third  a3 n*v c2b v+ :> c2c
+        b3 third  a3 n*v c3b v+ :> c3c
+        b4 third  a3 n*v c4b v+ :> c4c

-        a1 fourth b4 n*v c1c v+
-        a2 fourth b4 n*v c2c v+
-        a3 fourth b4 n*v c3c v+
-        a4 fourth b4 n*v c4c v+
+        b1 fourth a4 n*v c1c v+
+        b2 fourth a4 n*v c2c v+
+        b3 fourth a4 n*v c3c v+
+        b4 fourth a4 n*v c4c v+
    ] make-matrix4 ;

-TYPED:: v.m4 ( a: float-4 b: matrix4 -- c: float-4 )
-    b rows :> b4 :> b3 :> b2 :> b1
+TYPED:: m4.v ( m: matrix4 v: float-4 -- v': float-4 )
+    m columns :> m4 :> m3 :> m2 :> m1
    
-    a first  b1 n*v
-    a second b2 n*v v+
-    a third  b3 n*v v+
-    a fourth b4 n*v v+ ;
+    v first  m1 n*v
+    v second m2 n*v v+
+    v third  m3 n*v v+
+    v fourth m4 n*v v+ ;

-TYPED:: m4.v ( a: matrix4 b: float-4 -- c: float-4 )
-    a rows [ b v. ] 4 napply float-4-boa ;
+TYPED:: v.m4 ( v: float-4 m: matrix4 -- c: float-4 )
+    m columns [ v v. ] 4 napply float-4-boa ;

 CONSTANT: identity-matrix4
    S{ matrix4 f
@ -131,37 +131,37 @@ TYPED: diagonal-matrix4 ( diagonal: float-4 -- matrix: matrix4 )
    [ (vmerge) ] bi-curry@ bi* ; inline

 TYPED: transpose-matrix4 ( matrix: matrix4 -- matrix: matrix4 )
-    [ rows vmerge-transpose vmerge-transpose ] make-matrix4 ;
+    [ columns vmerge-transpose vmerge-transpose ] make-matrix4 ;
+
+: linear>homogeneous ( v -- v' )
+    [ float-4{ t t t f } ] dip float-4{ 0.0 0.0 0.0 1.0 } v? ; inline

 : scale-matrix4 ( factors -- matrix )
-    [ float-4{ t t t f } ] dip float-4{ 0.0 0.0 0.0 1.0 } v?
-    diagonal-matrix4 ; inline
+    linear>homogeneous diagonal-matrix4 ; inline

 : ortho-matrix4 ( factors -- matrix )
    float-4{ 1.0 1.0 1.0 1.0 } swap v/ scale-matrix4 ; inline

-TYPED:: translation-matrix4 ( offset: float-4 -- matrix: matrix4 )
+TYPED: translation-matrix4 ( offset: float-4 -- matrix: matrix4 )
    [
-        float-4{ 1.0 1.0 1.0 1.0 } :> diagonal
-
-        offset 0 float-4-with (vmerge)
-        [ 0 float-4-with swap (vmerge) ] bi@ drop :> z :> y :> x
-
-        diagonal y vmerge-diagonal*
-        [ x vmerge-diagonal* ]
-        [ z vmerge-diagonal* ] bi*
+        linear>homogeneous
+        [ 
+            float-4{ 1.0 0.0 0.0 0.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
+        ] dip
    ] make-matrix4 ;

 TYPED:: rotation-matrix4 ( axis: float-4 theta: float -- matrix: matrix4 )
-    !   x*x + c*(1.0 - x*x)   x*y*(1.0 - c) - s*z   x*z*(1.0 - c) + s*y   0
-    !   x*y*(1.0 - c) + s*z   y*y + c*(1.0 - y*y)   y*z*(1.0 - c) - s*x   0
-    !   x*z*(1.0 - c) - s*y   y*z*(1.0 - c) + s*x   z*z + c*(1.0 - z*z)   0
+    !   x*x + c*(1.0 - x*x)   x*y*(1.0 - c) + s*z   x*z*(1.0 - c) - s*y   0
+    !   x*y*(1.0 - c) - s*z   y*y + c*(1.0 - y*y)   y*z*(1.0 - c) + s*x   0
+    !   x*z*(1.0 - c) + s*y   y*z*(1.0 - c) - s*x   z*z + c*(1.0 - z*z)   0
    !   0                     0                     0                     1
    matrix4 (struct) :> triangle-m
    theta cos :> c
    theta sin :> s

-    float-4{  1.0 -1.0  1.0 0.0 } :> triangle-sign
+    float-4{ -1.0  1.0 -1.0 0.0 } :> triangle-sign

    c float-4-with :> cc
    s float-4-with :> ss
@ -184,7 +184,7 @@ TYPED:: rotation-matrix4 ( axis: float-4 theta: float -- matrix: matrix4 )
    triangle-lo { 1 0 3 3 } vshuffle
    float-4 new

-    triangle-m set-rows drop
+    triangle-m set-columns drop

    diagonal-m triangle-m m4+ ;

@ -194,8 +194,10 @@ TYPED:: frustum-matrix4 ( xy: float-4 near: float far: float -- matrix: matrix4
        float-4{ t t f f } xy near far - float-4-with v? ! denom
        v/ :> fov
        
-        fov 0.0 float-4-with (vmerge-head) vmerge-diagonal
-        fov float-4{ f f t t } vand
-        float-4{ 0.0 0.0 -1.0 0.0 }
+        float-4{ 0.0 -1.0 0.0 0.0 } :> negone
+
+        fov vmerge-diagonal
+        [ vmerge-diagonal ]
+        [ negone (vmerge) ] bi*
    ] make-matrix4 ;