Merge branch 'master' of git://factorcode.org/git/factor into new_gc

2009-10-13 06:57:37 -05:00 · 2009-10-13 06:57:37 -05:00 · e81e076935
parent a280ccfe1c 48f4799505
commit e81e076935
22 changed files with 338 additions and 187 deletions
--- a/basis/calendar/format/format-tests.factor
+++ b/basis/calendar/format/format-tests.factor
@ -1,5 +1,5 @@
 USING: calendar.format calendar kernel math tools.test
-io.streams.string accessors io math.order ;
+io.streams.string accessors io math.order sequences ;
 IN: calendar.format.tests
 [ 0 ] [
@ -81,3 +81,5 @@ IN: calendar.format.tests
 ] [ "Thursday, 02-Oct-2008 23:59:59 GMT" cookie-string>timestamp ] unit-test
 [ ]
 [ { 2008 2009 } [ year. ] each ] unit-test
--- a/basis/calendar/format/format.factor
+++ b/basis/calendar/format/format.factor
@ -66,7 +66,7 @@ M: array month. ( pair -- )
    [ month-name write bl number>string print ]
    [ 1 zeller-congruence ]
    [ (days-in-month) day-abbreviations2 " " join print ] 2tri
-    over "   " <repetition> concat write
+    over "   " <repetition> "" concat-as write
    [
        [ 1 + day. ] keep
        1 + + 7 mod zero? [ nl ] [ bl ] if
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@ -277,6 +277,11 @@ literal: rep ;
 PURE-INSN: ##shuffle-vector
 def: dst
 use: src shuffle
 literal: rep ;
 PURE-INSN: ##shuffle-vector-imm
 def: dst
 use: src
 literal: shuffle rep ;
--- a/basis/compiler/cfg/intrinsics/intrinsics.factor
+++ b/basis/compiler/cfg/intrinsics/intrinsics.factor
@ -194,7 +194,8 @@ IN: compiler.cfg.intrinsics
        { math.vectors.simd.intrinsics:(simd-with) [ [ ^^with-vector ] emit-unary-vector-op ] }
        { math.vectors.simd.intrinsics:(simd-gather-2) [ emit-gather-vector-2 ] }
        { math.vectors.simd.intrinsics:(simd-gather-4) [ emit-gather-vector-4 ] }
-        { math.vectors.simd.intrinsics:(simd-vshuffle) [ emit-shuffle-vector ] }
+        { math.vectors.simd.intrinsics:(simd-vshuffle-elements) [ emit-shuffle-vector ] }
        { math.vectors.simd.intrinsics:(simd-vshuffle-bytes) [ emit-shuffle-vector-var ] }
        { math.vectors.simd.intrinsics:(simd-(vmerge-head)) [ [ ^^merge-vector-head ] emit-binary-vector-op ] }
        { math.vectors.simd.intrinsics:(simd-(vmerge-tail)) [ [ ^^merge-vector-tail ] emit-binary-vector-op ] }
        { math.vectors.simd.intrinsics:(simd-(v>float)) [ [ ^^integer>float-vector ] emit-unary-vector-op ] }
--- a/basis/compiler/cfg/intrinsics/simd/simd.factor
+++ b/basis/compiler/cfg/intrinsics/simd/simd.factor
@ -1,15 +1,16 @@
 ! Copyright (C) 2009 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors byte-arrays fry cpu.architecture kernel math
+USING: accessors alien byte-arrays fry classes.algebra
-sequences math.vectors.simd.intrinsics macros generalizations
+cpu.architecture kernel math sequences math.vectors
-combinators combinators.short-circuit arrays locals
+math.vectors.simd.intrinsics macros generalizations combinators
 combinators.short-circuit arrays locals
 compiler.tree.propagation.info compiler.cfg.builder.blocks
 compiler.cfg.comparisons
 compiler.cfg.stacks compiler.cfg.stacks.local compiler.cfg.hats
 compiler.cfg.instructions compiler.cfg.registers
 compiler.cfg.intrinsics.alien
 specialized-arrays ;
-FROM: alien.c-types => float double ;
+FROM: alien.c-types => heap-size char uchar float double ;
 SPECIALIZED-ARRAYS: float double ;
 IN: compiler.cfg.intrinsics.simd
@ -21,7 +22,7 @@ MACRO: check-elements ( quots -- )
 MACRO: if-literals-match ( quots -- )
    [ length ] [ ] [ length ] tri
-    ! n quots n n
+    ! n quots n
    '[
        ! node quot
        [
@ -77,15 +78,41 @@ MACRO: if-literals-match ( quots -- )
 : shuffle? ( obj -- ? ) { [ array? ] [ [ integer? ] all? ] } 1&& ;
-: emit-shuffle-vector ( node -- )
+: >variable-shuffle ( shuffle rep -- shuffle' )
-    ! Pad the permutation with zeroes if its too short, since we
+    rep-component-type heap-size
    [ dup <repetition> >byte-array ]
    [ iota >byte-array ] bi
    '[ _ n*v _ v+ ] map concat ;
 : generate-shuffle-vector-imm ( src shuffle rep -- dst )
    dup %shuffle-vector-imm-reps member?
    [ ^^shuffle-vector-imm ]
    [
        [ >variable-shuffle ^^load-constant ] keep
        ^^shuffle-vector
    ] if ;
 : emit-shuffle-vector-imm ( node -- )
    ! Pad the permutation with zeroes if it's too short, since we
    ! can't throw an error at this point.
-    [ [ rep-components 0 pad-tail ] keep ^^shuffle-vector ] [unary/param]
+    [ [ rep-components 0 pad-tail ] keep generate-shuffle-vector-imm ] [unary/param]
    { [ shuffle? ] [ representation? ] } if-literals-match ;
 : emit-shuffle-vector-var ( node -- )
    [ ^^shuffle-vector ] [binary]
    { [ %shuffle-vector-reps member? ] } if-literals-match ;
 : emit-shuffle-vector ( node -- )
    dup node-input-infos {
        [ length 3 = ]
        [ first  class>> byte-array class<= ]
        [ second class>> byte-array class<= ]
        [ third  literal>> representation?  ]
    } 1&& [ emit-shuffle-vector-var ] [ emit-shuffle-vector-imm ] if ;
 : ^^broadcast-vector ( src n rep -- dst )
    [ rep-components swap <array> ] keep
-    ^^shuffle-vector ;
+    generate-shuffle-vector-imm ;
 : emit-broadcast-vector ( node -- )
    [ ^^broadcast-vector ] [unary/param]
@ -101,6 +128,9 @@ MACRO: if-literals-match ( quots -- )
    [ ^^select-vector ] [unary/param]
    { [ integer? ] [ representation? ] } if-literals-match ; inline
 : emit-alien-vector-op ( node quot: ( rep -- ) -- )
    { [ %alien-vector-reps member? ] } if-literals-match ; inline
 : emit-alien-vector ( node -- )
    dup [
        '[
@ -108,7 +138,7 @@ MACRO: if-literals-match ( quots -- )
            _ ^^alien-vector ds-push
        ]
        [ inline-alien-getter? ] inline-alien
-    ] with emit-vector-op ;
+    ] with emit-alien-vector-op ;
 : emit-set-alien-vector ( node -- )
    dup [
@ -118,7 +148,7 @@ MACRO: if-literals-match ( quots -- )
        ]
        [ byte-array inline-alien-setter? ]
        inline-alien
-    ] with emit-vector-op ;
+    ] with emit-alien-vector-op ;
 : generate-not-vector ( src rep -- dst )
    dup %not-vector-reps member?
--- a/basis/compiler/cfg/value-numbering/rewrite/rewrite.factor
+++ b/basis/compiler/cfg/value-numbering/rewrite/rewrite.factor
@ -450,26 +450,26 @@ M: ##set-alien-vector rewrite rewrite-alien-addressing ;
 ! Some lame constant folding for SIMD intrinsics. Eventually this
 ! should be redone completely.
-: rewrite-shuffle-vector ( insn expr -- insn' )
+: rewrite-shuffle-vector-imm ( insn expr -- insn' )
    2dup [ rep>> ] bi@ eq? [
        [ [ dst>> ] [ src>> vn>vreg ] bi* ]
        [ [ shuffle>> ] bi@ nths ]
        [ drop rep>> ]
-        2tri \ ##shuffle-vector new-insn
+        2tri \ ##shuffle-vector-imm new-insn
    ] [ 2drop f ] if ;
-: (fold-shuffle-vector) ( shuffle bytes -- bytes' )
+: (fold-shuffle-vector-imm) ( shuffle bytes -- bytes' )
    2dup length swap length /i group nths concat ;
-: fold-shuffle-vector ( insn expr -- insn' )
+: fold-shuffle-vector-imm ( insn expr -- insn' )
    [ [ dst>> ] [ shuffle>> ] bi ] dip value>>
-    (fold-shuffle-vector) \ ##load-constant new-insn ;
+    (fold-shuffle-vector-imm) \ ##load-constant new-insn ;
-M: ##shuffle-vector rewrite
+M: ##shuffle-vector-imm rewrite
    dup src>> vreg>expr {
-        { [ dup shuffle-vector-expr? ] [ rewrite-shuffle-vector ] }
+        { [ dup shuffle-vector-imm-expr? ] [ rewrite-shuffle-vector-imm ] }
-        { [ dup reference-expr? ] [ fold-shuffle-vector ] }
+        { [ dup reference-expr? ] [ fold-shuffle-vector-imm ] }
-        { [ dup constant-expr? ] [ fold-shuffle-vector ] }
+        { [ dup constant-expr? ] [ fold-shuffle-vector-imm ] }
        [ 2drop f ]
    } cond ;
--- a/basis/compiler/cfg/value-numbering/simplify/simplify.factor
+++ b/basis/compiler/cfg/value-numbering/simplify/simplify.factor
@ -136,7 +136,7 @@ M: scalar>vector-expr simplify*
        [ drop f ]
    } cond ;
-M: shuffle-vector-expr simplify*
+M: shuffle-vector-imm-expr simplify*
    [ src>> ] [ shuffle>> ] [ rep>> rep-components iota ] tri
    sequence= [ drop f ] unless ;
--- a/basis/compiler/cfg/value-numbering/value-numbering-tests.factor
+++ b/basis/compiler/cfg/value-numbering/value-numbering-tests.factor
@ -1215,31 +1215,31 @@ cell 8 = [
    }
 ] [
    {
-        T{ ##shuffle-vector f 1 0 { 0 1 2 3 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 0 1 2 3 } float-4-rep }
    } value-numbering-step
 ] unit-test
 [
    {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 0 { 0 2 3 1 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 0 { 0 2 3 1 } float-4-rep }
    }
 ] [
    {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 3 1 2 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 3 1 2 0 } float-4-rep }
    } value-numbering-step
 ] unit-test
 [
    {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 1 0 } double-2-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 1 0 } double-2-rep }
    }
 ] [
    {
-        T{ ##shuffle-vector f 1 0 { 1 2 3 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 1 0 { 1 2 3 0 } float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 1 0 } double-2-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 1 0 } double-2-rep }
    } value-numbering-step
 ] unit-test
@ -1253,7 +1253,7 @@ cell 8 = [
    {
        T{ ##load-constant f 0 $[ 55 tag-fixnum ] }
        T{ ##scalar>vector f 1 0 int-4-rep }
-        T{ ##shuffle-vector f 2 1 { 0 0 0 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 0 0 0 0 } float-4-rep }
    } value-numbering-step
 ] unit-test
@ -1267,7 +1267,7 @@ cell 8 = [
    {
        T{ ##load-constant f 0 1.25 }
        T{ ##scalar>vector f 1 0 float-4-rep }
-        T{ ##shuffle-vector f 2 1 { 0 0 0 0 } float-4-rep }
+        T{ ##shuffle-vector-imm f 2 1 { 0 0 0 0 } float-4-rep }
    } value-numbering-step
 ] unit-test
--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@ -154,6 +154,7 @@ CODEGEN: ##zero-vector %zero-vector
 CODEGEN: ##fill-vector %fill-vector
 CODEGEN: ##gather-vector-2 %gather-vector-2
 CODEGEN: ##gather-vector-4 %gather-vector-4
 CODEGEN: ##shuffle-vector-imm %shuffle-vector-imm
 CODEGEN: ##shuffle-vector %shuffle-vector
 CODEGEN: ##tail>head-vector %tail>head-vector
 CODEGEN: ##merge-vector-head %merge-vector-head
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@ -31,7 +31,8 @@ IN: compiler.tree.propagation.simd
    (simd-vrshift)
    (simd-hlshift)
    (simd-hrshift)
-    (simd-vshuffle)
+    (simd-vshuffle-bytes)
    (simd-vshuffle-elements)
    (simd-(vmerge-head))
    (simd-(vmerge-tail))
    (simd-(v>float))
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@ -242,6 +242,7 @@ HOOK: %fill-vector cpu ( dst rep -- )
 HOOK: %gather-vector-2 cpu ( dst src1 src2 rep -- )
 HOOK: %gather-vector-4 cpu ( dst src1 src2 src3 src4 rep -- )
 HOOK: %shuffle-vector cpu ( dst src shuffle rep -- )
 HOOK: %shuffle-vector-imm cpu ( dst src shuffle rep -- )
 HOOK: %tail>head-vector cpu ( dst src rep -- )
 HOOK: %merge-vector-head cpu ( dst src1 src2 rep -- )
 HOOK: %merge-vector-tail cpu ( dst src1 src2 rep -- )
@ -288,7 +289,9 @@ HOOK: %zero-vector-reps cpu ( -- reps )
 HOOK: %fill-vector-reps cpu ( -- reps )
 HOOK: %gather-vector-2-reps cpu ( -- reps )
 HOOK: %gather-vector-4-reps cpu ( -- reps )
 HOOK: %alien-vector-reps cpu ( -- reps )
 HOOK: %shuffle-vector-reps cpu ( -- reps )
 HOOK: %shuffle-vector-imm-reps cpu ( -- reps )
 HOOK: %merge-vector-reps cpu ( -- reps )
 HOOK: %signed-pack-vector-reps cpu ( -- reps )
 HOOK: %unsigned-pack-vector-reps cpu ( -- reps )
@ -328,7 +331,9 @@ M: object %zero-vector-reps { } ;
 M: object %fill-vector-reps { } ;
 M: object %gather-vector-2-reps { } ;
 M: object %gather-vector-4-reps { } ;
 M: object %alien-vector-reps { } ;
 M: object %shuffle-vector-reps { } ;
 M: object %shuffle-vector-imm-reps { } ;
 M: object %merge-vector-reps { } ;
 M: object %signed-pack-vector-reps { } ;
 M: object %unsigned-pack-vector-reps { } ;
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@ -432,8 +432,13 @@ M: x86 %epilogue ( n -- ) cell - incr-stack-reg ;
    temp 0 MOV \ t rc-absolute-cell rel-immediate
    dst temp word execute ; inline
 : (%compare) ( src1 src2 cc -- )
    2over [ { cc= cc/= } member? ] [ register? ] [ 0 = ] tri* and and
    [ drop dup TEST ]
    [ CMP ] if ;
 M:: x86 %compare ( dst src1 src2 cc temp -- )
-    src1 src2 CMP
+    src1 src2 cc (%compare)
    cc order-cc {
        { cc<  [ dst temp \ CMOVL %boolean ] }
        { cc<= [ dst temp \ CMOVLE %boolean ] }
@ -447,7 +452,7 @@ M: x86 %compare-imm ( dst src1 src2 cc temp -- )
    %compare ;
 M:: x86 %compare-branch ( label src1 src2 cc -- )
-    src1 src2 CMP
+    src1 src2 cc (%compare)
    cc order-cc {
        { cc<  [ label JL ] }
        { cc<= [ label JLE ] }
@ -562,6 +567,12 @@ MACRO: available-reps ( alist -- )
    reverse [ { } ] suffix
    '[ _ cond ] ;
 M: x86 %alien-vector-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
    } available-reps ;
 M: x86 %zero-vector
    {
        { double-2-rep [ dup XORPD ] }
@ -673,11 +684,9 @@ M: x86 %gather-vector-2-reps
        [ dupd SHUFPD ]
    } case ;
-: float-4-shuffle ( dst shuffle -- )
+: sse1-float-4-shuffle ( dst shuffle -- )
    {
        { { 0 1 2 3 } [ drop ] }
        { { 0 0 2 2 } [ dup MOVSLDUP ] }
        { { 1 1 3 3 } [ dup MOVSHDUP ] }
        { { 0 1 0 1 } [ dup MOVLHPS ] }
        { { 2 3 2 3 } [ dup MOVHLPS ] }
        { { 0 0 1 1 } [ dup UNPCKLPS ] }
@ -685,6 +694,15 @@ M: x86 %gather-vector-2-reps
        [ dupd SHUFPS ]
    } case ;
 : float-4-shuffle ( dst shuffle -- )
    sse3? [
        {
            { { 0 0 2 2 } [ dup MOVSLDUP ] }
            { { 1 1 3 3 } [ dup MOVSHDUP ] }
            [ sse1-float-4-shuffle ]
        } case
    ] [ sse1-float-4-shuffle ] if ;
 : int-4-shuffle ( dst shuffle -- )
    {
        { { 0 1 2 3 } [ drop ] }
@ -698,7 +716,7 @@ M: x86 %gather-vector-2-reps
 : longlong-2-shuffle ( dst shuffle -- )
    first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
-M:: x86 %shuffle-vector ( dst src shuffle rep -- )
+M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
    dst src rep %copy
    dst shuffle rep unsign-rep {
        { double-2-rep [ double-2-shuffle ] }
@ -707,12 +725,20 @@ M:: x86 %shuffle-vector ( dst src shuffle rep -- )
        { longlong-2-rep [ longlong-2-shuffle ] }
    } case ;
-M: x86 %shuffle-vector-reps
+M: x86 %shuffle-vector-imm-reps
    {
        { sse? { float-4-rep } }
        { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
    } available-reps ;
 M: x86 %shuffle-vector ( dst src shuffle rep -- )
    two-operand PSHUFB ;
 M: x86 %shuffle-vector-reps
    {
        { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } }
    } available-reps ;
 M: x86 %merge-vector-head
    [ two-operand ] keep
    unsign-rep {
@ -790,8 +816,6 @@ M: x86 %unpack-vector-head-reps ( -- reps )
        { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } }
    } available-reps ;
 M: x86 %unpack-vector-tail-reps ( -- reps ) { } ;
 M: x86 %integer>float-vector ( dst src rep -- )
    {
        { int-4-rep [ CVTDQ2PS ] }
@ -1037,10 +1061,6 @@ M: x86 %mul-vector-reps
        { sse4.1? { int-4-rep uint-4-rep } }
    } available-reps ;
 M: x86 %saturated-mul-vector-reps
    ! No multiplication with saturation on x86
    { } ;
 M: x86 %div-vector ( dst src1 src2 rep -- )
    [ two-operand ] keep
    {
@ -1223,8 +1243,6 @@ M: x86 %xor-vector-reps
        { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } }
    } available-reps ;
 M: x86 %not-vector-reps { } ;
 M: x86 %shl-vector ( dst src1 src2 rep -- )
    [ two-operand ] keep
    {
@ -1271,6 +1289,30 @@ M:: x86 %scalar>integer ( dst src rep -- )
        { uint-scalar-rep [
            dst 32-bit-version-of src MOVD
        ] }
        { short-scalar-rep [
            dst 32-bit-version-of src MOVD
            dst dst 16-bit-version-of MOVSX
        ] }
        { ushort-scalar-rep [
            dst 32-bit-version-of src MOVD
            dst dst 16-bit-version-of MOVZX
        ] }
        { char-scalar-rep [
            dst 32-bit-version-of src MOVD
            dst { } 8 [| tmp-dst |
                tmp-dst dst int-rep %copy
                tmp-dst tmp-dst 8-bit-version-of MOVSX
                dst tmp-dst int-rep %copy
            ] with-small-register
        ] }
        { uchar-scalar-rep [
            dst 32-bit-version-of src MOVD
            dst { } 8 [| tmp-dst |
                tmp-dst dst int-rep %copy
                tmp-dst tmp-dst 8-bit-version-of MOVZX
                dst tmp-dst int-rep %copy
            ] with-small-register
        ] }
    } case ;
 M: x86 %vector>scalar %copy ;
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@ -60,7 +60,7 @@ MACRO: simd-boa ( rep class -- simd-array )
    [ rep-components ] [ new ] [ '[ _ ] ] tri* swap replicate-as ; inline
 : simd-with/nth-fast? ( rep -- ? )
-    [ \ (simd-vshuffle) supported-simd-op? ]
+    [ \ (simd-vshuffle-elements) supported-simd-op? ]
    [ rep-component-type can-be-unboxed? ]
    bi and ;
@ -184,6 +184,8 @@ WHERE
 TUPLE: A
 { underlying byte-array read-only initial: $[ 16 <byte-array> ] } ;
 INSTANCE: A simd-128
 M: A clone underlying>> clone \ A boa ; inline
 M: A length drop N ; inline
@ -315,7 +317,7 @@ SLOT: underlying2
    class c:typedef ;
 : (define-simd-256) ( simd -- )
-    simd-ops get { vshuffle hlshift hrshift } unique assoc-diff >>ops
+    simd-ops get { vshuffle-elements vshuffle-bytes hlshift hrshift } unique assoc-diff >>ops
    [ define-simd ]
    [ [ class>> ] [ rep>> ] bi define-simd-256-type ] bi ;
@ -362,6 +364,8 @@ TUPLE: A
 { underlying1 byte-array initial: $[ 16 <byte-array> ] read-only }
 { underlying2 byte-array initial: $[ 16 <byte-array> ] read-only } ;
 INSTANCE: A simd-256
 M: A clone
    [ underlying1>> clone ] [ underlying2>> clone ] bi
    \ A boa ; inline
--- a/basis/math/vectors/simd/intrinsics/intrinsics.factor
+++ b/basis/math/vectors/simd/intrinsics/intrinsics.factor
@ -67,7 +67,8 @@ SIMD-OP: vlshift
 SIMD-OP: vrshift
 SIMD-OP: hlshift
 SIMD-OP: hrshift
-SIMD-OP: vshuffle
+SIMD-OP: vshuffle-elements
 SIMD-OP: vshuffle-bytes
 SIMD-OP: (vmerge-head)
 SIMD-OP: (vmerge-tail)
 SIMD-OP: v<=
@ -148,6 +149,9 @@ GENERIC# supported-simd-op? 1 ( rep intrinsic -- ? )
    union
    { uchar-16-rep ushort-8-rep uint-4-rep ulonglong-2-rep } union ;
 : (%shuffle-imm-reps) ( -- reps )
    %shuffle-vector-reps %shuffle-vector-imm-reps union ;
 M: vector-rep supported-simd-op?
    {
        { \ (simd-v+)            [ %add-vector-reps            ] }
@ -179,7 +183,8 @@ M: vector-rep supported-simd-op?
        { \ (simd-vrshift)       [ %shr-vector-reps            ] }
        { \ (simd-hlshift)       [ %horizontal-shl-vector-reps ] }
        { \ (simd-hrshift)       [ %horizontal-shr-vector-reps ] }
-        { \ (simd-vshuffle)      [ %shuffle-vector-reps        ] }
+        { \ (simd-vshuffle-elements) [ (%shuffle-imm-reps)         ] }
        { \ (simd-vshuffle-bytes)    [ %shuffle-vector-reps        ] }
        { \ (simd-(vmerge-head)) [ %merge-vector-reps          ] }
        { \ (simd-(vmerge-tail)) [ %merge-vector-reps          ] }
        { \ (simd-(v>float))        [ %integer>float-vector-reps ] }
--- a/basis/math/vectors/simd/simd-docs.factor
+++ b/basis/math/vectors/simd/simd-docs.factor
@ -41,7 +41,21 @@ $nl
    POSTPONE: SIMD:
    POSTPONE: SIMDS:
 }
-"The following vector types are supported:"
+"The following scalar types are supported:"
 { $code
    "char"
    "uchar"
    "short"
    "ushort"
    "int"
    "uint"
    "longlong"
    "ulonglong"
    "float"
    "double"
 }
 "The following vector types are generated from the above scalar types:"
 { $code
    "char-16"
    "uchar-16"
@ -89,6 +103,7 @@ $nl
 { $code
 """USING: compiler.tree.debugger math.vectors
 math.vectors.simd ;
 SIMD: double
 SYMBOLS: x y ;
 [
@ -107,7 +122,7 @@ IN: simd-demo
    { float-4 float-4 float-4 } declare
    [ v* ] [ [ 1.0 ] dip n-v v* ] bi-curry* bi v+ ;
-\ interpolate optimizer-report.""" }
+\\ interpolate optimizer-report.""" }
 "Note that using " { $link declare } " is not recommended. Safer ways of getting type information for the input parameters to a word include defining methods on a generic word (the value being dispatched upon has a statically known type in the method body), as well as using " { $link "hints" } " and " { $link POSTPONE: inline } " declarations."
 $nl
 "Here is a better version of the " { $snippet "interpolate" } " words above that uses hints:"
@ -122,7 +137,7 @@ IN: simd-demo
 HINTS: interpolate float-4 float-4 float-4 ;
-\ interpolate optimizer-report. """ }
+\\ interpolate optimizer-report. """ }
 "This time, the optimizer report lists calls to both SIMD primitives and high-level vector words, because hints cause two code paths to be generated. The " { $snippet "optimized." } " word can be used to make sure that the fast code path consists entirely of calls to primitives."
 $nl
 "If the " { $snippet "interpolate" } " word was to be used in several places with different types of vectors, it would be best to declare it " { $link POSTPONE: inline } "."
@ -153,13 +168,13 @@ M: actor advance ( dt actor -- )
    [ >float ] dip
    [ update-velocity ] [ update-position ] 2bi ;
-M\ actor advance optimized."""
+M\\ actor advance optimized."""
 }
 "The " { $vocab-link "compiler.cfg.debugger" } " vocabulary can give a lower-level picture of the generated code, that includes register assignments and other low-level details. To look at low-level optimizer output, call " { $snippet "test-mr mr." } " on a word or quotation:"
 { $code
 """USE: compiler.tree.debugger
-M\ actor advance test-mr mr.""" }
+M\\ actor advance test-mr mr.""" }
 "An example of a high-performance algorithm that uses SIMD primitives can be found in the " { $vocab-link "benchmark.nbody-simd" } " vocabulary." ;
 ARTICLE: "math.vectors.simd.intrinsics" "Low-level SIMD primitives"
@ -206,7 +221,7 @@ ARTICLE: "math.vectors.simd" "Hardware vector arithmetic (SIMD)"
 HELP: SIMD:
 { $syntax "SIMD: type" }
 { $values { "type" "a scalar C type" } }
-{ $description "Defines 128-bit and 256-bit SIMD arrays for holding elements of " { $snippet "type" } " into the vocabulary search path. The possible type/length combinations are listed in " { $link "math.vectors.simd.types" } " and the generated words are documented in " { $link "math.vectors.simd.words" } "." } ;
+{ $description "Defines 128-bit and 256-bit SIMD arrays for holding elements of " { $snippet "type" } " into the vocabulary search path. The allowed scalar types, and the auto-generated type/length vector combinations that result, are listed in " { $link "math.vectors.simd.types" } ". Generated words are documented in " { $link "math.vectors.simd.words" } "." } ;
 HELP: SIMDS:
 { $syntax "SIMDS: type type type ... ;" }
--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@ -174,7 +174,7 @@ CONSTANT: simd-classes
 : remove-special-words ( alist -- alist' )
    ! These have their own tests later
    {
-        hlshift hrshift vshuffle vbroadcast
+        hlshift hrshift vshuffle-bytes vshuffle-elements vbroadcast
        vany? vall? vnone?
        (v>float) (v>integer)
        (vpack-signed) (vpack-unsigned)
@ -360,6 +360,23 @@ simd-classes [
    ] unit-test
 ] each
 "== Checking variable shuffles" print
 : random-shift-vector ( class -- vec )
    new [ drop 16 random ] map ;
 :: test-shift-vector ( class -- ? )
    class random-int-vector :> src
    char-16 random-shift-vector :> perm
    { class char-16 } :> decl
    src perm vshuffle
    src perm [ decl declare vshuffle ] compile-call
    = ; inline
 { char-16 uchar-16 short-8 ushort-8 int-4 uint-4 longlong-2 ulonglong-2 }
 [ 10 swap '[ [ t ] [ _ test-shift-vector ] unit-test ] times ] each
 "== Checking vector tests" print
 :: test-vector-tests-bool ( vector declaration -- none? any? all? )
@ -512,38 +529,38 @@ SYMBOL: !!inconsistent!!
 STRUCT: simd-struct
 { x float-4 }
-{ y double-2 }
+{ y longlong-2 }
 { z double-4 }
-{ w float-8 } ;
+{ w int-8 } ;
 [ t ] [ [ simd-struct <struct> ] compile-call >c-ptr [ 0 = ] all? ] unit-test
 [
    float-4{ 1 2 3 4 }
-    double-2{ 2 1 }
+    longlong-2{ 2 1 }
    double-4{ 4 3 2 1 }
-    float-8{ 1 2 3 4 5 6 7 8 }
+    int-8{ 1 2 3 4 5 6 7 8 }
 ] [
    simd-struct <struct>
    float-4{ 1 2 3 4 } >>x
-    double-2{ 2 1 } >>y
+    longlong-2{ 2 1 } >>y
    double-4{ 4 3 2 1 } >>z
-    float-8{ 1 2 3 4 5 6 7 8 } >>w
+    int-8{ 1 2 3 4 5 6 7 8 } >>w
    { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
 ] unit-test
 [
    float-4{ 1 2 3 4 }
-    double-2{ 2 1 }
+    longlong-2{ 2 1 }
    double-4{ 4 3 2 1 }
-    float-8{ 1 2 3 4 5 6 7 8 }
+    int-8{ 1 2 3 4 5 6 7 8 }
 ] [
    [
        simd-struct <struct>
        float-4{ 1 2 3 4 } >>x
-        double-2{ 2 1 } >>y
+        longlong-2{ 2 1 } >>y
        double-4{ 4 3 2 1 } >>z
-        float-8{ 1 2 3 4 5 6 7 8 } >>w
+        int-8{ 1 2 3 4 5 6 7 8 } >>w
        { [ x>> ] [ y>> ] [ z>> ] [ w>> ] } cleave
    ] compile-call
 ] unit-test
--- a/basis/math/vectors/specialization/specialization.factor
+++ b/basis/math/vectors/specialization/specialization.factor
@ -98,7 +98,8 @@ H{
    { vrshift { +vector+ +scalar+ -> +vector+ } }
    { hlshift { +vector+ +literal+ -> +vector+ } }
    { hrshift { +vector+ +literal+ -> +vector+ } }
-    { vshuffle { +vector+ +literal+ -> +vector+ } }
+    { vshuffle-elements { +vector+ +literal+ -> +vector+ } }
    { vshuffle-bytes    { +vector+ +vector+  -> +vector+ } }
    { vbroadcast { +vector+ +literal+ -> +vector+ } }
    { (vmerge-head) { +vector+ +vector+ -> +vector+ } }
    { (vmerge-tail) { +vector+ +vector+ -> +vector+ } }
@ -162,7 +163,7 @@ ERROR: bad-vector-word word ;
    } cond
    ! Don't specialize horizontal shifts, shuffles, and conversions at all, they're only for SIMD
    {
-        hlshift hrshift vshuffle vbroadcast
+        hlshift hrshift vshuffle-elements vshuffle-bytes vbroadcast
        (v>integer) (v>float)
        (vpack-signed) (vpack-unsigned)
        (vunpack-head) (vunpack-tail)
--- a/basis/math/vectors/vectors.factor
+++ b/basis/math/vectors/vectors.factor
@ -6,6 +6,9 @@ locals ;
 QUALIFIED-WITH: alien.c-types c
 IN: math.vectors
 MIXIN: simd-128
 MIXIN: simd-256
 GENERIC: element-type ( obj -- c-type )
 M: object element-type drop f ; inline
@ -83,7 +86,20 @@ PRIVATE>
 : vbitnot ( u -- w ) dup '[ _ [ bitnot ] fp-bitwise-unary ] map ;
 :: vbroadcast ( u n -- v ) u length n u nth <repetition> u like ;
-: vshuffle ( u perm -- v ) swap [ '[ _ nth ] ] keep map-as ;
+
 : vshuffle-elements ( u perm -- v )
    swap [ '[ _ nth ] ] keep map-as ;
 : vshuffle-bytes ( u perm -- v )
    underlying>> [
        swap [ '[ _ nth ] ] keep map-as
    ] curry change-underlying ;
 GENERIC: vshuffle ( u perm -- v )
 M: array vshuffle ( u perm -- v )
    vshuffle-elements ; inline
 M: simd-128 vshuffle ( u perm -- v )
    vshuffle-bytes ; inline
 : vlshift ( u n -- w ) '[ _ shift ] map ;
 : vrshift ( u n -- w ) neg '[ _ shift ] map ;
@ -107,9 +123,9 @@ PRIVATE>
 : vxor ( u v -- w )  over '[ [ _ element>bool ] bi@ xor ] 2map ;
 : vnot ( u -- w )    dup '[ _ element>bool not ] map ;
-: vall? ( v -- ? ) [ ] all? ;
+: vall? ( v -- ? ) dup '[ _ element>bool ] all? ;
-: vany? ( v -- ? ) [ ] any? ;
+: vany? ( v -- ? ) dup '[ _ element>bool ] any? ;
-: vnone? ( v -- ? ) [ not ] all? ;
+: vnone? ( v -- ? ) dup '[ _ element>bool not ] all? ;
 : v<  ( u v -- w ) [ <   ] 2map ;
 : v<= ( u v -- w ) [ <=  ] 2map ;
--- a/basis/specialized-arrays/specialized-arrays.factor
+++ b/basis/specialized-arrays/specialized-arrays.factor
@ -2,7 +2,8 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors alien alien.c-types alien.data alien.parser
 assocs byte-arrays classes compiler.units functors kernel lexer
-libc math math.vectors math.vectors.specialization namespaces
+libc math math.vectors math.vectors.private
 math.vectors.specialization namespaces
 parser prettyprint.custom sequences sequences.private strings
 summary vocabs vocabs.loader vocabs.parser vocabs.generated
 words fry combinators present ;
@ -68,6 +69,8 @@ TUPLE: A
    [ drop \ T bad-byte-array-length ] unless
    <direct-A> ; inline
 M: A new-underlying drop byte-array>A ;
 M: A clone [ underlying>> clone ] [ length>> ] bi <direct-A> ; inline
 M: A length length>> ; inline
--- a/core/classes/tuple/tuple-docs.factor
+++ b/core/classes/tuple/tuple-docs.factor
@ -129,17 +129,18 @@ ARTICLE: "tuple-inheritance-example" "Tuple subclassing example"
 }
 "Rectangles and parallelograms use the same algorithm for computing the area, whereas they use different algorithms for computing perimiter. Also, rectangles and parallelograms both have " { $snippet "width" } " and " { $snippet "height" } " slots. We can exploit this with subclassing:"
 { $code
    "USING: accessors kernel math math.constants math.functions ;"
    "GENERIC: area ( shape -- n )"
    "GENERIC: perimiter ( shape -- n )"
    ""
    "TUPLE: shape ;"
    ""
    "TUPLE: circle < shape radius ;"
-    "M: area circle radius>> sq pi * ;"
+    "M: circle area radius>> sq pi * ;"
-    "M: perimiter circle radius>> 2 * pi * ;"
+    "M: circle perimiter radius>> 2 * pi * ;"
    ""
-    "TUPLE: quad < shape width height"
+    "TUPLE: quad < shape width height ;"
-    "M: area quad [ width>> ] [ height>> ] bi * ;"
+    "M: quad area [ width>> ] [ height>> ] bi * ;"
    ""
    "TUPLE: rectangle < quad ;"
    "M: rectangle perimiter [ width>> 2 * ] [ height>> 2 * ] bi + ;"
--- a/extra/math/matrices/simd/simd-tests.factor
+++ b/extra/math/matrices/simd/simd-tests.factor
@ -52,10 +52,10 @@ IN: math.matrices.simd.tests
 [ 
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 3.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
-            float-4{ 0.0 1.0 0.0 4.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
-            float-4{ 0.0 0.0 1.0 2.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 3.0 4.0 2.0 1.0 }
        }
    }
 ] [ float-4{ 3.0 4.0 2.0 0.0 } translation-matrix4 ] unit-test
@ -77,9 +77,9 @@ IN: math.matrices.simd.tests
    float-4{ 0.0 1.0 0.0 1.0 } pi 1/2. * rotation-matrix4
    S{ matrix4 f
        float-4-array{
-            float-4{  0.0  0.0  1.0 0.0 }
+            float-4{  0.0  0.0 -1.0 0.0 }
            float-4{  0.0  1.0  0.0 0.0 }
-            float-4{ -1.0  0.0  0.0 0.0 }
+            float-4{  1.0  0.0  0.0 0.0 }
            float-4{  0.0  0.0  0.0 1.0 }
        }
    }
@ -89,10 +89,10 @@ IN: math.matrices.simd.tests
 [
    S{ matrix4 f
        float-4-array{
-            float-4{ 2.0 0.0 0.0 10.0 }
+            float-4{  2.0  0.0  0.0  0.0 }
-            float-4{ 0.0 3.0 0.0 18.0 }
+            float-4{  0.0  3.0  0.0  0.0 }
-            float-4{ 0.0 0.0 4.0 28.0 }
+            float-4{  0.0  0.0  4.0  0.0 }
-            float-4{ 0.0 0.0 0.0  1.0 }
+            float-4{ 10.0 18.0 28.0  1.0 }
        }
    }
 ] [
@ -106,10 +106,10 @@ IN: math.matrices.simd.tests
    }
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
        }
    }
    m4.
@ -118,10 +118,10 @@ IN: math.matrices.simd.tests
 [
    S{ matrix4 f
        float-4-array{
-            float-4{ 3.0 0.0 0.0 5.0 }
+            float-4{ 3.0 0.0 0.0 0.0 }
-            float-4{ 0.0 4.0 0.0 6.0 }
+            float-4{ 0.0 4.0 0.0 0.0 }
-            float-4{ 0.0 0.0 5.0 7.0 }
+            float-4{ 0.0 0.0 5.0 0.0 }
-            float-4{ 0.0 0.0 0.0 2.0 }
+            float-4{ 5.0 6.0 7.0 2.0 }
        }
    }
 ] [
@ -135,10 +135,10 @@ IN: math.matrices.simd.tests
    }
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
        }
    }
    m4+
@ -147,10 +147,10 @@ IN: math.matrices.simd.tests
 [
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 -5.0 }
+            float-4{  1.0  0.0  0.0 0.0 }
-            float-4{ 0.0 2.0 0.0 -6.0 }
+            float-4{  0.0  2.0  0.0 0.0 }
-            float-4{ 0.0 0.0 3.0 -7.0 }
+            float-4{  0.0  0.0  3.0 0.0 }
-            float-4{ 0.0 0.0 0.0  0.0 }
+            float-4{ -5.0 -6.0 -7.0 0.0 }
        }
    }
 ] [
@ -164,10 +164,10 @@ IN: math.matrices.simd.tests
    }
    S{ matrix4 f
        float-4-array{
-            float-4{ 1.0 0.0 0.0 5.0 }
+            float-4{ 1.0 0.0 0.0 0.0 }
-            float-4{ 0.0 1.0 0.0 6.0 }
+            float-4{ 0.0 1.0 0.0 0.0 }
-            float-4{ 0.0 0.0 1.0 7.0 }
+            float-4{ 0.0 0.0 1.0 0.0 }
-            float-4{ 0.0 0.0 0.0 1.0 }
+            float-4{ 5.0 6.0 7.0 1.0 }
        }
    }
    m4-
@ -219,10 +219,10 @@ IN: math.matrices.simd.tests
 [
    S{ matrix4 f
        float-4-array{
-            float-4{ 1/2. 0.0   0.0    0.0  }
+            float-4{ 1/2. 0.0   0.0   0.0 }
-            float-4{ 0.0  1/2.  0.0    0.0  }
+            float-4{ 0.0  1/2.  0.0   0.0 }
-            float-4{ 0.0  0.0  -6/4. -10/4. }
+            float-4{ 0.0  0.0  -6/4. -1.0 }
-            float-4{ 0.0  0.0  -1.0    0.0  }
+            float-4{ 0.0  0.0 -10/4.  0.0 }
        }
    }
 ] [
--- a/extra/math/matrices/simd/simd.factor
+++ b/extra/math/matrices/simd/simd.factor
@ -9,34 +9,34 @@ SPECIALIZED-ARRAY: float-4
 IN: math.matrices.simd
 STRUCT: matrix4
-    { rows float-4[4] } ;
+    { columns float-4[4] } ;
 INSTANCE: matrix4 immutable-sequence
 M: matrix4 length drop 4 ; inline
-M: matrix4 nth-unsafe rows>> nth-unsafe ; inline
+M: matrix4 nth-unsafe columns>> nth-unsafe ; inline
 M: matrix4 new-sequence 2drop matrix4 (struct) ; inline
 <PRIVATE
-: rows ( a -- a1 a2 a3 a4 )
+: columns ( a -- a1 a2 a3 a4 )
-    rows>> 4 firstn ; inline
+    columns>> 4 firstn ; inline
-:: set-rows ( c1 c2 c3 c4 c -- c )
+:: set-columns ( c1 c2 c3 c4 c -- c )
-    c rows>> :> rows
+    c columns>> :> columns
-    c1 rows set-first
+    c1 columns set-first
-    c2 rows set-second
+    c2 columns set-second
-    c3 rows set-third
+    c3 columns set-third
-    c4 rows set-fourth
+    c4 columns set-fourth
    c ; inline
 : make-matrix4 ( quot: ( -- c1 c2 c3 c4 ) -- c )
-    matrix4 (struct) swap dip set-rows ; inline
+    matrix4 (struct) swap dip set-columns ; inline
-:: 2map-rows ( a b quot -- c )
+:: 2map-columns ( a b quot -- c )
    [
-        a rows :> a4 :> a3 :> a2 :> a1
+        a columns :> a4 :> a3 :> a2 :> a1
-        b rows :> b4 :> b3 :> b2 :> b1
+        b columns :> b4 :> b3 :> b2 :> b1
        a1 b1 quot call
        a2 b2 quot call
@ -44,57 +44,57 @@ M: matrix4 new-sequence 2drop matrix4 (struct) ; inline
        a4 b4 quot call
    ] make-matrix4 ; inline
-: map-rows ( a quot -- c )
+: map-columns ( a quot -- c )
-    '[ rows _ 4 napply ] make-matrix4 ; inline
+    '[ columns _ 4 napply ] make-matrix4 ; inline
 PRIVATE>
-TYPED: m4+ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v+ ] 2map-rows ;
+TYPED: m4+ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v+ ] 2map-columns ;
-TYPED: m4- ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v- ] 2map-rows ;
+TYPED: m4- ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v- ] 2map-columns ;
-TYPED: m4* ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v* ] 2map-rows ;
+TYPED: m4* ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v* ] 2map-columns ;
-TYPED: m4/ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v/ ] 2map-rows ;
+TYPED: m4/ ( a: matrix4 b: matrix4 -- c: matrix4 ) [ v/ ] 2map-columns ;
-TYPED: m4*n ( a: matrix4 b: float -- c: matrix4 ) [ v*n ] curry map-rows ;
+TYPED: m4*n ( a: matrix4 b: float -- c: matrix4 ) [ v*n ] curry map-columns ;
-TYPED: m4/n ( a: matrix4 b: float -- c: matrix4 ) [ v/n ] curry map-rows ;
+TYPED: m4/n ( a: matrix4 b: float -- c: matrix4 ) [ v/n ] curry map-columns ;
-TYPED: n*m4 ( a: float b: matrix4 -- c: matrix4 ) [ n*v ] with map-rows ;
+TYPED: n*m4 ( a: float b: matrix4 -- c: matrix4 ) [ n*v ] with map-columns ;
-TYPED: n/m4 ( a: float b: matrix4 -- c: matrix4 ) [ n/v ] with map-rows ;
+TYPED: n/m4 ( a: float b: matrix4 -- c: matrix4 ) [ n/v ] with map-columns ;
 TYPED:: m4. ( a: matrix4 b: matrix4 -- c: matrix4 )
    [
-        a rows :> a4 :> a3 :> a2 :> a1
+        a columns :> a4 :> a3 :> a2 :> a1
-        b rows :> b4 :> b3 :> b2 :> b1
+        b columns :> b4 :> b3 :> b2 :> b1
-        a1 first  b1 n*v :> c1a
+        b1 first  a1 n*v :> c1a
-        a2 first  b1 n*v :> c2a
+        b2 first  a1 n*v :> c2a
-        a3 first  b1 n*v :> c3a
+        b3 first  a1 n*v :> c3a
-        a4 first  b1 n*v :> c4a
+        b4 first  a1 n*v :> c4a
-        a1 second b2 n*v c1a v+ :> c1b 
+        b1 second a2 n*v c1a v+ :> c1b 
-        a2 second b2 n*v c2a v+ :> c2b
+        b2 second a2 n*v c2a v+ :> c2b
-        a3 second b2 n*v c3a v+ :> c3b
+        b3 second a2 n*v c3a v+ :> c3b
-        a4 second b2 n*v c4a v+ :> c4b
+        b4 second a2 n*v c4a v+ :> c4b
-        a1 third  b3 n*v c1b v+ :> c1c 
+        b1 third  a3 n*v c1b v+ :> c1c 
-        a2 third  b3 n*v c2b v+ :> c2c
+        b2 third  a3 n*v c2b v+ :> c2c
-        a3 third  b3 n*v c3b v+ :> c3c
+        b3 third  a3 n*v c3b v+ :> c3c
-        a4 third  b3 n*v c4b v+ :> c4c
+        b4 third  a3 n*v c4b v+ :> c4c
-        a1 fourth b4 n*v c1c v+
+        b1 fourth a4 n*v c1c v+
-        a2 fourth b4 n*v c2c v+
+        b2 fourth a4 n*v c2c v+
-        a3 fourth b4 n*v c3c v+
+        b3 fourth a4 n*v c3c v+
-        a4 fourth b4 n*v c4c v+
+        b4 fourth a4 n*v c4c v+
    ] make-matrix4 ;
-TYPED:: v.m4 ( a: float-4 b: matrix4 -- c: float-4 )
+TYPED:: m4.v ( m: matrix4 v: float-4 -- v': float-4 )
-    b rows :> b4 :> b3 :> b2 :> b1
+    m columns :> m4 :> m3 :> m2 :> m1
-    a first  b1 n*v
+    v first  m1 n*v
-    a second b2 n*v v+
+    v second m2 n*v v+
-    a third  b3 n*v v+
+    v third  m3 n*v v+
-    a fourth b4 n*v v+ ;
+    v fourth m4 n*v v+ ;
-TYPED:: m4.v ( a: matrix4 b: float-4 -- c: float-4 )
+TYPED:: v.m4 ( v: float-4 m: matrix4 -- c: float-4 )
-    a rows [ b v. ] 4 napply float-4-boa ;
+    m columns [ v v. ] 4 napply float-4-boa ;
 CONSTANT: identity-matrix4
    S{ matrix4 f
@ -131,37 +131,37 @@ TYPED: diagonal-matrix4 ( diagonal: float-4 -- matrix: matrix4 )
    [ (vmerge) ] bi-curry@ bi* ; inline
 TYPED: transpose-matrix4 ( matrix: matrix4 -- matrix: matrix4 )
-    [ rows vmerge-transpose vmerge-transpose ] make-matrix4 ;
+    [ columns vmerge-transpose vmerge-transpose ] make-matrix4 ;
 : linear>homogeneous ( v -- v' )
    [ float-4{ t t t f } ] dip float-4{ 0.0 0.0 0.0 1.0 } v? ; inline
 : scale-matrix4 ( factors -- matrix )
-    [ float-4{ t t t f } ] dip float-4{ 0.0 0.0 0.0 1.0 } v?
+    linear>homogeneous diagonal-matrix4 ; inline
    diagonal-matrix4 ; inline
 : ortho-matrix4 ( factors -- matrix )
    float-4{ 1.0 1.0 1.0 1.0 } swap v/ scale-matrix4 ; inline
-TYPED:: translation-matrix4 ( offset: float-4 -- matrix: matrix4 )
+TYPED: translation-matrix4 ( offset: float-4 -- matrix: matrix4 )
    [
-        float-4{ 1.0 1.0 1.0 1.0 } :> diagonal
+        linear>homogeneous
-
+        [ 
-        offset 0 float-4-with (vmerge)
+            float-4{ 1.0 0.0 0.0 0.0 }
-        [ 0 float-4-with swap (vmerge) ] bi@ drop :> z :> y :> x
+            float-4{ 0.0 1.0 0.0 0.0 }
-
+            float-4{ 0.0 0.0 1.0 0.0 }
-        diagonal y vmerge-diagonal*
+        ] dip
        [ x vmerge-diagonal* ]
        [ z vmerge-diagonal* ] bi*
    ] make-matrix4 ;
 TYPED:: rotation-matrix4 ( axis: float-4 theta: float -- matrix: matrix4 )
-    !   x*x + c*(1.0 - x*x)   x*y*(1.0 - c) - s*z   x*z*(1.0 - c) + s*y   0
+    !   x*x + c*(1.0 - x*x)   x*y*(1.0 - c) + s*z   x*z*(1.0 - c) - s*y   0
-    !   x*y*(1.0 - c) + s*z   y*y + c*(1.0 - y*y)   y*z*(1.0 - c) - s*x   0
+    !   x*y*(1.0 - c) - s*z   y*y + c*(1.0 - y*y)   y*z*(1.0 - c) + s*x   0
-    !   x*z*(1.0 - c) - s*y   y*z*(1.0 - c) + s*x   z*z + c*(1.0 - z*z)   0
+    !   x*z*(1.0 - c) + s*y   y*z*(1.0 - c) - s*x   z*z + c*(1.0 - z*z)   0
    !   0                     0                     0                     1
    matrix4 (struct) :> triangle-m
    theta cos :> c
    theta sin :> s
-    float-4{  1.0 -1.0  1.0 0.0 } :> triangle-sign
+    float-4{ -1.0  1.0 -1.0 0.0 } :> triangle-sign
    c float-4-with :> cc
    s float-4-with :> ss
@ -184,7 +184,7 @@ TYPED:: rotation-matrix4 ( axis: float-4 theta: float -- matrix: matrix4 )
    triangle-lo { 1 0 3 3 } vshuffle
    float-4 new
-    triangle-m set-rows drop
+    triangle-m set-columns drop
    diagonal-m triangle-m m4+ ;
@ -194,8 +194,10 @@ TYPED:: frustum-matrix4 ( xy: float-4 near: float far: float -- matrix: matrix4
        float-4{ t t f f } xy near far - float-4-with v? ! denom
        v/ :> fov
-        fov 0.0 float-4-with (vmerge-head) vmerge-diagonal
+        float-4{ 0.0 -1.0 0.0 0.0 } :> negone
-        fov float-4{ f f t t } vand
+
-        float-4{ 0.0 0.0 -1.0 0.0 }
+        fov vmerge-diagonal
        [ vmerge-diagonal ]
        [ negone (vmerge) ] bi*
    ] make-matrix4 ;