simd intrinsic implementation for v*high, v*hs+, vavg, and vsad

2009-12-05 14:52:18 -08:00 · 2009-12-05 14:52:18 -08:00 · 104c29aabc
parent 1845915dc6
commit 104c29aabc
7 changed files with 134 additions and 1 deletions
--- a/basis/compiler/cfg/instructions/instructions.factor
+++ b/basis/compiler/cfg/instructions/instructions.factor
@ -382,6 +382,16 @@ def: dst
 use: src1 src2
 literal: rep ;

+PURE-INSN: ##mul-high-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
+PURE-INSN: ##mul-horizontal-add-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
 PURE-INSN: ##saturated-mul-vector
 def: dst
 use: src1 src2
@ -402,11 +412,21 @@ def: dst
 use: src1 src2
 literal: rep ;

+PURE-INSN: ##avg-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
 PURE-INSN: ##dot-vector
 def: dst/scalar-rep
 use: src1 src2
 literal: rep ;

+PURE-INSN: ##sad-vector
+def: dst
+use: src1 src2
+literal: rep ;
+
 PURE-INSN: ##horizontal-add-vector
 def: dst
 use: src1 src2
--- a/basis/compiler/cfg/intrinsics/simd/simd.factor
+++ b/basis/compiler/cfg/intrinsics/simd/simd.factor
@ -57,6 +57,12 @@ IN: compiler.cfg.intrinsics.simd
        { longlong-2-rep [ longlong-array{ -1 0 } underlying>> ^^load-constant ] }
    } case ;

+: ^load-half-vector ( rep -- dst )
+    {
+        { float-4-rep  [ float-array{  0.5 0.5 0.5 0.5 } underlying>> ^^load-constant ] }
+        { double-2-rep [ double-array{ 0.5 0.5 }         underlying>> ^^load-constant ] }
+    } case ;
+
 : >variable-shuffle ( shuffle rep -- shuffle' )
    rep-component-type heap-size
    [ dup <repetition> >byte-array ]
@ -336,6 +342,16 @@ PREDICATE: fixnum-vector-rep < int-vector-rep
        [ ^^mul-vector ]
    } emit-vv-vector-op ;

+: emit-simd-v*high ( node -- )
+    {
+        [ ^^mul-high-vector ]
+    } emit-vv-vector-op ;
+
+: emit-simd-v*hs+ ( node -- )
+    {
+        [ ^^mul-horizontal-add-vector ]
+    } emit-vv-vector-op ;
+
 : emit-simd-v/ ( node -- )
    {
        [ ^^div-vector ]
@ -359,12 +375,26 @@ PREDICATE: fixnum-vector-rep < int-vector-rep
        ]
    } emit-vv-vector-op ;

+: emit-simd-vavg ( node -- )
+    {
+        [ ^^avg-vector ]
+        { float-vector-rep [| src1 src2 rep |
+            src1 src2 rep ^^add-vector
+            rep ^load-half-vector rep ^^mul-vector
+        ] }
+    } emit-vv-vector-op ;
+
 : emit-simd-v. ( node -- )
    {
        [ ^^dot-vector ]
        { float-vector-rep [ [ ^^mul-vector ] [ ^sum-vector ] bi ] }
    } emit-vv-vector-op ;

+: emit-simd-vsad ( node -- )
+    {
+        [ [ ^^sad-vector ] [ widen-vector-rep ^^vector>scalar ] bi ]
+    } emit-vv-vector-op ;
+
 : emit-simd-vsqrt ( node -- )
    {
        [ ^^sqrt-vector ]
@ -580,10 +610,14 @@ PREDICATE: fixnum-vector-rep < int-vector-rep
        { (simd-vs-)               [ emit-simd-vs-                 ] }
        { (simd-vs*)               [ emit-simd-vs*                 ] }
        { (simd-v*)                [ emit-simd-v*                  ] }
+        { (simd-v*high)            [ emit-simd-v*high              ] }
+        { (simd-v*hs+)             [ emit-simd-v*hs+               ] }
        { (simd-v/)                [ emit-simd-v/                  ] }
        { (simd-vmin)              [ emit-simd-vmin                ] }
        { (simd-vmax)              [ emit-simd-vmax                ] }
+        { (simd-vavg)              [ emit-simd-vavg                ] }
        { (simd-v.)                [ emit-simd-v.                  ] }
+        { (simd-vsad)              [ emit-simd-vsad                ] }
        { (simd-vsqrt)             [ emit-simd-vsqrt               ] }
        { (simd-sum)               [ emit-simd-sum                 ] }
        { (simd-vabs)              [ emit-simd-vabs                ] }
--- a/basis/compiler/codegen/codegen.factor
+++ b/basis/compiler/codegen/codegen.factor
@ -173,11 +173,15 @@ CODEGEN: ##add-sub-vector %add-sub-vector
 CODEGEN: ##sub-vector %sub-vector
 CODEGEN: ##saturated-sub-vector %saturated-sub-vector
 CODEGEN: ##mul-vector %mul-vector
+CODEGEN: ##mul-high-vector %mul-high-vector
+CODEGEN: ##mul-horizontal-add-vector %mul-horizontal-add-vector
 CODEGEN: ##saturated-mul-vector %saturated-mul-vector
 CODEGEN: ##div-vector %div-vector
 CODEGEN: ##min-vector %min-vector
 CODEGEN: ##max-vector %max-vector
+CODEGEN: ##avg-vector %avg-vector
 CODEGEN: ##dot-vector %dot-vector
+CODEGEN: ##sad-vector %sad-vector
 CODEGEN: ##sqrt-vector %sqrt-vector
 CODEGEN: ##horizontal-add-vector %horizontal-add-vector
 CODEGEN: ##horizontal-sub-vector %horizontal-sub-vector
--- a/basis/compiler/tree/propagation/simd/simd.factor
+++ b/basis/compiler/tree/propagation/simd/simd.factor
@ -16,9 +16,12 @@ CONSTANT: vector>vector-intrinsics
        (simd-vs-)
        (simd-vs*)
        (simd-v*)
+        (simd-v*high)
+        (simd-v*hs+)
        (simd-v/)
        (simd-vmin)
        (simd-vmax)
+        (simd-vavg)
        (simd-vsqrt)
        (simd-vabs)
        (simd-vbitand)
@ -60,6 +63,7 @@ CONSTANT: vector>vector-intrinsics
 CONSTANT: vector-other-intrinsics
    {
        (simd-v.)
+        (simd-vsad)
        (simd-sum)
        (simd-vany?)
        (simd-vall?)
--- a/basis/cpu/architecture/architecture.factor
+++ b/basis/cpu/architecture/architecture.factor
@ -283,11 +283,15 @@ HOOK: %add-sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %saturated-sub-vector cpu ( dst src1 src2 rep -- )
 HOOK: %mul-vector cpu ( dst src1 src2 rep -- )
+HOOK: %mul-high-vector cpu ( dst src1 src2 rep -- )
+HOOK: %mul-horizontal-add-vector cpu ( dst src1 src2 rep -- )
 HOOK: %saturated-mul-vector cpu ( dst src1 src2 rep -- )
 HOOK: %div-vector cpu ( dst src1 src2 rep -- )
 HOOK: %min-vector cpu ( dst src1 src2 rep -- )
 HOOK: %max-vector cpu ( dst src1 src2 rep -- )
+HOOK: %avg-vector cpu ( dst src1 src2 rep -- )
 HOOK: %dot-vector cpu ( dst src1 src2 rep -- )
+HOOK: %sad-vector cpu ( dst src1 src2 rep -- )
 HOOK: %sqrt-vector cpu ( dst src rep -- )
 HOOK: %horizontal-add-vector cpu ( dst src1 src2 rep -- )
 HOOK: %horizontal-sub-vector cpu ( dst src1 src2 rep -- )
@ -332,11 +336,15 @@ HOOK: %add-sub-vector-reps cpu ( -- reps )
 HOOK: %sub-vector-reps cpu ( -- reps )
 HOOK: %saturated-sub-vector-reps cpu ( -- reps )
 HOOK: %mul-vector-reps cpu ( -- reps )
+HOOK: %mul-high-vector-reps cpu ( -- reps )
+HOOK: %mul-horizontal-add-vector-reps cpu ( -- reps )
 HOOK: %saturated-mul-vector-reps cpu ( -- reps )
 HOOK: %div-vector-reps cpu ( -- reps )
 HOOK: %min-vector-reps cpu ( -- reps )
 HOOK: %max-vector-reps cpu ( -- reps )
+HOOK: %avg-vector-reps cpu ( -- reps )
 HOOK: %dot-vector-reps cpu ( -- reps )
+HOOK: %sad-vector-reps cpu ( -- reps )
 HOOK: %sqrt-vector-reps cpu ( -- reps )
 HOOK: %horizontal-add-vector-reps cpu ( -- reps )
 HOOK: %horizontal-sub-vector-reps cpu ( -- reps )
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@ -1106,6 +1106,33 @@ M: x86 %mul-vector-reps
        { sse4.1? { int-4-rep uint-4-rep } }
    } available-reps ;

+M: x86 %mul-high-vector ( dst src1 src2 rep -- )
+    [ two-operand ] keep
+    {
+        { short-8-rep  [ PMULHW ] }
+        { ushort-8-rep [ PMULHUW ] }
+    } case ;
+
+M: x86 %mul-high-vector-reps
+    {
+        { sse2? { short-8-rep ushort-8-rep } }
+    } available-reps ;
+
+M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- )
+    [ two-operand ] keep
+    {
+        { char-16-rep  [ PMADDUBSW ] }
+        { uchar-16-rep [ PMADDUBSW ] }
+        { short-8-rep  [ PMADDWD ] }
+        { ushort-8-rep [ PMADDWD ] }
+    } case ;
+
+M: x86 %mul-horizontal-add-vector-reps
+    {
+        { sse2?  { short-8-rep ushort-8-rep } }
+        { ssse3? { char-16-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %div-vector ( dst src1 src2 rep -- )
    [ two-operand ] keep
    {
@ -1159,6 +1186,18 @@ M: x86 %max-vector-reps
        { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } }
    } available-reps ;

+M: x86 %avg-vector ( dst src1 src2 rep -- )
+    [ two-operand ] keep
+    {
+        { uchar-16-rep [ PAVGB ] }
+        { ushort-8-rep [ PAVGW ] }
+    } case ;
+
+M: x86 %avg-vector-reps
+    {
+        { sse2? { uchar-16-rep ushort-8-rep } }
+    } available-reps ;
+
 M: x86 %dot-vector
    [ two-operand ] keep
    {
@ -1171,6 +1210,18 @@ M: x86 %dot-vector-reps
        { sse4.1? { float-4-rep double-2-rep } }
    } available-reps ;

+M: x86 %sad-vector
+    [ two-operand ] keep
+    {
+        { char-16-rep [ PSADBW ] }
+        { uchar-16-rep [ PSADBW ] }
+    } case ;
+
+M: x86 %sad-vector-reps
+    {
+        { sse2? { char-16-rep uchar-16-rep } }
+    } available-reps ;
+
 M: x86 %horizontal-add-vector ( dst src1 src2 rep -- )
    [ two-operand ] keep
    signed-rep {
--- a/basis/math/vectors/simd/simd.factor
+++ b/basis/math/vectors/simd/simd.factor
@ -168,7 +168,6 @@ M: A vs-               \ A-rep [ (simd-vs-)               ] [ call-next-method ]
 M: A vs*               \ A-rep [ (simd-vs*)               ] [ call-next-method ] vv->v-op ; inline
 M: A v*                \ A-rep [ (simd-v*)                ] [ call-next-method ] vv->v-op ; inline
 M: A v*high            \ A-rep [ (simd-v*high)            ] [ call-next-method ] vv->v-op ; inline
-M: A v*hs+             \ A-rep [ (simd-v*hs+)             ] [ call-next-method ] vv->v-op ; inline
 M: A v/                \ A-rep [ (simd-v/)                ] [ call-next-method ] vv->v-op ; inline
 M: A vavg              \ A-rep [ (simd-vavg)              ] [ call-next-method ] vv->v-op ; inline
 M: A vmin              \ A-rep [ (simd-vmin)              ] [ call-next-method ] vv->v-op ; inline
@ -273,6 +272,19 @@ SIMD-128: double-2
 M: simd-128 vshuffle ( u perm -- v )
    vshuffle-bytes ; inline

+M: uchar-16 v*hs+
+    uchar-16-rep [ (simd-v*hs+) ] [ call-next-method ] vv->v-op ushort-8-cast ; inline
+M: ushort-8 v*hs+
+    ushort-8-rep [ (simd-v*hs+) ] [ call-next-method ] vv->v-op uint-4-cast ; inline
+M: uint-4 v*hs+
+    uint-4-rep [ (simd-v*hs+) ] [ call-next-method ] vv->v-op ulonglong-2-cast ; inline
+M: char-16 v*hs+
+    char-16-rep [ (simd-v*hs+) ] [ call-next-method ] vv->v-op short-8-cast ; inline
+M: short-8 v*hs+
+    short-8-rep [ (simd-v*hs+) ] [ call-next-method ] vv->v-op int-4-cast ; inline
+M: int-4 v*hs+
+    int-4-rep [ (simd-v*hs+) ] [ call-next-method ] vv->v-op longlong-2-cast ; inline
+
 "mirrors" vocab [
    "math.vectors.simd.mirrors" require
 ] when