diff --git a/basis/compiler/cfg/build-stack-frame/build-stack-frame.factor b/basis/compiler/cfg/build-stack-frame/build-stack-frame.factor index 8b17db756d..a8a0fceed7 100644 --- a/basis/compiler/cfg/build-stack-frame/build-stack-frame.factor +++ b/basis/compiler/cfg/build-stack-frame/build-stack-frame.factor @@ -38,10 +38,10 @@ M: ##spill compute-stack-frame* drop frame-required ; M: ##reload compute-stack-frame* drop frame-required ; M: ##float>integer compute-stack-frame* - drop cpu ppc? [ frame-required ] when ; + drop integer-float-needs-stack-frame? [ frame-required ] when ; M: ##integer>float compute-stack-frame* - drop cpu ppc? [ frame-required ] when ; + drop integer-float-needs-stack-frame? [ frame-required ] when ; M: insn compute-stack-frame* drop ; diff --git a/basis/compiler/tests/float.factor b/basis/compiler/tests/float.factor index 9685870936..ea62795035 100644 --- a/basis/compiler/tests/float.factor +++ b/basis/compiler/tests/float.factor @@ -85,6 +85,9 @@ IN: compiler.tests.float [ t ] [ -0.0 [ dup 0.0 float= swap -0.0 float= or ] compile-call ] unit-test [ f ] [ 3.0 [ dup 0.0 float= swap -0.0 float= or ] compile-call ] unit-test +[ 313.0 ] [ 313 [ fixnum>float ] compile-call ] unit-test +[ -313 ] [ -313.5 [ float>fixnum ] compile-call ] unit-test +[ 313 ] [ 313.5 [ float>fixnum ] compile-call ] unit-test [ 315 315.0 ] [ 313 [ 2 fixnum+fast dup fixnum>float ] compile-call ] unit-test [ t ] [ 0/0. 0/0. [ float-unordered? ] compile-call ] unit-test diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor index cc6079d060..bb9adbf5ce 100644 --- a/basis/cpu/architecture/architecture.factor +++ b/basis/cpu/architecture/architecture.factor @@ -296,6 +296,8 @@ HOOK: %binary-float-function cpu ( dst src1 src2 func -- ) HOOK: %single>double-float cpu ( dst src -- ) HOOK: %double>single-float cpu ( dst src -- ) +HOOK: integer-float-needs-stack-frame? cpu ( -- ? ) + HOOK: %integer>float cpu ( dst src -- ) HOOK: %float>integer cpu ( dst src -- ) diff --git a/basis/cpu/ppc/ppc.factor b/basis/cpu/ppc/ppc.factor index 89ec8f4efa..56ec02d851 100644 --- a/basis/cpu/ppc/ppc.factor +++ b/basis/cpu/ppc/ppc.factor @@ -190,6 +190,8 @@ M: ppc %sub-float FSUB ; M: ppc %mul-float FMUL ; M: ppc %div-float FDIV ; +M: ppc integer-float-needs-stack-frame? t ; + M:: ppc %integer>float ( dst src -- ) HEX: 4330 scratch-reg LIS scratch-reg 1 0 scratch@ STW diff --git a/basis/cpu/x86/32/32.factor b/basis/cpu/x86/32/32.factor index d914c85f9a..09a81a5bdc 100755 --- a/basis/cpu/x86/32/32.factor +++ b/basis/cpu/x86/32/32.factor @@ -7,15 +7,20 @@ words compiler.constants compiler.codegen.fixup compiler.cfg.instructions compiler.cfg.builder compiler.cfg.builder.alien.boxing compiler.cfg.intrinsics compiler.cfg.stack-frame cpu.x86.assembler -cpu.x86.assembler.operands cpu.x86 cpu.architecture vm ; +cpu.x86.assembler.operands cpu.x86 cpu.architecture vm vocabs ; FROM: layouts => cell ; IN: cpu.x86.32 +: x86-float-regs ( -- seq ) + "cpu.x86.sse" vocab + { XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 } + { ST0 ST1 ST2 ST3 ST4 ST5 ST6 } + ? ; + M: x86.32 machine-registers - { - { int-regs { EAX ECX EDX EBP EBX } } - { float-regs { XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 } } - } ; + { int-regs { EAX ECX EDX EBP EBX } } + float-regs x86-float-regs 2array + 2array ; M: x86.32 ds-reg ESI ; M: x86.32 rs-reg EDI ; @@ -94,7 +99,7 @@ M: x86.32 param-regs M: x86.32 return-regs { { int-regs { EAX EDX } } - { float-regs { f } } + { float-regs { ST0 } } } ; M: x86.32 %prologue ( n -- ) @@ -105,11 +110,11 @@ M: x86.32 %prologue ( n -- ) M: x86.32 %prepare-jump pic-tail-reg 0 MOV xt-tail-pic-offset rc-absolute-cell rel-here ; -:: load-float-return ( dst x87-insn sse-insn -- ) +:: load-float-return ( dst x87-insn rep -- ) dst register? [ ESP 4 SUB ESP [] x87-insn execute - dst ESP [] sse-insn execute + dst ESP [] rep %copy ESP 4 ADD ] [ dst ?spill-slot x87-insn execute @@ -118,14 +123,14 @@ M: x86.32 %prepare-jump M: x86.32 %load-reg-param ( dst reg rep -- ) { { int-rep [ int-rep %copy ] } - { float-rep [ drop \ FSTPS \ MOVSS load-float-return ] } - { double-rep [ drop \ FSTPL \ MOVSD load-float-return ] } + { float-rep [ drop \ FSTPS float-rep load-float-return ] } + { double-rep [ drop \ FSTPL double-rep load-float-return ] } } case ; -:: store-float-return ( src x87-insn sse-insn -- ) +:: store-float-return ( src x87-insn rep -- ) src register? [ ESP 4 SUB - ESP [] src sse-insn execute + ESP [] src rep %copy ESP [] x87-insn execute ESP 4 ADD ] [ @@ -135,8 +140,8 @@ M: x86.32 %load-reg-param ( dst reg rep -- ) M: x86.32 %store-reg-param ( src reg rep -- ) { { int-rep [ swap int-rep %copy ] } - { float-rep [ drop \ FLDS \ MOVSS store-float-return ] } - { double-rep [ drop \ FLDL \ MOVSD store-float-return ] } + { float-rep [ drop \ FLDS float-rep store-float-return ] } + { double-rep [ drop \ FLDL double-rep store-float-return ] } } case ; :: call-unbox-func ( src func -- ) diff --git a/basis/cpu/x86/assembler/assembler.factor b/basis/cpu/x86/assembler/assembler.factor index 4460643152..1cb8f67aa6 100644 --- a/basis/cpu/x86/assembler/assembler.factor +++ b/basis/cpu/x86/assembler/assembler.factor @@ -496,6 +496,8 @@ PRIVATE> : FILDQ ( src -- ) { BIN: 101 f HEX: DF } 1-operand ; : FISTPD ( dst -- ) { BIN: 011 f HEX: DB } 1-operand ; : FISTPQ ( dst -- ) { BIN: 111 f HEX: DF } 1-operand ; +: FISTTPD ( dst -- ) { BIN: 001 f HEX: DB } 1-operand ; +: FISTTPQ ( dst -- ) { BIN: 001 f HEX: DF } 1-operand ; : FLD ( dst src -- ) HEX: D9 0 x87-st0-op ; : FLD1 ( -- ) { HEX: D9 HEX: E8 } % ; diff --git a/basis/cpu/x86/assembler/operands/operands.factor b/basis/cpu/x86/assembler/operands/operands.factor index ffee62450d..dc9ee1ce4c 100644 --- a/basis/cpu/x86/assembler/operands/operands.factor +++ b/basis/cpu/x86/assembler/operands/operands.factor @@ -15,15 +15,16 @@ REGISTERS: 16 AX CX DX BX SP BP SI DI R8W R9W R10W R11W R12W R13W R14W R15W ; REGISTERS: 32 EAX ECX EDX EBX ESP EBP ESI EDI R8D R9D R10D R11D R12D R13D R14D R15D ; -REGISTERS: 64 -RAX RCX RDX RBX RSP RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ; +REGISTERS: 64 RAX RCX RDX RBX RSP RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ; REGISTERS: 128 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15 ; -REGISTERS: 80 -ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7 ; +REGISTERS: 80 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7 ; + +: shuffle-down ( STn -- STn+1 ) + "register" word-prop 1 + 80 registers get at nth ; PREDICATE: register < word "register" word-prop ; diff --git a/basis/cpu/x86/sse/authors.txt b/basis/cpu/x86/sse/authors.txt new file mode 100644 index 0000000000..580f882c8d --- /dev/null +++ b/basis/cpu/x86/sse/authors.txt @@ -0,0 +1,2 @@ +Slava Pestov +Joe Groff diff --git a/basis/cpu/x86/sse/sse.factor b/basis/cpu/x86/sse/sse.factor new file mode 100644 index 0000000000..a6e92ff0a0 --- /dev/null +++ b/basis/cpu/x86/sse/sse.factor @@ -0,0 +1,913 @@ +! Copyright (C) 2009, 2010 Joe Groff, Slava Pestov. +! See http://factorcode.org/license.txt for BSD license. +USING: alien.c-types arrays assocs combinators fry kernel locals +macros math math.vectors namespaces quotations sequences system +compiler.cfg.comparisons compiler.cfg.intrinsics +compiler.codegen.fixup cpu.architecture cpu.x86 +cpu.x86.assembler cpu.x86.assembler.operands cpu.x86.features ; +IN: cpu.x86.sse + +! Scalar floating point with SSE2 +M: x86 %load-float float-rep %load-vector ; +M: x86 %load-double double-rep %load-vector ; + +M: float-rep copy-register* drop MOVAPS ; +M: double-rep copy-register* drop MOVAPS ; + +M: float-rep copy-memory* drop MOVSS ; +M: double-rep copy-memory* drop MOVSD ; + +M: x86 %add-float double-rep two-operand ADDSD ; +M: x86 %sub-float double-rep two-operand SUBSD ; +M: x86 %mul-float double-rep two-operand MULSD ; +M: x86 %div-float double-rep two-operand DIVSD ; +M: x86 %min-float double-rep two-operand MINSD ; +M: x86 %max-float double-rep two-operand MAXSD ; +M: x86 %sqrt SQRTSD ; + +: %clear-unless-in-place ( dst src -- ) + over = [ drop ] [ dup XORPS ] if ; + +M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ; +M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ; + +M: x86 integer-float-needs-stack-frame? f ; +M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ; +M: x86 %float>integer CVTTSD2SI ; + +M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- ) + [ COMISD ] (%compare-float) ; + +M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- ) + [ UCOMISD ] (%compare-float) ; + +M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- ) + [ COMISD ] (%compare-float-branch) ; + +M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- ) + [ UCOMISD ] (%compare-float-branch) ; + +! SIMD +M: float-4-rep copy-register* drop MOVAPS ; +M: double-2-rep copy-register* drop MOVAPS ; +M: vector-rep copy-register* drop MOVDQA ; + +MACRO: available-reps ( alist -- ) + ! Each SSE version adds new representations and supports + ! all old ones + unzip { } [ append ] accumulate rest swap suffix + [ [ 1quotation ] map ] bi@ zip + reverse [ { } ] suffix + '[ _ cond ] ; + +M: x86 %alien-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %zero-vector + { + { double-2-rep [ dup XORPS ] } + { float-4-rep [ dup XORPS ] } + [ drop dup PXOR ] + } case ; + +M: x86 %zero-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %fill-vector + { + { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } + { float-4-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } + [ drop dup PCMPEQB ] + } case ; + +M: x86 %fill-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) + rep signed-rep { + { float-4-rep [ + dst src1 float-4-rep %copy + dst src2 UNPCKLPS + src3 src4 UNPCKLPS + dst src3 MOVLHPS + ] } + { int-4-rep [ + dst src1 int-4-rep %copy + dst src2 PUNPCKLDQ + src3 src4 PUNPCKLDQ + dst src3 PUNPCKLQDQ + ] } + } case ; + +M: x86 %gather-vector-4-reps + { + ! Can't do this with sse1 since it will want to unbox + ! double-precision floats and convert to single precision + { sse2? { float-4-rep int-4-rep uint-4-rep } } + } available-reps ; + +M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- ) + dst rep %zero-vector + dst src1 32-bit-version-of 0 PINSRD + dst src2 32-bit-version-of 1 PINSRD + dst src3 32-bit-version-of 2 PINSRD + dst src4 32-bit-version-of 3 PINSRD ; + +M: x86 %gather-int-vector-4-reps + { + { sse4.1? { int-4-rep uint-4-rep } } + } available-reps ; + +M:: x86 %gather-vector-2 ( dst src1 src2 rep -- ) + rep signed-rep { + { double-2-rep [ + dst src1 double-2-rep %copy + dst src2 MOVLHPS + ] } + { longlong-2-rep [ + dst src1 longlong-2-rep %copy + dst src2 PUNPCKLQDQ + ] } + } case ; + +M: x86 %gather-vector-2-reps + { + { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- ) + dst rep %zero-vector + dst src1 0 PINSRQ + dst src2 1 PINSRQ ; + +M: x86.64 %gather-int-vector-2-reps + { + { sse4.1? { longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +:: %select-vector-32 ( dst src n rep -- ) + rep { + { char-16-rep [ + dst 32-bit-version-of src n PEXTRB + dst dst 8-bit-version-of MOVSX + ] } + { uchar-16-rep [ + dst 32-bit-version-of src n PEXTRB + ] } + { short-8-rep [ + dst 32-bit-version-of src n PEXTRW + dst dst 16-bit-version-of MOVSX + ] } + { ushort-8-rep [ + dst 32-bit-version-of src n PEXTRW + ] } + { int-4-rep [ + dst 32-bit-version-of src n PEXTRD + dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if + ] } + { uint-4-rep [ + dst 32-bit-version-of src n PEXTRD + ] } + } case ; + +M: x86.32 %select-vector + %select-vector-32 ; + +M: x86.32 %select-vector-reps + { + { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } } + } available-reps ; + +M: x86.64 %select-vector + { + { longlong-2-rep [ PEXTRQ ] } + { ulonglong-2-rep [ PEXTRQ ] } + [ %select-vector-32 ] + } case ; + +M: x86.64 %select-vector-reps + { + { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } } + } available-reps ; + +: sse1-float-4-shuffle ( dst shuffle -- ) + { + { { 0 1 2 3 } [ drop ] } + { { 0 1 0 1 } [ dup MOVLHPS ] } + { { 2 3 2 3 } [ dup MOVHLPS ] } + { { 0 0 1 1 } [ dup UNPCKLPS ] } + { { 2 2 3 3 } [ dup UNPCKHPS ] } + [ dupd SHUFPS ] + } case ; + +: float-4-shuffle ( dst shuffle -- ) + sse3? [ + { + { { 0 0 2 2 } [ dup MOVSLDUP ] } + { { 1 1 3 3 } [ dup MOVSHDUP ] } + [ sse1-float-4-shuffle ] + } case + ] [ sse1-float-4-shuffle ] if ; + +: int-4-shuffle ( dst shuffle -- ) + { + { { 0 1 2 3 } [ drop ] } + { { 0 0 1 1 } [ dup PUNPCKLDQ ] } + { { 2 2 3 3 } [ dup PUNPCKHDQ ] } + { { 0 1 0 1 } [ dup PUNPCKLQDQ ] } + { { 2 3 2 3 } [ dup PUNPCKHQDQ ] } + [ dupd PSHUFD ] + } case ; + +: longlong-2-shuffle ( dst shuffle -- ) + first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ; + +: >float-4-shuffle ( double-2-shuffle -- float-4-shuffle ) + [ 2 * { 0 1 } n+v ] map concat ; + +M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- ) + dst src rep %copy + dst shuffle rep signed-rep { + { double-2-rep [ >float-4-shuffle float-4-shuffle ] } + { float-4-rep [ float-4-shuffle ] } + { int-4-rep [ int-4-shuffle ] } + { longlong-2-rep [ longlong-2-shuffle ] } + } case ; + +M: x86 %shuffle-vector-imm-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- ) + dst src1 src2 rep two-operand + shuffle rep { + { double-2-rep [ >float-4-shuffle SHUFPS ] } + { float-4-rep [ SHUFPS ] } + } case ; + +M: x86 %shuffle-vector-halves-imm-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + +M: x86 %shuffle-vector ( dst src shuffle rep -- ) + two-operand PSHUFB ; + +M: x86 %shuffle-vector-reps + { + { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } } + } available-reps ; + +M: x86 %merge-vector-head + [ two-operand ] keep + signed-rep { + { double-2-rep [ MOVLHPS ] } + { float-4-rep [ UNPCKLPS ] } + { longlong-2-rep [ PUNPCKLQDQ ] } + { int-4-rep [ PUNPCKLDQ ] } + { short-8-rep [ PUNPCKLWD ] } + { char-16-rep [ PUNPCKLBW ] } + } case ; + +M: x86 %merge-vector-tail + [ two-operand ] keep + signed-rep { + { double-2-rep [ UNPCKHPD ] } + { float-4-rep [ UNPCKHPS ] } + { longlong-2-rep [ PUNPCKHQDQ ] } + { int-4-rep [ PUNPCKHDQ ] } + { short-8-rep [ PUNPCKHWD ] } + { char-16-rep [ PUNPCKHBW ] } + } case ; + +M: x86 %merge-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %signed-pack-vector + [ two-operand ] keep + { + { int-4-rep [ PACKSSDW ] } + { short-8-rep [ PACKSSWB ] } + } case ; + +M: x86 %signed-pack-vector-reps + { + { sse2? { short-8-rep int-4-rep } } + } available-reps ; + +M: x86 %unsigned-pack-vector + [ two-operand ] keep + signed-rep { + { int-4-rep [ PACKUSDW ] } + { short-8-rep [ PACKUSWB ] } + } case ; + +M: x86 %unsigned-pack-vector-reps + { + { sse2? { short-8-rep } } + { sse4.1? { int-4-rep } } + } available-reps ; + +M: x86 %tail>head-vector ( dst src rep -- ) + dup { + { float-4-rep [ drop UNPCKHPD ] } + { double-2-rep [ drop UNPCKHPD ] } + [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ] + } case ; + +M: x86 %unpack-vector-head ( dst src rep -- ) + { + { char-16-rep [ PMOVSXBW ] } + { uchar-16-rep [ PMOVZXBW ] } + { short-8-rep [ PMOVSXWD ] } + { ushort-8-rep [ PMOVZXWD ] } + { int-4-rep [ PMOVSXDQ ] } + { uint-4-rep [ PMOVZXDQ ] } + { float-4-rep [ CVTPS2PD ] } + } case ; + +M: x86 %unpack-vector-head-reps ( -- reps ) + { + { sse2? { float-4-rep } } + { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + +M: x86 %integer>float-vector ( dst src rep -- ) + { + { int-4-rep [ CVTDQ2PS ] } + } case ; + +M: x86 %integer>float-vector-reps + { + { sse2? { int-4-rep } } + } available-reps ; + +M: x86 %float>integer-vector ( dst src rep -- ) + { + { float-4-rep [ CVTTPS2DQ ] } + } case ; + +M: x86 %float>integer-vector-reps + { + { sse2? { float-4-rep } } + } available-reps ; + +: (%compare-float-vector) ( dst src rep double single -- ) + [ double-2-rep eq? ] 2dip if ; inline + +: %compare-float-vector ( dst src rep cc -- ) + { + { cc< [ [ CMPLTPD ] [ CMPLTPS ] (%compare-float-vector) ] } + { cc<= [ [ CMPLEPD ] [ CMPLEPS ] (%compare-float-vector) ] } + { cc= [ [ CMPEQPD ] [ CMPEQPS ] (%compare-float-vector) ] } + { cc<>= [ [ CMPORDPD ] [ CMPORDPS ] (%compare-float-vector) ] } + { cc/< [ [ CMPNLTPD ] [ CMPNLTPS ] (%compare-float-vector) ] } + { cc/<= [ [ CMPNLEPD ] [ CMPNLEPS ] (%compare-float-vector) ] } + { cc/= [ [ CMPNEQPD ] [ CMPNEQPS ] (%compare-float-vector) ] } + { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] } + } case ; + +:: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- ) + rep signed-rep :> rep' + dst src rep' { + { longlong-2-rep [ int64 call ] } + { int-4-rep [ int32 call ] } + { short-8-rep [ int16 call ] } + { char-16-rep [ int8 call ] } + } case ; inline + +: %compare-int-vector ( dst src rep cc -- ) + { + { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] } + { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] } + } case ; + +M: x86 %compare-vector ( dst src1 src2 rep cc -- ) + [ [ two-operand ] keep ] dip + over float-vector-rep? + [ %compare-float-vector ] + [ %compare-int-vector ] if ; + +: %compare-vector-eq-reps ( -- reps ) + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } + { sse4.1? { longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +: %compare-vector-ord-reps ( -- reps ) + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } } + { sse4.2? { longlong-2-rep } } + } available-reps ; + +M: x86 %compare-vector-reps + { + { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] } + [ drop %compare-vector-ord-reps ] + } cond ; + +: %compare-float-vector-ccs ( cc -- ccs not? ) + { + { cc< [ { { cc< f } } f ] } + { cc<= [ { { cc<= f } } f ] } + { cc> [ { { cc< t } } f ] } + { cc>= [ { { cc<= t } } f ] } + { cc= [ { { cc= f } } f ] } + { cc<> [ { { cc< f } { cc< t } } f ] } + { cc<>= [ { { cc<>= f } } f ] } + { cc/< [ { { cc/< f } } f ] } + { cc/<= [ { { cc/<= f } } f ] } + { cc/> [ { { cc/< t } } f ] } + { cc/>= [ { { cc/<= t } } f ] } + { cc/= [ { { cc/= f } } f ] } + { cc/<> [ { { cc/= f } { cc/<>= f } } f ] } + { cc/<>= [ { { cc/<>= f } } f ] } + } case ; + +: %compare-int-vector-ccs ( cc -- ccs not? ) + order-cc { + { cc< [ { { cc> t } } f ] } + { cc<= [ { { cc> f } } t ] } + { cc> [ { { cc> f } } f ] } + { cc>= [ { { cc> t } } t ] } + { cc= [ { { cc= f } } f ] } + { cc/= [ { { cc= f } } t ] } + { t [ { } t ] } + { f [ { } f ] } + } case ; + +M: x86 %compare-vector-ccs + swap float-vector-rep? + [ %compare-float-vector-ccs ] + [ %compare-int-vector-ccs ] if ; + +:: %test-vector-mask ( dst temp mask vcc -- ) + vcc { + { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] } + { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] } + { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] } + { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] } + } case ; + +: %move-vector-mask ( dst src rep -- mask ) + { + { double-2-rep [ MOVMSKPS HEX: f ] } + { float-4-rep [ MOVMSKPS HEX: f ] } + [ drop PMOVMSKB HEX: ffff ] + } case ; + +M:: x86 %test-vector ( dst src temp rep vcc -- ) + dst src rep %move-vector-mask :> mask + dst temp mask vcc %test-vector-mask ; + +:: %test-vector-mask-branch ( label temp mask vcc -- ) + vcc { + { vcc-any [ temp temp TEST label JNE ] } + { vcc-none [ temp temp TEST label JE ] } + { vcc-all [ temp mask CMP label JE ] } + { vcc-notall [ temp mask CMP label JNE ] } + } case ; + +M:: x86 %test-vector-branch ( label src temp rep vcc -- ) + temp src rep %move-vector-mask :> mask + label temp mask vcc %test-vector-mask-branch ; + +M: x86 %test-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %add-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ADDPS ] } + { double-2-rep [ ADDPD ] } + { char-16-rep [ PADDB ] } + { uchar-16-rep [ PADDB ] } + { short-8-rep [ PADDW ] } + { ushort-8-rep [ PADDW ] } + { int-4-rep [ PADDD ] } + { uint-4-rep [ PADDD ] } + { longlong-2-rep [ PADDQ ] } + { ulonglong-2-rep [ PADDQ ] } + } case ; + +M: x86 %add-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %saturated-add-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PADDSB ] } + { uchar-16-rep [ PADDUSB ] } + { short-8-rep [ PADDSW ] } + { ushort-8-rep [ PADDUSW ] } + } case ; + +M: x86 %saturated-add-vector-reps + { + { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } + } available-reps ; + +M: x86 %add-sub-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ADDSUBPS ] } + { double-2-rep [ ADDSUBPD ] } + } case ; + +M: x86 %add-sub-vector-reps + { + { sse3? { float-4-rep double-2-rep } } + } available-reps ; + +M: x86 %sub-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ SUBPS ] } + { double-2-rep [ SUBPD ] } + { char-16-rep [ PSUBB ] } + { uchar-16-rep [ PSUBB ] } + { short-8-rep [ PSUBW ] } + { ushort-8-rep [ PSUBW ] } + { int-4-rep [ PSUBD ] } + { uint-4-rep [ PSUBD ] } + { longlong-2-rep [ PSUBQ ] } + { ulonglong-2-rep [ PSUBQ ] } + } case ; + +M: x86 %sub-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %saturated-sub-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PSUBSB ] } + { uchar-16-rep [ PSUBUSB ] } + { short-8-rep [ PSUBSW ] } + { ushort-8-rep [ PSUBUSW ] } + } case ; + +M: x86 %saturated-sub-vector-reps + { + { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } + } available-reps ; + +M: x86 %mul-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ MULPS ] } + { double-2-rep [ MULPD ] } + { short-8-rep [ PMULLW ] } + { ushort-8-rep [ PMULLW ] } + { int-4-rep [ PMULLD ] } + { uint-4-rep [ PMULLD ] } + } case ; + +M: x86 %mul-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep short-8-rep ushort-8-rep } } + { sse4.1? { int-4-rep uint-4-rep } } + } available-reps ; + +M: x86 %mul-high-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { short-8-rep [ PMULHW ] } + { ushort-8-rep [ PMULHUW ] } + } case ; + +M: x86 %mul-high-vector-reps + { + { sse2? { short-8-rep ushort-8-rep } } + } available-reps ; + +M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PMADDUBSW ] } + { uchar-16-rep [ PMADDUBSW ] } + { short-8-rep [ PMADDWD ] } + } case ; + +M: x86 %mul-horizontal-add-vector-reps + { + { sse2? { short-8-rep } } + { ssse3? { char-16-rep uchar-16-rep } } + } available-reps ; + +M: x86 %div-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ DIVPS ] } + { double-2-rep [ DIVPD ] } + } case ; + +M: x86 %div-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + +M: x86 %min-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PMINSB ] } + { uchar-16-rep [ PMINUB ] } + { short-8-rep [ PMINSW ] } + { ushort-8-rep [ PMINUW ] } + { int-4-rep [ PMINSD ] } + { uint-4-rep [ PMINUD ] } + { float-4-rep [ MINPS ] } + { double-2-rep [ MINPD ] } + } case ; + +M: x86 %min-vector-reps + { + { sse? { float-4-rep } } + { sse2? { uchar-16-rep short-8-rep double-2-rep } } + { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + +M: x86 %max-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PMAXSB ] } + { uchar-16-rep [ PMAXUB ] } + { short-8-rep [ PMAXSW ] } + { ushort-8-rep [ PMAXUW ] } + { int-4-rep [ PMAXSD ] } + { uint-4-rep [ PMAXUD ] } + { float-4-rep [ MAXPS ] } + { double-2-rep [ MAXPD ] } + } case ; + +M: x86 %max-vector-reps + { + { sse? { float-4-rep } } + { sse2? { uchar-16-rep short-8-rep double-2-rep } } + { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + +M: x86 %avg-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { uchar-16-rep [ PAVGB ] } + { ushort-8-rep [ PAVGW ] } + } case ; + +M: x86 %avg-vector-reps + { + { sse2? { uchar-16-rep ushort-8-rep } } + } available-reps ; + +M: x86 %dot-vector + [ two-operand ] keep + { + { float-4-rep [ HEX: ff DPPS ] } + { double-2-rep [ HEX: ff DPPD ] } + } case ; + +M: x86 %dot-vector-reps + { + { sse4.1? { float-4-rep double-2-rep } } + } available-reps ; + +M: x86 %sad-vector + [ two-operand ] keep + { + { uchar-16-rep [ PSADBW ] } + } case ; + +M: x86 %sad-vector-reps + { + { sse2? { uchar-16-rep } } + } available-reps ; + +M: x86 %horizontal-add-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + signed-rep { + { float-4-rep [ HADDPS ] } + { double-2-rep [ HADDPD ] } + { int-4-rep [ PHADDD ] } + { short-8-rep [ PHADDW ] } + } case ; + +M: x86 %horizontal-add-vector-reps + { + { sse3? { float-4-rep double-2-rep } } + { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } } + } available-reps ; + +M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- ) + two-operand PSLLDQ ; + +M: x86 %horizontal-shl-vector-imm-reps + { + { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } + } available-reps ; + +M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- ) + two-operand PSRLDQ ; + +M: x86 %horizontal-shr-vector-imm-reps + { + { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } + } available-reps ; + +M: x86 %abs-vector ( dst src rep -- ) + { + { char-16-rep [ PABSB ] } + { short-8-rep [ PABSW ] } + { int-4-rep [ PABSD ] } + } case ; + +M: x86 %abs-vector-reps + { + { ssse3? { char-16-rep short-8-rep int-4-rep } } + } available-reps ; + +M: x86 %sqrt-vector ( dst src rep -- ) + { + { float-4-rep [ SQRTPS ] } + { double-2-rep [ SQRTPD ] } + } case ; + +M: x86 %sqrt-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + +M: x86 %and-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ANDPS ] } + { double-2-rep [ ANDPS ] } + [ drop PAND ] + } case ; + +M: x86 %and-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %andn-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ANDNPS ] } + { double-2-rep [ ANDNPS ] } + [ drop PANDN ] + } case ; + +M: x86 %andn-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %or-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ORPS ] } + { double-2-rep [ ORPS ] } + [ drop POR ] + } case ; + +M: x86 %or-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %xor-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ XORPS ] } + { double-2-rep [ XORPS ] } + [ drop PXOR ] + } case ; + +M: x86 %xor-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %shl-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { short-8-rep [ PSLLW ] } + { ushort-8-rep [ PSLLW ] } + { int-4-rep [ PSLLD ] } + { uint-4-rep [ PSLLD ] } + { longlong-2-rep [ PSLLQ ] } + { ulonglong-2-rep [ PSLLQ ] } + } case ; + +M: x86 %shl-vector-reps + { + { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %shr-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { short-8-rep [ PSRAW ] } + { ushort-8-rep [ PSRLW ] } + { int-4-rep [ PSRAD ] } + { uint-4-rep [ PSRLD ] } + { ulonglong-2-rep [ PSRLQ ] } + } case ; + +M: x86 %shr-vector-reps + { + { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %shl-vector-imm %shl-vector ; +M: x86 %shl-vector-imm-reps %shl-vector-reps ; +M: x86 %shr-vector-imm %shr-vector ; +M: x86 %shr-vector-imm-reps %shr-vector-reps ; + +: scalar-sized-reg ( reg rep -- reg' ) + rep-size 8 * n-bit-version-of ; + +M: x86 %integer>scalar drop MOVD ; + +:: %scalar>integer-32 ( dst src rep -- ) + rep { + { int-scalar-rep [ + dst 32-bit-version-of src MOVD + dst dst 32-bit-version-of + 2dup eq? [ 2drop ] [ MOVSX ] if + ] } + { uint-scalar-rep [ + dst 32-bit-version-of src MOVD + ] } + { short-scalar-rep [ + dst 32-bit-version-of src MOVD + dst dst 16-bit-version-of MOVSX + ] } + { ushort-scalar-rep [ + dst 32-bit-version-of src MOVD + dst dst 16-bit-version-of MOVZX + ] } + { char-scalar-rep [ + dst 32-bit-version-of src MOVD + dst { } 8 [| tmp-dst | + tmp-dst dst int-rep %copy + tmp-dst tmp-dst 8-bit-version-of MOVSX + dst tmp-dst int-rep %copy + ] with-small-register + ] } + { uchar-scalar-rep [ + dst 32-bit-version-of src MOVD + dst { } 8 [| tmp-dst | + tmp-dst dst int-rep %copy + tmp-dst tmp-dst 8-bit-version-of MOVZX + dst tmp-dst int-rep %copy + ] with-small-register + ] } + } case ; + +M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ; + +M: x86.64 %scalar>integer ( dst src rep -- ) + { + { longlong-scalar-rep [ MOVD ] } + { ulonglong-scalar-rep [ MOVD ] } + [ %scalar>integer-32 ] + } case ; + +M: x86 %vector>scalar %copy ; + +M: x86 %scalar>vector %copy ; + +enable-float-min/max diff --git a/basis/cpu/x86/sse/tags.txt b/basis/cpu/x86/sse/tags.txt new file mode 100644 index 0000000000..ebb74b4d5f --- /dev/null +++ b/basis/cpu/x86/sse/tags.txt @@ -0,0 +1 @@ +not loaded diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor index 949a0104af..205b3fd65f 100644 --- a/basis/cpu/x86/x86.factor +++ b/basis/cpu/x86/x86.factor @@ -6,7 +6,7 @@ cpu.x86.features cpu.x86.features.private cpu.architecture kernel kernel.private math memory namespaces make sequences words system layouts combinators math.order math.vectors fry locals compiler.constants byte-arrays io macros quotations classes.algebra compiler -compiler.units init vm +compiler.units init vm vocabs.loader compiler.cfg.registers compiler.cfg.instructions compiler.cfg.intrinsics @@ -69,12 +69,6 @@ M: x86 %load-reference [ \ f type-number MOV ] if* ; -M: x86 %load-float ( dst val -- ) - float-rep %load-vector ; - -M: x86 %load-double ( dst val -- ) - double-rep %load-vector ; - HOOK: ds-reg cpu ( -- reg ) HOOK: rs-reg cpu ( -- reg ) @@ -165,15 +159,8 @@ GENERIC: copy-memory* ( dst src rep -- ) M: int-rep copy-register* drop MOV ; M: tagged-rep copy-register* drop MOV ; -M: float-rep copy-register* drop MOVAPS ; -M: double-rep copy-register* drop MOVAPS ; -M: float-4-rep copy-register* drop MOVAPS ; -M: double-2-rep copy-register* drop MOVAPS ; -M: vector-rep copy-register* drop MOVDQA ; M: object copy-memory* copy-register* ; -M: float-rep copy-memory* drop MOVSS ; -M: double-rep copy-memory* drop MOVSD ; : ?spill-slot ( obj -- obj ) dup spill-slot? [ n>> spill@ ] when ; @@ -576,961 +563,6 @@ M:: x86 %compare-imm-branch ( label src1 src2 cc -- ) src1 src2 (%compare-imm) label cc %branch ; -M: x86 %add-float double-rep two-operand ADDSD ; -M: x86 %sub-float double-rep two-operand SUBSD ; -M: x86 %mul-float double-rep two-operand MULSD ; -M: x86 %div-float double-rep two-operand DIVSD ; -M: x86 %min-float double-rep two-operand MINSD ; -M: x86 %max-float double-rep two-operand MAXSD ; -M: x86 %sqrt SQRTSD ; - -: %clear-unless-in-place ( dst src -- ) - over = [ drop ] [ dup XORPS ] if ; - -M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ; -M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ; - -M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ; -M: x86 %float>integer CVTTSD2SI ; - -: %cmov-float= ( dst src -- ) - [ - "no-move" define-label - - "no-move" get [ JNE ] [ JP ] bi - MOV - "no-move" resolve-label - ] with-scope ; - -: %cmov-float/= ( dst src -- ) - [ - "no-move" define-label - "move" define-label - - "move" get JP - "no-move" get JE - "move" resolve-label - MOV - "no-move" resolve-label - ] with-scope ; - -:: (%compare-float) ( dst src1 src2 cc temp compare -- ) - cc { - { cc< [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVA (%boolean) ] } - { cc<= [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVAE (%boolean) ] } - { cc> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVA (%boolean) ] } - { cc>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVAE (%boolean) ] } - { cc= [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float= (%boolean) ] } - { cc<> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNE (%boolean) ] } - { cc<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNP (%boolean) ] } - { cc/< [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVBE (%boolean) ] } - { cc/<= [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVB (%boolean) ] } - { cc/> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVBE (%boolean) ] } - { cc/>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVB (%boolean) ] } - { cc/= [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float/= (%boolean) ] } - { cc/<> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVE (%boolean) ] } - { cc/<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVP (%boolean) ] } - } case ; inline - -M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- ) - \ COMISD (%compare-float) ; - -M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- ) - \ UCOMISD (%compare-float) ; - -: %jump-float= ( label -- ) - [ - "no-jump" define-label - "no-jump" get JP - JE - "no-jump" resolve-label - ] with-scope ; - -: %jump-float/= ( label -- ) - [ JNE ] [ JP ] bi ; - -:: (%compare-float-branch) ( label src1 src2 cc compare -- ) - cc { - { cc< [ src2 src1 \ compare execute( a b -- ) label JA ] } - { cc<= [ src2 src1 \ compare execute( a b -- ) label JAE ] } - { cc> [ src1 src2 \ compare execute( a b -- ) label JA ] } - { cc>= [ src1 src2 \ compare execute( a b -- ) label JAE ] } - { cc= [ src1 src2 \ compare execute( a b -- ) label %jump-float= ] } - { cc<> [ src1 src2 \ compare execute( a b -- ) label JNE ] } - { cc<>= [ src1 src2 \ compare execute( a b -- ) label JNP ] } - { cc/< [ src2 src1 \ compare execute( a b -- ) label JBE ] } - { cc/<= [ src2 src1 \ compare execute( a b -- ) label JB ] } - { cc/> [ src1 src2 \ compare execute( a b -- ) label JBE ] } - { cc/>= [ src1 src2 \ compare execute( a b -- ) label JB ] } - { cc/= [ src1 src2 \ compare execute( a b -- ) label %jump-float/= ] } - { cc/<> [ src1 src2 \ compare execute( a b -- ) label JE ] } - { cc/<>= [ src1 src2 \ compare execute( a b -- ) label JP ] } - } case ; - -M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- ) - \ COMISD (%compare-float-branch) ; - -M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- ) - \ UCOMISD (%compare-float-branch) ; - -MACRO: available-reps ( alist -- ) - ! Each SSE version adds new representations and supports - ! all old ones - unzip { } [ append ] accumulate rest swap suffix - [ [ 1quotation ] map ] bi@ zip - reverse [ { } ] suffix - '[ _ cond ] ; - -M: x86 %alien-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %zero-vector - { - { double-2-rep [ dup XORPS ] } - { float-4-rep [ dup XORPS ] } - [ drop dup PXOR ] - } case ; - -M: x86 %zero-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %fill-vector - { - { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } - { float-4-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } - [ drop dup PCMPEQB ] - } case ; - -M: x86 %fill-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) - rep signed-rep { - { float-4-rep [ - dst src1 float-4-rep %copy - dst src2 UNPCKLPS - src3 src4 UNPCKLPS - dst src3 MOVLHPS - ] } - { int-4-rep [ - dst src1 int-4-rep %copy - dst src2 PUNPCKLDQ - src3 src4 PUNPCKLDQ - dst src3 PUNPCKLQDQ - ] } - } case ; - -M: x86 %gather-vector-4-reps - { - ! Can't do this with sse1 since it will want to unbox - ! double-precision floats and convert to single precision - { sse2? { float-4-rep int-4-rep uint-4-rep } } - } available-reps ; - -M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- ) - dst rep %zero-vector - dst src1 32-bit-version-of 0 PINSRD - dst src2 32-bit-version-of 1 PINSRD - dst src3 32-bit-version-of 2 PINSRD - dst src4 32-bit-version-of 3 PINSRD ; - -M: x86 %gather-int-vector-4-reps - { - { sse4.1? { int-4-rep uint-4-rep } } - } available-reps ; - -M:: x86 %gather-vector-2 ( dst src1 src2 rep -- ) - rep signed-rep { - { double-2-rep [ - dst src1 double-2-rep %copy - dst src2 MOVLHPS - ] } - { longlong-2-rep [ - dst src1 longlong-2-rep %copy - dst src2 PUNPCKLQDQ - ] } - } case ; - -M: x86 %gather-vector-2-reps - { - { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- ) - dst rep %zero-vector - dst src1 0 PINSRQ - dst src2 1 PINSRQ ; - -M: x86.64 %gather-int-vector-2-reps - { - { sse4.1? { longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -:: %select-vector-32 ( dst src n rep -- ) - rep { - { char-16-rep [ - dst 32-bit-version-of src n PEXTRB - dst dst 8-bit-version-of MOVSX - ] } - { uchar-16-rep [ - dst 32-bit-version-of src n PEXTRB - ] } - { short-8-rep [ - dst 32-bit-version-of src n PEXTRW - dst dst 16-bit-version-of MOVSX - ] } - { ushort-8-rep [ - dst 32-bit-version-of src n PEXTRW - ] } - { int-4-rep [ - dst 32-bit-version-of src n PEXTRD - dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if - ] } - { uint-4-rep [ - dst 32-bit-version-of src n PEXTRD - ] } - } case ; - -M: x86.32 %select-vector - %select-vector-32 ; - -M: x86.32 %select-vector-reps - { - { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } } - } available-reps ; - -M: x86.64 %select-vector - { - { longlong-2-rep [ PEXTRQ ] } - { ulonglong-2-rep [ PEXTRQ ] } - [ %select-vector-32 ] - } case ; - -M: x86.64 %select-vector-reps - { - { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } } - } available-reps ; - -: sse1-float-4-shuffle ( dst shuffle -- ) - { - { { 0 1 2 3 } [ drop ] } - { { 0 1 0 1 } [ dup MOVLHPS ] } - { { 2 3 2 3 } [ dup MOVHLPS ] } - { { 0 0 1 1 } [ dup UNPCKLPS ] } - { { 2 2 3 3 } [ dup UNPCKHPS ] } - [ dupd SHUFPS ] - } case ; - -: float-4-shuffle ( dst shuffle -- ) - sse3? [ - { - { { 0 0 2 2 } [ dup MOVSLDUP ] } - { { 1 1 3 3 } [ dup MOVSHDUP ] } - [ sse1-float-4-shuffle ] - } case - ] [ sse1-float-4-shuffle ] if ; - -: int-4-shuffle ( dst shuffle -- ) - { - { { 0 1 2 3 } [ drop ] } - { { 0 0 1 1 } [ dup PUNPCKLDQ ] } - { { 2 2 3 3 } [ dup PUNPCKHDQ ] } - { { 0 1 0 1 } [ dup PUNPCKLQDQ ] } - { { 2 3 2 3 } [ dup PUNPCKHQDQ ] } - [ dupd PSHUFD ] - } case ; - -: longlong-2-shuffle ( dst shuffle -- ) - first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ; - -: >float-4-shuffle ( double-2-shuffle -- float-4-shuffle ) - [ 2 * { 0 1 } n+v ] map concat ; - -M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- ) - dst src rep %copy - dst shuffle rep signed-rep { - { double-2-rep [ >float-4-shuffle float-4-shuffle ] } - { float-4-rep [ float-4-shuffle ] } - { int-4-rep [ int-4-shuffle ] } - { longlong-2-rep [ longlong-2-shuffle ] } - } case ; - -M: x86 %shuffle-vector-imm-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- ) - dst src1 src2 rep two-operand - shuffle rep { - { double-2-rep [ >float-4-shuffle SHUFPS ] } - { float-4-rep [ SHUFPS ] } - } case ; - -M: x86 %shuffle-vector-halves-imm-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep } } - } available-reps ; - -M: x86 %shuffle-vector ( dst src shuffle rep -- ) - two-operand PSHUFB ; - -M: x86 %shuffle-vector-reps - { - { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } } - } available-reps ; - -M: x86 %merge-vector-head - [ two-operand ] keep - signed-rep { - { double-2-rep [ MOVLHPS ] } - { float-4-rep [ UNPCKLPS ] } - { longlong-2-rep [ PUNPCKLQDQ ] } - { int-4-rep [ PUNPCKLDQ ] } - { short-8-rep [ PUNPCKLWD ] } - { char-16-rep [ PUNPCKLBW ] } - } case ; - -M: x86 %merge-vector-tail - [ two-operand ] keep - signed-rep { - { double-2-rep [ UNPCKHPD ] } - { float-4-rep [ UNPCKHPS ] } - { longlong-2-rep [ PUNPCKHQDQ ] } - { int-4-rep [ PUNPCKHDQ ] } - { short-8-rep [ PUNPCKHWD ] } - { char-16-rep [ PUNPCKHBW ] } - } case ; - -M: x86 %merge-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %signed-pack-vector - [ two-operand ] keep - { - { int-4-rep [ PACKSSDW ] } - { short-8-rep [ PACKSSWB ] } - } case ; - -M: x86 %signed-pack-vector-reps - { - { sse2? { short-8-rep int-4-rep } } - } available-reps ; - -M: x86 %unsigned-pack-vector - [ two-operand ] keep - signed-rep { - { int-4-rep [ PACKUSDW ] } - { short-8-rep [ PACKUSWB ] } - } case ; - -M: x86 %unsigned-pack-vector-reps - { - { sse2? { short-8-rep } } - { sse4.1? { int-4-rep } } - } available-reps ; - -M: x86 %tail>head-vector ( dst src rep -- ) - dup { - { float-4-rep [ drop UNPCKHPD ] } - { double-2-rep [ drop UNPCKHPD ] } - [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ] - } case ; - -M: x86 %unpack-vector-head ( dst src rep -- ) - { - { char-16-rep [ PMOVSXBW ] } - { uchar-16-rep [ PMOVZXBW ] } - { short-8-rep [ PMOVSXWD ] } - { ushort-8-rep [ PMOVZXWD ] } - { int-4-rep [ PMOVSXDQ ] } - { uint-4-rep [ PMOVZXDQ ] } - { float-4-rep [ CVTPS2PD ] } - } case ; - -M: x86 %unpack-vector-head-reps ( -- reps ) - { - { sse2? { float-4-rep } } - { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } - } available-reps ; - -M: x86 %integer>float-vector ( dst src rep -- ) - { - { int-4-rep [ CVTDQ2PS ] } - } case ; - -M: x86 %integer>float-vector-reps - { - { sse2? { int-4-rep } } - } available-reps ; - -M: x86 %float>integer-vector ( dst src rep -- ) - { - { float-4-rep [ CVTTPS2DQ ] } - } case ; - -M: x86 %float>integer-vector-reps - { - { sse2? { float-4-rep } } - } available-reps ; - -: (%compare-float-vector) ( dst src rep double single -- ) - [ double-2-rep eq? ] 2dip if ; inline - -: %compare-float-vector ( dst src rep cc -- ) - { - { cc< [ [ CMPLTPD ] [ CMPLTPS ] (%compare-float-vector) ] } - { cc<= [ [ CMPLEPD ] [ CMPLEPS ] (%compare-float-vector) ] } - { cc= [ [ CMPEQPD ] [ CMPEQPS ] (%compare-float-vector) ] } - { cc<>= [ [ CMPORDPD ] [ CMPORDPS ] (%compare-float-vector) ] } - { cc/< [ [ CMPNLTPD ] [ CMPNLTPS ] (%compare-float-vector) ] } - { cc/<= [ [ CMPNLEPD ] [ CMPNLEPS ] (%compare-float-vector) ] } - { cc/= [ [ CMPNEQPD ] [ CMPNEQPS ] (%compare-float-vector) ] } - { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] } - } case ; - -:: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- ) - rep signed-rep :> rep' - dst src rep' { - { longlong-2-rep [ int64 call ] } - { int-4-rep [ int32 call ] } - { short-8-rep [ int16 call ] } - { char-16-rep [ int8 call ] } - } case ; inline - -: %compare-int-vector ( dst src rep cc -- ) - { - { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] } - { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] } - } case ; - -M: x86 %compare-vector ( dst src1 src2 rep cc -- ) - [ [ two-operand ] keep ] dip - over float-vector-rep? - [ %compare-float-vector ] - [ %compare-int-vector ] if ; - -: %compare-vector-eq-reps ( -- reps ) - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } - { sse4.1? { longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -: %compare-vector-ord-reps ( -- reps ) - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } } - { sse4.2? { longlong-2-rep } } - } available-reps ; - -M: x86 %compare-vector-reps - { - { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] } - [ drop %compare-vector-ord-reps ] - } cond ; - -: %compare-float-vector-ccs ( cc -- ccs not? ) - { - { cc< [ { { cc< f } } f ] } - { cc<= [ { { cc<= f } } f ] } - { cc> [ { { cc< t } } f ] } - { cc>= [ { { cc<= t } } f ] } - { cc= [ { { cc= f } } f ] } - { cc<> [ { { cc< f } { cc< t } } f ] } - { cc<>= [ { { cc<>= f } } f ] } - { cc/< [ { { cc/< f } } f ] } - { cc/<= [ { { cc/<= f } } f ] } - { cc/> [ { { cc/< t } } f ] } - { cc/>= [ { { cc/<= t } } f ] } - { cc/= [ { { cc/= f } } f ] } - { cc/<> [ { { cc/= f } { cc/<>= f } } f ] } - { cc/<>= [ { { cc/<>= f } } f ] } - } case ; - -: %compare-int-vector-ccs ( cc -- ccs not? ) - order-cc { - { cc< [ { { cc> t } } f ] } - { cc<= [ { { cc> f } } t ] } - { cc> [ { { cc> f } } f ] } - { cc>= [ { { cc> t } } t ] } - { cc= [ { { cc= f } } f ] } - { cc/= [ { { cc= f } } t ] } - { t [ { } t ] } - { f [ { } f ] } - } case ; - -M: x86 %compare-vector-ccs - swap float-vector-rep? - [ %compare-float-vector-ccs ] - [ %compare-int-vector-ccs ] if ; - -:: %test-vector-mask ( dst temp mask vcc -- ) - vcc { - { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] } - { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] } - { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] } - { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] } - } case ; - -: %move-vector-mask ( dst src rep -- mask ) - { - { double-2-rep [ MOVMSKPS HEX: f ] } - { float-4-rep [ MOVMSKPS HEX: f ] } - [ drop PMOVMSKB HEX: ffff ] - } case ; - -M:: x86 %test-vector ( dst src temp rep vcc -- ) - dst src rep %move-vector-mask :> mask - dst temp mask vcc %test-vector-mask ; - -:: %test-vector-mask-branch ( label temp mask vcc -- ) - vcc { - { vcc-any [ temp temp TEST label JNE ] } - { vcc-none [ temp temp TEST label JE ] } - { vcc-all [ temp mask CMP label JE ] } - { vcc-notall [ temp mask CMP label JNE ] } - } case ; - -M:: x86 %test-vector-branch ( label src temp rep vcc -- ) - temp src rep %move-vector-mask :> mask - label temp mask vcc %test-vector-mask-branch ; - -M: x86 %test-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %add-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ADDPS ] } - { double-2-rep [ ADDPD ] } - { char-16-rep [ PADDB ] } - { uchar-16-rep [ PADDB ] } - { short-8-rep [ PADDW ] } - { ushort-8-rep [ PADDW ] } - { int-4-rep [ PADDD ] } - { uint-4-rep [ PADDD ] } - { longlong-2-rep [ PADDQ ] } - { ulonglong-2-rep [ PADDQ ] } - } case ; - -M: x86 %add-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %saturated-add-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PADDSB ] } - { uchar-16-rep [ PADDUSB ] } - { short-8-rep [ PADDSW ] } - { ushort-8-rep [ PADDUSW ] } - } case ; - -M: x86 %saturated-add-vector-reps - { - { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } - } available-reps ; - -M: x86 %add-sub-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ADDSUBPS ] } - { double-2-rep [ ADDSUBPD ] } - } case ; - -M: x86 %add-sub-vector-reps - { - { sse3? { float-4-rep double-2-rep } } - } available-reps ; - -M: x86 %sub-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ SUBPS ] } - { double-2-rep [ SUBPD ] } - { char-16-rep [ PSUBB ] } - { uchar-16-rep [ PSUBB ] } - { short-8-rep [ PSUBW ] } - { ushort-8-rep [ PSUBW ] } - { int-4-rep [ PSUBD ] } - { uint-4-rep [ PSUBD ] } - { longlong-2-rep [ PSUBQ ] } - { ulonglong-2-rep [ PSUBQ ] } - } case ; - -M: x86 %sub-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %saturated-sub-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PSUBSB ] } - { uchar-16-rep [ PSUBUSB ] } - { short-8-rep [ PSUBSW ] } - { ushort-8-rep [ PSUBUSW ] } - } case ; - -M: x86 %saturated-sub-vector-reps - { - { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } - } available-reps ; - -M: x86 %mul-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ MULPS ] } - { double-2-rep [ MULPD ] } - { short-8-rep [ PMULLW ] } - { ushort-8-rep [ PMULLW ] } - { int-4-rep [ PMULLD ] } - { uint-4-rep [ PMULLD ] } - } case ; - -M: x86 %mul-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep short-8-rep ushort-8-rep } } - { sse4.1? { int-4-rep uint-4-rep } } - } available-reps ; - -M: x86 %mul-high-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { short-8-rep [ PMULHW ] } - { ushort-8-rep [ PMULHUW ] } - } case ; - -M: x86 %mul-high-vector-reps - { - { sse2? { short-8-rep ushort-8-rep } } - } available-reps ; - -M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PMADDUBSW ] } - { uchar-16-rep [ PMADDUBSW ] } - { short-8-rep [ PMADDWD ] } - } case ; - -M: x86 %mul-horizontal-add-vector-reps - { - { sse2? { short-8-rep } } - { ssse3? { char-16-rep uchar-16-rep } } - } available-reps ; - -M: x86 %div-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ DIVPS ] } - { double-2-rep [ DIVPD ] } - } case ; - -M: x86 %div-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep } } - } available-reps ; - -M: x86 %min-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PMINSB ] } - { uchar-16-rep [ PMINUB ] } - { short-8-rep [ PMINSW ] } - { ushort-8-rep [ PMINUW ] } - { int-4-rep [ PMINSD ] } - { uint-4-rep [ PMINUD ] } - { float-4-rep [ MINPS ] } - { double-2-rep [ MINPD ] } - } case ; - -M: x86 %min-vector-reps - { - { sse? { float-4-rep } } - { sse2? { uchar-16-rep short-8-rep double-2-rep } } - { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } - } available-reps ; - -M: x86 %max-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PMAXSB ] } - { uchar-16-rep [ PMAXUB ] } - { short-8-rep [ PMAXSW ] } - { ushort-8-rep [ PMAXUW ] } - { int-4-rep [ PMAXSD ] } - { uint-4-rep [ PMAXUD ] } - { float-4-rep [ MAXPS ] } - { double-2-rep [ MAXPD ] } - } case ; - -M: x86 %max-vector-reps - { - { sse? { float-4-rep } } - { sse2? { uchar-16-rep short-8-rep double-2-rep } } - { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } - } available-reps ; - -M: x86 %avg-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { uchar-16-rep [ PAVGB ] } - { ushort-8-rep [ PAVGW ] } - } case ; - -M: x86 %avg-vector-reps - { - { sse2? { uchar-16-rep ushort-8-rep } } - } available-reps ; - -M: x86 %dot-vector - [ two-operand ] keep - { - { float-4-rep [ HEX: ff DPPS ] } - { double-2-rep [ HEX: ff DPPD ] } - } case ; - -M: x86 %dot-vector-reps - { - { sse4.1? { float-4-rep double-2-rep } } - } available-reps ; - -M: x86 %sad-vector - [ two-operand ] keep - { - { uchar-16-rep [ PSADBW ] } - } case ; - -M: x86 %sad-vector-reps - { - { sse2? { uchar-16-rep } } - } available-reps ; - -M: x86 %horizontal-add-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - signed-rep { - { float-4-rep [ HADDPS ] } - { double-2-rep [ HADDPD ] } - { int-4-rep [ PHADDD ] } - { short-8-rep [ PHADDW ] } - } case ; - -M: x86 %horizontal-add-vector-reps - { - { sse3? { float-4-rep double-2-rep } } - { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } } - } available-reps ; - -M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- ) - two-operand PSLLDQ ; - -M: x86 %horizontal-shl-vector-imm-reps - { - { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } - } available-reps ; - -M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- ) - two-operand PSRLDQ ; - -M: x86 %horizontal-shr-vector-imm-reps - { - { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } - } available-reps ; - -M: x86 %abs-vector ( dst src rep -- ) - { - { char-16-rep [ PABSB ] } - { short-8-rep [ PABSW ] } - { int-4-rep [ PABSD ] } - } case ; - -M: x86 %abs-vector-reps - { - { ssse3? { char-16-rep short-8-rep int-4-rep } } - } available-reps ; - -M: x86 %sqrt-vector ( dst src rep -- ) - { - { float-4-rep [ SQRTPS ] } - { double-2-rep [ SQRTPD ] } - } case ; - -M: x86 %sqrt-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep } } - } available-reps ; - -M: x86 %and-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ANDPS ] } - { double-2-rep [ ANDPS ] } - [ drop PAND ] - } case ; - -M: x86 %and-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %andn-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ANDNPS ] } - { double-2-rep [ ANDNPS ] } - [ drop PANDN ] - } case ; - -M: x86 %andn-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %or-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ORPS ] } - { double-2-rep [ ORPS ] } - [ drop POR ] - } case ; - -M: x86 %or-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %xor-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ XORPS ] } - { double-2-rep [ XORPS ] } - [ drop PXOR ] - } case ; - -M: x86 %xor-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %shl-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { short-8-rep [ PSLLW ] } - { ushort-8-rep [ PSLLW ] } - { int-4-rep [ PSLLD ] } - { uint-4-rep [ PSLLD ] } - { longlong-2-rep [ PSLLQ ] } - { ulonglong-2-rep [ PSLLQ ] } - } case ; - -M: x86 %shl-vector-reps - { - { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %shr-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { short-8-rep [ PSRAW ] } - { ushort-8-rep [ PSRLW ] } - { int-4-rep [ PSRAD ] } - { uint-4-rep [ PSRLD ] } - { ulonglong-2-rep [ PSRLQ ] } - } case ; - -M: x86 %shr-vector-reps - { - { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %shl-vector-imm %shl-vector ; -M: x86 %shl-vector-imm-reps %shl-vector-reps ; -M: x86 %shr-vector-imm %shr-vector ; -M: x86 %shr-vector-imm-reps %shr-vector-reps ; - -: scalar-sized-reg ( reg rep -- reg' ) - rep-size 8 * n-bit-version-of ; - -M: x86 %integer>scalar drop MOVD ; - -:: %scalar>integer-32 ( dst src rep -- ) - rep { - { int-scalar-rep [ - dst 32-bit-version-of src MOVD - dst dst 32-bit-version-of - 2dup eq? [ 2drop ] [ MOVSX ] if - ] } - { uint-scalar-rep [ - dst 32-bit-version-of src MOVD - ] } - { short-scalar-rep [ - dst 32-bit-version-of src MOVD - dst dst 16-bit-version-of MOVSX - ] } - { ushort-scalar-rep [ - dst 32-bit-version-of src MOVD - dst dst 16-bit-version-of MOVZX - ] } - { char-scalar-rep [ - dst 32-bit-version-of src MOVD - dst { } 8 [| tmp-dst | - tmp-dst dst int-rep %copy - tmp-dst tmp-dst 8-bit-version-of MOVSX - dst tmp-dst int-rep %copy - ] with-small-register - ] } - { uchar-scalar-rep [ - dst 32-bit-version-of src MOVD - dst { } 8 [| tmp-dst | - tmp-dst dst int-rep %copy - tmp-dst tmp-dst 8-bit-version-of MOVZX - dst tmp-dst int-rep %copy - ] with-small-register - ] } - } case ; - -M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ; - -M: x86.64 %scalar>integer ( dst src rep -- ) - { - { longlong-scalar-rep [ MOVD ] } - { ulonglong-scalar-rep [ MOVD ] } - [ %scalar>integer-32 ] - } case ; - -M: x86 %vector>scalar %copy ; - -M: x86 %scalar>vector %copy ; - M:: x86 %spill ( src rep dst -- ) dst src rep %copy ; @@ -1590,18 +622,84 @@ M: x86 immediate-arithmetic? ( n -- ? ) M: x86 immediate-bitwise? ( n -- ? ) HEX: -80000000 HEX: 7fffffff between? ; +: %cmov-float= ( dst src -- ) + [ + "no-move" define-label + + "no-move" get [ JNE ] [ JP ] bi + MOV + "no-move" resolve-label + ] with-scope ; + +: %cmov-float/= ( dst src -- ) + [ + "no-move" define-label + "move" define-label + + "move" get JP + "no-move" get JE + "move" resolve-label + MOV + "no-move" resolve-label + ] with-scope ; + +:: (%compare-float) ( dst src1 src2 cc temp compare -- ) + cc { + { cc< [ src2 src1 \ compare call( a b -- ) dst temp \ CMOVA (%boolean) ] } + { cc<= [ src2 src1 \ compare call( a b -- ) dst temp \ CMOVAE (%boolean) ] } + { cc> [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVA (%boolean) ] } + { cc>= [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVAE (%boolean) ] } + { cc= [ src1 src2 \ compare call( a b -- ) dst temp \ %cmov-float= (%boolean) ] } + { cc<> [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVNE (%boolean) ] } + { cc<>= [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVNP (%boolean) ] } + { cc/< [ src2 src1 \ compare call( a b -- ) dst temp \ CMOVBE (%boolean) ] } + { cc/<= [ src2 src1 \ compare call( a b -- ) dst temp \ CMOVB (%boolean) ] } + { cc/> [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVBE (%boolean) ] } + { cc/>= [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVB (%boolean) ] } + { cc/= [ src1 src2 \ compare call( a b -- ) dst temp \ %cmov-float/= (%boolean) ] } + { cc/<> [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVE (%boolean) ] } + { cc/<>= [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVP (%boolean) ] } + } case ; inline + +: %jump-float= ( label -- ) + [ + "no-jump" define-label + "no-jump" get JP + JE + "no-jump" resolve-label + ] with-scope ; + +: %jump-float/= ( label -- ) + [ JNE ] [ JP ] bi ; + +:: (%compare-float-branch) ( label src1 src2 cc compare -- ) + cc { + { cc< [ src2 src1 \ compare call( a b -- ) label JA ] } + { cc<= [ src2 src1 \ compare call( a b -- ) label JAE ] } + { cc> [ src1 src2 \ compare call( a b -- ) label JA ] } + { cc>= [ src1 src2 \ compare call( a b -- ) label JAE ] } + { cc= [ src1 src2 \ compare call( a b -- ) label %jump-float= ] } + { cc<> [ src1 src2 \ compare call( a b -- ) label JNE ] } + { cc<>= [ src1 src2 \ compare call( a b -- ) label JNP ] } + { cc/< [ src2 src1 \ compare call( a b -- ) label JBE ] } + { cc/<= [ src2 src1 \ compare call( a b -- ) label JB ] } + { cc/> [ src1 src2 \ compare call( a b -- ) label JBE ] } + { cc/>= [ src1 src2 \ compare call( a b -- ) label JB ] } + { cc/= [ src1 src2 \ compare call( a b -- ) label %jump-float/= ] } + { cc/<> [ src1 src2 \ compare call( a b -- ) label JE ] } + { cc/<>= [ src1 src2 \ compare call( a b -- ) label JP ] } + } case ; + enable-min/max enable-log2 enable-float-intrinsics enable-float-functions -enable-float-min/max enable-fsqrt : check-sse ( -- ) + "Checking for multimedia extensions... " write flush [ { (sse-version) popcnt? } compile ] with-optimizer - sse-version 20 < [ - "Factor requires SSE2, which your CPU does not support." print - flush - 1 exit - ] when ; + sse-version + [ sse-string " detected" append print ] + [ 20 < "cpu.x86.x87" "cpu.x86.sse" ? require ] bi ; diff --git a/basis/cpu/x86/x87/authors.txt b/basis/cpu/x86/x87/authors.txt new file mode 100644 index 0000000000..1901f27a24 --- /dev/null +++ b/basis/cpu/x86/x87/authors.txt @@ -0,0 +1 @@ +Slava Pestov diff --git a/basis/cpu/x86/x87/tags.txt b/basis/cpu/x86/x87/tags.txt new file mode 100644 index 0000000000..ebb74b4d5f --- /dev/null +++ b/basis/cpu/x86/x87/tags.txt @@ -0,0 +1 @@ +not loaded diff --git a/basis/cpu/x86/x87/x87.factor b/basis/cpu/x86/x87/x87.factor new file mode 100644 index 0000000000..8c920ea87b --- /dev/null +++ b/basis/cpu/x86/x87/x87.factor @@ -0,0 +1,91 @@ +! Copyright (C) 2010 Slava Pestov. +! See http://factorcode.org/license.txt for BSD license. +USING: alien.c-types combinators kernel locals system namespaces +compiler.codegen.fixup compiler.constants +compiler.cfg.comparisons cpu.architecture cpu.x86 +cpu.x86.assembler cpu.x86.assembler.operands ; +IN: cpu.x86.x87 + +! x87 unit is only used if SSE2 is not available. + +: FLD* ( src -- ) [ ST0 ] dip FLD ; +: FSTP* ( dst -- ) ST0 FSTP ; + +: copy-register-x87 ( dst src -- ) + 2dup eq? [ 2drop ] [ FLD* shuffle-down FSTP* ] if ; + +M: float-rep copy-register* drop copy-register-x87 ; +M: double-rep copy-register* drop copy-register-x87 ; + +: load-x87 ( dst src rep -- ) + { + { float-rep [ FLDS shuffle-down FSTP* ] } + { double-rep [ FLDL shuffle-down FSTP* ] } + } case ; + +: store-x87 ( dst src rep -- ) + { + { float-rep [ FLD* FSTPS ] } + { double-rep [ FLD* FSTPL ] } + } case ; + +: copy-memory-x87 ( dst src rep -- ) + { + { [ pick register? ] [ load-x87 ] } + { [ over register? ] [ store-x87 ] } + } cond ; + +M: float-rep copy-memory* copy-memory-x87 ; +M: double-rep copy-memory* copy-memory-x87 ; + +M: x86 %load-float + 0 [] FLDS + rc-absolute rel-binary-literal + shuffle-down FSTP* ; + +M: x86 %load-double + 0 [] FLDL + rc-absolute rel-binary-literal + shuffle-down FSTP* ; + +:: binary-op ( dst src1 src2 quot -- ) + src1 FLD* + ST0 src2 shuffle-down quot call + dst shuffle-down FSTP* ; inline + +M: x86 %add-float [ FADD ] binary-op ; +M: x86 %sub-float [ FSUB ] binary-op ; +M: x86 %mul-float [ FMUL ] binary-op ; +M: x86 %div-float [ FDIV ] binary-op ; + +M: x86 %sqrt FLD* FSQRT shuffle-down FSTP* ; + +M: x86 %single>double-float copy-register-x87 ; +M: x86 %double>single-float copy-register-x87 ; + +M: x86 integer-float-needs-stack-frame? t ; + +M:: x86 %integer>float ( dst src -- ) + 4 stack@ src MOV + 4 stack@ FILDD + dst shuffle-down FSTP* ; + +M:: x86 %float>integer ( dst src -- ) + src FLD* + 4 stack@ FISTTPD + dst 4 stack@ MOV ; + +: compare-op ( src1 src2 quot -- ) + [ ST0 ] 3dip binary-op ; inline + +M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- ) + [ [ FCOMI ] compare-op ] (%compare-float) ; + +M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- ) + [ [ FUCOMI ] compare-op ] (%compare-float) ; + +M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- ) + [ [ FCOMI ] compare-op ] (%compare-float-branch) ; + +M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- ) + [ [ FUCOMI ] compare-op ] (%compare-float-branch) ;