| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  | ! Copyright (C) 2009, 2010 Joe Groff, Slava Pestov. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2016-05-25 10:10:33 -04:00
										 |  |  | USING: alien.data arrays assocs combinators compiler.cfg.comparisons | 
					
						
							|  |  |  | compiler.cfg.intrinsics cpu.architecture cpu.x86 cpu.x86.assembler | 
					
						
							|  |  |  | cpu.x86.assembler.operands cpu.x86.features fry kernel locals macros | 
					
						
							|  |  |  | math math.vectors quotations sequences system ;
 | 
					
						
							| 
									
										
										
										
											2010-10-25 16:54:42 -04:00
										 |  |  | QUALIFIED-WITH: alien.c-types c | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  | IN: cpu.x86.sse | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! Scalar floating point with SSE2 | 
					
						
							| 
									
										
										
										
											2010-10-25 16:54:42 -04:00
										 |  |  | M: x86 %load-float c:float <ref> float-rep %load-vector ;
 | 
					
						
							|  |  |  | M: x86 %load-double c:double <ref> double-rep %load-vector ;
 | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | M: float-rep copy-register* drop MOVAPS ;
 | 
					
						
							|  |  |  | M: double-rep copy-register* drop MOVAPS ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: float-rep copy-memory* drop MOVSS ;
 | 
					
						
							|  |  |  | M: double-rep copy-memory* drop MOVSD ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %add-float double-rep two-operand ADDSD ;
 | 
					
						
							|  |  |  | M: x86 %sub-float double-rep two-operand SUBSD ;
 | 
					
						
							|  |  |  | M: x86 %mul-float double-rep two-operand MULSD ;
 | 
					
						
							|  |  |  | M: x86 %div-float double-rep two-operand DIVSD ;
 | 
					
						
							|  |  |  | M: x86 %min-float double-rep two-operand MINSD ;
 | 
					
						
							|  |  |  | M: x86 %max-float double-rep two-operand MAXSD ;
 | 
					
						
							|  |  |  | M: x86 %sqrt SQRTSD ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : %clear-unless-in-place ( dst src -- )
 | 
					
						
							|  |  |  |     over = [ drop ] [ dup XORPS ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ;
 | 
					
						
							|  |  |  | M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 integer-float-needs-stack-frame? f ;
 | 
					
						
							|  |  |  | M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ;
 | 
					
						
							|  |  |  | M: x86 %float>integer CVTTSD2SI ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- )
 | 
					
						
							|  |  |  |     [ COMISD ] (%compare-float) ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- )
 | 
					
						
							|  |  |  |     [ UCOMISD ] (%compare-float) ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- )
 | 
					
						
							|  |  |  |     [ COMISD ] (%compare-float-branch) ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- )
 | 
					
						
							|  |  |  |     [ UCOMISD ] (%compare-float-branch) ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! SIMD | 
					
						
							|  |  |  | M: float-4-rep copy-register* drop MOVAPS ;
 | 
					
						
							|  |  |  | M: double-2-rep copy-register* drop MOVAPS ;
 | 
					
						
							|  |  |  | M: vector-rep copy-register* drop MOVDQA ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-19 14:42:46 -04:00
										 |  |  | MACRO: available-reps ( alist -- quot )
 | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  |     ! Each SSE version adds new representations and supports | 
					
						
							|  |  |  |     ! all old ones | 
					
						
							| 
									
										
										
										
											2016-03-08 09:04:35 -05:00
										 |  |  |     unzip { } [ append ] accumulate* | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  |     [ [ 1quotation ] map ] bi@ zip
 | 
					
						
							|  |  |  |     reverse [ { } ] suffix
 | 
					
						
							|  |  |  |     '[ _ cond ] ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %alien-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %zero-vector | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { double-2-rep [ dup XORPS ] } | 
					
						
							|  |  |  |         { float-4-rep [ dup XORPS ] } | 
					
						
							|  |  |  |         [ drop dup PXOR ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %zero-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %fill-vector | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } | 
					
						
							|  |  |  |         { float-4-rep  [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } | 
					
						
							|  |  |  |         [ drop dup PCMPEQB ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %fill-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- )
 | 
					
						
							|  |  |  |     rep signed-rep { | 
					
						
							|  |  |  |         { float-4-rep [ | 
					
						
							|  |  |  |             dst src1 float-4-rep %copy | 
					
						
							|  |  |  |             dst src2 UNPCKLPS | 
					
						
							|  |  |  |             src3 src4 UNPCKLPS | 
					
						
							|  |  |  |             dst src3 MOVLHPS | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { int-4-rep [ | 
					
						
							|  |  |  |             dst src1 int-4-rep %copy | 
					
						
							|  |  |  |             dst src2 PUNPCKLDQ | 
					
						
							|  |  |  |             src3 src4 PUNPCKLDQ | 
					
						
							|  |  |  |             dst src3 PUNPCKLQDQ | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %gather-vector-4-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         ! Can't do this with sse1 since it will want to unbox | 
					
						
							|  |  |  |         ! double-precision floats and convert to single precision | 
					
						
							|  |  |  |         { sse2? { float-4-rep int-4-rep uint-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- )
 | 
					
						
							|  |  |  |     dst rep %zero-vector | 
					
						
							|  |  |  |     dst src1 32-bit-version-of 0 PINSRD | 
					
						
							|  |  |  |     dst src2 32-bit-version-of 1 PINSRD | 
					
						
							|  |  |  |     dst src3 32-bit-version-of 2 PINSRD | 
					
						
							|  |  |  |     dst src4 32-bit-version-of 3 PINSRD ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %gather-int-vector-4-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse4.1? { int-4-rep uint-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M:: x86 %gather-vector-2 ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     rep signed-rep { | 
					
						
							|  |  |  |         { double-2-rep [ | 
					
						
							|  |  |  |             dst src1 double-2-rep %copy | 
					
						
							|  |  |  |             dst src2 MOVLHPS | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { longlong-2-rep [ | 
					
						
							|  |  |  |             dst src1 longlong-2-rep %copy | 
					
						
							|  |  |  |             dst src2 PUNPCKLQDQ | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %gather-vector-2-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     dst rep %zero-vector | 
					
						
							|  |  |  |     dst src1 0 PINSRQ | 
					
						
							|  |  |  |     dst src2 1 PINSRQ ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86.64 %gather-int-vector-2-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse4.1? { longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: %select-vector-32 ( dst src n rep -- )
 | 
					
						
							|  |  |  |     rep { | 
					
						
							|  |  |  |         { char-16-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src n PEXTRB | 
					
						
							|  |  |  |             dst dst 8-bit-version-of MOVSX | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src n PEXTRB | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { short-8-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src n PEXTRW | 
					
						
							|  |  |  |             dst dst 16-bit-version-of MOVSX | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src n PEXTRW | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { int-4-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src n PEXTRD | 
					
						
							|  |  |  |             dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if
 | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { uint-4-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src n PEXTRD | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86.32 %select-vector | 
					
						
							|  |  |  |     %select-vector-32 ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86.32 %select-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86.64 %select-vector | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { longlong-2-rep  [ PEXTRQ ] } | 
					
						
							|  |  |  |         { ulonglong-2-rep [ PEXTRQ ] } | 
					
						
							|  |  |  |         [ %select-vector-32 ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86.64 %select-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : sse1-float-4-shuffle ( dst shuffle -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { { 0 1 2 3 } [ drop ] } | 
					
						
							|  |  |  |         { { 0 1 0 1 } [ dup MOVLHPS ] } | 
					
						
							|  |  |  |         { { 2 3 2 3 } [ dup MOVHLPS ] } | 
					
						
							|  |  |  |         { { 0 0 1 1 } [ dup UNPCKLPS ] } | 
					
						
							|  |  |  |         { { 2 2 3 3 } [ dup UNPCKHPS ] } | 
					
						
							|  |  |  |         [ dupd SHUFPS ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : float-4-shuffle ( dst shuffle -- )
 | 
					
						
							|  |  |  |     sse3? [ | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             { { 0 0 2 2 } [ dup MOVSLDUP ] } | 
					
						
							|  |  |  |             { { 1 1 3 3 } [ dup MOVSHDUP ] } | 
					
						
							|  |  |  |             [ sse1-float-4-shuffle ] | 
					
						
							|  |  |  |         } case
 | 
					
						
							|  |  |  |     ] [ sse1-float-4-shuffle ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : int-4-shuffle ( dst shuffle -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { { 0 1 2 3 } [ drop ] } | 
					
						
							|  |  |  |         { { 0 0 1 1 } [ dup PUNPCKLDQ ] } | 
					
						
							|  |  |  |         { { 2 2 3 3 } [ dup PUNPCKHDQ ] } | 
					
						
							|  |  |  |         { { 0 1 0 1 } [ dup PUNPCKLQDQ ] } | 
					
						
							|  |  |  |         { { 2 3 2 3 } [ dup PUNPCKHQDQ ] } | 
					
						
							|  |  |  |         [ dupd PSHUFD ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : longlong-2-shuffle ( dst shuffle -- )
 | 
					
						
							|  |  |  |     first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : >float-4-shuffle ( double-2-shuffle -- float-4-shuffle )
 | 
					
						
							|  |  |  |     [ 2 * { 0 1 } n+v ] map concat ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- )
 | 
					
						
							|  |  |  |     dst src rep %copy | 
					
						
							|  |  |  |     dst shuffle rep signed-rep { | 
					
						
							|  |  |  |         { double-2-rep [ >float-4-shuffle float-4-shuffle ] } | 
					
						
							|  |  |  |         { float-4-rep [ float-4-shuffle ] } | 
					
						
							|  |  |  |         { int-4-rep [ int-4-shuffle ] } | 
					
						
							|  |  |  |         { longlong-2-rep [ longlong-2-shuffle ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shuffle-vector-imm-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- )
 | 
					
						
							|  |  |  |     dst src1 src2 rep two-operand | 
					
						
							|  |  |  |     shuffle rep { | 
					
						
							|  |  |  |         { double-2-rep [ >float-4-shuffle SHUFPS ] } | 
					
						
							|  |  |  |         { float-4-rep [ SHUFPS ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shuffle-vector-halves-imm-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shuffle-vector ( dst src shuffle rep -- )
 | 
					
						
							|  |  |  |     two-operand PSHUFB ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shuffle-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %merge-vector-head | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     signed-rep { | 
					
						
							|  |  |  |         { double-2-rep   [ MOVLHPS ] } | 
					
						
							|  |  |  |         { float-4-rep    [ UNPCKLPS ] } | 
					
						
							|  |  |  |         { longlong-2-rep [ PUNPCKLQDQ ] } | 
					
						
							|  |  |  |         { int-4-rep      [ PUNPCKLDQ ] } | 
					
						
							|  |  |  |         { short-8-rep    [ PUNPCKLWD ] } | 
					
						
							|  |  |  |         { char-16-rep    [ PUNPCKLBW ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %merge-vector-tail | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     signed-rep { | 
					
						
							|  |  |  |         { double-2-rep   [ UNPCKHPD ] } | 
					
						
							|  |  |  |         { float-4-rep    [ UNPCKHPS ] } | 
					
						
							|  |  |  |         { longlong-2-rep [ PUNPCKHQDQ ] } | 
					
						
							|  |  |  |         { int-4-rep      [ PUNPCKHDQ ] } | 
					
						
							|  |  |  |         { short-8-rep    [ PUNPCKHWD ] } | 
					
						
							|  |  |  |         { char-16-rep    [ PUNPCKHBW ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %merge-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-06-01 03:34:50 -04:00
										 |  |  | M: x86 %float-pack-vector | 
					
						
							|  |  |  |     drop CVTPD2PS ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %float-pack-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { double-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  | M: x86 %signed-pack-vector | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { int-4-rep    [ PACKSSDW ] } | 
					
						
							|  |  |  |         { short-8-rep  [ PACKSSWB ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %signed-pack-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { short-8-rep int-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %unsigned-pack-vector | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     signed-rep { | 
					
						
							|  |  |  |         { int-4-rep   [ PACKUSDW ] } | 
					
						
							|  |  |  |         { short-8-rep [ PACKUSWB ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %unsigned-pack-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { short-8-rep } } | 
					
						
							|  |  |  |         { sse4.1? { int-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %tail>head-vector ( dst src rep -- )
 | 
					
						
							|  |  |  |     dup { | 
					
						
							|  |  |  |         { float-4-rep [ drop UNPCKHPD ] } | 
					
						
							|  |  |  |         { double-2-rep [ drop UNPCKHPD ] } | 
					
						
							|  |  |  |         [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %unpack-vector-head ( dst src rep -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { char-16-rep  [ PMOVSXBW ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ PMOVZXBW ] } | 
					
						
							|  |  |  |         { short-8-rep  [ PMOVSXWD ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PMOVZXWD ] } | 
					
						
							|  |  |  |         { int-4-rep    [ PMOVSXDQ ] } | 
					
						
							|  |  |  |         { uint-4-rep   [ PMOVZXDQ ] } | 
					
						
							|  |  |  |         { float-4-rep  [ CVTPS2PD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %unpack-vector-head-reps ( -- reps )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { float-4-rep } } | 
					
						
							|  |  |  |         { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %integer>float-vector ( dst src rep -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { int-4-rep [ CVTDQ2PS ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %integer>float-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { int-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %float>integer-vector ( dst src rep -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ CVTTPS2DQ ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %float>integer-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { float-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : (%compare-float-vector) ( dst src rep double single -- )
 | 
					
						
							|  |  |  |     [ double-2-rep eq? ] 2dip if ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : %compare-float-vector ( dst src rep cc -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { cc<    [ [ CMPLTPD    ] [ CMPLTPS    ] (%compare-float-vector) ] } | 
					
						
							|  |  |  |         { cc<=   [ [ CMPLEPD    ] [ CMPLEPS    ] (%compare-float-vector) ] } | 
					
						
							|  |  |  |         { cc=    [ [ CMPEQPD    ] [ CMPEQPS    ] (%compare-float-vector) ] } | 
					
						
							|  |  |  |         { cc<>=  [ [ CMPORDPD   ] [ CMPORDPS   ] (%compare-float-vector) ] } | 
					
						
							|  |  |  |         { cc/<   [ [ CMPNLTPD   ] [ CMPNLTPS   ] (%compare-float-vector) ] } | 
					
						
							|  |  |  |         { cc/<=  [ [ CMPNLEPD   ] [ CMPNLEPS   ] (%compare-float-vector) ] } | 
					
						
							|  |  |  |         { cc/=   [ [ CMPNEQPD   ] [ CMPNEQPS   ] (%compare-float-vector) ] } | 
					
						
							|  |  |  |         { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- )
 | 
					
						
							|  |  |  |     rep signed-rep :> rep' | 
					
						
							|  |  |  |     dst src rep' { | 
					
						
							|  |  |  |         { longlong-2-rep [ int64 call ] } | 
					
						
							|  |  |  |         { int-4-rep      [ int32 call ] } | 
					
						
							|  |  |  |         { short-8-rep    [ int16 call ] } | 
					
						
							|  |  |  |         { char-16-rep    [ int8  call ] } | 
					
						
							|  |  |  |     } case ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : %compare-int-vector ( dst src rep cc -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] } | 
					
						
							|  |  |  |         { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %compare-vector ( dst src1 src2 rep cc -- )
 | 
					
						
							|  |  |  |     [ [ two-operand ] keep ] dip
 | 
					
						
							|  |  |  |     over float-vector-rep? | 
					
						
							|  |  |  |     [ %compare-float-vector ] | 
					
						
							|  |  |  |     [ %compare-int-vector ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : %compare-vector-eq-reps ( -- reps )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } | 
					
						
							|  |  |  |         { sse4.1? { longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : %compare-vector-ord-reps ( -- reps )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } } | 
					
						
							|  |  |  |         { sse4.2? { longlong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %compare-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] } | 
					
						
							|  |  |  |         [ drop %compare-vector-ord-reps ] | 
					
						
							|  |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : %compare-float-vector-ccs ( cc -- ccs not? )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { cc<    [ { { cc<  f   }              } f ] } | 
					
						
							|  |  |  |         { cc<=   [ { { cc<= f   }              } f ] } | 
					
						
							|  |  |  |         { cc>    [ { { cc<  t   }              } f ] } | 
					
						
							|  |  |  |         { cc>=   [ { { cc<= t   }              } f ] } | 
					
						
							|  |  |  |         { cc=    [ { { cc=  f   }              } f ] } | 
					
						
							|  |  |  |         { cc<>   [ { { cc<  f   } { cc<    t } } f ] } | 
					
						
							|  |  |  |         { cc<>=  [ { { cc<>= f  }              } f ] } | 
					
						
							|  |  |  |         { cc/<   [ { { cc/<  f  }              } f ] } | 
					
						
							|  |  |  |         { cc/<=  [ { { cc/<= f  }              } f ] } | 
					
						
							|  |  |  |         { cc/>   [ { { cc/<  t  }              } f ] } | 
					
						
							|  |  |  |         { cc/>=  [ { { cc/<= t  }              } f ] } | 
					
						
							|  |  |  |         { cc/=   [ { { cc/=  f  }              } f ] } | 
					
						
							|  |  |  |         { cc/<>  [ { { cc/=  f  } { cc/<>= f } } f ] } | 
					
						
							|  |  |  |         { cc/<>= [ { { cc/<>= f }              } f ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : %compare-int-vector-ccs ( cc -- ccs not? )
 | 
					
						
							|  |  |  |     order-cc { | 
					
						
							|  |  |  |         { cc<    [ { { cc> t } } f ] } | 
					
						
							|  |  |  |         { cc<=   [ { { cc> f } } t ] } | 
					
						
							|  |  |  |         { cc>    [ { { cc> f } } f ] } | 
					
						
							|  |  |  |         { cc>=   [ { { cc> t } } t ] } | 
					
						
							|  |  |  |         { cc=    [ { { cc= f } } f ] } | 
					
						
							|  |  |  |         { cc/=   [ { { cc= f } } t ] } | 
					
						
							|  |  |  |         { t      [ {           } t ] } | 
					
						
							|  |  |  |         { f      [ {           } f ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %compare-vector-ccs | 
					
						
							|  |  |  |     swap float-vector-rep? | 
					
						
							|  |  |  |     [ %compare-float-vector-ccs ] | 
					
						
							|  |  |  |     [ %compare-int-vector-ccs ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: %test-vector-mask ( dst temp mask vcc -- )
 | 
					
						
							|  |  |  |     vcc { | 
					
						
							|  |  |  |         { vcc-any    [ dst dst TEST dst temp \ CMOVNE (%boolean) ] } | 
					
						
							|  |  |  |         { vcc-none   [ dst dst TEST dst temp \ CMOVE  (%boolean) ] } | 
					
						
							|  |  |  |         { vcc-all    [ dst mask CMP dst temp \ CMOVE  (%boolean) ] } | 
					
						
							|  |  |  |         { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-11-13 18:45:03 -05:00
										 |  |  | : (%move-vector-mask) ( dst src rep -- mask )
 | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  |     { | 
					
						
							| 
									
										
										
										
											2011-11-23 21:49:33 -05:00
										 |  |  |         { double-2-rep [ MOVMSKPS 0xf ] } | 
					
						
							|  |  |  |         { float-4-rep  [ MOVMSKPS 0xf ] } | 
					
						
							|  |  |  |         [ drop PMOVMSKB 0xffff ] | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-11-12 01:47:54 -05:00
										 |  |  | M: x86 %move-vector-mask ( dst src rep -- )
 | 
					
						
							| 
									
										
										
										
											2011-11-13 18:45:03 -05:00
										 |  |  |     (%move-vector-mask) drop ;
 | 
					
						
							| 
									
										
										
										
											2011-11-12 01:47:54 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | M: x86 %move-vector-mask-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  | M:: x86 %test-vector ( dst src temp rep vcc -- )
 | 
					
						
							| 
									
										
										
										
											2011-11-13 18:45:03 -05:00
										 |  |  |     dst src rep (%move-vector-mask) :> mask | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  |     dst temp mask vcc %test-vector-mask ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: %test-vector-mask-branch ( label temp mask vcc -- )
 | 
					
						
							|  |  |  |     vcc { | 
					
						
							|  |  |  |         { vcc-any    [ temp temp TEST label JNE ] } | 
					
						
							|  |  |  |         { vcc-none   [ temp temp TEST label JE ] } | 
					
						
							|  |  |  |         { vcc-all    [ temp mask CMP label JE ] } | 
					
						
							|  |  |  |         { vcc-notall [ temp mask CMP label JNE ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M:: x86 %test-vector-branch ( label src temp rep vcc -- )
 | 
					
						
							| 
									
										
										
										
											2011-11-13 18:45:03 -05:00
										 |  |  |     temp src rep (%move-vector-mask) :> mask | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  |     label temp mask vcc %test-vector-mask-branch ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %test-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %add-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ ADDPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ ADDPD ] } | 
					
						
							|  |  |  |         { char-16-rep [ PADDB ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ PADDB ] } | 
					
						
							|  |  |  |         { short-8-rep [ PADDW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PADDW ] } | 
					
						
							|  |  |  |         { int-4-rep [ PADDD ] } | 
					
						
							|  |  |  |         { uint-4-rep [ PADDD ] } | 
					
						
							|  |  |  |         { longlong-2-rep [ PADDQ ] } | 
					
						
							|  |  |  |         { ulonglong-2-rep [ PADDQ ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %add-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %saturated-add-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { char-16-rep [ PADDSB ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ PADDUSB ] } | 
					
						
							|  |  |  |         { short-8-rep [ PADDSW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PADDUSW ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %saturated-add-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %add-sub-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ ADDSUBPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ ADDSUBPD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %add-sub-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse3? { float-4-rep double-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %sub-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ SUBPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ SUBPD ] } | 
					
						
							|  |  |  |         { char-16-rep [ PSUBB ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ PSUBB ] } | 
					
						
							|  |  |  |         { short-8-rep [ PSUBW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PSUBW ] } | 
					
						
							|  |  |  |         { int-4-rep [ PSUBD ] } | 
					
						
							|  |  |  |         { uint-4-rep [ PSUBD ] } | 
					
						
							|  |  |  |         { longlong-2-rep [ PSUBQ ] } | 
					
						
							|  |  |  |         { ulonglong-2-rep [ PSUBQ ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %sub-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %saturated-sub-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { char-16-rep [ PSUBSB ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ PSUBUSB ] } | 
					
						
							|  |  |  |         { short-8-rep [ PSUBSW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PSUBUSW ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %saturated-sub-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %mul-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ MULPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ MULPD ] } | 
					
						
							|  |  |  |         { short-8-rep [ PMULLW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PMULLW ] } | 
					
						
							|  |  |  |         { int-4-rep [ PMULLD ] } | 
					
						
							|  |  |  |         { uint-4-rep [ PMULLD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %mul-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep short-8-rep ushort-8-rep } } | 
					
						
							|  |  |  |         { sse4.1? { int-4-rep uint-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %mul-high-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { short-8-rep  [ PMULHW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PMULHUW ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %mul-high-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { short-8-rep ushort-8-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { char-16-rep  [ PMADDUBSW ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ PMADDUBSW ] } | 
					
						
							|  |  |  |         { short-8-rep  [ PMADDWD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %mul-horizontal-add-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2?  { short-8-rep } } | 
					
						
							|  |  |  |         { ssse3? { char-16-rep uchar-16-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %div-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ DIVPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ DIVPD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %div-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %min-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { char-16-rep [ PMINSB ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ PMINUB ] } | 
					
						
							|  |  |  |         { short-8-rep [ PMINSW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PMINUW ] } | 
					
						
							|  |  |  |         { int-4-rep [ PMINSD ] } | 
					
						
							|  |  |  |         { uint-4-rep [ PMINUD ] } | 
					
						
							|  |  |  |         { float-4-rep [ MINPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ MINPD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %min-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { uchar-16-rep short-8-rep double-2-rep } } | 
					
						
							|  |  |  |         { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %max-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { char-16-rep [ PMAXSB ] } | 
					
						
							|  |  |  |         { uchar-16-rep [ PMAXUB ] } | 
					
						
							|  |  |  |         { short-8-rep [ PMAXSW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PMAXUW ] } | 
					
						
							|  |  |  |         { int-4-rep [ PMAXSD ] } | 
					
						
							|  |  |  |         { uint-4-rep [ PMAXUD ] } | 
					
						
							|  |  |  |         { float-4-rep [ MAXPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ MAXPD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %max-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { uchar-16-rep short-8-rep double-2-rep } } | 
					
						
							|  |  |  |         { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %avg-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { uchar-16-rep [ PAVGB ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PAVGW ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %avg-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { uchar-16-rep ushort-8-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %dot-vector | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2011-11-23 21:49:33 -05:00
										 |  |  |         { float-4-rep [ 0xff DPPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ 0xff DPPD ] } | 
					
						
							| 
									
										
										
										
											2010-05-17 10:44:49 -04:00
										 |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %dot-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse4.1? { float-4-rep double-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %sad-vector | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { uchar-16-rep [ PSADBW ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %sad-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { uchar-16-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %horizontal-add-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     signed-rep { | 
					
						
							|  |  |  |         { float-4-rep  [ HADDPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ HADDPD ] } | 
					
						
							|  |  |  |         { int-4-rep    [ PHADDD ] } | 
					
						
							|  |  |  |         { short-8-rep  [ PHADDW ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %horizontal-add-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse3? { float-4-rep double-2-rep } } | 
					
						
							|  |  |  |         { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     two-operand PSLLDQ ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %horizontal-shl-vector-imm-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     two-operand PSRLDQ ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %horizontal-shr-vector-imm-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %abs-vector ( dst src rep -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { char-16-rep [ PABSB ] } | 
					
						
							|  |  |  |         { short-8-rep [ PABSW ] } | 
					
						
							|  |  |  |         { int-4-rep [ PABSD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %abs-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { ssse3? { char-16-rep short-8-rep int-4-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %sqrt-vector ( dst src rep -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ SQRTPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ SQRTPD ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %sqrt-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %and-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ ANDPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ ANDPS ] } | 
					
						
							|  |  |  |         [ drop PAND ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %and-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %andn-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ ANDNPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ ANDNPS ] } | 
					
						
							|  |  |  |         [ drop PANDN ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %andn-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %or-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ ORPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ ORPS ] } | 
					
						
							|  |  |  |         [ drop POR ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %or-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %xor-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { float-4-rep [ XORPS ] } | 
					
						
							|  |  |  |         { double-2-rep [ XORPS ] } | 
					
						
							|  |  |  |         [ drop PXOR ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %xor-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse? { float-4-rep } } | 
					
						
							|  |  |  |         { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shl-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { short-8-rep [ PSLLW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PSLLW ] } | 
					
						
							|  |  |  |         { int-4-rep [ PSLLD ] } | 
					
						
							|  |  |  |         { uint-4-rep [ PSLLD ] } | 
					
						
							|  |  |  |         { longlong-2-rep [ PSLLQ ] } | 
					
						
							|  |  |  |         { ulonglong-2-rep [ PSLLQ ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shl-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shr-vector ( dst src1 src2 rep -- )
 | 
					
						
							|  |  |  |     [ two-operand ] keep
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { short-8-rep [ PSRAW ] } | 
					
						
							|  |  |  |         { ushort-8-rep [ PSRLW ] } | 
					
						
							|  |  |  |         { int-4-rep [ PSRAD ] } | 
					
						
							|  |  |  |         { uint-4-rep [ PSRLD ] } | 
					
						
							|  |  |  |         { ulonglong-2-rep [ PSRLQ ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shr-vector-reps | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } } | 
					
						
							|  |  |  |     } available-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %shl-vector-imm %shl-vector ;
 | 
					
						
							|  |  |  | M: x86 %shl-vector-imm-reps %shl-vector-reps ;
 | 
					
						
							|  |  |  | M: x86 %shr-vector-imm %shr-vector ;
 | 
					
						
							|  |  |  | M: x86 %shr-vector-imm-reps %shr-vector-reps ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %integer>scalar drop MOVD ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: %scalar>integer-32 ( dst src rep -- )
 | 
					
						
							|  |  |  |     rep { | 
					
						
							|  |  |  |         { int-scalar-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src MOVD | 
					
						
							|  |  |  |             dst dst 32-bit-version-of | 
					
						
							|  |  |  |             2dup eq? [ 2drop ] [ MOVSX ] if
 | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { uint-scalar-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src MOVD | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { short-scalar-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src MOVD | 
					
						
							|  |  |  |             dst dst 16-bit-version-of MOVSX | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { ushort-scalar-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src MOVD | 
					
						
							|  |  |  |             dst dst 16-bit-version-of MOVZX | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { char-scalar-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src MOVD | 
					
						
							|  |  |  |             dst { } 8 [| tmp-dst | | 
					
						
							|  |  |  |                 tmp-dst dst int-rep %copy | 
					
						
							|  |  |  |                 tmp-dst tmp-dst 8-bit-version-of MOVSX | 
					
						
							|  |  |  |                 dst tmp-dst int-rep %copy | 
					
						
							|  |  |  |             ] with-small-register | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         { uchar-scalar-rep [ | 
					
						
							|  |  |  |             dst 32-bit-version-of src MOVD | 
					
						
							|  |  |  |             dst { } 8 [| tmp-dst | | 
					
						
							|  |  |  |                 tmp-dst dst int-rep %copy | 
					
						
							|  |  |  |                 tmp-dst tmp-dst 8-bit-version-of MOVZX | 
					
						
							|  |  |  |                 dst tmp-dst int-rep %copy | 
					
						
							|  |  |  |             ] with-small-register | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86.64 %scalar>integer ( dst src rep -- )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { longlong-scalar-rep  [ MOVD ] } | 
					
						
							|  |  |  |         { ulonglong-scalar-rep [ MOVD ] } | 
					
						
							|  |  |  |         [ %scalar>integer-32 ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %vector>scalar %copy ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: x86 %scalar>vector %copy ;
 |