diff --git a/basis/compiler/cfg/build-stack-frame/build-stack-frame.factor b/basis/compiler/cfg/build-stack-frame/build-stack-frame.factor index 8b17db756d..a8a0fceed7 100644 --- a/basis/compiler/cfg/build-stack-frame/build-stack-frame.factor +++ b/basis/compiler/cfg/build-stack-frame/build-stack-frame.factor @@ -38,10 +38,10 @@ M: ##spill compute-stack-frame* drop frame-required ; M: ##reload compute-stack-frame* drop frame-required ; M: ##float>integer compute-stack-frame* - drop cpu ppc? [ frame-required ] when ; + drop integer-float-needs-stack-frame? [ frame-required ] when ; M: ##integer>float compute-stack-frame* - drop cpu ppc? [ frame-required ] when ; + drop integer-float-needs-stack-frame? [ frame-required ] when ; M: insn compute-stack-frame* drop ; diff --git a/basis/compiler/cfg/linear-scan/linear-scan.factor b/basis/compiler/cfg/linear-scan/linear-scan.factor index 53a7dd8e76..5f1abd3165 100644 --- a/basis/compiler/cfg/linear-scan/linear-scan.factor +++ b/basis/compiler/cfg/linear-scan/linear-scan.factor @@ -40,9 +40,9 @@ IN: compiler.cfg.linear-scan : admissible-registers ( cfg -- regs ) [ machine-registers ] dip frame-pointer?>> [ - [ int-regs ] dip clone + [ int-regs ] dip [ clone ] map [ [ [ frame-reg ] dip remove ] change-at ] keep - ] unless ; + ] when ; : linear-scan ( cfg -- cfg' ) dup dup admissible-registers (linear-scan) ; diff --git a/basis/compiler/cfg/ssa/destruction/destruction.factor b/basis/compiler/cfg/ssa/destruction/destruction.factor index b4cca42ad6..1bb19bd8b0 100644 --- a/basis/compiler/cfg/ssa/destruction/destruction.factor +++ b/basis/compiler/cfg/ssa/destruction/destruction.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2009, 2010 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs fry kernel namespaces +USING: accessors arrays assocs fry locals kernel namespaces sequences sequences.deep sets vectors cpu.architecture @@ -46,35 +46,39 @@ SYMBOL: class-element-map ! Sequence of vreg pairs SYMBOL: copies +: value-of ( vreg -- value ) + insn-of dup ##tagged>integer? [ src>> ] [ dst>> ] if ; + : init-coalescing ( -- ) - defs get keys - [ [ dup ] H{ } map>assoc leader-map set ] - [ [ dup 1vector ] H{ } map>assoc class-element-map set ] bi + defs get + [ [ drop dup ] assoc-map leader-map set ] + [ [ [ dup dup value-of ] dip 1array ] assoc-map class-element-map set ] bi V{ } clone copies set ; -: classes-interfere? ( vreg1 vreg2 -- ? ) - [ leader ] bi@ 2dup eq? [ 2drop f ] [ - [ class-elements flatten ] bi@ sets-interfere? - ] if ; - -: update-leaders ( vreg1 vreg2 -- ) +: coalesce-leaders ( vreg1 vreg2 -- ) + ! leader2 becomes the leader. swap leader-map get set-at ; -: merge-classes ( vreg1 vreg2 -- ) - [ [ class-elements ] bi@ push ] - [ drop class-element-map get delete-at ] 2bi ; +: coalesce-elements ( merged vreg1 vreg2 -- ) + ! delete leader1's class, and set leader2's class to merged. + class-element-map get [ delete-at ] [ set-at ] bi-curry bi* ; -: eliminate-copy ( vreg1 vreg2 -- ) - [ leader ] bi@ - 2dup eq? [ 2drop ] [ - [ update-leaders ] - [ merge-classes ] - 2bi - ] if ; +: coalesce-vregs ( merged leader1 leader2 -- ) + [ coalesce-leaders ] [ coalesce-elements ] 2bi ; + +:: maybe-eliminate-copy ( vreg1 vreg2 -- ) + ! Eliminate a copy of possible. + vreg1 leader :> vreg1 + vreg2 leader :> vreg2 + vreg1 vreg2 eq? [ + vreg1 class-elements vreg2 class-elements sets-interfere? + [ drop ] [ vreg1 vreg2 coalesce-vregs ] if + ] unless ; GENERIC: prepare-insn ( insn -- ) -: try-to-coalesce ( dst src -- ) 2array copies get push ; +: maybe-eliminate-copy-later ( dst src -- ) + 2array copies get push ; M: insn prepare-insn drop ; @@ -85,19 +89,19 @@ M: vreg-insn prepare-insn 2dup empty? not and [ first 2dup [ rep-of reg-class-of ] bi@ eq? - [ try-to-coalesce ] [ 2drop ] if + [ maybe-eliminate-copy-later ] [ 2drop ] if ] [ 2drop ] if ] bi ; M: ##copy prepare-insn - [ dst>> ] [ src>> ] bi try-to-coalesce ; + [ dst>> ] [ src>> ] bi maybe-eliminate-copy-later ; M: ##tagged>integer prepare-insn - [ dst>> ] [ src>> ] bi eliminate-copy ; + [ dst>> ] [ src>> ] bi maybe-eliminate-copy ; M: ##phi prepare-insn [ dst>> ] [ inputs>> values ] bi - [ eliminate-copy ] with each ; + [ maybe-eliminate-copy ] with each ; : prepare-block ( bb -- ) instructions>> [ prepare-insn ] each ; @@ -107,10 +111,7 @@ M: ##phi prepare-insn [ prepare-block ] each-basic-block ; : process-copies ( -- ) - copies get [ - 2dup classes-interfere? - [ 2drop ] [ eliminate-copy ] if - ] assoc-each ; + copies get [ maybe-eliminate-copy ] assoc-each ; GENERIC: useful-insn? ( insn -- ? ) @@ -135,6 +136,7 @@ PRIVATE> dup construct-cssa dup compute-defs + dup compute-insns dup compute-ssa-live-sets dup compute-live-ranges dup prepare-coalescing diff --git a/basis/compiler/cfg/ssa/interference/interference-tests.factor b/basis/compiler/cfg/ssa/interference/interference-tests.factor index c48ae4ad58..4e3da1c6dc 100644 --- a/basis/compiler/cfg/ssa/interference/interference-tests.factor +++ b/basis/compiler/cfg/ssa/interference/interference-tests.factor @@ -2,17 +2,35 @@ USING: accessors compiler.cfg compiler.cfg.debugger compiler.cfg.def-use compiler.cfg.dominance compiler.cfg.instructions compiler.cfg.liveness.ssa compiler.cfg.registers compiler.cfg.predecessors -compiler.cfg.ssa.interference -compiler.cfg.ssa.interference.live-ranges cpu.architecture -kernel namespaces tools.test ; +compiler.cfg.comparisons compiler.cfg.ssa.interference +compiler.cfg.ssa.interference.private +compiler.cfg.ssa.interference.live-ranges +cpu.architecture kernel namespaces tools.test alien.c-types +arrays sequences slots ; IN: compiler.cfg.ssa.interference.tests : test-interference ( -- ) cfg new 0 get >>entry dup compute-ssa-live-sets dup compute-defs + dup compute-insns compute-live-ranges ; +: ( vreg -- info ) + [ ] [ insn-of dup ##tagged>integer? [ src>> ] [ dst>> ] if ] [ def-of ] tri + ; + +: test-vregs-intersect? ( vreg1 vreg2 -- ? ) + [ ] bi@ vregs-intersect? ; + +: test-vregs-interfere? ( vreg1 vreg2 -- ? ) + [ ] bi@ + [ blue >>color ] [ red >>color ] bi* + vregs-interfere? ; + +: test-sets-interfere? ( seq1 seq2 -- merged ? ) + [ [ ] map ] bi@ sets-interfere? ; + V{ T{ ##peek f 0 D 0 } T{ ##peek f 2 D 0 } @@ -34,17 +52,310 @@ V{ [ ] [ test-interference ] unit-test -[ f ] [ 0 1 vregs-interfere? ] unit-test -[ f ] [ 1 0 vregs-interfere? ] unit-test -[ f ] [ 2 3 vregs-interfere? ] unit-test -[ f ] [ 3 2 vregs-interfere? ] unit-test -[ t ] [ 0 2 vregs-interfere? ] unit-test -[ t ] [ 2 0 vregs-interfere? ] unit-test -[ f ] [ 1 3 vregs-interfere? ] unit-test -[ f ] [ 3 1 vregs-interfere? ] unit-test -[ t ] [ 3 4 vregs-interfere? ] unit-test -[ t ] [ 4 3 vregs-interfere? ] unit-test -[ t ] [ 3 5 vregs-interfere? ] unit-test -[ t ] [ 5 3 vregs-interfere? ] unit-test -[ f ] [ 3 6 vregs-interfere? ] unit-test -[ f ] [ 6 3 vregs-interfere? ] unit-test \ No newline at end of file +[ f ] [ 0 1 test-vregs-intersect? ] unit-test +[ f ] [ 1 0 test-vregs-intersect? ] unit-test +[ f ] [ 2 3 test-vregs-intersect? ] unit-test +[ f ] [ 3 2 test-vregs-intersect? ] unit-test +[ t ] [ 0 2 test-vregs-intersect? ] unit-test +[ t ] [ 2 0 test-vregs-intersect? ] unit-test +[ f ] [ 1 3 test-vregs-intersect? ] unit-test +[ f ] [ 3 1 test-vregs-intersect? ] unit-test +[ t ] [ 3 4 test-vregs-intersect? ] unit-test +[ t ] [ 4 3 test-vregs-intersect? ] unit-test +[ t ] [ 3 5 test-vregs-intersect? ] unit-test +[ t ] [ 5 3 test-vregs-intersect? ] unit-test +[ f ] [ 3 6 test-vregs-intersect? ] unit-test +[ f ] [ 6 3 test-vregs-intersect? ] unit-test + +V{ + T{ ##prologue } + T{ ##branch } +} 0 test-bb + + +V{ + T{ ##inc-d f -3 } + T{ ##peek f 12 D -2 } + T{ ##peek f 23 D -1 } + T{ ##sar-imm f 13 23 4 } + T{ ##peek f 24 D -3 } + T{ ##sar-imm f 14 24 4 } + T{ ##mul f 15 13 13 } + T{ ##mul f 16 15 15 } + T{ ##tagged>integer f 17 12 } + T{ ##store-memory f 16 17 14 0 7 int-rep uchar } + T{ ##branch } +} 1 test-bb + +V{ + T{ ##epilogue } + T{ ##return } +} 2 test-bb + +0 1 edge +1 2 edge + +[ ] [ test-interference ] unit-test + +[ t ] [ { 15 } { 23 13 } test-sets-interfere? nip ] unit-test + +V{ + T{ ##prologue f } + T{ ##branch f } +} 0 test-bb + +V{ + T{ ##inc-d f 2 } + T{ ##peek f 32 D 2 } + T{ ##load-reference f 33 ##check-nursery-branch } + T{ ##load-integer f 34 11 } + T{ ##tagged>integer f 35 32 } + T{ ##and-imm f 36 35 15 } + T{ ##compare-integer-imm-branch f 36 7 cc= } +} 1 test-bb + +V{ + T{ ##slot-imm f 48 32 1 7 } + T{ ##slot-imm f 50 48 1 2 } + T{ ##sar-imm f 65 50 4 } + T{ ##compare-integer-branch f 34 65 cc<= } +} 2 test-bb + +V{ + T{ ##inc-d f -2 } + T{ ##slot-imm f 57 48 11 2 } + T{ ##compare f 58 33 57 cc= 20 } + T{ ##replace f 58 D 0 } + T{ ##branch f } +} 3 test-bb + +V{ + T{ ##epilogue f } + T{ ##return f } +} 4 test-bb + +V{ + T{ ##inc-d f -2 } + T{ ##replace-imm f f D 0 } + T{ ##branch f } +} 5 test-bb + +V{ + T{ ##epilogue f } + T{ ##return f } +} 6 test-bb + +V{ + T{ ##inc-d f -2 } + T{ ##replace-imm f f D 0 } + T{ ##branch f } +} 7 test-bb + +V{ + T{ ##epilogue f } + T{ ##return f } +} 8 test-bb + +0 1 edge +1 { 2 7 } edges +2 { 3 5 } edges +3 4 edge +5 6 edge +7 8 edge + +[ ] [ test-interference ] unit-test + +[ f ] [ { 48 } { 32 35 } test-sets-interfere? nip ] unit-test + +TUPLE: bab ; +TUPLE: gfg { x bab } ; +: bah ( -- x ) f ; + +V{ + T{ ##prologue } + T{ ##branch } +} 0 test-bb + +V{ + T{ ##check-nursery-branch f 16 cc<= 75 76 } +} 1 test-bb + +V{ + T{ ##save-context f 77 78 } + T{ ##call-gc f { } } + T{ ##branch } +} 2 test-bb + +V{ + T{ ##inc-d f 1 } + T{ ##load-reference f 37 T{ bab } } + T{ ##load-reference f 38 { gfg 1 1 tuple 57438726 gfg 7785907 } } + T{ ##allot f 40 12 tuple 4 } + T{ ##set-slot-imm f 38 40 1 7 } + T{ ##set-slot-imm f 37 40 2 7 } + T{ ##replace f 40 D 0 } + T{ ##branch } +} 3 test-bb + +V{ + T{ ##call f bah } + T{ ##branch } +} 4 test-bb + +V{ + T{ ##inc-r f 1 } + T{ ##inc-d f 1 } + T{ ##peek f 43 D 1 } + T{ ##peek f 44 D 2 } + T{ ##tagged>integer f 45 43 } + T{ ##and-imm f 46 45 15 } + T{ ##compare-integer-imm-branch f 46 7 cc= } +} 5 test-bb + +V{ + T{ ##inc-d f -1 } + T{ ##slot-imm f 58 43 1 7 } + T{ ##slot-imm f 60 58 7 2 } + T{ ##compare-imm-branch f 60 bab cc= } +} 6 test-bb + +V{ + T{ ##branch } +} 7 test-bb + +V{ + T{ ##inc-r f -1 } + T{ ##inc-d f -1 } + T{ ##set-slot-imm f 43 44 2 7 } + T{ ##write-barrier-imm f 44 2 7 34 35 } + T{ ##branch } +} 8 test-bb + +V{ + T{ ##epilogue } + T{ ##return } +} 9 test-bb + +V{ + T{ ##inc-d f 1 } + T{ ##replace f 44 R 0 } + T{ ##replace-imm f bab D 0 } + T{ ##branch } +} 10 test-bb + +V{ + T{ ##call f bad-slot-value } + T{ ##branch } +} 11 test-bb + +V{ + T{ ##no-tco } +} 12 test-bb + +V{ + T{ ##inc-d f -1 } + T{ ##branch } +} 13 test-bb + +V{ + T{ ##inc-d f 1 } + T{ ##replace f 44 R 0 } + T{ ##replace-imm f bab D 0 } + T{ ##branch } +} 14 test-bb + +V{ + T{ ##call f bad-slot-value } + T{ ##branch } +} 15 test-bb + +V{ + T{ ##no-tco } +} 16 test-bb + +0 1 edge +1 { 3 2 } edges +2 3 edge +3 4 edge +4 5 edge +5 { 6 13 } edges +6 { 7 10 } edges +7 8 edge +8 9 edge +10 11 edge +11 12 edge +13 14 edge +14 15 edge +15 16 edge + +[ ] [ test-interference ] unit-test + +[ t ] [ 43 45 test-vregs-intersect? ] unit-test +[ f ] [ 43 45 test-vregs-interfere? ] unit-test + +[ t ] [ 43 46 test-vregs-intersect? ] unit-test +[ t ] [ 43 46 test-vregs-interfere? ] unit-test + +[ f ] [ 45 46 test-vregs-intersect? ] unit-test +[ f ] [ 45 46 test-vregs-interfere? ] unit-test + +[ f ] [ { 43 } { 45 } test-sets-interfere? nip ] unit-test + +[ t f ] [ + { 46 } { 43 } { 45 } + [ [ ] map ] tri@ + sets-interfere? [ sets-interfere? nip ] dip +] unit-test + +V{ + T{ ##prologue f } + T{ ##branch f } +} 0 test-bb + +V{ + T{ ##inc-d f 1 } + T{ ##peek f 31 D 1 } + T{ ##sar-imm f 16 31 4 } + T{ ##load-integer f 17 0 } + T{ ##copy f 33 17 int-rep } + T{ ##branch f } +} 1 test-bb + +V{ + T{ ##phi f 21 H{ { 1 33 } { 3 32 } } } + T{ ##compare-integer-branch f 21 16 cc< } +} 2 test-bb + +V{ + T{ ##add-imm f 27 21 1 } + T{ ##copy f 32 27 int-rep } + T{ ##branch f } +} 3 test-bb + +V{ + T{ ##inc-d f -2 } + T{ ##branch f } +} 4 test-bb + +V{ + T{ ##epilogue f } + T{ ##return f } +} 5 test-bb + +0 1 edge +1 2 edge +2 { 3 4 } edges +3 2 edge +4 5 edge + +[ ] [ test-interference ] unit-test + +[ f f ] [ + { 33 } { 21 } { 32 } + [ [ ] map ] tri@ + sets-interfere? [ sets-interfere? nip ] dip +] unit-test + +[ f ] [ 33 21 test-vregs-intersect? ] unit-test +[ f ] [ 32 21 test-vregs-intersect? ] unit-test +[ f ] [ 32 33 test-vregs-intersect? ] unit-test \ No newline at end of file diff --git a/basis/compiler/cfg/ssa/interference/interference.factor b/basis/compiler/cfg/ssa/interference/interference.factor index a76b55cd83..0beb9ef010 100644 --- a/basis/compiler/cfg/ssa/interference/interference.factor +++ b/basis/compiler/cfg/ssa/interference/interference.factor @@ -1,92 +1,175 @@ -! Copyright (C) 2009 Slava Pestov. +! Copyright (C) 2009, 2010 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors assocs combinators combinators.short-circuit fry -kernel math math.order sorting namespaces sequences locals -compiler.cfg.def-use compiler.cfg.dominance -compiler.cfg.ssa.interference.live-ranges ; +USING: accessors arrays assocs combinators +combinators.short-circuit fry kernel math math.order sorting +sorting.slots namespaces sequences locals compiler.cfg.def-use +compiler.cfg.dominance compiler.cfg.ssa.interference.live-ranges ; IN: compiler.cfg.ssa.interference -! Interference testing using SSA properties. Actually the only SSA property -! used here is that definitions dominate uses; because of this, the input -! is allowed to have multiple definitions of each vreg as long as they're -! all in the same basic block. This is needed because two-operand conversion -! runs before coalescing, which uses SSA interference testing. +! Interference testing using SSA properties. +! +! Based on: +! +! Revisiting Out-of-SSA Translation for Correctness, Code Quality, and Efficiency +! http://hal.archives-ouvertes.fr/docs/00/34/99/25/PDF/OutSSA-RR.pdf + +TUPLE: vreg-info vreg value def-index bb pre-of color equal-anc-in equal-anc-out ; + +:: ( vreg value bb -- info ) + vreg-info new + vreg >>vreg + bb >>bb + value >>value + bb pre-of >>pre-of + vreg bb def-index >>def-index ; + > ] bi@ < ; + +:: vreg-dominates? ( vreg1 vreg2 -- ? ) + vreg1 bb>> :> bb1 + vreg2 bb>> :> bb2 + bb1 bb2 eq? + [ vreg1 vreg2 locally-dominates? ] [ bb1 bb2 dominates? ] if ; + +! Testing individual vregs for live range intersection. +: kill-after-def? ( vreg1 vreg2 bb -- ? ) ! If first register is used after second one is defined, they interfere. ! If they are used in the same instruction, no interference. If the ! instruction is a def-is-use-insn, then there will be a use at +1 ! (instructions are 2 apart) and so outputs will interfere with ! inputs. - vreg1 bb kill-index - vreg2 bb def-index > ; + [ kill-index ] [ def-index ] bi-curry bi* > ; -:: interferes-same-block? ( vreg1 vreg2 bb1 bb2 -- ? ) - ! If both are defined in the same basic block, they interfere if their - ! local live ranges intersect. - vreg1 bb1 def-index - vreg2 bb1 def-index < - [ vreg1 vreg2 ] [ vreg2 vreg1 ] if - bb1 kill-after-def? ; - -: interferes-first-dominates? ( vreg1 vreg2 bb1 bb2 -- ? ) +: interferes-first-dominates? ( vreg1 vreg2 -- ? ) ! If vreg1 dominates vreg2, then they interfere if vreg2's definition ! occurs before vreg1 is killed. - nip - kill-after-def? ; + [ [ vreg>> ] bi@ ] [ nip bb>> ] 2bi kill-after-def? ; -: interferes-second-dominates? ( vreg1 vreg2 bb1 bb2 -- ? ) +: interferes-second-dominates? ( vreg1 vreg2 -- ? ) ! If vreg2 dominates vreg1, then they interfere if vreg1's definition ! occurs before vreg2 is killed. - drop - swapd kill-after-def? ; + swap interferes-first-dominates? ; -PRIVATE> +: interferes-same-block? ( vreg1 vreg2 -- ? ) + ! If both are defined in the same basic block, they interfere if their + ! local live ranges intersect. + 2dup locally-dominates? [ swap ] unless + interferes-first-dominates? ; -: vregs-interfere? ( vreg1 vreg2 -- ? ) - 2dup [ def-of ] bi@ { - { [ 2dup eq? ] [ interferes-same-block? ] } - { [ 2dup dominates? ] [ interferes-first-dominates? ] } - { [ 2dup swap dominates? ] [ interferes-second-dominates? ] } - [ 2drop 2drop f ] +:: vregs-intersect? ( vreg1 vreg2 -- ? ) + vreg1 bb>> :> bb1 + vreg2 bb>> :> bb2 + { + { [ bb1 bb2 eq? ] [ vreg1 vreg2 interferes-same-block? ] } + { [ bb1 bb2 dominates? ] [ vreg1 vreg2 interferes-first-dominates? ] } + { [ bb2 bb1 dominates? ] [ vreg1 vreg2 interferes-second-dominates? ] } + [ f ] } cond ; -> ] + while nip ; -! Debug this stuff later +: update-equal-anc-out ( vreg1 vreg2 -- ) + dupd chain-intersect >>equal-anc-out drop ; -: quadratic-test? ( seq1 seq2 -- ? ) [ length ] bi@ + 10 < ; +: same-sets? ( vreg1 vreg2 -- ? ) + [ color>> ] bi@ eq? ; -: quadratic-test ( seq1 seq2 -- ? ) - '[ _ [ vregs-interfere? ] with any? ] any? ; +: same-values? ( vreg1 vreg2 -- ? ) + [ value>> ] bi@ eq? ; -: sort-vregs-by-bb ( vregs -- alist ) - defs get - '[ dup _ at ] { } map>assoc - [ second pre-of ] sort-with ; +: vregs-interfere? ( vreg1 vreg2 -- ? ) + [ f >>equal-anc-out ] dip -: ?last ( seq -- elt/f ) [ f ] [ last ] if-empty ; inline + 2dup same-sets? [ equal-anc-out>> ] when -: find-parent ( dom current -- parent ) + 2dup same-values? + [ update-equal-anc-out f ] [ chain-intersect >boolean ] if ; + +! Merging lists of vregs sorted by dominance. +M: vreg-info <=> ( vreg1 vreg2 -- <=> ) + { { pre-of>> <=> } { def-index>> <=> } } compare-slots ; + +SYMBOLS: blue red ; + +TUPLE: iterator seq n ; +: ( seq -- iterator ) 0 iterator boa ; inline +: done? ( iterator -- ? ) [ seq>> length ] [ n>> ] bi = ; inline +: this ( iterator -- obj ) [ n>> ] [ seq>> ] bi nth ; inline +: ++ ( iterator -- ) [ 1 + ] change-n drop ; inline +: take ( iterator -- obj ) [ this ] [ ++ ] bi ; inline + +: blue-smaller? ( blue red -- ? ) + [ this ] bi@ before? ; inline + +: take-blue? ( blue red -- ? ) + { + [ nip done? ] + [ + { + [ drop done? not ] + [ blue-smaller? ] + } 2&& + ] + } 2|| ; inline + +: merge-sets ( blue red -- seq ) + [ ] bi@ + [ 2dup [ done? ] both? not ] + [ + 2dup take-blue? + [ over take blue >>color ] + [ dup take red >>color ] + if + ] produce 2nip ; + +: update-for-merge ( seq -- ) + [ + dup [ equal-anc-in>> ] [ equal-anc-out>> ] bi + 2dup and [ [ vreg-dominates? ] most ] [ or ] if + >>equal-anc-in + drop + ] each ; + +! Linear-time live range intersection test in a merged set. +: find-parent ( dom current -- vreg ) over empty? [ 2drop f ] [ - over last over dominates? [ drop last ] [ - over pop* find-parent - ] if + over last over vreg-dominates? + [ drop last ] [ over pop* find-parent ] if ] if ; -:: linear-test ( seq1 seq2 -- ? ) - ! Instead of sorting, SSA destruction should keep equivalence - ! classes sorted by merging them on append +:: linear-interference-test ( seq -- ? ) V{ } clone :> dom - seq1 seq2 append sort-vregs-by-bb [| pair | - pair first :> current - dom current find-parent - dup [ current vregs-interfere? ] when - [ t ] [ current dom push f ] if + seq [| vreg | + dom vreg find-parent + { [ ] [ vreg same-sets? not ] [ vreg swap vregs-interfere? ] } 1&& + [ t ] [ vreg dom push f ] if ] any? ; +: sets-interfere-1? ( seq1 seq2 -- merged/f ? ) + [ first ] bi@ + 2dup before? [ swap ] unless + 2dup same-values? [ + 2dup equal-anc-in<< + 2array f + ] [ + 2dup vregs-intersect? + [ 2drop f t ] [ 2array f ] if + ] if ; + PRIVATE> -: sets-interfere? ( seq1 seq2 -- ? ) - quadratic-test ; \ No newline at end of file +: sets-interfere? ( seq1 seq2 -- merged/f ? ) + 2dup [ length 1 = ] both? [ sets-interfere-1? ] [ + merge-sets dup linear-interference-test + [ drop f t ] [ dup update-for-merge f ] if + ] if ; \ No newline at end of file diff --git a/basis/compiler/cfg/ssa/interference/live-ranges/live-ranges.factor b/basis/compiler/cfg/ssa/interference/live-ranges/live-ranges.factor index be45485134..d0c729556d 100644 --- a/basis/compiler/cfg/ssa/interference/live-ranges/live-ranges.factor +++ b/basis/compiler/cfg/ssa/interference/live-ranges/live-ranges.factor @@ -25,15 +25,23 @@ SYMBOLS: local-def-indices local-kill-indices ; [ 1 + ] dip [ local-kill-indices get set-at ] with each ] if ; -: visit-insn ( insn n -- ) - 2 * swap [ record-def ] [ record-uses ] 2bi ; +GENERIC: record-insn ( n insn -- ) + +M: ##phi record-insn + record-def ; + +M: vreg-insn record-insn + [ 2 * ] dip [ record-def ] [ record-uses ] 2bi ; + +M: insn record-insn + 2drop ; SYMBOLS: def-indices kill-indices ; : compute-local-live-ranges ( bb -- ) H{ } clone local-def-indices set H{ } clone local-kill-indices set - [ instructions>> [ visit-insn ] each-index ] + [ instructions>> [ swap record-insn ] each-index ] [ [ local-def-indices get ] dip def-indices get set-at ] [ [ local-kill-indices get ] dip kill-indices get set-at ] tri ; diff --git a/basis/compiler/tests/float.factor b/basis/compiler/tests/float.factor index 9685870936..ea62795035 100644 --- a/basis/compiler/tests/float.factor +++ b/basis/compiler/tests/float.factor @@ -85,6 +85,9 @@ IN: compiler.tests.float [ t ] [ -0.0 [ dup 0.0 float= swap -0.0 float= or ] compile-call ] unit-test [ f ] [ 3.0 [ dup 0.0 float= swap -0.0 float= or ] compile-call ] unit-test +[ 313.0 ] [ 313 [ fixnum>float ] compile-call ] unit-test +[ -313 ] [ -313.5 [ float>fixnum ] compile-call ] unit-test +[ 313 ] [ 313.5 [ float>fixnum ] compile-call ] unit-test [ 315 315.0 ] [ 313 [ 2 fixnum+fast dup fixnum>float ] compile-call ] unit-test [ t ] [ 0/0. 0/0. [ float-unordered? ] compile-call ] unit-test diff --git a/basis/cpu/architecture/architecture.factor b/basis/cpu/architecture/architecture.factor index cc6079d060..bb9adbf5ce 100644 --- a/basis/cpu/architecture/architecture.factor +++ b/basis/cpu/architecture/architecture.factor @@ -296,6 +296,8 @@ HOOK: %binary-float-function cpu ( dst src1 src2 func -- ) HOOK: %single>double-float cpu ( dst src -- ) HOOK: %double>single-float cpu ( dst src -- ) +HOOK: integer-float-needs-stack-frame? cpu ( -- ? ) + HOOK: %integer>float cpu ( dst src -- ) HOOK: %float>integer cpu ( dst src -- ) diff --git a/basis/cpu/ppc/ppc.factor b/basis/cpu/ppc/ppc.factor index 89ec8f4efa..56ec02d851 100644 --- a/basis/cpu/ppc/ppc.factor +++ b/basis/cpu/ppc/ppc.factor @@ -190,6 +190,8 @@ M: ppc %sub-float FSUB ; M: ppc %mul-float FMUL ; M: ppc %div-float FDIV ; +M: ppc integer-float-needs-stack-frame? t ; + M:: ppc %integer>float ( dst src -- ) HEX: 4330 scratch-reg LIS scratch-reg 1 0 scratch@ STW diff --git a/basis/cpu/x86/32/32.factor b/basis/cpu/x86/32/32.factor index d914c85f9a..09a81a5bdc 100755 --- a/basis/cpu/x86/32/32.factor +++ b/basis/cpu/x86/32/32.factor @@ -7,15 +7,20 @@ words compiler.constants compiler.codegen.fixup compiler.cfg.instructions compiler.cfg.builder compiler.cfg.builder.alien.boxing compiler.cfg.intrinsics compiler.cfg.stack-frame cpu.x86.assembler -cpu.x86.assembler.operands cpu.x86 cpu.architecture vm ; +cpu.x86.assembler.operands cpu.x86 cpu.architecture vm vocabs ; FROM: layouts => cell ; IN: cpu.x86.32 +: x86-float-regs ( -- seq ) + "cpu.x86.sse" vocab + { XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 } + { ST0 ST1 ST2 ST3 ST4 ST5 ST6 } + ? ; + M: x86.32 machine-registers - { - { int-regs { EAX ECX EDX EBP EBX } } - { float-regs { XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 } } - } ; + { int-regs { EAX ECX EDX EBP EBX } } + float-regs x86-float-regs 2array + 2array ; M: x86.32 ds-reg ESI ; M: x86.32 rs-reg EDI ; @@ -94,7 +99,7 @@ M: x86.32 param-regs M: x86.32 return-regs { { int-regs { EAX EDX } } - { float-regs { f } } + { float-regs { ST0 } } } ; M: x86.32 %prologue ( n -- ) @@ -105,11 +110,11 @@ M: x86.32 %prologue ( n -- ) M: x86.32 %prepare-jump pic-tail-reg 0 MOV xt-tail-pic-offset rc-absolute-cell rel-here ; -:: load-float-return ( dst x87-insn sse-insn -- ) +:: load-float-return ( dst x87-insn rep -- ) dst register? [ ESP 4 SUB ESP [] x87-insn execute - dst ESP [] sse-insn execute + dst ESP [] rep %copy ESP 4 ADD ] [ dst ?spill-slot x87-insn execute @@ -118,14 +123,14 @@ M: x86.32 %prepare-jump M: x86.32 %load-reg-param ( dst reg rep -- ) { { int-rep [ int-rep %copy ] } - { float-rep [ drop \ FSTPS \ MOVSS load-float-return ] } - { double-rep [ drop \ FSTPL \ MOVSD load-float-return ] } + { float-rep [ drop \ FSTPS float-rep load-float-return ] } + { double-rep [ drop \ FSTPL double-rep load-float-return ] } } case ; -:: store-float-return ( src x87-insn sse-insn -- ) +:: store-float-return ( src x87-insn rep -- ) src register? [ ESP 4 SUB - ESP [] src sse-insn execute + ESP [] src rep %copy ESP [] x87-insn execute ESP 4 ADD ] [ @@ -135,8 +140,8 @@ M: x86.32 %load-reg-param ( dst reg rep -- ) M: x86.32 %store-reg-param ( src reg rep -- ) { { int-rep [ swap int-rep %copy ] } - { float-rep [ drop \ FLDS \ MOVSS store-float-return ] } - { double-rep [ drop \ FLDL \ MOVSD store-float-return ] } + { float-rep [ drop \ FLDS float-rep store-float-return ] } + { double-rep [ drop \ FLDL double-rep store-float-return ] } } case ; :: call-unbox-func ( src func -- ) diff --git a/basis/cpu/x86/64/64.factor b/basis/cpu/x86/64/64.factor index 2883d70028..ad4fc626f1 100644 --- a/basis/cpu/x86/64/64.factor +++ b/basis/cpu/x86/64/64.factor @@ -179,3 +179,5 @@ USE: vocabs.loader { [ os unix? ] [ "cpu.x86.64.unix" require ] } { [ os winnt? ] [ "cpu.x86.64.winnt" require ] } } cond + +check-sse diff --git a/basis/cpu/x86/assembler/assembler.factor b/basis/cpu/x86/assembler/assembler.factor index 4460643152..1cb8f67aa6 100644 --- a/basis/cpu/x86/assembler/assembler.factor +++ b/basis/cpu/x86/assembler/assembler.factor @@ -496,6 +496,8 @@ PRIVATE> : FILDQ ( src -- ) { BIN: 101 f HEX: DF } 1-operand ; : FISTPD ( dst -- ) { BIN: 011 f HEX: DB } 1-operand ; : FISTPQ ( dst -- ) { BIN: 111 f HEX: DF } 1-operand ; +: FISTTPD ( dst -- ) { BIN: 001 f HEX: DB } 1-operand ; +: FISTTPQ ( dst -- ) { BIN: 001 f HEX: DF } 1-operand ; : FLD ( dst src -- ) HEX: D9 0 x87-st0-op ; : FLD1 ( -- ) { HEX: D9 HEX: E8 } % ; diff --git a/basis/cpu/x86/assembler/operands/operands.factor b/basis/cpu/x86/assembler/operands/operands.factor index ffee62450d..dc9ee1ce4c 100644 --- a/basis/cpu/x86/assembler/operands/operands.factor +++ b/basis/cpu/x86/assembler/operands/operands.factor @@ -15,15 +15,16 @@ REGISTERS: 16 AX CX DX BX SP BP SI DI R8W R9W R10W R11W R12W R13W R14W R15W ; REGISTERS: 32 EAX ECX EDX EBX ESP EBP ESI EDI R8D R9D R10D R11D R12D R13D R14D R15D ; -REGISTERS: 64 -RAX RCX RDX RBX RSP RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ; +REGISTERS: 64 RAX RCX RDX RBX RSP RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ; REGISTERS: 128 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 XMM9 XMM10 XMM11 XMM12 XMM13 XMM14 XMM15 ; -REGISTERS: 80 -ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7 ; +REGISTERS: 80 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7 ; + +: shuffle-down ( STn -- STn+1 ) + "register" word-prop 1 + 80 registers get at nth ; PREDICATE: register < word "register" word-prop ; diff --git a/basis/cpu/x86/sse/authors.txt b/basis/cpu/x86/sse/authors.txt new file mode 100644 index 0000000000..580f882c8d --- /dev/null +++ b/basis/cpu/x86/sse/authors.txt @@ -0,0 +1,2 @@ +Slava Pestov +Joe Groff diff --git a/basis/cpu/x86/sse/sse.factor b/basis/cpu/x86/sse/sse.factor new file mode 100644 index 0000000000..a6e92ff0a0 --- /dev/null +++ b/basis/cpu/x86/sse/sse.factor @@ -0,0 +1,913 @@ +! Copyright (C) 2009, 2010 Joe Groff, Slava Pestov. +! See http://factorcode.org/license.txt for BSD license. +USING: alien.c-types arrays assocs combinators fry kernel locals +macros math math.vectors namespaces quotations sequences system +compiler.cfg.comparisons compiler.cfg.intrinsics +compiler.codegen.fixup cpu.architecture cpu.x86 +cpu.x86.assembler cpu.x86.assembler.operands cpu.x86.features ; +IN: cpu.x86.sse + +! Scalar floating point with SSE2 +M: x86 %load-float float-rep %load-vector ; +M: x86 %load-double double-rep %load-vector ; + +M: float-rep copy-register* drop MOVAPS ; +M: double-rep copy-register* drop MOVAPS ; + +M: float-rep copy-memory* drop MOVSS ; +M: double-rep copy-memory* drop MOVSD ; + +M: x86 %add-float double-rep two-operand ADDSD ; +M: x86 %sub-float double-rep two-operand SUBSD ; +M: x86 %mul-float double-rep two-operand MULSD ; +M: x86 %div-float double-rep two-operand DIVSD ; +M: x86 %min-float double-rep two-operand MINSD ; +M: x86 %max-float double-rep two-operand MAXSD ; +M: x86 %sqrt SQRTSD ; + +: %clear-unless-in-place ( dst src -- ) + over = [ drop ] [ dup XORPS ] if ; + +M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ; +M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ; + +M: x86 integer-float-needs-stack-frame? f ; +M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ; +M: x86 %float>integer CVTTSD2SI ; + +M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- ) + [ COMISD ] (%compare-float) ; + +M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- ) + [ UCOMISD ] (%compare-float) ; + +M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- ) + [ COMISD ] (%compare-float-branch) ; + +M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- ) + [ UCOMISD ] (%compare-float-branch) ; + +! SIMD +M: float-4-rep copy-register* drop MOVAPS ; +M: double-2-rep copy-register* drop MOVAPS ; +M: vector-rep copy-register* drop MOVDQA ; + +MACRO: available-reps ( alist -- ) + ! Each SSE version adds new representations and supports + ! all old ones + unzip { } [ append ] accumulate rest swap suffix + [ [ 1quotation ] map ] bi@ zip + reverse [ { } ] suffix + '[ _ cond ] ; + +M: x86 %alien-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %zero-vector + { + { double-2-rep [ dup XORPS ] } + { float-4-rep [ dup XORPS ] } + [ drop dup PXOR ] + } case ; + +M: x86 %zero-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %fill-vector + { + { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } + { float-4-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } + [ drop dup PCMPEQB ] + } case ; + +M: x86 %fill-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) + rep signed-rep { + { float-4-rep [ + dst src1 float-4-rep %copy + dst src2 UNPCKLPS + src3 src4 UNPCKLPS + dst src3 MOVLHPS + ] } + { int-4-rep [ + dst src1 int-4-rep %copy + dst src2 PUNPCKLDQ + src3 src4 PUNPCKLDQ + dst src3 PUNPCKLQDQ + ] } + } case ; + +M: x86 %gather-vector-4-reps + { + ! Can't do this with sse1 since it will want to unbox + ! double-precision floats and convert to single precision + { sse2? { float-4-rep int-4-rep uint-4-rep } } + } available-reps ; + +M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- ) + dst rep %zero-vector + dst src1 32-bit-version-of 0 PINSRD + dst src2 32-bit-version-of 1 PINSRD + dst src3 32-bit-version-of 2 PINSRD + dst src4 32-bit-version-of 3 PINSRD ; + +M: x86 %gather-int-vector-4-reps + { + { sse4.1? { int-4-rep uint-4-rep } } + } available-reps ; + +M:: x86 %gather-vector-2 ( dst src1 src2 rep -- ) + rep signed-rep { + { double-2-rep [ + dst src1 double-2-rep %copy + dst src2 MOVLHPS + ] } + { longlong-2-rep [ + dst src1 longlong-2-rep %copy + dst src2 PUNPCKLQDQ + ] } + } case ; + +M: x86 %gather-vector-2-reps + { + { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- ) + dst rep %zero-vector + dst src1 0 PINSRQ + dst src2 1 PINSRQ ; + +M: x86.64 %gather-int-vector-2-reps + { + { sse4.1? { longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +:: %select-vector-32 ( dst src n rep -- ) + rep { + { char-16-rep [ + dst 32-bit-version-of src n PEXTRB + dst dst 8-bit-version-of MOVSX + ] } + { uchar-16-rep [ + dst 32-bit-version-of src n PEXTRB + ] } + { short-8-rep [ + dst 32-bit-version-of src n PEXTRW + dst dst 16-bit-version-of MOVSX + ] } + { ushort-8-rep [ + dst 32-bit-version-of src n PEXTRW + ] } + { int-4-rep [ + dst 32-bit-version-of src n PEXTRD + dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if + ] } + { uint-4-rep [ + dst 32-bit-version-of src n PEXTRD + ] } + } case ; + +M: x86.32 %select-vector + %select-vector-32 ; + +M: x86.32 %select-vector-reps + { + { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } } + } available-reps ; + +M: x86.64 %select-vector + { + { longlong-2-rep [ PEXTRQ ] } + { ulonglong-2-rep [ PEXTRQ ] } + [ %select-vector-32 ] + } case ; + +M: x86.64 %select-vector-reps + { + { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } } + } available-reps ; + +: sse1-float-4-shuffle ( dst shuffle -- ) + { + { { 0 1 2 3 } [ drop ] } + { { 0 1 0 1 } [ dup MOVLHPS ] } + { { 2 3 2 3 } [ dup MOVHLPS ] } + { { 0 0 1 1 } [ dup UNPCKLPS ] } + { { 2 2 3 3 } [ dup UNPCKHPS ] } + [ dupd SHUFPS ] + } case ; + +: float-4-shuffle ( dst shuffle -- ) + sse3? [ + { + { { 0 0 2 2 } [ dup MOVSLDUP ] } + { { 1 1 3 3 } [ dup MOVSHDUP ] } + [ sse1-float-4-shuffle ] + } case + ] [ sse1-float-4-shuffle ] if ; + +: int-4-shuffle ( dst shuffle -- ) + { + { { 0 1 2 3 } [ drop ] } + { { 0 0 1 1 } [ dup PUNPCKLDQ ] } + { { 2 2 3 3 } [ dup PUNPCKHDQ ] } + { { 0 1 0 1 } [ dup PUNPCKLQDQ ] } + { { 2 3 2 3 } [ dup PUNPCKHQDQ ] } + [ dupd PSHUFD ] + } case ; + +: longlong-2-shuffle ( dst shuffle -- ) + first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ; + +: >float-4-shuffle ( double-2-shuffle -- float-4-shuffle ) + [ 2 * { 0 1 } n+v ] map concat ; + +M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- ) + dst src rep %copy + dst shuffle rep signed-rep { + { double-2-rep [ >float-4-shuffle float-4-shuffle ] } + { float-4-rep [ float-4-shuffle ] } + { int-4-rep [ int-4-shuffle ] } + { longlong-2-rep [ longlong-2-shuffle ] } + } case ; + +M: x86 %shuffle-vector-imm-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- ) + dst src1 src2 rep two-operand + shuffle rep { + { double-2-rep [ >float-4-shuffle SHUFPS ] } + { float-4-rep [ SHUFPS ] } + } case ; + +M: x86 %shuffle-vector-halves-imm-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + +M: x86 %shuffle-vector ( dst src shuffle rep -- ) + two-operand PSHUFB ; + +M: x86 %shuffle-vector-reps + { + { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } } + } available-reps ; + +M: x86 %merge-vector-head + [ two-operand ] keep + signed-rep { + { double-2-rep [ MOVLHPS ] } + { float-4-rep [ UNPCKLPS ] } + { longlong-2-rep [ PUNPCKLQDQ ] } + { int-4-rep [ PUNPCKLDQ ] } + { short-8-rep [ PUNPCKLWD ] } + { char-16-rep [ PUNPCKLBW ] } + } case ; + +M: x86 %merge-vector-tail + [ two-operand ] keep + signed-rep { + { double-2-rep [ UNPCKHPD ] } + { float-4-rep [ UNPCKHPS ] } + { longlong-2-rep [ PUNPCKHQDQ ] } + { int-4-rep [ PUNPCKHDQ ] } + { short-8-rep [ PUNPCKHWD ] } + { char-16-rep [ PUNPCKHBW ] } + } case ; + +M: x86 %merge-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %signed-pack-vector + [ two-operand ] keep + { + { int-4-rep [ PACKSSDW ] } + { short-8-rep [ PACKSSWB ] } + } case ; + +M: x86 %signed-pack-vector-reps + { + { sse2? { short-8-rep int-4-rep } } + } available-reps ; + +M: x86 %unsigned-pack-vector + [ two-operand ] keep + signed-rep { + { int-4-rep [ PACKUSDW ] } + { short-8-rep [ PACKUSWB ] } + } case ; + +M: x86 %unsigned-pack-vector-reps + { + { sse2? { short-8-rep } } + { sse4.1? { int-4-rep } } + } available-reps ; + +M: x86 %tail>head-vector ( dst src rep -- ) + dup { + { float-4-rep [ drop UNPCKHPD ] } + { double-2-rep [ drop UNPCKHPD ] } + [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ] + } case ; + +M: x86 %unpack-vector-head ( dst src rep -- ) + { + { char-16-rep [ PMOVSXBW ] } + { uchar-16-rep [ PMOVZXBW ] } + { short-8-rep [ PMOVSXWD ] } + { ushort-8-rep [ PMOVZXWD ] } + { int-4-rep [ PMOVSXDQ ] } + { uint-4-rep [ PMOVZXDQ ] } + { float-4-rep [ CVTPS2PD ] } + } case ; + +M: x86 %unpack-vector-head-reps ( -- reps ) + { + { sse2? { float-4-rep } } + { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + +M: x86 %integer>float-vector ( dst src rep -- ) + { + { int-4-rep [ CVTDQ2PS ] } + } case ; + +M: x86 %integer>float-vector-reps + { + { sse2? { int-4-rep } } + } available-reps ; + +M: x86 %float>integer-vector ( dst src rep -- ) + { + { float-4-rep [ CVTTPS2DQ ] } + } case ; + +M: x86 %float>integer-vector-reps + { + { sse2? { float-4-rep } } + } available-reps ; + +: (%compare-float-vector) ( dst src rep double single -- ) + [ double-2-rep eq? ] 2dip if ; inline + +: %compare-float-vector ( dst src rep cc -- ) + { + { cc< [ [ CMPLTPD ] [ CMPLTPS ] (%compare-float-vector) ] } + { cc<= [ [ CMPLEPD ] [ CMPLEPS ] (%compare-float-vector) ] } + { cc= [ [ CMPEQPD ] [ CMPEQPS ] (%compare-float-vector) ] } + { cc<>= [ [ CMPORDPD ] [ CMPORDPS ] (%compare-float-vector) ] } + { cc/< [ [ CMPNLTPD ] [ CMPNLTPS ] (%compare-float-vector) ] } + { cc/<= [ [ CMPNLEPD ] [ CMPNLEPS ] (%compare-float-vector) ] } + { cc/= [ [ CMPNEQPD ] [ CMPNEQPS ] (%compare-float-vector) ] } + { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] } + } case ; + +:: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- ) + rep signed-rep :> rep' + dst src rep' { + { longlong-2-rep [ int64 call ] } + { int-4-rep [ int32 call ] } + { short-8-rep [ int16 call ] } + { char-16-rep [ int8 call ] } + } case ; inline + +: %compare-int-vector ( dst src rep cc -- ) + { + { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] } + { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] } + } case ; + +M: x86 %compare-vector ( dst src1 src2 rep cc -- ) + [ [ two-operand ] keep ] dip + over float-vector-rep? + [ %compare-float-vector ] + [ %compare-int-vector ] if ; + +: %compare-vector-eq-reps ( -- reps ) + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } + { sse4.1? { longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +: %compare-vector-ord-reps ( -- reps ) + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } } + { sse4.2? { longlong-2-rep } } + } available-reps ; + +M: x86 %compare-vector-reps + { + { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] } + [ drop %compare-vector-ord-reps ] + } cond ; + +: %compare-float-vector-ccs ( cc -- ccs not? ) + { + { cc< [ { { cc< f } } f ] } + { cc<= [ { { cc<= f } } f ] } + { cc> [ { { cc< t } } f ] } + { cc>= [ { { cc<= t } } f ] } + { cc= [ { { cc= f } } f ] } + { cc<> [ { { cc< f } { cc< t } } f ] } + { cc<>= [ { { cc<>= f } } f ] } + { cc/< [ { { cc/< f } } f ] } + { cc/<= [ { { cc/<= f } } f ] } + { cc/> [ { { cc/< t } } f ] } + { cc/>= [ { { cc/<= t } } f ] } + { cc/= [ { { cc/= f } } f ] } + { cc/<> [ { { cc/= f } { cc/<>= f } } f ] } + { cc/<>= [ { { cc/<>= f } } f ] } + } case ; + +: %compare-int-vector-ccs ( cc -- ccs not? ) + order-cc { + { cc< [ { { cc> t } } f ] } + { cc<= [ { { cc> f } } t ] } + { cc> [ { { cc> f } } f ] } + { cc>= [ { { cc> t } } t ] } + { cc= [ { { cc= f } } f ] } + { cc/= [ { { cc= f } } t ] } + { t [ { } t ] } + { f [ { } f ] } + } case ; + +M: x86 %compare-vector-ccs + swap float-vector-rep? + [ %compare-float-vector-ccs ] + [ %compare-int-vector-ccs ] if ; + +:: %test-vector-mask ( dst temp mask vcc -- ) + vcc { + { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] } + { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] } + { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] } + { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] } + } case ; + +: %move-vector-mask ( dst src rep -- mask ) + { + { double-2-rep [ MOVMSKPS HEX: f ] } + { float-4-rep [ MOVMSKPS HEX: f ] } + [ drop PMOVMSKB HEX: ffff ] + } case ; + +M:: x86 %test-vector ( dst src temp rep vcc -- ) + dst src rep %move-vector-mask :> mask + dst temp mask vcc %test-vector-mask ; + +:: %test-vector-mask-branch ( label temp mask vcc -- ) + vcc { + { vcc-any [ temp temp TEST label JNE ] } + { vcc-none [ temp temp TEST label JE ] } + { vcc-all [ temp mask CMP label JE ] } + { vcc-notall [ temp mask CMP label JNE ] } + } case ; + +M:: x86 %test-vector-branch ( label src temp rep vcc -- ) + temp src rep %move-vector-mask :> mask + label temp mask vcc %test-vector-mask-branch ; + +M: x86 %test-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %add-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ADDPS ] } + { double-2-rep [ ADDPD ] } + { char-16-rep [ PADDB ] } + { uchar-16-rep [ PADDB ] } + { short-8-rep [ PADDW ] } + { ushort-8-rep [ PADDW ] } + { int-4-rep [ PADDD ] } + { uint-4-rep [ PADDD ] } + { longlong-2-rep [ PADDQ ] } + { ulonglong-2-rep [ PADDQ ] } + } case ; + +M: x86 %add-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %saturated-add-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PADDSB ] } + { uchar-16-rep [ PADDUSB ] } + { short-8-rep [ PADDSW ] } + { ushort-8-rep [ PADDUSW ] } + } case ; + +M: x86 %saturated-add-vector-reps + { + { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } + } available-reps ; + +M: x86 %add-sub-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ADDSUBPS ] } + { double-2-rep [ ADDSUBPD ] } + } case ; + +M: x86 %add-sub-vector-reps + { + { sse3? { float-4-rep double-2-rep } } + } available-reps ; + +M: x86 %sub-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ SUBPS ] } + { double-2-rep [ SUBPD ] } + { char-16-rep [ PSUBB ] } + { uchar-16-rep [ PSUBB ] } + { short-8-rep [ PSUBW ] } + { ushort-8-rep [ PSUBW ] } + { int-4-rep [ PSUBD ] } + { uint-4-rep [ PSUBD ] } + { longlong-2-rep [ PSUBQ ] } + { ulonglong-2-rep [ PSUBQ ] } + } case ; + +M: x86 %sub-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %saturated-sub-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PSUBSB ] } + { uchar-16-rep [ PSUBUSB ] } + { short-8-rep [ PSUBSW ] } + { ushort-8-rep [ PSUBUSW ] } + } case ; + +M: x86 %saturated-sub-vector-reps + { + { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } + } available-reps ; + +M: x86 %mul-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ MULPS ] } + { double-2-rep [ MULPD ] } + { short-8-rep [ PMULLW ] } + { ushort-8-rep [ PMULLW ] } + { int-4-rep [ PMULLD ] } + { uint-4-rep [ PMULLD ] } + } case ; + +M: x86 %mul-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep short-8-rep ushort-8-rep } } + { sse4.1? { int-4-rep uint-4-rep } } + } available-reps ; + +M: x86 %mul-high-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { short-8-rep [ PMULHW ] } + { ushort-8-rep [ PMULHUW ] } + } case ; + +M: x86 %mul-high-vector-reps + { + { sse2? { short-8-rep ushort-8-rep } } + } available-reps ; + +M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PMADDUBSW ] } + { uchar-16-rep [ PMADDUBSW ] } + { short-8-rep [ PMADDWD ] } + } case ; + +M: x86 %mul-horizontal-add-vector-reps + { + { sse2? { short-8-rep } } + { ssse3? { char-16-rep uchar-16-rep } } + } available-reps ; + +M: x86 %div-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ DIVPS ] } + { double-2-rep [ DIVPD ] } + } case ; + +M: x86 %div-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + +M: x86 %min-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PMINSB ] } + { uchar-16-rep [ PMINUB ] } + { short-8-rep [ PMINSW ] } + { ushort-8-rep [ PMINUW ] } + { int-4-rep [ PMINSD ] } + { uint-4-rep [ PMINUD ] } + { float-4-rep [ MINPS ] } + { double-2-rep [ MINPD ] } + } case ; + +M: x86 %min-vector-reps + { + { sse? { float-4-rep } } + { sse2? { uchar-16-rep short-8-rep double-2-rep } } + { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + +M: x86 %max-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { char-16-rep [ PMAXSB ] } + { uchar-16-rep [ PMAXUB ] } + { short-8-rep [ PMAXSW ] } + { ushort-8-rep [ PMAXUW ] } + { int-4-rep [ PMAXSD ] } + { uint-4-rep [ PMAXUD ] } + { float-4-rep [ MAXPS ] } + { double-2-rep [ MAXPD ] } + } case ; + +M: x86 %max-vector-reps + { + { sse? { float-4-rep } } + { sse2? { uchar-16-rep short-8-rep double-2-rep } } + { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } + } available-reps ; + +M: x86 %avg-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { uchar-16-rep [ PAVGB ] } + { ushort-8-rep [ PAVGW ] } + } case ; + +M: x86 %avg-vector-reps + { + { sse2? { uchar-16-rep ushort-8-rep } } + } available-reps ; + +M: x86 %dot-vector + [ two-operand ] keep + { + { float-4-rep [ HEX: ff DPPS ] } + { double-2-rep [ HEX: ff DPPD ] } + } case ; + +M: x86 %dot-vector-reps + { + { sse4.1? { float-4-rep double-2-rep } } + } available-reps ; + +M: x86 %sad-vector + [ two-operand ] keep + { + { uchar-16-rep [ PSADBW ] } + } case ; + +M: x86 %sad-vector-reps + { + { sse2? { uchar-16-rep } } + } available-reps ; + +M: x86 %horizontal-add-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + signed-rep { + { float-4-rep [ HADDPS ] } + { double-2-rep [ HADDPD ] } + { int-4-rep [ PHADDD ] } + { short-8-rep [ PHADDW ] } + } case ; + +M: x86 %horizontal-add-vector-reps + { + { sse3? { float-4-rep double-2-rep } } + { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } } + } available-reps ; + +M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- ) + two-operand PSLLDQ ; + +M: x86 %horizontal-shl-vector-imm-reps + { + { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } + } available-reps ; + +M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- ) + two-operand PSRLDQ ; + +M: x86 %horizontal-shr-vector-imm-reps + { + { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } + } available-reps ; + +M: x86 %abs-vector ( dst src rep -- ) + { + { char-16-rep [ PABSB ] } + { short-8-rep [ PABSW ] } + { int-4-rep [ PABSD ] } + } case ; + +M: x86 %abs-vector-reps + { + { ssse3? { char-16-rep short-8-rep int-4-rep } } + } available-reps ; + +M: x86 %sqrt-vector ( dst src rep -- ) + { + { float-4-rep [ SQRTPS ] } + { double-2-rep [ SQRTPD ] } + } case ; + +M: x86 %sqrt-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep } } + } available-reps ; + +M: x86 %and-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ANDPS ] } + { double-2-rep [ ANDPS ] } + [ drop PAND ] + } case ; + +M: x86 %and-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %andn-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ANDNPS ] } + { double-2-rep [ ANDNPS ] } + [ drop PANDN ] + } case ; + +M: x86 %andn-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %or-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ ORPS ] } + { double-2-rep [ ORPS ] } + [ drop POR ] + } case ; + +M: x86 %or-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %xor-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { float-4-rep [ XORPS ] } + { double-2-rep [ XORPS ] } + [ drop PXOR ] + } case ; + +M: x86 %xor-vector-reps + { + { sse? { float-4-rep } } + { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %shl-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { short-8-rep [ PSLLW ] } + { ushort-8-rep [ PSLLW ] } + { int-4-rep [ PSLLD ] } + { uint-4-rep [ PSLLD ] } + { longlong-2-rep [ PSLLQ ] } + { ulonglong-2-rep [ PSLLQ ] } + } case ; + +M: x86 %shl-vector-reps + { + { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %shr-vector ( dst src1 src2 rep -- ) + [ two-operand ] keep + { + { short-8-rep [ PSRAW ] } + { ushort-8-rep [ PSRLW ] } + { int-4-rep [ PSRAD ] } + { uint-4-rep [ PSRLD ] } + { ulonglong-2-rep [ PSRLQ ] } + } case ; + +M: x86 %shr-vector-reps + { + { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } } + } available-reps ; + +M: x86 %shl-vector-imm %shl-vector ; +M: x86 %shl-vector-imm-reps %shl-vector-reps ; +M: x86 %shr-vector-imm %shr-vector ; +M: x86 %shr-vector-imm-reps %shr-vector-reps ; + +: scalar-sized-reg ( reg rep -- reg' ) + rep-size 8 * n-bit-version-of ; + +M: x86 %integer>scalar drop MOVD ; + +:: %scalar>integer-32 ( dst src rep -- ) + rep { + { int-scalar-rep [ + dst 32-bit-version-of src MOVD + dst dst 32-bit-version-of + 2dup eq? [ 2drop ] [ MOVSX ] if + ] } + { uint-scalar-rep [ + dst 32-bit-version-of src MOVD + ] } + { short-scalar-rep [ + dst 32-bit-version-of src MOVD + dst dst 16-bit-version-of MOVSX + ] } + { ushort-scalar-rep [ + dst 32-bit-version-of src MOVD + dst dst 16-bit-version-of MOVZX + ] } + { char-scalar-rep [ + dst 32-bit-version-of src MOVD + dst { } 8 [| tmp-dst | + tmp-dst dst int-rep %copy + tmp-dst tmp-dst 8-bit-version-of MOVSX + dst tmp-dst int-rep %copy + ] with-small-register + ] } + { uchar-scalar-rep [ + dst 32-bit-version-of src MOVD + dst { } 8 [| tmp-dst | + tmp-dst dst int-rep %copy + tmp-dst tmp-dst 8-bit-version-of MOVZX + dst tmp-dst int-rep %copy + ] with-small-register + ] } + } case ; + +M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ; + +M: x86.64 %scalar>integer ( dst src rep -- ) + { + { longlong-scalar-rep [ MOVD ] } + { ulonglong-scalar-rep [ MOVD ] } + [ %scalar>integer-32 ] + } case ; + +M: x86 %vector>scalar %copy ; + +M: x86 %scalar>vector %copy ; + +enable-float-min/max diff --git a/basis/cpu/x86/sse/tags.txt b/basis/cpu/x86/sse/tags.txt new file mode 100644 index 0000000000..ebb74b4d5f --- /dev/null +++ b/basis/cpu/x86/sse/tags.txt @@ -0,0 +1 @@ +not loaded diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor index 949a0104af..2a115532fa 100644 --- a/basis/cpu/x86/x86.factor +++ b/basis/cpu/x86/x86.factor @@ -6,7 +6,7 @@ cpu.x86.features cpu.x86.features.private cpu.architecture kernel kernel.private math memory namespaces make sequences words system layouts combinators math.order math.vectors fry locals compiler.constants byte-arrays io macros quotations classes.algebra compiler -compiler.units init vm +compiler.units init vm vocabs.loader compiler.cfg.registers compiler.cfg.instructions compiler.cfg.intrinsics @@ -69,12 +69,6 @@ M: x86 %load-reference [ \ f type-number MOV ] if* ; -M: x86 %load-float ( dst val -- ) - float-rep %load-vector ; - -M: x86 %load-double ( dst val -- ) - double-rep %load-vector ; - HOOK: ds-reg cpu ( -- reg ) HOOK: rs-reg cpu ( -- reg ) @@ -165,15 +159,8 @@ GENERIC: copy-memory* ( dst src rep -- ) M: int-rep copy-register* drop MOV ; M: tagged-rep copy-register* drop MOV ; -M: float-rep copy-register* drop MOVAPS ; -M: double-rep copy-register* drop MOVAPS ; -M: float-4-rep copy-register* drop MOVAPS ; -M: double-2-rep copy-register* drop MOVAPS ; -M: vector-rep copy-register* drop MOVDQA ; M: object copy-memory* copy-register* ; -M: float-rep copy-memory* drop MOVSS ; -M: double-rep copy-memory* drop MOVSD ; : ?spill-slot ( obj -- obj ) dup spill-slot? [ n>> spill@ ] when ; @@ -576,961 +563,6 @@ M:: x86 %compare-imm-branch ( label src1 src2 cc -- ) src1 src2 (%compare-imm) label cc %branch ; -M: x86 %add-float double-rep two-operand ADDSD ; -M: x86 %sub-float double-rep two-operand SUBSD ; -M: x86 %mul-float double-rep two-operand MULSD ; -M: x86 %div-float double-rep two-operand DIVSD ; -M: x86 %min-float double-rep two-operand MINSD ; -M: x86 %max-float double-rep two-operand MAXSD ; -M: x86 %sqrt SQRTSD ; - -: %clear-unless-in-place ( dst src -- ) - over = [ drop ] [ dup XORPS ] if ; - -M: x86 %single>double-float [ %clear-unless-in-place ] [ CVTSS2SD ] 2bi ; -M: x86 %double>single-float [ %clear-unless-in-place ] [ CVTSD2SS ] 2bi ; - -M: x86 %integer>float [ drop dup XORPS ] [ CVTSI2SD ] 2bi ; -M: x86 %float>integer CVTTSD2SI ; - -: %cmov-float= ( dst src -- ) - [ - "no-move" define-label - - "no-move" get [ JNE ] [ JP ] bi - MOV - "no-move" resolve-label - ] with-scope ; - -: %cmov-float/= ( dst src -- ) - [ - "no-move" define-label - "move" define-label - - "move" get JP - "no-move" get JE - "move" resolve-label - MOV - "no-move" resolve-label - ] with-scope ; - -:: (%compare-float) ( dst src1 src2 cc temp compare -- ) - cc { - { cc< [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVA (%boolean) ] } - { cc<= [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVAE (%boolean) ] } - { cc> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVA (%boolean) ] } - { cc>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVAE (%boolean) ] } - { cc= [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float= (%boolean) ] } - { cc<> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNE (%boolean) ] } - { cc<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVNP (%boolean) ] } - { cc/< [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVBE (%boolean) ] } - { cc/<= [ src2 src1 \ compare execute( a b -- ) dst temp \ CMOVB (%boolean) ] } - { cc/> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVBE (%boolean) ] } - { cc/>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVB (%boolean) ] } - { cc/= [ src1 src2 \ compare execute( a b -- ) dst temp \ %cmov-float/= (%boolean) ] } - { cc/<> [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVE (%boolean) ] } - { cc/<>= [ src1 src2 \ compare execute( a b -- ) dst temp \ CMOVP (%boolean) ] } - } case ; inline - -M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- ) - \ COMISD (%compare-float) ; - -M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- ) - \ UCOMISD (%compare-float) ; - -: %jump-float= ( label -- ) - [ - "no-jump" define-label - "no-jump" get JP - JE - "no-jump" resolve-label - ] with-scope ; - -: %jump-float/= ( label -- ) - [ JNE ] [ JP ] bi ; - -:: (%compare-float-branch) ( label src1 src2 cc compare -- ) - cc { - { cc< [ src2 src1 \ compare execute( a b -- ) label JA ] } - { cc<= [ src2 src1 \ compare execute( a b -- ) label JAE ] } - { cc> [ src1 src2 \ compare execute( a b -- ) label JA ] } - { cc>= [ src1 src2 \ compare execute( a b -- ) label JAE ] } - { cc= [ src1 src2 \ compare execute( a b -- ) label %jump-float= ] } - { cc<> [ src1 src2 \ compare execute( a b -- ) label JNE ] } - { cc<>= [ src1 src2 \ compare execute( a b -- ) label JNP ] } - { cc/< [ src2 src1 \ compare execute( a b -- ) label JBE ] } - { cc/<= [ src2 src1 \ compare execute( a b -- ) label JB ] } - { cc/> [ src1 src2 \ compare execute( a b -- ) label JBE ] } - { cc/>= [ src1 src2 \ compare execute( a b -- ) label JB ] } - { cc/= [ src1 src2 \ compare execute( a b -- ) label %jump-float/= ] } - { cc/<> [ src1 src2 \ compare execute( a b -- ) label JE ] } - { cc/<>= [ src1 src2 \ compare execute( a b -- ) label JP ] } - } case ; - -M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- ) - \ COMISD (%compare-float-branch) ; - -M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- ) - \ UCOMISD (%compare-float-branch) ; - -MACRO: available-reps ( alist -- ) - ! Each SSE version adds new representations and supports - ! all old ones - unzip { } [ append ] accumulate rest swap suffix - [ [ 1quotation ] map ] bi@ zip - reverse [ { } ] suffix - '[ _ cond ] ; - -M: x86 %alien-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %zero-vector - { - { double-2-rep [ dup XORPS ] } - { float-4-rep [ dup XORPS ] } - [ drop dup PXOR ] - } case ; - -M: x86 %zero-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %fill-vector - { - { double-2-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } - { float-4-rep [ dup [ XORPS ] [ CMPEQPS ] 2bi ] } - [ drop dup PCMPEQB ] - } case ; - -M: x86 %fill-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M:: x86 %gather-vector-4 ( dst src1 src2 src3 src4 rep -- ) - rep signed-rep { - { float-4-rep [ - dst src1 float-4-rep %copy - dst src2 UNPCKLPS - src3 src4 UNPCKLPS - dst src3 MOVLHPS - ] } - { int-4-rep [ - dst src1 int-4-rep %copy - dst src2 PUNPCKLDQ - src3 src4 PUNPCKLDQ - dst src3 PUNPCKLQDQ - ] } - } case ; - -M: x86 %gather-vector-4-reps - { - ! Can't do this with sse1 since it will want to unbox - ! double-precision floats and convert to single precision - { sse2? { float-4-rep int-4-rep uint-4-rep } } - } available-reps ; - -M:: x86 %gather-int-vector-4 ( dst src1 src2 src3 src4 rep -- ) - dst rep %zero-vector - dst src1 32-bit-version-of 0 PINSRD - dst src2 32-bit-version-of 1 PINSRD - dst src3 32-bit-version-of 2 PINSRD - dst src4 32-bit-version-of 3 PINSRD ; - -M: x86 %gather-int-vector-4-reps - { - { sse4.1? { int-4-rep uint-4-rep } } - } available-reps ; - -M:: x86 %gather-vector-2 ( dst src1 src2 rep -- ) - rep signed-rep { - { double-2-rep [ - dst src1 double-2-rep %copy - dst src2 MOVLHPS - ] } - { longlong-2-rep [ - dst src1 longlong-2-rep %copy - dst src2 PUNPCKLQDQ - ] } - } case ; - -M: x86 %gather-vector-2-reps - { - { sse2? { double-2-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M:: x86.64 %gather-int-vector-2 ( dst src1 src2 rep -- ) - dst rep %zero-vector - dst src1 0 PINSRQ - dst src2 1 PINSRQ ; - -M: x86.64 %gather-int-vector-2-reps - { - { sse4.1? { longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -:: %select-vector-32 ( dst src n rep -- ) - rep { - { char-16-rep [ - dst 32-bit-version-of src n PEXTRB - dst dst 8-bit-version-of MOVSX - ] } - { uchar-16-rep [ - dst 32-bit-version-of src n PEXTRB - ] } - { short-8-rep [ - dst 32-bit-version-of src n PEXTRW - dst dst 16-bit-version-of MOVSX - ] } - { ushort-8-rep [ - dst 32-bit-version-of src n PEXTRW - ] } - { int-4-rep [ - dst 32-bit-version-of src n PEXTRD - dst dst 32-bit-version-of 2dup = [ 2drop ] [ MOVSX ] if - ] } - { uint-4-rep [ - dst 32-bit-version-of src n PEXTRD - ] } - } case ; - -M: x86.32 %select-vector - %select-vector-32 ; - -M: x86.32 %select-vector-reps - { - { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep } } - } available-reps ; - -M: x86.64 %select-vector - { - { longlong-2-rep [ PEXTRQ ] } - { ulonglong-2-rep [ PEXTRQ ] } - [ %select-vector-32 ] - } case ; - -M: x86.64 %select-vector-reps - { - { sse4.1? { uchar-16-rep char-16-rep ushort-8-rep short-8-rep uint-4-rep int-4-rep ulonglong-2-rep longlong-2-rep } } - } available-reps ; - -: sse1-float-4-shuffle ( dst shuffle -- ) - { - { { 0 1 2 3 } [ drop ] } - { { 0 1 0 1 } [ dup MOVLHPS ] } - { { 2 3 2 3 } [ dup MOVHLPS ] } - { { 0 0 1 1 } [ dup UNPCKLPS ] } - { { 2 2 3 3 } [ dup UNPCKHPS ] } - [ dupd SHUFPS ] - } case ; - -: float-4-shuffle ( dst shuffle -- ) - sse3? [ - { - { { 0 0 2 2 } [ dup MOVSLDUP ] } - { { 1 1 3 3 } [ dup MOVSHDUP ] } - [ sse1-float-4-shuffle ] - } case - ] [ sse1-float-4-shuffle ] if ; - -: int-4-shuffle ( dst shuffle -- ) - { - { { 0 1 2 3 } [ drop ] } - { { 0 0 1 1 } [ dup PUNPCKLDQ ] } - { { 2 2 3 3 } [ dup PUNPCKHDQ ] } - { { 0 1 0 1 } [ dup PUNPCKLQDQ ] } - { { 2 3 2 3 } [ dup PUNPCKHQDQ ] } - [ dupd PSHUFD ] - } case ; - -: longlong-2-shuffle ( dst shuffle -- ) - first2 [ 2 * dup 1 + ] bi@ 4array int-4-shuffle ; - -: >float-4-shuffle ( double-2-shuffle -- float-4-shuffle ) - [ 2 * { 0 1 } n+v ] map concat ; - -M:: x86 %shuffle-vector-imm ( dst src shuffle rep -- ) - dst src rep %copy - dst shuffle rep signed-rep { - { double-2-rep [ >float-4-shuffle float-4-shuffle ] } - { float-4-rep [ float-4-shuffle ] } - { int-4-rep [ int-4-shuffle ] } - { longlong-2-rep [ longlong-2-shuffle ] } - } case ; - -M: x86 %shuffle-vector-imm-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M:: x86 %shuffle-vector-halves-imm ( dst src1 src2 shuffle rep -- ) - dst src1 src2 rep two-operand - shuffle rep { - { double-2-rep [ >float-4-shuffle SHUFPS ] } - { float-4-rep [ SHUFPS ] } - } case ; - -M: x86 %shuffle-vector-halves-imm-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep } } - } available-reps ; - -M: x86 %shuffle-vector ( dst src shuffle rep -- ) - two-operand PSHUFB ; - -M: x86 %shuffle-vector-reps - { - { ssse3? { float-4-rep double-2-rep longlong-2-rep ulonglong-2-rep int-4-rep uint-4-rep short-8-rep ushort-8-rep char-16-rep uchar-16-rep } } - } available-reps ; - -M: x86 %merge-vector-head - [ two-operand ] keep - signed-rep { - { double-2-rep [ MOVLHPS ] } - { float-4-rep [ UNPCKLPS ] } - { longlong-2-rep [ PUNPCKLQDQ ] } - { int-4-rep [ PUNPCKLDQ ] } - { short-8-rep [ PUNPCKLWD ] } - { char-16-rep [ PUNPCKLBW ] } - } case ; - -M: x86 %merge-vector-tail - [ two-operand ] keep - signed-rep { - { double-2-rep [ UNPCKHPD ] } - { float-4-rep [ UNPCKHPS ] } - { longlong-2-rep [ PUNPCKHQDQ ] } - { int-4-rep [ PUNPCKHDQ ] } - { short-8-rep [ PUNPCKHWD ] } - { char-16-rep [ PUNPCKHBW ] } - } case ; - -M: x86 %merge-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %signed-pack-vector - [ two-operand ] keep - { - { int-4-rep [ PACKSSDW ] } - { short-8-rep [ PACKSSWB ] } - } case ; - -M: x86 %signed-pack-vector-reps - { - { sse2? { short-8-rep int-4-rep } } - } available-reps ; - -M: x86 %unsigned-pack-vector - [ two-operand ] keep - signed-rep { - { int-4-rep [ PACKUSDW ] } - { short-8-rep [ PACKUSWB ] } - } case ; - -M: x86 %unsigned-pack-vector-reps - { - { sse2? { short-8-rep } } - { sse4.1? { int-4-rep } } - } available-reps ; - -M: x86 %tail>head-vector ( dst src rep -- ) - dup { - { float-4-rep [ drop UNPCKHPD ] } - { double-2-rep [ drop UNPCKHPD ] } - [ drop [ %copy ] [ drop PUNPCKHQDQ ] 3bi ] - } case ; - -M: x86 %unpack-vector-head ( dst src rep -- ) - { - { char-16-rep [ PMOVSXBW ] } - { uchar-16-rep [ PMOVZXBW ] } - { short-8-rep [ PMOVSXWD ] } - { ushort-8-rep [ PMOVZXWD ] } - { int-4-rep [ PMOVSXDQ ] } - { uint-4-rep [ PMOVZXDQ ] } - { float-4-rep [ CVTPS2PD ] } - } case ; - -M: x86 %unpack-vector-head-reps ( -- reps ) - { - { sse2? { float-4-rep } } - { sse4.1? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } - } available-reps ; - -M: x86 %integer>float-vector ( dst src rep -- ) - { - { int-4-rep [ CVTDQ2PS ] } - } case ; - -M: x86 %integer>float-vector-reps - { - { sse2? { int-4-rep } } - } available-reps ; - -M: x86 %float>integer-vector ( dst src rep -- ) - { - { float-4-rep [ CVTTPS2DQ ] } - } case ; - -M: x86 %float>integer-vector-reps - { - { sse2? { float-4-rep } } - } available-reps ; - -: (%compare-float-vector) ( dst src rep double single -- ) - [ double-2-rep eq? ] 2dip if ; inline - -: %compare-float-vector ( dst src rep cc -- ) - { - { cc< [ [ CMPLTPD ] [ CMPLTPS ] (%compare-float-vector) ] } - { cc<= [ [ CMPLEPD ] [ CMPLEPS ] (%compare-float-vector) ] } - { cc= [ [ CMPEQPD ] [ CMPEQPS ] (%compare-float-vector) ] } - { cc<>= [ [ CMPORDPD ] [ CMPORDPS ] (%compare-float-vector) ] } - { cc/< [ [ CMPNLTPD ] [ CMPNLTPS ] (%compare-float-vector) ] } - { cc/<= [ [ CMPNLEPD ] [ CMPNLEPS ] (%compare-float-vector) ] } - { cc/= [ [ CMPNEQPD ] [ CMPNEQPS ] (%compare-float-vector) ] } - { cc/<>= [ [ CMPUNORDPD ] [ CMPUNORDPS ] (%compare-float-vector) ] } - } case ; - -:: (%compare-int-vector) ( dst src rep int64 int32 int16 int8 -- ) - rep signed-rep :> rep' - dst src rep' { - { longlong-2-rep [ int64 call ] } - { int-4-rep [ int32 call ] } - { short-8-rep [ int16 call ] } - { char-16-rep [ int8 call ] } - } case ; inline - -: %compare-int-vector ( dst src rep cc -- ) - { - { cc= [ [ PCMPEQQ ] [ PCMPEQD ] [ PCMPEQW ] [ PCMPEQB ] (%compare-int-vector) ] } - { cc> [ [ PCMPGTQ ] [ PCMPGTD ] [ PCMPGTW ] [ PCMPGTB ] (%compare-int-vector) ] } - } case ; - -M: x86 %compare-vector ( dst src1 src2 rep cc -- ) - [ [ two-operand ] keep ] dip - over float-vector-rep? - [ %compare-float-vector ] - [ %compare-int-vector ] if ; - -: %compare-vector-eq-reps ( -- reps ) - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep } } - { sse4.1? { longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -: %compare-vector-ord-reps ( -- reps ) - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep short-8-rep int-4-rep } } - { sse4.2? { longlong-2-rep } } - } available-reps ; - -M: x86 %compare-vector-reps - { - { [ dup { cc= cc/= cc/<>= cc<>= } member-eq? ] [ drop %compare-vector-eq-reps ] } - [ drop %compare-vector-ord-reps ] - } cond ; - -: %compare-float-vector-ccs ( cc -- ccs not? ) - { - { cc< [ { { cc< f } } f ] } - { cc<= [ { { cc<= f } } f ] } - { cc> [ { { cc< t } } f ] } - { cc>= [ { { cc<= t } } f ] } - { cc= [ { { cc= f } } f ] } - { cc<> [ { { cc< f } { cc< t } } f ] } - { cc<>= [ { { cc<>= f } } f ] } - { cc/< [ { { cc/< f } } f ] } - { cc/<= [ { { cc/<= f } } f ] } - { cc/> [ { { cc/< t } } f ] } - { cc/>= [ { { cc/<= t } } f ] } - { cc/= [ { { cc/= f } } f ] } - { cc/<> [ { { cc/= f } { cc/<>= f } } f ] } - { cc/<>= [ { { cc/<>= f } } f ] } - } case ; - -: %compare-int-vector-ccs ( cc -- ccs not? ) - order-cc { - { cc< [ { { cc> t } } f ] } - { cc<= [ { { cc> f } } t ] } - { cc> [ { { cc> f } } f ] } - { cc>= [ { { cc> t } } t ] } - { cc= [ { { cc= f } } f ] } - { cc/= [ { { cc= f } } t ] } - { t [ { } t ] } - { f [ { } f ] } - } case ; - -M: x86 %compare-vector-ccs - swap float-vector-rep? - [ %compare-float-vector-ccs ] - [ %compare-int-vector-ccs ] if ; - -:: %test-vector-mask ( dst temp mask vcc -- ) - vcc { - { vcc-any [ dst dst TEST dst temp \ CMOVNE (%boolean) ] } - { vcc-none [ dst dst TEST dst temp \ CMOVE (%boolean) ] } - { vcc-all [ dst mask CMP dst temp \ CMOVE (%boolean) ] } - { vcc-notall [ dst mask CMP dst temp \ CMOVNE (%boolean) ] } - } case ; - -: %move-vector-mask ( dst src rep -- mask ) - { - { double-2-rep [ MOVMSKPS HEX: f ] } - { float-4-rep [ MOVMSKPS HEX: f ] } - [ drop PMOVMSKB HEX: ffff ] - } case ; - -M:: x86 %test-vector ( dst src temp rep vcc -- ) - dst src rep %move-vector-mask :> mask - dst temp mask vcc %test-vector-mask ; - -:: %test-vector-mask-branch ( label temp mask vcc -- ) - vcc { - { vcc-any [ temp temp TEST label JNE ] } - { vcc-none [ temp temp TEST label JE ] } - { vcc-all [ temp mask CMP label JE ] } - { vcc-notall [ temp mask CMP label JNE ] } - } case ; - -M:: x86 %test-vector-branch ( label src temp rep vcc -- ) - temp src rep %move-vector-mask :> mask - label temp mask vcc %test-vector-mask-branch ; - -M: x86 %test-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %add-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ADDPS ] } - { double-2-rep [ ADDPD ] } - { char-16-rep [ PADDB ] } - { uchar-16-rep [ PADDB ] } - { short-8-rep [ PADDW ] } - { ushort-8-rep [ PADDW ] } - { int-4-rep [ PADDD ] } - { uint-4-rep [ PADDD ] } - { longlong-2-rep [ PADDQ ] } - { ulonglong-2-rep [ PADDQ ] } - } case ; - -M: x86 %add-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %saturated-add-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PADDSB ] } - { uchar-16-rep [ PADDUSB ] } - { short-8-rep [ PADDSW ] } - { ushort-8-rep [ PADDUSW ] } - } case ; - -M: x86 %saturated-add-vector-reps - { - { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } - } available-reps ; - -M: x86 %add-sub-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ADDSUBPS ] } - { double-2-rep [ ADDSUBPD ] } - } case ; - -M: x86 %add-sub-vector-reps - { - { sse3? { float-4-rep double-2-rep } } - } available-reps ; - -M: x86 %sub-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ SUBPS ] } - { double-2-rep [ SUBPD ] } - { char-16-rep [ PSUBB ] } - { uchar-16-rep [ PSUBB ] } - { short-8-rep [ PSUBW ] } - { ushort-8-rep [ PSUBW ] } - { int-4-rep [ PSUBD ] } - { uint-4-rep [ PSUBD ] } - { longlong-2-rep [ PSUBQ ] } - { ulonglong-2-rep [ PSUBQ ] } - } case ; - -M: x86 %sub-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %saturated-sub-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PSUBSB ] } - { uchar-16-rep [ PSUBUSB ] } - { short-8-rep [ PSUBSW ] } - { ushort-8-rep [ PSUBUSW ] } - } case ; - -M: x86 %saturated-sub-vector-reps - { - { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep } } - } available-reps ; - -M: x86 %mul-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ MULPS ] } - { double-2-rep [ MULPD ] } - { short-8-rep [ PMULLW ] } - { ushort-8-rep [ PMULLW ] } - { int-4-rep [ PMULLD ] } - { uint-4-rep [ PMULLD ] } - } case ; - -M: x86 %mul-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep short-8-rep ushort-8-rep } } - { sse4.1? { int-4-rep uint-4-rep } } - } available-reps ; - -M: x86 %mul-high-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { short-8-rep [ PMULHW ] } - { ushort-8-rep [ PMULHUW ] } - } case ; - -M: x86 %mul-high-vector-reps - { - { sse2? { short-8-rep ushort-8-rep } } - } available-reps ; - -M: x86 %mul-horizontal-add-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PMADDUBSW ] } - { uchar-16-rep [ PMADDUBSW ] } - { short-8-rep [ PMADDWD ] } - } case ; - -M: x86 %mul-horizontal-add-vector-reps - { - { sse2? { short-8-rep } } - { ssse3? { char-16-rep uchar-16-rep } } - } available-reps ; - -M: x86 %div-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ DIVPS ] } - { double-2-rep [ DIVPD ] } - } case ; - -M: x86 %div-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep } } - } available-reps ; - -M: x86 %min-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PMINSB ] } - { uchar-16-rep [ PMINUB ] } - { short-8-rep [ PMINSW ] } - { ushort-8-rep [ PMINUW ] } - { int-4-rep [ PMINSD ] } - { uint-4-rep [ PMINUD ] } - { float-4-rep [ MINPS ] } - { double-2-rep [ MINPD ] } - } case ; - -M: x86 %min-vector-reps - { - { sse? { float-4-rep } } - { sse2? { uchar-16-rep short-8-rep double-2-rep } } - { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } - } available-reps ; - -M: x86 %max-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { char-16-rep [ PMAXSB ] } - { uchar-16-rep [ PMAXUB ] } - { short-8-rep [ PMAXSW ] } - { ushort-8-rep [ PMAXUW ] } - { int-4-rep [ PMAXSD ] } - { uint-4-rep [ PMAXUD ] } - { float-4-rep [ MAXPS ] } - { double-2-rep [ MAXPD ] } - } case ; - -M: x86 %max-vector-reps - { - { sse? { float-4-rep } } - { sse2? { uchar-16-rep short-8-rep double-2-rep } } - { sse4.1? { char-16-rep ushort-8-rep int-4-rep uint-4-rep } } - } available-reps ; - -M: x86 %avg-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { uchar-16-rep [ PAVGB ] } - { ushort-8-rep [ PAVGW ] } - } case ; - -M: x86 %avg-vector-reps - { - { sse2? { uchar-16-rep ushort-8-rep } } - } available-reps ; - -M: x86 %dot-vector - [ two-operand ] keep - { - { float-4-rep [ HEX: ff DPPS ] } - { double-2-rep [ HEX: ff DPPD ] } - } case ; - -M: x86 %dot-vector-reps - { - { sse4.1? { float-4-rep double-2-rep } } - } available-reps ; - -M: x86 %sad-vector - [ two-operand ] keep - { - { uchar-16-rep [ PSADBW ] } - } case ; - -M: x86 %sad-vector-reps - { - { sse2? { uchar-16-rep } } - } available-reps ; - -M: x86 %horizontal-add-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - signed-rep { - { float-4-rep [ HADDPS ] } - { double-2-rep [ HADDPD ] } - { int-4-rep [ PHADDD ] } - { short-8-rep [ PHADDW ] } - } case ; - -M: x86 %horizontal-add-vector-reps - { - { sse3? { float-4-rep double-2-rep } } - { ssse3? { int-4-rep uint-4-rep short-8-rep ushort-8-rep } } - } available-reps ; - -M: x86 %horizontal-shl-vector-imm ( dst src1 src2 rep -- ) - two-operand PSLLDQ ; - -M: x86 %horizontal-shl-vector-imm-reps - { - { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } - } available-reps ; - -M: x86 %horizontal-shr-vector-imm ( dst src1 src2 rep -- ) - two-operand PSRLDQ ; - -M: x86 %horizontal-shr-vector-imm-reps - { - { sse2? { char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep float-4-rep double-2-rep } } - } available-reps ; - -M: x86 %abs-vector ( dst src rep -- ) - { - { char-16-rep [ PABSB ] } - { short-8-rep [ PABSW ] } - { int-4-rep [ PABSD ] } - } case ; - -M: x86 %abs-vector-reps - { - { ssse3? { char-16-rep short-8-rep int-4-rep } } - } available-reps ; - -M: x86 %sqrt-vector ( dst src rep -- ) - { - { float-4-rep [ SQRTPS ] } - { double-2-rep [ SQRTPD ] } - } case ; - -M: x86 %sqrt-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep } } - } available-reps ; - -M: x86 %and-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ANDPS ] } - { double-2-rep [ ANDPS ] } - [ drop PAND ] - } case ; - -M: x86 %and-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %andn-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ANDNPS ] } - { double-2-rep [ ANDNPS ] } - [ drop PANDN ] - } case ; - -M: x86 %andn-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %or-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ ORPS ] } - { double-2-rep [ ORPS ] } - [ drop POR ] - } case ; - -M: x86 %or-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %xor-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { float-4-rep [ XORPS ] } - { double-2-rep [ XORPS ] } - [ drop PXOR ] - } case ; - -M: x86 %xor-vector-reps - { - { sse? { float-4-rep } } - { sse2? { double-2-rep char-16-rep uchar-16-rep short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %shl-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { short-8-rep [ PSLLW ] } - { ushort-8-rep [ PSLLW ] } - { int-4-rep [ PSLLD ] } - { uint-4-rep [ PSLLD ] } - { longlong-2-rep [ PSLLQ ] } - { ulonglong-2-rep [ PSLLQ ] } - } case ; - -M: x86 %shl-vector-reps - { - { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep longlong-2-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %shr-vector ( dst src1 src2 rep -- ) - [ two-operand ] keep - { - { short-8-rep [ PSRAW ] } - { ushort-8-rep [ PSRLW ] } - { int-4-rep [ PSRAD ] } - { uint-4-rep [ PSRLD ] } - { ulonglong-2-rep [ PSRLQ ] } - } case ; - -M: x86 %shr-vector-reps - { - { sse2? { short-8-rep ushort-8-rep int-4-rep uint-4-rep ulonglong-2-rep } } - } available-reps ; - -M: x86 %shl-vector-imm %shl-vector ; -M: x86 %shl-vector-imm-reps %shl-vector-reps ; -M: x86 %shr-vector-imm %shr-vector ; -M: x86 %shr-vector-imm-reps %shr-vector-reps ; - -: scalar-sized-reg ( reg rep -- reg' ) - rep-size 8 * n-bit-version-of ; - -M: x86 %integer>scalar drop MOVD ; - -:: %scalar>integer-32 ( dst src rep -- ) - rep { - { int-scalar-rep [ - dst 32-bit-version-of src MOVD - dst dst 32-bit-version-of - 2dup eq? [ 2drop ] [ MOVSX ] if - ] } - { uint-scalar-rep [ - dst 32-bit-version-of src MOVD - ] } - { short-scalar-rep [ - dst 32-bit-version-of src MOVD - dst dst 16-bit-version-of MOVSX - ] } - { ushort-scalar-rep [ - dst 32-bit-version-of src MOVD - dst dst 16-bit-version-of MOVZX - ] } - { char-scalar-rep [ - dst 32-bit-version-of src MOVD - dst { } 8 [| tmp-dst | - tmp-dst dst int-rep %copy - tmp-dst tmp-dst 8-bit-version-of MOVSX - dst tmp-dst int-rep %copy - ] with-small-register - ] } - { uchar-scalar-rep [ - dst 32-bit-version-of src MOVD - dst { } 8 [| tmp-dst | - tmp-dst dst int-rep %copy - tmp-dst tmp-dst 8-bit-version-of MOVZX - dst tmp-dst int-rep %copy - ] with-small-register - ] } - } case ; - -M: x86.32 %scalar>integer ( dst src rep -- ) %scalar>integer-32 ; - -M: x86.64 %scalar>integer ( dst src rep -- ) - { - { longlong-scalar-rep [ MOVD ] } - { ulonglong-scalar-rep [ MOVD ] } - [ %scalar>integer-32 ] - } case ; - -M: x86 %vector>scalar %copy ; - -M: x86 %scalar>vector %copy ; - M:: x86 %spill ( src rep dst -- ) dst src rep %copy ; @@ -1590,18 +622,84 @@ M: x86 immediate-arithmetic? ( n -- ? ) M: x86 immediate-bitwise? ( n -- ? ) HEX: -80000000 HEX: 7fffffff between? ; +: %cmov-float= ( dst src -- ) + [ + "no-move" define-label + + "no-move" get [ JNE ] [ JP ] bi + MOV + "no-move" resolve-label + ] with-scope ; + +: %cmov-float/= ( dst src -- ) + [ + "no-move" define-label + "move" define-label + + "move" get JP + "no-move" get JE + "move" resolve-label + MOV + "no-move" resolve-label + ] with-scope ; + +:: (%compare-float) ( dst src1 src2 cc temp compare -- ) + cc { + { cc< [ src2 src1 \ compare call( a b -- ) dst temp \ CMOVA (%boolean) ] } + { cc<= [ src2 src1 \ compare call( a b -- ) dst temp \ CMOVAE (%boolean) ] } + { cc> [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVA (%boolean) ] } + { cc>= [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVAE (%boolean) ] } + { cc= [ src1 src2 \ compare call( a b -- ) dst temp \ %cmov-float= (%boolean) ] } + { cc<> [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVNE (%boolean) ] } + { cc<>= [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVNP (%boolean) ] } + { cc/< [ src2 src1 \ compare call( a b -- ) dst temp \ CMOVBE (%boolean) ] } + { cc/<= [ src2 src1 \ compare call( a b -- ) dst temp \ CMOVB (%boolean) ] } + { cc/> [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVBE (%boolean) ] } + { cc/>= [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVB (%boolean) ] } + { cc/= [ src1 src2 \ compare call( a b -- ) dst temp \ %cmov-float/= (%boolean) ] } + { cc/<> [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVE (%boolean) ] } + { cc/<>= [ src1 src2 \ compare call( a b -- ) dst temp \ CMOVP (%boolean) ] } + } case ; inline + +: %jump-float= ( label -- ) + [ + "no-jump" define-label + "no-jump" get JP + JE + "no-jump" resolve-label + ] with-scope ; + +: %jump-float/= ( label -- ) + [ JNE ] [ JP ] bi ; + +:: (%compare-float-branch) ( label src1 src2 cc compare -- ) + cc { + { cc< [ src2 src1 \ compare call( a b -- ) label JA ] } + { cc<= [ src2 src1 \ compare call( a b -- ) label JAE ] } + { cc> [ src1 src2 \ compare call( a b -- ) label JA ] } + { cc>= [ src1 src2 \ compare call( a b -- ) label JAE ] } + { cc= [ src1 src2 \ compare call( a b -- ) label %jump-float= ] } + { cc<> [ src1 src2 \ compare call( a b -- ) label JNE ] } + { cc<>= [ src1 src2 \ compare call( a b -- ) label JNP ] } + { cc/< [ src2 src1 \ compare call( a b -- ) label JBE ] } + { cc/<= [ src2 src1 \ compare call( a b -- ) label JB ] } + { cc/> [ src1 src2 \ compare call( a b -- ) label JBE ] } + { cc/>= [ src1 src2 \ compare call( a b -- ) label JB ] } + { cc/= [ src1 src2 \ compare call( a b -- ) label %jump-float/= ] } + { cc/<> [ src1 src2 \ compare call( a b -- ) label JE ] } + { cc/<>= [ src1 src2 \ compare call( a b -- ) label JP ] } + } case ; + enable-min/max enable-log2 enable-float-intrinsics enable-float-functions -enable-float-min/max enable-fsqrt : check-sse ( -- ) - [ { (sse-version) popcnt? } compile ] with-optimizer - sse-version 20 < [ - "Factor requires SSE2, which your CPU does not support." print - flush - 1 exit - ] when ; + "Checking for multimedia extensions... " write flush + [ { (sse-version) } compile ] with-optimizer + sse-version + [ sse-string " detected" append print ] + [ 20 < "cpu.x86.x87" "cpu.x86.sse" ? require ] bi ; diff --git a/basis/cpu/x86/x87/authors.txt b/basis/cpu/x86/x87/authors.txt new file mode 100644 index 0000000000..1901f27a24 --- /dev/null +++ b/basis/cpu/x86/x87/authors.txt @@ -0,0 +1 @@ +Slava Pestov diff --git a/basis/cpu/x86/x87/tags.txt b/basis/cpu/x86/x87/tags.txt new file mode 100644 index 0000000000..ebb74b4d5f --- /dev/null +++ b/basis/cpu/x86/x87/tags.txt @@ -0,0 +1 @@ +not loaded diff --git a/basis/cpu/x86/x87/x87.factor b/basis/cpu/x86/x87/x87.factor new file mode 100644 index 0000000000..8c920ea87b --- /dev/null +++ b/basis/cpu/x86/x87/x87.factor @@ -0,0 +1,91 @@ +! Copyright (C) 2010 Slava Pestov. +! See http://factorcode.org/license.txt for BSD license. +USING: alien.c-types combinators kernel locals system namespaces +compiler.codegen.fixup compiler.constants +compiler.cfg.comparisons cpu.architecture cpu.x86 +cpu.x86.assembler cpu.x86.assembler.operands ; +IN: cpu.x86.x87 + +! x87 unit is only used if SSE2 is not available. + +: FLD* ( src -- ) [ ST0 ] dip FLD ; +: FSTP* ( dst -- ) ST0 FSTP ; + +: copy-register-x87 ( dst src -- ) + 2dup eq? [ 2drop ] [ FLD* shuffle-down FSTP* ] if ; + +M: float-rep copy-register* drop copy-register-x87 ; +M: double-rep copy-register* drop copy-register-x87 ; + +: load-x87 ( dst src rep -- ) + { + { float-rep [ FLDS shuffle-down FSTP* ] } + { double-rep [ FLDL shuffle-down FSTP* ] } + } case ; + +: store-x87 ( dst src rep -- ) + { + { float-rep [ FLD* FSTPS ] } + { double-rep [ FLD* FSTPL ] } + } case ; + +: copy-memory-x87 ( dst src rep -- ) + { + { [ pick register? ] [ load-x87 ] } + { [ over register? ] [ store-x87 ] } + } cond ; + +M: float-rep copy-memory* copy-memory-x87 ; +M: double-rep copy-memory* copy-memory-x87 ; + +M: x86 %load-float + 0 [] FLDS + rc-absolute rel-binary-literal + shuffle-down FSTP* ; + +M: x86 %load-double + 0 [] FLDL + rc-absolute rel-binary-literal + shuffle-down FSTP* ; + +:: binary-op ( dst src1 src2 quot -- ) + src1 FLD* + ST0 src2 shuffle-down quot call + dst shuffle-down FSTP* ; inline + +M: x86 %add-float [ FADD ] binary-op ; +M: x86 %sub-float [ FSUB ] binary-op ; +M: x86 %mul-float [ FMUL ] binary-op ; +M: x86 %div-float [ FDIV ] binary-op ; + +M: x86 %sqrt FLD* FSQRT shuffle-down FSTP* ; + +M: x86 %single>double-float copy-register-x87 ; +M: x86 %double>single-float copy-register-x87 ; + +M: x86 integer-float-needs-stack-frame? t ; + +M:: x86 %integer>float ( dst src -- ) + 4 stack@ src MOV + 4 stack@ FILDD + dst shuffle-down FSTP* ; + +M:: x86 %float>integer ( dst src -- ) + src FLD* + 4 stack@ FISTTPD + dst 4 stack@ MOV ; + +: compare-op ( src1 src2 quot -- ) + [ ST0 ] 3dip binary-op ; inline + +M: x86 %compare-float-ordered ( dst src1 src2 cc temp -- ) + [ [ FCOMI ] compare-op ] (%compare-float) ; + +M: x86 %compare-float-unordered ( dst src1 src2 cc temp -- ) + [ [ FUCOMI ] compare-op ] (%compare-float) ; + +M: x86 %compare-float-ordered-branch ( label src1 src2 cc -- ) + [ [ FCOMI ] compare-op ] (%compare-float-branch) ; + +M: x86 %compare-float-unordered-branch ( label src1 src2 cc -- ) + [ [ FUCOMI ] compare-op ] (%compare-float-branch) ;