Align stack pointer on non-Mac OS X x86-32 platforms, and use aligned loads/stores for SIMD values

2009-11-03 23:51:44 -06:00 · 2009-11-03 23:51:44 -06:00 · 2b1a26228b
parent 3c4c05e915
commit 2b1a26228b
7 changed files with 39 additions and 24 deletions
--- a/basis/cpu/x86/32/32.factor
+++ b/basis/cpu/x86/32/32.factor
@ -11,9 +11,6 @@ cpu.x86.assembler cpu.x86.assembler.operands cpu.x86
 cpu.architecture ;
 IN: cpu.x86.32

-! We implement the FFI for Linux, OS X and Windows all at once.
-! OS X requires that the stack be 16-byte aligned.
-
 M: x86.32 machine-registers
    {
        { int-regs { EAX ECX EDX EBP EBX } }
--- a/basis/cpu/x86/x86.factor
+++ b/basis/cpu/x86/x86.factor
@ -45,8 +45,7 @@ HOOK: extra-stack-space cpu ( stack-frame -- n )
 : incr-stack-reg ( n -- )
    dup 0 = [ drop ] [ stack-reg swap ADD ] if ;

-: align-stack ( n -- n' )
-    os macosx? cpu x86.64? or [ 16 align ] when ;
+: align-stack ( n -- n' ) 16 align ;

 M: x86 stack-frame-size ( stack-frame -- i )
    [ (stack-frame-size) ]
@ -141,8 +140,10 @@ M: x86 %not     int-rep one-operand NOT ;
 M: x86 %neg     int-rep one-operand NEG ;
 M: x86 %log2    BSR ;

+! A bit of logic to avoid using MOVSS/MOVSD for reg-reg moves
+! since this induces partial register stalls
 GENERIC: copy-register* ( dst src rep -- )
-GENERIC: copy-unaligned* ( dst src rep -- )
+GENERIC: copy-memory* ( dst src rep -- )

 M: int-rep copy-register* drop MOV ;
 M: tagged-rep copy-register* drop MOV ;
@ -152,17 +153,14 @@ M: float-4-rep copy-register* drop MOVAPS ;
 M: double-2-rep copy-register* drop MOVAPS ;
 M: vector-rep copy-register* drop MOVDQA ;

-M: object copy-unaligned* copy-register* ;
-M: float-rep copy-unaligned* drop MOVSS ;
-M: double-rep copy-unaligned* drop MOVSD ;
-M: float-4-rep copy-unaligned* drop MOVUPS ;
-M: double-2-rep copy-unaligned* drop MOVUPS ;
-M: vector-rep copy-unaligned* drop MOVDQU ;
+M: object copy-memory* copy-register* ;
+M: float-rep copy-memory* drop MOVSS ;
+M: double-rep copy-memory* drop MOVSD ;

 M: x86 %copy ( dst src rep -- )
    2over eq? [ 3drop ] [
        [ [ dup spill-slot? [ n>> spill@ ] when ] bi@ ] dip
-        2over [ register? ] both? [ copy-register* ] [ copy-unaligned* ] if
+        2over [ register? ] both? [ copy-register* ] [ copy-memory* ] if
    ] if ;

 M: x86 %fixnum-add ( label dst src1 src2 -- )
--- a/basis/math/vectors/simd/functor/functor.factor
+++ b/basis/math/vectors/simd/functor/functor.factor
@ -146,7 +146,7 @@ TUPLE: simd class elt-class ops special-wrappers schema-wrappers ctor rep ;
        [ rep alien-vector class boa ] >>getter
        [ [ underlying>> ] 2dip rep set-alien-vector ] >>setter
        16 >>size
-        8 >>align
+        16 >>align
        rep >>rep
    class c:typedef ;

@ -315,7 +315,7 @@ SLOT: underlying2
            3bi
        ] >>setter
        32 >>size
-        8 >>align
+        16 >>align
        rep >>rep
    class c:typedef ;

--- a/basis/math/vectors/simd/simd-tests.factor
+++ b/basis/math/vectors/simd/simd-tests.factor
@ -582,3 +582,20 @@ STRUCT: simd-struct
    float-4{ 1.0 0.0 1.0 0.0 } pi [ broken 3array ]
    [ compile-call ] [ call ] 3bi =
 ] unit-test
+
+! Spilling SIMD values -- this basically just tests that the
+! stack was aligned properly by the runtime
+
+: simd-spill-test-1 ( a b c -- v )
+    { float-4 float-4 float } declare 
+    [ v+ ] dip sin v*n ;
+
+[ float-4{ 0 0 0 0 } ]
+[ float-4{ 1 2 3 4 } float-4{ 4 5 6 7 } 0.0 simd-spill-test-1 ] unit-test
+
+: simd-spill-test-2 ( a b d c -- v )
+    { float float-4 float-4 float } declare 
+    [ [ 3.0 + ] 2dip v+ ] dip sin v*n n*v ;
+
+[ float-4{ 0 0 0 0 } ]
+[ 5.0 float-4{ 1 2 3 4 } float-4{ 4 5 6 7 } 0.0 simd-spill-test-2 ] unit-test
--- a/vm/cpu-x86.32.S
+++ b/vm/cpu-x86.32.S
@ -19,11 +19,9 @@

 #define PUSH_NONVOLATILE \
 	push %ebx ; \
-	push %ebp ; \
 	push %ebp

 #define POP_NONVOLATILE \
-	pop %ebp ; \
 	pop %ebp ; \
 	pop %ebx

--- a/vm/cpu-x86.64.S
+++ b/vm/cpu-x86.64.S
@ -27,11 +27,9 @@
 		push %rdi ; \
 		push %rsi ; \
 		push %rbx ; \
-		push %rbp ; \
 		push %rbp

 	#define POP_NONVOLATILE \
-		pop %rbp ; \
 		pop %rbp ; \
 		pop %rbx ; \
 		pop %rsi ; \
@ -50,11 +48,9 @@
 		push %rbx ; \
 		push %rbp ; \
 		push %r12 ; \
-		push %r13 ; \
 		push %r13

 	#define POP_NONVOLATILE \
-		pop %r13 ; \
 		pop %r13 ; \
 		pop %r12 ; \
 		pop %rbp ; \
--- a/vm/cpu-x86.S
+++ b/vm/cpu-x86.S
@ -43,14 +43,20 @@ DEF(F_FASTCALL void,c_to_factor,(CELL quot, void *vm)):
 	PUSH_NONVOLATILE
 	mov ARG0,NV0
 	mov ARG1,NV1
-	
+
+    /* Save old stack pointer and align */
+    mov STACK_REG,ARG0
+    and $-16,STACK_REG
+    add $CELL_SIZE,STACK_REG
+    push ARG0
+
 	/* Create register shadow area for Win64 */
 	sub $32,STACK_REG
-	
+
 	/* Save stack pointer */
 	lea -CELL_SIZE(STACK_REG),ARG0
 	call MANGLE(save_callstack_bottom)
-	
+
 	/* Call quot-xt */
 	mov NV0,ARG0
 	mov NV1,ARG1
@ -59,6 +65,9 @@ DEF(F_FASTCALL void,c_to_factor,(CELL quot, void *vm)):
 	/* Tear down register shadow area */
 	add $32,STACK_REG

+    /* Undo stack alignment */
+    mov (STACK_REG),STACK_REG
+
 	POP_NONVOLATILE
 	ret