From 2b1a26228be743e7a9e4e33fd2d14acac82ecc87 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Tue, 3 Nov 2009 23:51:44 -0600 Subject: [PATCH] Align stack pointer on non-Mac OS X x86-32 platforms, and use aligned loads/stores for SIMD values --- basis/cpu/x86/32/32.factor | 3 --- basis/cpu/x86/x86.factor | 18 ++++++++---------- basis/math/vectors/simd/functor/functor.factor | 4 ++-- basis/math/vectors/simd/simd-tests.factor | 17 +++++++++++++++++ vm/cpu-x86.32.S | 2 -- vm/cpu-x86.64.S | 4 ---- vm/cpu-x86.S | 15 ++++++++++++--- 7 files changed, 39 insertions(+), 24 deletions(-) diff --git a/basis/cpu/x86/32/32.factor b/basis/cpu/x86/32/32.factor index cff5c561c8..8a29c82dad 100755 --- a/basis/cpu/x86/32/32.factor +++ b/basis/cpu/x86/32/32.factor @@ -11,9 +11,6 @@ cpu.x86.assembler cpu.x86.assembler.operands cpu.x86 cpu.architecture ; IN: cpu.x86.32 -! We implement the FFI for Linux, OS X and Windows all at once. -! OS X requires that the stack be 16-byte aligned. - M: x86.32 machine-registers { { int-regs { EAX ECX EDX EBP EBX } } diff --git a/basis/cpu/x86/x86.factor b/basis/cpu/x86/x86.factor index 5cd9ab2199..a63b92e050 100644 --- a/basis/cpu/x86/x86.factor +++ b/basis/cpu/x86/x86.factor @@ -45,8 +45,7 @@ HOOK: extra-stack-space cpu ( stack-frame -- n ) : incr-stack-reg ( n -- ) dup 0 = [ drop ] [ stack-reg swap ADD ] if ; -: align-stack ( n -- n' ) - os macosx? cpu x86.64? or [ 16 align ] when ; +: align-stack ( n -- n' ) 16 align ; M: x86 stack-frame-size ( stack-frame -- i ) [ (stack-frame-size) ] @@ -141,8 +140,10 @@ M: x86 %not int-rep one-operand NOT ; M: x86 %neg int-rep one-operand NEG ; M: x86 %log2 BSR ; +! A bit of logic to avoid using MOVSS/MOVSD for reg-reg moves +! since this induces partial register stalls GENERIC: copy-register* ( dst src rep -- ) -GENERIC: copy-unaligned* ( dst src rep -- ) +GENERIC: copy-memory* ( dst src rep -- ) M: int-rep copy-register* drop MOV ; M: tagged-rep copy-register* drop MOV ; @@ -152,17 +153,14 @@ M: float-4-rep copy-register* drop MOVAPS ; M: double-2-rep copy-register* drop MOVAPS ; M: vector-rep copy-register* drop MOVDQA ; -M: object copy-unaligned* copy-register* ; -M: float-rep copy-unaligned* drop MOVSS ; -M: double-rep copy-unaligned* drop MOVSD ; -M: float-4-rep copy-unaligned* drop MOVUPS ; -M: double-2-rep copy-unaligned* drop MOVUPS ; -M: vector-rep copy-unaligned* drop MOVDQU ; +M: object copy-memory* copy-register* ; +M: float-rep copy-memory* drop MOVSS ; +M: double-rep copy-memory* drop MOVSD ; M: x86 %copy ( dst src rep -- ) 2over eq? [ 3drop ] [ [ [ dup spill-slot? [ n>> spill@ ] when ] bi@ ] dip - 2over [ register? ] both? [ copy-register* ] [ copy-unaligned* ] if + 2over [ register? ] both? [ copy-register* ] [ copy-memory* ] if ] if ; M: x86 %fixnum-add ( label dst src1 src2 -- ) diff --git a/basis/math/vectors/simd/functor/functor.factor b/basis/math/vectors/simd/functor/functor.factor index 480981d165..44907df68e 100644 --- a/basis/math/vectors/simd/functor/functor.factor +++ b/basis/math/vectors/simd/functor/functor.factor @@ -146,7 +146,7 @@ TUPLE: simd class elt-class ops special-wrappers schema-wrappers ctor rep ; [ rep alien-vector class boa ] >>getter [ [ underlying>> ] 2dip rep set-alien-vector ] >>setter 16 >>size - 8 >>align + 16 >>align rep >>rep class c:typedef ; @@ -315,7 +315,7 @@ SLOT: underlying2 3bi ] >>setter 32 >>size - 8 >>align + 16 >>align rep >>rep class c:typedef ; diff --git a/basis/math/vectors/simd/simd-tests.factor b/basis/math/vectors/simd/simd-tests.factor index 7ba9f243ce..396b8da22a 100644 --- a/basis/math/vectors/simd/simd-tests.factor +++ b/basis/math/vectors/simd/simd-tests.factor @@ -582,3 +582,20 @@ STRUCT: simd-struct float-4{ 1.0 0.0 1.0 0.0 } pi [ broken 3array ] [ compile-call ] [ call ] 3bi = ] unit-test + +! Spilling SIMD values -- this basically just tests that the +! stack was aligned properly by the runtime + +: simd-spill-test-1 ( a b c -- v ) + { float-4 float-4 float } declare + [ v+ ] dip sin v*n ; + +[ float-4{ 0 0 0 0 } ] +[ float-4{ 1 2 3 4 } float-4{ 4 5 6 7 } 0.0 simd-spill-test-1 ] unit-test + +: simd-spill-test-2 ( a b d c -- v ) + { float float-4 float-4 float } declare + [ [ 3.0 + ] 2dip v+ ] dip sin v*n n*v ; + +[ float-4{ 0 0 0 0 } ] +[ 5.0 float-4{ 1 2 3 4 } float-4{ 4 5 6 7 } 0.0 simd-spill-test-2 ] unit-test diff --git a/vm/cpu-x86.32.S b/vm/cpu-x86.32.S index 2e85be0f81..c0532f0ece 100644 --- a/vm/cpu-x86.32.S +++ b/vm/cpu-x86.32.S @@ -19,11 +19,9 @@ #define PUSH_NONVOLATILE \ push %ebx ; \ - push %ebp ; \ push %ebp #define POP_NONVOLATILE \ - pop %ebp ; \ pop %ebp ; \ pop %ebx diff --git a/vm/cpu-x86.64.S b/vm/cpu-x86.64.S index 5e307f0500..8ccd703bfe 100644 --- a/vm/cpu-x86.64.S +++ b/vm/cpu-x86.64.S @@ -27,11 +27,9 @@ push %rdi ; \ push %rsi ; \ push %rbx ; \ - push %rbp ; \ push %rbp #define POP_NONVOLATILE \ - pop %rbp ; \ pop %rbp ; \ pop %rbx ; \ pop %rsi ; \ @@ -50,11 +48,9 @@ push %rbx ; \ push %rbp ; \ push %r12 ; \ - push %r13 ; \ push %r13 #define POP_NONVOLATILE \ - pop %r13 ; \ pop %r13 ; \ pop %r12 ; \ pop %rbp ; \ diff --git a/vm/cpu-x86.S b/vm/cpu-x86.S index 846300120d..411a0cdaa6 100644 --- a/vm/cpu-x86.S +++ b/vm/cpu-x86.S @@ -43,14 +43,20 @@ DEF(F_FASTCALL void,c_to_factor,(CELL quot, void *vm)): PUSH_NONVOLATILE mov ARG0,NV0 mov ARG1,NV1 - + + /* Save old stack pointer and align */ + mov STACK_REG,ARG0 + and $-16,STACK_REG + add $CELL_SIZE,STACK_REG + push ARG0 + /* Create register shadow area for Win64 */ sub $32,STACK_REG - + /* Save stack pointer */ lea -CELL_SIZE(STACK_REG),ARG0 call MANGLE(save_callstack_bottom) - + /* Call quot-xt */ mov NV0,ARG0 mov NV1,ARG1 @@ -59,6 +65,9 @@ DEF(F_FASTCALL void,c_to_factor,(CELL quot, void *vm)): /* Tear down register shadow area */ add $32,STACK_REG + /* Undo stack alignment */ + mov (STACK_REG),STACK_REG + POP_NONVOLATILE ret