Optimizing and cleaning up unicode.breaks and unicode.normalize

db4
Daniel Ehrenberg 2009-01-09 15:53:35 -06:00
parent 1ed964e539
commit fb25d04061
2 changed files with 72 additions and 75 deletions

View File

@ -4,7 +4,8 @@ USING: combinators.short-circuit unicode.categories kernel math
combinators splitting sequences math.parser io.files io assocs combinators splitting sequences math.parser io.files io assocs
arrays namespaces make math.ranges unicode.normalize.private values arrays namespaces make math.ranges unicode.normalize.private values
io.encodings.ascii unicode.syntax unicode.data compiler.units fry io.encodings.ascii unicode.syntax unicode.data compiler.units fry
alien.syntax sets accessors interval-maps memoize locals words ; alien.syntax sets accessors interval-maps memoize locals words
strings hints ;
IN: unicode.breaks IN: unicode.breaks
<PRIVATE <PRIVATE
@ -58,38 +59,31 @@ SYMBOL: table
: finish-table ( -- table ) : finish-table ( -- table )
table get [ [ 1 = ] map ] map ; table get [ [ 1 = ] map ] map ;
: set-table ( class1 class2 val -- ) : eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ;
: (set-table) ( class1 class2 val -- )
-rot table get nth [ swap or ] change-nth ; -rot table get nth [ swap or ] change-nth ;
: set-table ( classes1 classes2 val -- )
[ [ eval-seq ] bi@ ] dip
[ [ (set-table) ] curry with each ] 2curry each ;
: connect ( class1 class2 -- ) 1 set-table ; : connect ( class1 class2 -- ) 1 set-table ;
: disconnect ( class1 class2 -- ) 0 set-table ; : disconnect ( class1 class2 -- ) 0 set-table ;
: check-before ( class classes value -- )
[ set-table ] curry with each ;
: check-after ( classes class value -- )
[ set-table ] 2curry each ;
: connect-before ( class classes -- )
1 check-before ;
: connect-after ( classes class -- )
1 check-after ;
: break-around ( classes1 classes2 -- ) : break-around ( classes1 classes2 -- )
[ [ 2dup disconnect swap disconnect ] with each ] curry each ; [ disconnect ] [ swap disconnect ] 2bi ;
: make-grapheme-table ( -- ) : make-grapheme-table ( -- )
CR LF connect { CR } { LF } connect
Control CR LF 3array graphemes break-around { Control CR LF } graphemes disconnect
L L V LV LVT 4array connect-before graphemes { Control CR LF } disconnect
V V T 2array connect-before { L } { L V LV LVT } connect
LV V T 2array connect-before { LV V } { V T } connect
T T connect { LVT T } { T } connect
LVT T connect graphemes { Extend } connect
graphemes Extend connect-after graphemes { SpacingMark } connect
graphemes SpacingMark connect-after { Prepend } graphemes connect ;
Prepend graphemes connect-before ;
VALUE: grapheme-table VALUE: grapheme-table
@ -99,14 +93,11 @@ VALUE: grapheme-table
: chars ( i str n -- str[i] str[i+n] ) : chars ( i str n -- str[i] str[i+n] )
swap [ dupd + ] dip [ ?nth ] curry bi@ ; swap [ dupd + ] dip [ ?nth ] curry bi@ ;
: find-index ( seq quot -- i ) find drop ; inline
: find-last-index ( seq quot -- i ) find-last drop ; inline
PRIVATE> PRIVATE>
: first-grapheme ( str -- i ) : first-grapheme ( str -- i )
unclip-slice grapheme-class over unclip-slice grapheme-class over
[ grapheme-class tuck grapheme-break? ] find-index [ grapheme-class tuck grapheme-break? ] find drop
nip swap length or 1+ ; nip swap length or 1+ ;
<PRIVATE <PRIVATE
@ -125,7 +116,7 @@ PRIVATE>
: last-grapheme ( str -- i ) : last-grapheme ( str -- i )
unclip-last-slice grapheme-class swap unclip-last-slice grapheme-class swap
[ grapheme-class dup rot grapheme-break? ] find-last-index ?1+ nip ; [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
<PRIVATE <PRIVATE
@ -156,27 +147,23 @@ wMidNum wMidNumLet wNumeric wExtendNumLet words ;
word-break-table interval-at word-break-table interval-at
word-break-classes at [ wOther ] unless* ; word-break-classes at [ wOther ] unless* ;
: e ( seq -- seq ) [ execute ] map ;
SYMBOL: check-letter-before SYMBOL: check-letter-before
SYMBOL: check-letter-after SYMBOL: check-letter-after
SYMBOL: check-number-before SYMBOL: check-number-before
SYMBOL: check-number-after SYMBOL: check-number-after
: make-word-table ( -- ) : make-word-table ( -- )
wCR wLF connect { wCR } { wLF } connect
{ wNewline wCR wLF } e words break-around { wNewline wCR wLF } words disconnect
wALetter dup connect words { wNewline wCR wLF } disconnect
wALetter { wMidLetter wMidNumLet } e check-letter-after check-before { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
{ wMidLetter wMidNumLet } e wALetter check-letter-before check-after { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
wNumeric dup connect { wNumeric wALetter } { wNumeric wALetter } connect
wALetter wNumeric connect { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
wNumeric wALetter connect { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
wNumeric { wMidNum wMidNumLet } e check-number-after check-before { wKatakana } { wKatakana } connect
{ wMidNum wMidNumLet } e wNumeric check-number-before check-after { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
wKatakana dup connect [ connect ] [ swap connect ] 2bi ;
{ wALetter wNumeric wKatakana wExtendNumLet } e wExtendNumLet
[ connect-after ] [ swap connect-before ] 2bi ;
VALUE: word-table VALUE: word-table
@ -192,7 +179,7 @@ to: word-table
: word-table-nth ( class1 class2 -- ? ) : word-table-nth ( class1 class2 -- ? )
word-table nth nth ; word-table nth nth ;
:: property-not= ( i str property -- ? ) :: property-not= ( str i property -- ? )
i [ i [
i str ?nth [ word-break-prop property = not ] i str ?nth [ word-break-prop property = not ]
[ f ] if* [ f ] if*
@ -201,41 +188,49 @@ to: word-table
: format/extended? ( ch -- ? ) : format/extended? ( ch -- ? )
word-break-prop { 4 5 } member? ; word-break-prop { 4 5 } member? ;
:: walk-up ( str i -- j ) : (walk-up) ( str i -- j )
i 1 + str [ format/extended? not ] find-from drop swap [ format/extended? not ] find-from drop ;
[ 1+ str [ format/extended? not ] find-from drop ] [ f ] if* ;
:: walk-down ( str i -- j ) : walk-up ( str i -- j )
i str [ format/extended? not ] find-last-from drop dupd 1+ (walk-up) [ 1+ (walk-up) ] [ drop f ] if* ;
[ 1- str [ format/extended? not ] find-last-from drop ] [ f ] if* ;
:: word-break? ( table-entry i str -- ? ) : (walk-down) ( str i -- j )
table-entry { swap [ format/extended? not ] find-last-from drop ;
{ t [ f ] }
{ f [ t ] } : walk-down ( str i -- j )
dupd (walk-down) [ 1- (walk-down) ] [ drop f ] if* ;
: word-break? ( table-entry i str -- ? )
spin {
{ t [ 2drop f ] }
{ f [ 2drop t ] }
{ check-letter-after { check-letter-after
[ str i walk-up str wALetter property-not= ] } [ dupd walk-up wALetter property-not= ] }
{ check-letter-before { check-letter-before
[ str i walk-down str wALetter property-not= ] } [ dupd walk-down wALetter property-not= ] }
{ check-number-after { check-number-after
[ str i walk-up str wNumeric property-not= ] } [ dupd walk-up wNumeric property-not= ] }
{ check-number-before { check-number-before
[ str i walk-down str wNumeric property-not= ] } [ dupd walk-down wNumeric property-not= ] }
} case ; } case ; inline
:: word-break-next ( old-class new-char i str -- next-class ? ) :: word-break-next ( old-class new-char i str -- next-class ? )
new-char dup format/extended? new-char dup format/extended?
[ drop old-class dup { 1 2 3 } member? ] [ [ drop old-class dup { 1 2 3 } member? ] [
word-break-prop old-class over word-table-nth word-break-prop old-class over word-table-nth
i str word-break? i str word-break?
] if ; ] if ; inline
PRIVATE> PRIVATE>
:: first-word ( str -- i ) : first-word ( str -- i )
str unclip-slice word-break-prop over <enum> [ unclip-slice word-break-prop over <enum> ] keep
[ swap str word-break-next ] assoc-find 2drop '[ swap _ word-break-next ] assoc-find 2drop
nip swap length or 1+ ; nip swap length or 1+ ; inline
HINTS: first-word string ;
: >words ( str -- words ) : >words ( str -- words )
[ first-word ] >pieces ; [ first-word ] >pieces ;
HINTS: >words string ;

View File

@ -134,10 +134,10 @@ HINTS: string-append string string ;
TUPLE: compose-state i str char after last-class ; TUPLE: compose-state i str char after last-class ;
: get-str ( state i -- ch ) : get-str ( state i -- ch )
swap [ i>> + ] [ str>> ] bi ?nth ; swap [ i>> + ] [ str>> ] bi ?nth ; inline
: current ( state -- ch ) 0 get-str ; : current ( state -- ch ) 0 get-str ; inline
: to ( state -- state ) [ 1+ ] change-i ; : to ( state -- state ) [ 1+ ] change-i ; inline
: push-after ( ch state -- state ) [ ?push ] change-after ; : push-after ( ch state -- state ) [ ?push ] change-after ; inline
:: try-compose ( state new-char current-class -- state ) :: try-compose ( state new-char current-class -- state )
state last-class>> current-class = state last-class>> current-class =
@ -147,13 +147,13 @@ TUPLE: compose-state i str char after last-class ;
new-char state push-after new-char state push-after
current-class >>last-class current-class >>last-class
] if* ] if*
] if ; ] if ; inline
DEFER: compose-iter DEFER: compose-iter
: try-noncombining ( char state -- state ) : try-noncombining ( char state -- state )
tuck char>> swap combine-chars tuck char>> swap combine-chars
[ >>char to f >>last-class compose-iter ] when* ; [ >>char to f >>last-class compose-iter ] when* ; inline
: compose-iter ( state -- state ) : compose-iter ( state -- state )
dup current [ dup current [
@ -164,7 +164,7 @@ DEFER: compose-iter
[ drop ] [ swap try-noncombining ] if ] } [ drop ] [ swap try-noncombining ] if ] }
[ try-compose to compose-iter ] [ try-compose to compose-iter ]
} case } case
] when* ; ] when* ; inline recursive
: compose-combining ( ch str i -- str i ) : compose-combining ( ch str i -- str i )
compose-state new compose-state new
@ -172,7 +172,7 @@ DEFER: compose-iter
swap >>str swap >>str
swap >>char swap >>char
compose-iter compose-iter
{ [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ; { [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ; inline
:: (compose) ( str i -- ) :: (compose) ( str i -- )
i str ?nth [ i str ?nth [
@ -180,11 +180,13 @@ DEFER: compose-iter
i 1+ str ?nth combining-class i 1+ str ?nth combining-class
[ str i 1+ compose-combining ] [ , str i 1+ ] if [ str i 1+ compose-combining ] [ , str i 1+ ] if
] if (compose) ] if (compose)
] when* ; ] when* ; inline recursive
: combine ( str -- comp ) : combine ( str -- comp )
[ pass-combining (compose) ] "" make ; [ pass-combining (compose) ] "" make ;
HINTS: combine string ;
PRIVATE> PRIVATE>
: nfc ( string -- nfc ) : nfc ( string -- nfc )