Optimizing and cleaning up unicode.breaks and unicode.normalize

db4
Daniel Ehrenberg 2009-01-09 15:53:35 -06:00
parent 1ed964e539
commit fb25d04061
2 changed files with 72 additions and 75 deletions

View File

@ -4,7 +4,8 @@ USING: combinators.short-circuit unicode.categories kernel math
combinators splitting sequences math.parser io.files io assocs
arrays namespaces make math.ranges unicode.normalize.private values
io.encodings.ascii unicode.syntax unicode.data compiler.units fry
alien.syntax sets accessors interval-maps memoize locals words ;
alien.syntax sets accessors interval-maps memoize locals words
strings hints ;
IN: unicode.breaks
<PRIVATE
@ -58,38 +59,31 @@ SYMBOL: table
: finish-table ( -- table )
table get [ [ 1 = ] map ] map ;
: set-table ( class1 class2 val -- )
: eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ;
: (set-table) ( class1 class2 val -- )
-rot table get nth [ swap or ] change-nth ;
: set-table ( classes1 classes2 val -- )
[ [ eval-seq ] bi@ ] dip
[ [ (set-table) ] curry with each ] 2curry each ;
: connect ( class1 class2 -- ) 1 set-table ;
: disconnect ( class1 class2 -- ) 0 set-table ;
: check-before ( class classes value -- )
[ set-table ] curry with each ;
: check-after ( classes class value -- )
[ set-table ] 2curry each ;
: connect-before ( class classes -- )
1 check-before ;
: connect-after ( classes class -- )
1 check-after ;
: break-around ( classes1 classes2 -- )
[ [ 2dup disconnect swap disconnect ] with each ] curry each ;
[ disconnect ] [ swap disconnect ] 2bi ;
: make-grapheme-table ( -- )
CR LF connect
Control CR LF 3array graphemes break-around
L L V LV LVT 4array connect-before
V V T 2array connect-before
LV V T 2array connect-before
T T connect
LVT T connect
graphemes Extend connect-after
graphemes SpacingMark connect-after
Prepend graphemes connect-before ;
{ CR } { LF } connect
{ Control CR LF } graphemes disconnect
graphemes { Control CR LF } disconnect
{ L } { L V LV LVT } connect
{ LV V } { V T } connect
{ LVT T } { T } connect
graphemes { Extend } connect
graphemes { SpacingMark } connect
{ Prepend } graphemes connect ;
VALUE: grapheme-table
@ -99,14 +93,11 @@ VALUE: grapheme-table
: chars ( i str n -- str[i] str[i+n] )
swap [ dupd + ] dip [ ?nth ] curry bi@ ;
: find-index ( seq quot -- i ) find drop ; inline
: find-last-index ( seq quot -- i ) find-last drop ; inline
PRIVATE>
: first-grapheme ( str -- i )
unclip-slice grapheme-class over
[ grapheme-class tuck grapheme-break? ] find-index
[ grapheme-class tuck grapheme-break? ] find drop
nip swap length or 1+ ;
<PRIVATE
@ -125,7 +116,7 @@ PRIVATE>
: last-grapheme ( str -- i )
unclip-last-slice grapheme-class swap
[ grapheme-class dup rot grapheme-break? ] find-last-index ?1+ nip ;
[ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
<PRIVATE
@ -156,27 +147,23 @@ wMidNum wMidNumLet wNumeric wExtendNumLet words ;
word-break-table interval-at
word-break-classes at [ wOther ] unless* ;
: e ( seq -- seq ) [ execute ] map ;
SYMBOL: check-letter-before
SYMBOL: check-letter-after
SYMBOL: check-number-before
SYMBOL: check-number-after
: make-word-table ( -- )
wCR wLF connect
{ wNewline wCR wLF } e words break-around
wALetter dup connect
wALetter { wMidLetter wMidNumLet } e check-letter-after check-before
{ wMidLetter wMidNumLet } e wALetter check-letter-before check-after
wNumeric dup connect
wALetter wNumeric connect
wNumeric wALetter connect
wNumeric { wMidNum wMidNumLet } e check-number-after check-before
{ wMidNum wMidNumLet } e wNumeric check-number-before check-after
wKatakana dup connect
{ wALetter wNumeric wKatakana wExtendNumLet } e wExtendNumLet
[ connect-after ] [ swap connect-before ] 2bi ;
{ wCR } { wLF } connect
{ wNewline wCR wLF } words disconnect
words { wNewline wCR wLF } disconnect
{ wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
{ wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
{ wNumeric wALetter } { wNumeric wALetter } connect
{ wNumeric } { wMidNum wMidNumLet } check-number-after set-table
{ wMidNum wMidNumLet } { wNumeric } check-number-before set-table
{ wKatakana } { wKatakana } connect
{ wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
[ connect ] [ swap connect ] 2bi ;
VALUE: word-table
@ -192,7 +179,7 @@ to: word-table
: word-table-nth ( class1 class2 -- ? )
word-table nth nth ;
:: property-not= ( i str property -- ? )
:: property-not= ( str i property -- ? )
i [
i str ?nth [ word-break-prop property = not ]
[ f ] if*
@ -201,41 +188,49 @@ to: word-table
: format/extended? ( ch -- ? )
word-break-prop { 4 5 } member? ;
:: walk-up ( str i -- j )
i 1 + str [ format/extended? not ] find-from drop
[ 1+ str [ format/extended? not ] find-from drop ] [ f ] if* ;
: (walk-up) ( str i -- j )
swap [ format/extended? not ] find-from drop ;
:: walk-down ( str i -- j )
i str [ format/extended? not ] find-last-from drop
[ 1- str [ format/extended? not ] find-last-from drop ] [ f ] if* ;
: walk-up ( str i -- j )
dupd 1+ (walk-up) [ 1+ (walk-up) ] [ drop f ] if* ;
:: word-break? ( table-entry i str -- ? )
table-entry {
{ t [ f ] }
{ f [ t ] }
: (walk-down) ( str i -- j )
swap [ format/extended? not ] find-last-from drop ;
: walk-down ( str i -- j )
dupd (walk-down) [ 1- (walk-down) ] [ drop f ] if* ;
: word-break? ( table-entry i str -- ? )
spin {
{ t [ 2drop f ] }
{ f [ 2drop t ] }
{ check-letter-after
[ str i walk-up str wALetter property-not= ] }
[ dupd walk-up wALetter property-not= ] }
{ check-letter-before
[ str i walk-down str wALetter property-not= ] }
[ dupd walk-down wALetter property-not= ] }
{ check-number-after
[ str i walk-up str wNumeric property-not= ] }
[ dupd walk-up wNumeric property-not= ] }
{ check-number-before
[ str i walk-down str wNumeric property-not= ] }
} case ;
[ dupd walk-down wNumeric property-not= ] }
} case ; inline
:: word-break-next ( old-class new-char i str -- next-class ? )
new-char dup format/extended?
[ drop old-class dup { 1 2 3 } member? ] [
word-break-prop old-class over word-table-nth
i str word-break?
] if ;
] if ; inline
PRIVATE>
:: first-word ( str -- i )
str unclip-slice word-break-prop over <enum>
[ swap str word-break-next ] assoc-find 2drop
nip swap length or 1+ ;
: first-word ( str -- i )
[ unclip-slice word-break-prop over <enum> ] keep
'[ swap _ word-break-next ] assoc-find 2drop
nip swap length or 1+ ; inline
HINTS: first-word string ;
: >words ( str -- words )
[ first-word ] >pieces ;
HINTS: >words string ;

View File

@ -134,10 +134,10 @@ HINTS: string-append string string ;
TUPLE: compose-state i str char after last-class ;
: get-str ( state i -- ch )
swap [ i>> + ] [ str>> ] bi ?nth ;
: current ( state -- ch ) 0 get-str ;
: to ( state -- state ) [ 1+ ] change-i ;
: push-after ( ch state -- state ) [ ?push ] change-after ;
swap [ i>> + ] [ str>> ] bi ?nth ; inline
: current ( state -- ch ) 0 get-str ; inline
: to ( state -- state ) [ 1+ ] change-i ; inline
: push-after ( ch state -- state ) [ ?push ] change-after ; inline
:: try-compose ( state new-char current-class -- state )
state last-class>> current-class =
@ -147,13 +147,13 @@ TUPLE: compose-state i str char after last-class ;
new-char state push-after
current-class >>last-class
] if*
] if ;
] if ; inline
DEFER: compose-iter
: try-noncombining ( char state -- state )
tuck char>> swap combine-chars
[ >>char to f >>last-class compose-iter ] when* ;
[ >>char to f >>last-class compose-iter ] when* ; inline
: compose-iter ( state -- state )
dup current [
@ -164,7 +164,7 @@ DEFER: compose-iter
[ drop ] [ swap try-noncombining ] if ] }
[ try-compose to compose-iter ]
} case
] when* ;
] when* ; inline recursive
: compose-combining ( ch str i -- str i )
compose-state new
@ -172,7 +172,7 @@ DEFER: compose-iter
swap >>str
swap >>char
compose-iter
{ [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ;
{ [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ; inline
:: (compose) ( str i -- )
i str ?nth [
@ -180,11 +180,13 @@ DEFER: compose-iter
i 1+ str ?nth combining-class
[ str i 1+ compose-combining ] [ , str i 1+ ] if
] if (compose)
] when* ;
] when* ; inline recursive
: combine ( str -- comp )
[ pass-combining (compose) ] "" make ;
HINTS: combine string ;
PRIVATE>
: nfc ( string -- nfc )