Optimizing and cleaning up unicode.breaks and unicode.normalize
parent
1ed964e539
commit
fb25d04061
|
@ -4,7 +4,8 @@ USING: combinators.short-circuit unicode.categories kernel math
|
|||
combinators splitting sequences math.parser io.files io assocs
|
||||
arrays namespaces make math.ranges unicode.normalize.private values
|
||||
io.encodings.ascii unicode.syntax unicode.data compiler.units fry
|
||||
alien.syntax sets accessors interval-maps memoize locals words ;
|
||||
alien.syntax sets accessors interval-maps memoize locals words
|
||||
strings hints ;
|
||||
IN: unicode.breaks
|
||||
|
||||
<PRIVATE
|
||||
|
@ -58,38 +59,31 @@ SYMBOL: table
|
|||
: finish-table ( -- table )
|
||||
table get [ [ 1 = ] map ] map ;
|
||||
|
||||
: set-table ( class1 class2 val -- )
|
||||
: eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ;
|
||||
|
||||
: (set-table) ( class1 class2 val -- )
|
||||
-rot table get nth [ swap or ] change-nth ;
|
||||
|
||||
: set-table ( classes1 classes2 val -- )
|
||||
[ [ eval-seq ] bi@ ] dip
|
||||
[ [ (set-table) ] curry with each ] 2curry each ;
|
||||
|
||||
: connect ( class1 class2 -- ) 1 set-table ;
|
||||
: disconnect ( class1 class2 -- ) 0 set-table ;
|
||||
|
||||
: check-before ( class classes value -- )
|
||||
[ set-table ] curry with each ;
|
||||
|
||||
: check-after ( classes class value -- )
|
||||
[ set-table ] 2curry each ;
|
||||
|
||||
: connect-before ( class classes -- )
|
||||
1 check-before ;
|
||||
|
||||
: connect-after ( classes class -- )
|
||||
1 check-after ;
|
||||
|
||||
: break-around ( classes1 classes2 -- )
|
||||
[ [ 2dup disconnect swap disconnect ] with each ] curry each ;
|
||||
[ disconnect ] [ swap disconnect ] 2bi ;
|
||||
|
||||
: make-grapheme-table ( -- )
|
||||
CR LF connect
|
||||
Control CR LF 3array graphemes break-around
|
||||
L L V LV LVT 4array connect-before
|
||||
V V T 2array connect-before
|
||||
LV V T 2array connect-before
|
||||
T T connect
|
||||
LVT T connect
|
||||
graphemes Extend connect-after
|
||||
graphemes SpacingMark connect-after
|
||||
Prepend graphemes connect-before ;
|
||||
{ CR } { LF } connect
|
||||
{ Control CR LF } graphemes disconnect
|
||||
graphemes { Control CR LF } disconnect
|
||||
{ L } { L V LV LVT } connect
|
||||
{ LV V } { V T } connect
|
||||
{ LVT T } { T } connect
|
||||
graphemes { Extend } connect
|
||||
graphemes { SpacingMark } connect
|
||||
{ Prepend } graphemes connect ;
|
||||
|
||||
VALUE: grapheme-table
|
||||
|
||||
|
@ -99,14 +93,11 @@ VALUE: grapheme-table
|
|||
: chars ( i str n -- str[i] str[i+n] )
|
||||
swap [ dupd + ] dip [ ?nth ] curry bi@ ;
|
||||
|
||||
: find-index ( seq quot -- i ) find drop ; inline
|
||||
: find-last-index ( seq quot -- i ) find-last drop ; inline
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: first-grapheme ( str -- i )
|
||||
unclip-slice grapheme-class over
|
||||
[ grapheme-class tuck grapheme-break? ] find-index
|
||||
[ grapheme-class tuck grapheme-break? ] find drop
|
||||
nip swap length or 1+ ;
|
||||
|
||||
<PRIVATE
|
||||
|
@ -125,7 +116,7 @@ PRIVATE>
|
|||
|
||||
: last-grapheme ( str -- i )
|
||||
unclip-last-slice grapheme-class swap
|
||||
[ grapheme-class dup rot grapheme-break? ] find-last-index ?1+ nip ;
|
||||
[ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
|
||||
|
||||
<PRIVATE
|
||||
|
||||
|
@ -156,27 +147,23 @@ wMidNum wMidNumLet wNumeric wExtendNumLet words ;
|
|||
word-break-table interval-at
|
||||
word-break-classes at [ wOther ] unless* ;
|
||||
|
||||
: e ( seq -- seq ) [ execute ] map ;
|
||||
|
||||
SYMBOL: check-letter-before
|
||||
SYMBOL: check-letter-after
|
||||
SYMBOL: check-number-before
|
||||
SYMBOL: check-number-after
|
||||
|
||||
: make-word-table ( -- )
|
||||
wCR wLF connect
|
||||
{ wNewline wCR wLF } e words break-around
|
||||
wALetter dup connect
|
||||
wALetter { wMidLetter wMidNumLet } e check-letter-after check-before
|
||||
{ wMidLetter wMidNumLet } e wALetter check-letter-before check-after
|
||||
wNumeric dup connect
|
||||
wALetter wNumeric connect
|
||||
wNumeric wALetter connect
|
||||
wNumeric { wMidNum wMidNumLet } e check-number-after check-before
|
||||
{ wMidNum wMidNumLet } e wNumeric check-number-before check-after
|
||||
wKatakana dup connect
|
||||
{ wALetter wNumeric wKatakana wExtendNumLet } e wExtendNumLet
|
||||
[ connect-after ] [ swap connect-before ] 2bi ;
|
||||
{ wCR } { wLF } connect
|
||||
{ wNewline wCR wLF } words disconnect
|
||||
words { wNewline wCR wLF } disconnect
|
||||
{ wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
|
||||
{ wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
|
||||
{ wNumeric wALetter } { wNumeric wALetter } connect
|
||||
{ wNumeric } { wMidNum wMidNumLet } check-number-after set-table
|
||||
{ wMidNum wMidNumLet } { wNumeric } check-number-before set-table
|
||||
{ wKatakana } { wKatakana } connect
|
||||
{ wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
|
||||
[ connect ] [ swap connect ] 2bi ;
|
||||
|
||||
VALUE: word-table
|
||||
|
||||
|
@ -192,7 +179,7 @@ to: word-table
|
|||
: word-table-nth ( class1 class2 -- ? )
|
||||
word-table nth nth ;
|
||||
|
||||
:: property-not= ( i str property -- ? )
|
||||
:: property-not= ( str i property -- ? )
|
||||
i [
|
||||
i str ?nth [ word-break-prop property = not ]
|
||||
[ f ] if*
|
||||
|
@ -201,41 +188,49 @@ to: word-table
|
|||
: format/extended? ( ch -- ? )
|
||||
word-break-prop { 4 5 } member? ;
|
||||
|
||||
:: walk-up ( str i -- j )
|
||||
i 1 + str [ format/extended? not ] find-from drop
|
||||
[ 1+ str [ format/extended? not ] find-from drop ] [ f ] if* ;
|
||||
: (walk-up) ( str i -- j )
|
||||
swap [ format/extended? not ] find-from drop ;
|
||||
|
||||
:: walk-down ( str i -- j )
|
||||
i str [ format/extended? not ] find-last-from drop
|
||||
[ 1- str [ format/extended? not ] find-last-from drop ] [ f ] if* ;
|
||||
: walk-up ( str i -- j )
|
||||
dupd 1+ (walk-up) [ 1+ (walk-up) ] [ drop f ] if* ;
|
||||
|
||||
:: word-break? ( table-entry i str -- ? )
|
||||
table-entry {
|
||||
{ t [ f ] }
|
||||
{ f [ t ] }
|
||||
: (walk-down) ( str i -- j )
|
||||
swap [ format/extended? not ] find-last-from drop ;
|
||||
|
||||
: walk-down ( str i -- j )
|
||||
dupd (walk-down) [ 1- (walk-down) ] [ drop f ] if* ;
|
||||
|
||||
: word-break? ( table-entry i str -- ? )
|
||||
spin {
|
||||
{ t [ 2drop f ] }
|
||||
{ f [ 2drop t ] }
|
||||
{ check-letter-after
|
||||
[ str i walk-up str wALetter property-not= ] }
|
||||
[ dupd walk-up wALetter property-not= ] }
|
||||
{ check-letter-before
|
||||
[ str i walk-down str wALetter property-not= ] }
|
||||
[ dupd walk-down wALetter property-not= ] }
|
||||
{ check-number-after
|
||||
[ str i walk-up str wNumeric property-not= ] }
|
||||
[ dupd walk-up wNumeric property-not= ] }
|
||||
{ check-number-before
|
||||
[ str i walk-down str wNumeric property-not= ] }
|
||||
} case ;
|
||||
[ dupd walk-down wNumeric property-not= ] }
|
||||
} case ; inline
|
||||
|
||||
:: word-break-next ( old-class new-char i str -- next-class ? )
|
||||
new-char dup format/extended?
|
||||
[ drop old-class dup { 1 2 3 } member? ] [
|
||||
word-break-prop old-class over word-table-nth
|
||||
i str word-break?
|
||||
] if ;
|
||||
] if ; inline
|
||||
|
||||
PRIVATE>
|
||||
|
||||
:: first-word ( str -- i )
|
||||
str unclip-slice word-break-prop over <enum>
|
||||
[ swap str word-break-next ] assoc-find 2drop
|
||||
nip swap length or 1+ ;
|
||||
: first-word ( str -- i )
|
||||
[ unclip-slice word-break-prop over <enum> ] keep
|
||||
'[ swap _ word-break-next ] assoc-find 2drop
|
||||
nip swap length or 1+ ; inline
|
||||
|
||||
HINTS: first-word string ;
|
||||
|
||||
: >words ( str -- words )
|
||||
[ first-word ] >pieces ;
|
||||
|
||||
HINTS: >words string ;
|
||||
|
|
|
@ -134,10 +134,10 @@ HINTS: string-append string string ;
|
|||
TUPLE: compose-state i str char after last-class ;
|
||||
|
||||
: get-str ( state i -- ch )
|
||||
swap [ i>> + ] [ str>> ] bi ?nth ;
|
||||
: current ( state -- ch ) 0 get-str ;
|
||||
: to ( state -- state ) [ 1+ ] change-i ;
|
||||
: push-after ( ch state -- state ) [ ?push ] change-after ;
|
||||
swap [ i>> + ] [ str>> ] bi ?nth ; inline
|
||||
: current ( state -- ch ) 0 get-str ; inline
|
||||
: to ( state -- state ) [ 1+ ] change-i ; inline
|
||||
: push-after ( ch state -- state ) [ ?push ] change-after ; inline
|
||||
|
||||
:: try-compose ( state new-char current-class -- state )
|
||||
state last-class>> current-class =
|
||||
|
@ -147,13 +147,13 @@ TUPLE: compose-state i str char after last-class ;
|
|||
new-char state push-after
|
||||
current-class >>last-class
|
||||
] if*
|
||||
] if ;
|
||||
] if ; inline
|
||||
|
||||
DEFER: compose-iter
|
||||
|
||||
: try-noncombining ( char state -- state )
|
||||
tuck char>> swap combine-chars
|
||||
[ >>char to f >>last-class compose-iter ] when* ;
|
||||
[ >>char to f >>last-class compose-iter ] when* ; inline
|
||||
|
||||
: compose-iter ( state -- state )
|
||||
dup current [
|
||||
|
@ -164,7 +164,7 @@ DEFER: compose-iter
|
|||
[ drop ] [ swap try-noncombining ] if ] }
|
||||
[ try-compose to compose-iter ]
|
||||
} case
|
||||
] when* ;
|
||||
] when* ; inline recursive
|
||||
|
||||
: compose-combining ( ch str i -- str i )
|
||||
compose-state new
|
||||
|
@ -172,7 +172,7 @@ DEFER: compose-iter
|
|||
swap >>str
|
||||
swap >>char
|
||||
compose-iter
|
||||
{ [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ;
|
||||
{ [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ; inline
|
||||
|
||||
:: (compose) ( str i -- )
|
||||
i str ?nth [
|
||||
|
@ -180,11 +180,13 @@ DEFER: compose-iter
|
|||
i 1+ str ?nth combining-class
|
||||
[ str i 1+ compose-combining ] [ , str i 1+ ] if
|
||||
] if (compose)
|
||||
] when* ;
|
||||
] when* ; inline recursive
|
||||
|
||||
: combine ( str -- comp )
|
||||
[ pass-combining (compose) ] "" make ;
|
||||
|
||||
HINTS: combine string ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: nfc ( string -- nfc )
|
||||
|
|
Loading…
Reference in New Issue