287 lines
7.4 KiB
Factor
287 lines
7.4 KiB
Factor
! Copyright (C) 2008 Daniel Ehrenberg.
|
|
! See http://factorcode.org/license.txt for BSD license.
|
|
USING: accessors alien.syntax arrays assocs combinators
|
|
combinators.short-circuit compiler.units fry interval-maps io
|
|
io.encodings.ascii io.files kernel literals locals make math
|
|
math.parser math.ranges memoize namespaces sequences
|
|
sets simple-flat-file splitting unicode.categories
|
|
unicode.categories.syntax unicode.data unicode.normalize
|
|
unicode.normalize.private words ;
|
|
FROM: sequences => change-nth ;
|
|
IN: unicode.breaks
|
|
|
|
<PRIVATE
|
|
! Grapheme breaks
|
|
|
|
CONSTANT: Any 0
|
|
CONSTANT: L 1
|
|
CONSTANT: V 2
|
|
CONSTANT: T 3
|
|
CONSTANT: LV 4
|
|
CONSTANT: LVT 5
|
|
CONSTANT: Extend 6
|
|
CONSTANT: Control 7
|
|
CONSTANT: CR 8
|
|
CONSTANT: LF 9
|
|
CONSTANT: SpacingMark 10
|
|
CONSTANT: Prepend 11
|
|
CONSTANT: graphemes 12
|
|
|
|
: jamo-class ( ch -- class )
|
|
dup initial? [ drop L ]
|
|
[ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
|
|
|
|
: hangul-class ( ch -- class )
|
|
hangul-base - 0x1C mod zero? LV LVT ? ;
|
|
|
|
CATEGORY: grapheme-control Zl Zp Cc Cf ;
|
|
: control-class ( ch -- class )
|
|
{
|
|
{ CHAR: \r [ CR ] }
|
|
{ CHAR: \n [ LF ] }
|
|
{ 0x200C [ Extend ] }
|
|
{ 0x200D [ Extend ] }
|
|
[ drop Control ]
|
|
} case ;
|
|
|
|
CATEGORY: extend
|
|
Me Mn |
|
|
"Other_Grapheme_Extend" property? ;
|
|
|
|
: loe? ( ch -- ? )
|
|
"Logical_Order_Exception" property? ;
|
|
|
|
CATEGORY: spacing Mc ;
|
|
|
|
: grapheme-class ( ch -- class )
|
|
{
|
|
{ [ dup jamo? ] [ jamo-class ] }
|
|
{ [ dup hangul? ] [ hangul-class ] }
|
|
{ [ dup grapheme-control? ] [ control-class ] }
|
|
{ [ dup extend? ] [ drop Extend ] }
|
|
{ [ dup spacing? ] [ drop SpacingMark ] }
|
|
{ [ loe? ] [ Prepend ] }
|
|
[ Any ]
|
|
} cond ;
|
|
|
|
: init-table ( size -- table )
|
|
dup [ f <array> ] curry replicate ;
|
|
|
|
SYMBOL: table
|
|
|
|
: finish-table ( -- table )
|
|
table get [ [ 1 = ] map ] map ;
|
|
|
|
: eval-seq ( seq -- seq ) [ ?execute ] map ;
|
|
|
|
: (set-table) ( class1 class2 val -- )
|
|
[ table get nth ] dip '[ _ or ] change-nth ;
|
|
|
|
: set-table ( classes1 classes2 val -- )
|
|
[ [ eval-seq ] bi@ ] dip
|
|
[ [ (set-table) ] curry with each ] 2curry each ;
|
|
|
|
: connect ( class1 class2 -- ) 1 set-table ;
|
|
: disconnect ( class1 class2 -- ) 0 set-table ;
|
|
|
|
: make-grapheme-table ( -- )
|
|
{ CR } { LF } connect
|
|
{ Control CR LF } graphemes iota disconnect
|
|
graphemes iota { Control CR LF } disconnect
|
|
{ L } { L V LV LVT } connect
|
|
{ LV V } { V T } connect
|
|
{ LVT T } { T } connect
|
|
graphemes iota { Extend } connect
|
|
graphemes iota { SpacingMark } connect
|
|
{ Prepend } graphemes iota connect ;
|
|
|
|
SYMBOL: grapheme-table
|
|
|
|
: grapheme-break? ( class1 class2 -- ? )
|
|
grapheme-table get-global nth nth not ;
|
|
|
|
PRIVATE>
|
|
|
|
: first-grapheme ( str -- i )
|
|
unclip-slice grapheme-class over
|
|
[ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
|
|
nip swap length or 1 + ;
|
|
|
|
: first-grapheme-from ( start str -- i )
|
|
over tail-slice first-grapheme + ;
|
|
|
|
: last-grapheme ( str -- i )
|
|
unclip-last-slice grapheme-class swap
|
|
[ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
|
|
|
|
: last-grapheme-from ( end str -- i )
|
|
swap head-slice last-grapheme ;
|
|
|
|
<PRIVATE
|
|
|
|
: >pieces ( str quot: ( str -- i ) -- graphemes )
|
|
[ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
|
|
|
|
PRIVATE>
|
|
|
|
: >graphemes ( str -- graphemes )
|
|
[ first-grapheme ] >pieces ;
|
|
|
|
: string-reverse ( str -- rts )
|
|
>graphemes reverse! concat ;
|
|
|
|
<PRIVATE
|
|
|
|
graphemes init-table table
|
|
[ make-grapheme-table finish-table ] with-variable
|
|
grapheme-table set-global
|
|
|
|
! Word breaks
|
|
|
|
SYMBOL: word-break-table
|
|
|
|
"vocab:unicode/data/WordBreakProperty.txt" load-interval-file
|
|
word-break-table set-global
|
|
|
|
CONSTANT: wOther 0
|
|
CONSTANT: wCR 1
|
|
CONSTANT: wLF 2
|
|
CONSTANT: wNewline 3
|
|
CONSTANT: wExtend 4
|
|
CONSTANT: wFormat 5
|
|
CONSTANT: wKatakana 6
|
|
CONSTANT: wALetter 7
|
|
CONSTANT: wMidLetter 8
|
|
CONSTANT: wMidNum 9
|
|
CONSTANT: wMidNumLet 10
|
|
CONSTANT: wNumeric 11
|
|
CONSTANT: wExtendNumLet 12
|
|
CONSTANT: words 13
|
|
|
|
: word-break-classes ( -- table ) ! Is there a way to avoid this?
|
|
H{
|
|
{ "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
|
|
{ "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
|
|
{ "ALetter" 7 } { "MidLetter" 8 }
|
|
{ "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
|
|
{ "ExtendNumLet" 12 }
|
|
} ;
|
|
|
|
: word-break-prop ( char -- word-break-prop )
|
|
word-break-table get-global interval-at
|
|
word-break-classes at [ wOther ] unless* ;
|
|
|
|
SYMBOL: check-letter-before
|
|
SYMBOL: check-letter-after
|
|
SYMBOL: check-number-before
|
|
SYMBOL: check-number-after
|
|
|
|
: make-word-table ( -- )
|
|
{ wCR } { wLF } connect
|
|
{ wNewline wCR wLF } words iota disconnect
|
|
words iota { wNewline wCR wLF } disconnect
|
|
{ wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
|
|
{ wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
|
|
{ wNumeric wALetter } { wNumeric wALetter } connect
|
|
{ wNumeric } { wMidNum wMidNumLet } check-number-after set-table
|
|
{ wMidNum wMidNumLet } { wNumeric } check-number-before set-table
|
|
{ wKatakana } { wKatakana } connect
|
|
{ wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
|
|
[ connect ] [ swap connect ] 2bi ;
|
|
|
|
SYMBOL: word-table
|
|
|
|
: finish-word-table ( -- table )
|
|
table get [
|
|
[ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
|
|
] map ;
|
|
|
|
words init-table table
|
|
[ make-word-table finish-word-table ] with-variable
|
|
word-table set-global
|
|
|
|
: word-table-nth ( class1 class2 -- ? )
|
|
word-table get-global nth nth ;
|
|
|
|
:: property-not= ( str i property -- ? )
|
|
i [
|
|
i str ?nth [ word-break-prop property = not ]
|
|
[ f ] if*
|
|
] [ t ] if ;
|
|
|
|
: (format/extended?) ( class -- ? )
|
|
${ wExtend wFormat } member? ; inline
|
|
|
|
: format/extended? ( ch -- ? )
|
|
word-break-prop (format/extended?) ;
|
|
|
|
: (walk-up) ( str i -- j )
|
|
swap [ format/extended? not ] find-from drop ;
|
|
|
|
: walk-up ( str i -- j )
|
|
dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
|
|
|
|
: (walk-down) ( str i -- j )
|
|
swap [ format/extended? not ] find-last-from drop ;
|
|
|
|
: walk-down ( str i -- j )
|
|
dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
|
|
|
|
: word-break? ( str i table-entry -- ? )
|
|
{
|
|
{ t [ 2drop f ] }
|
|
{ f [ 2drop t ] }
|
|
{ check-letter-after
|
|
[ dupd walk-up wALetter property-not= ] }
|
|
{ check-letter-before
|
|
[ dupd walk-down wALetter property-not= ] }
|
|
{ check-number-after
|
|
[ dupd walk-up wNumeric property-not= ] }
|
|
{ check-number-before
|
|
[ dupd walk-down wNumeric property-not= ] }
|
|
} case ;
|
|
|
|
:: word-break-next ( old-class new-char i str -- next-class ? )
|
|
new-char word-break-prop :> new-class
|
|
new-class (format/extended?)
|
|
[ old-class dup ${ wCR wLF wNewline } member? ] [
|
|
new-class old-class over word-table-nth
|
|
[ str i 1 - ] dip word-break?
|
|
] if ;
|
|
|
|
PRIVATE>
|
|
|
|
: first-word ( str -- i )
|
|
[ [ length ] [ first word-break-prop ] bi ] keep
|
|
1 swap dup '[ _ word-break-next ] find-index-from
|
|
drop nip swap or ;
|
|
|
|
: >words ( str -- words )
|
|
[ first-word ] >pieces ;
|
|
|
|
<PRIVATE
|
|
|
|
: nth-next ( i str -- str[i-1] str[i] )
|
|
[ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
|
|
|
|
PRIVATE>
|
|
|
|
: word-break-at? ( i str -- ? )
|
|
{
|
|
[ drop zero? ]
|
|
[ length = ]
|
|
[
|
|
[ nth-next [ word-break-prop ] dip ] 2keep
|
|
word-break-next nip
|
|
]
|
|
} 2|| ;
|
|
|
|
: first-word-from ( start str -- i )
|
|
over tail-slice first-word + ;
|
|
|
|
: last-word ( str -- i )
|
|
[ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
|
|
|
|
: last-word-from ( end str -- i )
|
|
swap head-slice last-word ;
|