2008-10-05 19:36:56 -04:00
|
|
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
|
|
|
! See http://factorcode.org/license.txt for BSD license.
|
2008-09-10 23:11:40 -04:00
|
|
|
USING: combinators.short-circuit unicode.categories kernel math
|
|
|
|
combinators splitting sequences math.parser io.files io assocs
|
2009-01-07 18:59:01 -05:00
|
|
|
arrays namespaces make math.ranges unicode.normalize.private values
|
2008-09-10 23:11:40 -04:00
|
|
|
io.encodings.ascii unicode.syntax unicode.data compiler.units
|
2009-01-07 13:23:07 -05:00
|
|
|
alien.syntax sets accessors interval-maps memoize locals words ;
|
2007-09-20 18:09:08 -04:00
|
|
|
IN: unicode.breaks
|
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
<PRIVATE
|
2009-01-07 13:23:07 -05:00
|
|
|
! Grapheme breaks
|
|
|
|
|
2009-01-06 11:19:19 -05:00
|
|
|
C-ENUM: Any L V T LV LVT Extend Control CR LF
|
|
|
|
SpacingMark Prepend graphemes ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: jamo-class ( ch -- class )
|
|
|
|
dup initial? [ drop L ]
|
|
|
|
[ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
|
|
|
|
|
2009-01-06 11:19:19 -05:00
|
|
|
: hangul-class ( ch -- class )
|
|
|
|
hangul-base - HEX: 1C mod zero? LV LVT ? ;
|
|
|
|
|
2007-09-20 18:09:08 -04:00
|
|
|
CATEGORY: grapheme-control Zl Zp Cc Cf ;
|
|
|
|
: control-class ( ch -- class )
|
|
|
|
{
|
|
|
|
{ CHAR: \r [ CR ] }
|
|
|
|
{ CHAR: \n [ LF ] }
|
|
|
|
{ HEX: 200C [ Extend ] }
|
|
|
|
{ HEX: 200D [ Extend ] }
|
|
|
|
[ drop Control ]
|
|
|
|
} case ;
|
|
|
|
|
|
|
|
CATEGORY: (extend) Me Mn ;
|
|
|
|
: extend? ( ch -- ? )
|
2008-06-13 02:51:46 -04:00
|
|
|
{ [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2009-01-06 11:19:19 -05:00
|
|
|
: loe? ( ch -- ? )
|
|
|
|
"Logical_Order_Exception" property? ;
|
|
|
|
|
|
|
|
CATEGORY: spacing Mc ;
|
|
|
|
|
2007-09-20 18:09:08 -04:00
|
|
|
: grapheme-class ( ch -- class )
|
|
|
|
{
|
|
|
|
{ [ dup jamo? ] [ jamo-class ] }
|
2009-01-06 11:19:19 -05:00
|
|
|
{ [ dup hangul? ] [ hangul-class ] }
|
2007-09-20 18:09:08 -04:00
|
|
|
{ [ dup grapheme-control? ] [ control-class ] }
|
2009-01-06 11:19:19 -05:00
|
|
|
{ [ dup extend? ] [ drop Extend ] }
|
|
|
|
{ [ dup spacing? ] [ drop SpacingMark ] }
|
|
|
|
{ [ loe? ] [ Prepend ] }
|
2008-04-11 13:54:51 -04:00
|
|
|
[ Any ]
|
2007-09-20 18:09:08 -04:00
|
|
|
} cond ;
|
|
|
|
|
2009-01-07 13:23:07 -05:00
|
|
|
: init-table ( size -- table )
|
|
|
|
dup [ f <array> ] curry replicate ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
SYMBOL: table
|
|
|
|
|
|
|
|
: finish-table ( -- table )
|
|
|
|
table get [ [ 1 = ] map ] map ;
|
|
|
|
|
|
|
|
: set-table ( class1 class2 val -- )
|
|
|
|
-rot table get nth [ swap or ] change-nth ;
|
|
|
|
|
|
|
|
: connect ( class1 class2 -- ) 1 set-table ;
|
|
|
|
: disconnect ( class1 class2 -- ) 0 set-table ;
|
|
|
|
|
2009-01-07 13:23:07 -05:00
|
|
|
: check-before ( class classes value -- )
|
|
|
|
[ set-table ] curry with each ;
|
|
|
|
|
|
|
|
: check-after ( classes class value -- )
|
|
|
|
[ set-table ] 2curry each ;
|
|
|
|
|
2007-09-20 18:09:08 -04:00
|
|
|
: connect-before ( class classes -- )
|
2009-01-07 13:23:07 -05:00
|
|
|
1 check-before ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: connect-after ( classes class -- )
|
2009-01-07 13:23:07 -05:00
|
|
|
1 check-after ;
|
|
|
|
|
2007-09-20 18:09:08 -04:00
|
|
|
: break-around ( classes1 classes2 -- )
|
2008-01-09 17:36:30 -05:00
|
|
|
[ [ 2dup disconnect swap disconnect ] with each ] curry each ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: make-grapheme-table ( -- )
|
|
|
|
CR LF connect
|
2008-01-27 19:44:26 -05:00
|
|
|
Control CR LF 3array graphemes break-around
|
2009-01-06 11:19:19 -05:00
|
|
|
L L V LV LVT 4array connect-before
|
2008-01-27 19:44:26 -05:00
|
|
|
V V T 2array connect-before
|
2009-01-06 11:19:19 -05:00
|
|
|
LV V T 2array connect-before
|
2007-09-20 18:09:08 -04:00
|
|
|
T T connect
|
2009-01-06 11:19:19 -05:00
|
|
|
LVT T connect
|
|
|
|
graphemes Extend connect-after
|
|
|
|
graphemes SpacingMark connect-after
|
|
|
|
Prepend graphemes connect-before ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-01-29 14:33:14 -05:00
|
|
|
VALUE: grapheme-table
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: grapheme-break? ( class1 class2 -- ? )
|
|
|
|
grapheme-table nth nth not ;
|
|
|
|
|
|
|
|
: chars ( i str n -- str[i] str[i+n] )
|
2008-12-04 11:19:18 -05:00
|
|
|
swap [ dupd + ] dip [ ?nth ] curry bi@ ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-01-28 00:54:38 -05:00
|
|
|
: find-index ( seq quot -- i ) find drop ; inline
|
|
|
|
: find-last-index ( seq quot -- i ) find-last drop ; inline
|
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
PRIVATE>
|
|
|
|
|
2008-01-28 00:54:38 -05:00
|
|
|
: first-grapheme ( str -- i )
|
|
|
|
unclip-slice grapheme-class over
|
|
|
|
[ grapheme-class tuck grapheme-break? ] find-index
|
|
|
|
nip swap length or 1+ ;
|
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
<PRIVATE
|
|
|
|
|
2009-01-07 13:23:07 -05:00
|
|
|
:: (>pieces) ( str quot -- )
|
|
|
|
str [
|
|
|
|
dup quot call cut-slice
|
|
|
|
swap , quot (>pieces)
|
2009-01-07 16:08:08 -05:00
|
|
|
] unless-empty ; inline recursive
|
2008-01-28 00:54:38 -05:00
|
|
|
|
2009-01-07 13:23:07 -05:00
|
|
|
: >pieces ( str quot -- graphemes )
|
2009-01-07 16:08:08 -05:00
|
|
|
[ (>pieces) ] { } make ; inline
|
2009-01-07 13:23:07 -05:00
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
PRIVATE>
|
|
|
|
|
2007-09-20 18:09:08 -04:00
|
|
|
: >graphemes ( str -- graphemes )
|
2009-01-07 13:23:07 -05:00
|
|
|
[ first-grapheme ] >pieces ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: string-reverse ( str -- rts )
|
|
|
|
>graphemes reverse concat ;
|
|
|
|
|
2008-01-28 00:54:38 -05:00
|
|
|
: last-grapheme ( str -- i )
|
|
|
|
unclip-last-slice grapheme-class swap
|
2008-04-13 04:52:40 -04:00
|
|
|
[ grapheme-class dup rot grapheme-break? ] find-last-index ?1+ nip ;
|
2008-01-10 22:03:34 -05:00
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
<PRIVATE
|
|
|
|
|
2009-01-07 13:23:07 -05:00
|
|
|
graphemes init-table table
|
2008-05-24 18:34:01 -04:00
|
|
|
[ make-grapheme-table finish-table ] with-variable
|
2008-09-28 01:40:41 -04:00
|
|
|
to: grapheme-table
|
2009-01-07 13:23:07 -05:00
|
|
|
|
|
|
|
! Word breaks
|
|
|
|
|
|
|
|
VALUE: word-break-table
|
|
|
|
|
|
|
|
"resource:basis/unicode/data/WordBreakProperty.txt" load-script
|
|
|
|
to: word-break-table
|
|
|
|
|
|
|
|
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
|
|
|
|
wMidNum wMidNumLet wNumeric wExtendNumLet words ;
|
|
|
|
|
2009-01-07 16:08:08 -05:00
|
|
|
: word-break-classes ( -- table ) ! Is there a way to avoid this?
|
2009-01-07 13:23:07 -05:00
|
|
|
H{
|
2009-01-07 16:08:08 -05:00
|
|
|
{ "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
|
|
|
|
{ "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
|
|
|
|
{ "ALetter" 7 } { "MidLetter" 8 }
|
|
|
|
{ "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
|
|
|
|
{ "ExtendNumLet" 12 }
|
|
|
|
} ;
|
2009-01-07 13:23:07 -05:00
|
|
|
|
|
|
|
: word-break-prop ( char -- word-break-prop )
|
|
|
|
word-break-table interval-at
|
|
|
|
word-break-classes at [ wOther ] unless* ;
|
|
|
|
|
|
|
|
: e ( seq -- seq ) [ execute ] map ;
|
|
|
|
|
|
|
|
SYMBOL: check-letter-before
|
|
|
|
SYMBOL: check-letter-after
|
|
|
|
SYMBOL: check-number-before
|
|
|
|
SYMBOL: check-number-after
|
|
|
|
|
|
|
|
: make-word-table ( -- )
|
|
|
|
wCR wLF connect
|
|
|
|
{ wNewline wCR wLF } e words break-around
|
|
|
|
wALetter dup connect
|
|
|
|
wALetter { wMidLetter wMidNumLet } e check-letter-after check-before
|
|
|
|
{ wMidLetter wMidNumLet } e wALetter check-letter-before check-after
|
|
|
|
wNumeric dup connect
|
|
|
|
wALetter wNumeric connect
|
|
|
|
wNumeric wALetter connect
|
|
|
|
wNumeric { wMidNum wMidNumLet } e check-number-after check-before
|
|
|
|
{ wMidNum wMidNumLet } e wNumeric check-number-before check-after
|
|
|
|
wKatakana dup connect
|
|
|
|
{ wALetter wNumeric wKatakana wExtendNumLet } e wExtendNumLet
|
|
|
|
[ connect-after ] [ swap connect-before ] 2bi ;
|
|
|
|
|
|
|
|
VALUE: word-table
|
|
|
|
|
|
|
|
: finish-word-table ( -- table )
|
|
|
|
table get [
|
|
|
|
[ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
|
|
|
|
] map ;
|
|
|
|
|
|
|
|
words init-table table
|
|
|
|
[ make-word-table finish-word-table ] with-variable
|
|
|
|
to: word-table
|
|
|
|
|
2009-01-07 16:08:08 -05:00
|
|
|
: word-table-nth ( class1 class2 -- ? )
|
|
|
|
word-table nth nth ;
|
|
|
|
|
|
|
|
: property-not= ( i str property -- ? )
|
|
|
|
pick [
|
|
|
|
[ ?nth ] dip swap
|
|
|
|
[ word-break-prop = not ] [ drop f ] if*
|
|
|
|
] [ 3drop t ] if ;
|
|
|
|
|
|
|
|
: format/extended? ( ch -- ? )
|
|
|
|
word-break-prop { 4 5 } member? ;
|
|
|
|
|
|
|
|
:: walk-up ( str i -- j )
|
|
|
|
i 1 + str [ format/extended? not ] find-from drop
|
|
|
|
1+ str [ format/extended? not ] find-from drop ; ! possible bounds error?
|
|
|
|
|
|
|
|
:: walk-down ( str i -- j )
|
|
|
|
i str [ format/extended? not ] find-last-from drop
|
|
|
|
1- str [ format/extended? not ] find-last-from drop ; ! possible bounds error?
|
|
|
|
|
|
|
|
:: word-break? ( table-entry i str -- ? )
|
|
|
|
table-entry {
|
|
|
|
{ t [ f ] }
|
|
|
|
{ f [ t ] }
|
|
|
|
{ check-letter-after
|
|
|
|
[ str i walk-up str wALetter property-not= ] }
|
|
|
|
{ check-letter-before
|
|
|
|
[ str i walk-down str wALetter property-not= ] }
|
|
|
|
{ check-number-after
|
|
|
|
[ str i walk-up str wNumeric property-not= ] }
|
|
|
|
{ check-number-before
|
|
|
|
[ str i walk-down str wNumeric property-not= ] }
|
|
|
|
} case ;
|
2009-01-07 13:23:07 -05:00
|
|
|
|
2009-01-07 16:08:08 -05:00
|
|
|
:: word-break-next ( old-class new-char i str -- next-class ? )
|
|
|
|
new-char word-break-prop dup { 4 5 } member?
|
|
|
|
[ drop old-class dup { 1 2 3 } member? ]
|
|
|
|
[ old-class over word-table-nth i str word-break? ] if ;
|
2009-01-07 13:23:07 -05:00
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
PRIVATE>
|
|
|
|
|
2009-01-07 16:08:08 -05:00
|
|
|
:: first-word ( str -- i )
|
|
|
|
str unclip-slice word-break-prop over <enum>
|
|
|
|
[ swap str word-break-next ] assoc-find 2drop
|
2009-01-07 13:23:07 -05:00
|
|
|
nip swap length or 1+ ;
|
|
|
|
|
|
|
|
: >words ( str -- words )
|
|
|
|
[ first-word ] >pieces ;
|