factor/basis/unicode/breaks/breaks.factor

245 lines
6.7 KiB
Factor
Raw Normal View History

2008-10-05 19:36:56 -04:00
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
2008-09-10 23:11:40 -04:00
USING: combinators.short-circuit unicode.categories kernel math
combinators splitting sequences math.parser io.files io assocs
2009-01-07 18:59:01 -05:00
arrays namespaces make math.ranges unicode.normalize.private values
2008-09-10 23:11:40 -04:00
io.encodings.ascii unicode.syntax unicode.data compiler.units
2009-01-07 13:23:07 -05:00
alien.syntax sets accessors interval-maps memoize locals words ;
2007-09-20 18:09:08 -04:00
IN: unicode.breaks
2009-01-07 18:59:01 -05:00
<PRIVATE
2009-01-07 13:23:07 -05:00
! Grapheme breaks
2009-01-06 11:19:19 -05:00
C-ENUM: Any L V T LV LVT Extend Control CR LF
SpacingMark Prepend graphemes ;
2007-09-20 18:09:08 -04:00
: jamo-class ( ch -- class )
dup initial? [ drop L ]
[ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
2009-01-06 11:19:19 -05:00
: hangul-class ( ch -- class )
hangul-base - HEX: 1C mod zero? LV LVT ? ;
2007-09-20 18:09:08 -04:00
CATEGORY: grapheme-control Zl Zp Cc Cf ;
: control-class ( ch -- class )
{
{ CHAR: \r [ CR ] }
{ CHAR: \n [ LF ] }
{ HEX: 200C [ Extend ] }
{ HEX: 200D [ Extend ] }
[ drop Control ]
} case ;
CATEGORY: (extend) Me Mn ;
: extend? ( ch -- ? )
{ [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
2007-09-20 18:09:08 -04:00
2009-01-06 11:19:19 -05:00
: loe? ( ch -- ? )
"Logical_Order_Exception" property? ;
CATEGORY: spacing Mc ;
2007-09-20 18:09:08 -04:00
: grapheme-class ( ch -- class )
{
{ [ dup jamo? ] [ jamo-class ] }
2009-01-06 11:19:19 -05:00
{ [ dup hangul? ] [ hangul-class ] }
2007-09-20 18:09:08 -04:00
{ [ dup grapheme-control? ] [ control-class ] }
2009-01-06 11:19:19 -05:00
{ [ dup extend? ] [ drop Extend ] }
{ [ dup spacing? ] [ drop SpacingMark ] }
{ [ loe? ] [ Prepend ] }
2008-04-11 13:54:51 -04:00
[ Any ]
2007-09-20 18:09:08 -04:00
} cond ;
2009-01-07 13:23:07 -05:00
: init-table ( size -- table )
dup [ f <array> ] curry replicate ;
2007-09-20 18:09:08 -04:00
SYMBOL: table
: finish-table ( -- table )
table get [ [ 1 = ] map ] map ;
: set-table ( class1 class2 val -- )
-rot table get nth [ swap or ] change-nth ;
: connect ( class1 class2 -- ) 1 set-table ;
: disconnect ( class1 class2 -- ) 0 set-table ;
2009-01-07 13:23:07 -05:00
: check-before ( class classes value -- )
[ set-table ] curry with each ;
: check-after ( classes class value -- )
[ set-table ] 2curry each ;
2007-09-20 18:09:08 -04:00
: connect-before ( class classes -- )
2009-01-07 13:23:07 -05:00
1 check-before ;
2007-09-20 18:09:08 -04:00
: connect-after ( classes class -- )
2009-01-07 13:23:07 -05:00
1 check-after ;
2007-09-20 18:09:08 -04:00
: break-around ( classes1 classes2 -- )
2008-01-09 17:36:30 -05:00
[ [ 2dup disconnect swap disconnect ] with each ] curry each ;
2007-09-20 18:09:08 -04:00
: make-grapheme-table ( -- )
CR LF connect
Control CR LF 3array graphemes break-around
2009-01-06 11:19:19 -05:00
L L V LV LVT 4array connect-before
V V T 2array connect-before
2009-01-06 11:19:19 -05:00
LV V T 2array connect-before
2007-09-20 18:09:08 -04:00
T T connect
2009-01-06 11:19:19 -05:00
LVT T connect
graphemes Extend connect-after
graphemes SpacingMark connect-after
Prepend graphemes connect-before ;
2007-09-20 18:09:08 -04:00
2008-01-29 14:33:14 -05:00
VALUE: grapheme-table
2007-09-20 18:09:08 -04:00
: grapheme-break? ( class1 class2 -- ? )
grapheme-table nth nth not ;
: chars ( i str n -- str[i] str[i+n] )
2008-12-04 11:19:18 -05:00
swap [ dupd + ] dip [ ?nth ] curry bi@ ;
2007-09-20 18:09:08 -04:00
: find-index ( seq quot -- i ) find drop ; inline
: find-last-index ( seq quot -- i ) find-last drop ; inline
2009-01-07 18:59:01 -05:00
PRIVATE>
: first-grapheme ( str -- i )
unclip-slice grapheme-class over
[ grapheme-class tuck grapheme-break? ] find-index
nip swap length or 1+ ;
2009-01-07 18:59:01 -05:00
<PRIVATE
2009-01-07 13:23:07 -05:00
:: (>pieces) ( str quot -- )
str [
dup quot call cut-slice
swap , quot (>pieces)
2009-01-07 16:08:08 -05:00
] unless-empty ; inline recursive
2009-01-07 13:23:07 -05:00
: >pieces ( str quot -- graphemes )
2009-01-07 16:08:08 -05:00
[ (>pieces) ] { } make ; inline
2009-01-07 13:23:07 -05:00
2009-01-07 18:59:01 -05:00
PRIVATE>
2007-09-20 18:09:08 -04:00
: >graphemes ( str -- graphemes )
2009-01-07 13:23:07 -05:00
[ first-grapheme ] >pieces ;
2007-09-20 18:09:08 -04:00
: string-reverse ( str -- rts )
>graphemes reverse concat ;
: last-grapheme ( str -- i )
unclip-last-slice grapheme-class swap
2008-04-13 04:52:40 -04:00
[ grapheme-class dup rot grapheme-break? ] find-last-index ?1+ nip ;
2009-01-07 18:59:01 -05:00
<PRIVATE
2009-01-07 13:23:07 -05:00
graphemes init-table table
[ make-grapheme-table finish-table ] with-variable
to: grapheme-table
2009-01-07 13:23:07 -05:00
! Word breaks
VALUE: word-break-table
"resource:basis/unicode/data/WordBreakProperty.txt" load-script
to: word-break-table
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
wMidNum wMidNumLet wNumeric wExtendNumLet words ;
2009-01-07 16:08:08 -05:00
: word-break-classes ( -- table ) ! Is there a way to avoid this?
2009-01-07 13:23:07 -05:00
H{
2009-01-07 16:08:08 -05:00
{ "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
{ "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
{ "ALetter" 7 } { "MidLetter" 8 }
{ "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
{ "ExtendNumLet" 12 }
} ;
2009-01-07 13:23:07 -05:00
: word-break-prop ( char -- word-break-prop )
word-break-table interval-at
word-break-classes at [ wOther ] unless* ;
: e ( seq -- seq ) [ execute ] map ;
SYMBOL: check-letter-before
SYMBOL: check-letter-after
SYMBOL: check-number-before
SYMBOL: check-number-after
: make-word-table ( -- )
wCR wLF connect
{ wNewline wCR wLF } e words break-around
wALetter dup connect
wALetter { wMidLetter wMidNumLet } e check-letter-after check-before
{ wMidLetter wMidNumLet } e wALetter check-letter-before check-after
wNumeric dup connect
wALetter wNumeric connect
wNumeric wALetter connect
wNumeric { wMidNum wMidNumLet } e check-number-after check-before
{ wMidNum wMidNumLet } e wNumeric check-number-before check-after
wKatakana dup connect
{ wALetter wNumeric wKatakana wExtendNumLet } e wExtendNumLet
[ connect-after ] [ swap connect-before ] 2bi ;
VALUE: word-table
: finish-word-table ( -- table )
table get [
[ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
] map ;
words init-table table
[ make-word-table finish-word-table ] with-variable
to: word-table
2009-01-07 16:08:08 -05:00
: word-table-nth ( class1 class2 -- ? )
word-table nth nth ;
: property-not= ( i str property -- ? )
pick [
[ ?nth ] dip swap
[ word-break-prop = not ] [ drop f ] if*
] [ 3drop t ] if ;
: format/extended? ( ch -- ? )
word-break-prop { 4 5 } member? ;
:: walk-up ( str i -- j )
i 1 + str [ format/extended? not ] find-from drop
1+ str [ format/extended? not ] find-from drop ; ! possible bounds error?
:: walk-down ( str i -- j )
i str [ format/extended? not ] find-last-from drop
1- str [ format/extended? not ] find-last-from drop ; ! possible bounds error?
:: word-break? ( table-entry i str -- ? )
table-entry {
{ t [ f ] }
{ f [ t ] }
{ check-letter-after
[ str i walk-up str wALetter property-not= ] }
{ check-letter-before
[ str i walk-down str wALetter property-not= ] }
{ check-number-after
[ str i walk-up str wNumeric property-not= ] }
{ check-number-before
[ str i walk-down str wNumeric property-not= ] }
} case ;
2009-01-07 13:23:07 -05:00
2009-01-07 16:08:08 -05:00
:: word-break-next ( old-class new-char i str -- next-class ? )
new-char word-break-prop dup { 4 5 } member?
[ drop old-class dup { 1 2 3 } member? ]
[ old-class over word-table-nth i str word-break? ] if ;
2009-01-07 13:23:07 -05:00
2009-01-07 18:59:01 -05:00
PRIVATE>
2009-01-07 16:08:08 -05:00
:: first-word ( str -- i )
str unclip-slice word-break-prop over <enum>
[ swap str word-break-next ] assoc-find 2drop
2009-01-07 13:23:07 -05:00
nip swap length or 1+ ;
: >words ( str -- words )
[ first-word ] >pieces ;