Unicode word breaks
parent
a860ae82f1
commit
b3d175821a
|
@ -36,4 +36,4 @@ IN: unicode.breaks.tests
|
|||
] each ;
|
||||
|
||||
grapheme-break-test parse-test-file [ >graphemes ] test
|
||||
! word-break-test parse-test-file [ >words ] test
|
||||
word-break-test parse-test-file [ >words ] test
|
||||
|
|
|
@ -110,10 +110,10 @@ VALUE: grapheme-table
|
|||
str [
|
||||
dup quot call cut-slice
|
||||
swap , quot (>pieces)
|
||||
] unless-empty ;
|
||||
] unless-empty ; inline recursive
|
||||
|
||||
: >pieces ( str quot -- graphemes )
|
||||
[ (>pieces) ] { } make ;
|
||||
[ (>pieces) ] { } make ; inline
|
||||
|
||||
: >graphemes ( str -- graphemes )
|
||||
[ first-grapheme ] >pieces ;
|
||||
|
@ -139,14 +139,14 @@ to: word-break-table
|
|||
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
|
||||
wMidNum wMidNumLet wNumeric wExtendNumLet words ;
|
||||
|
||||
MEMO: word-break-classes ( -- table )
|
||||
: word-break-classes ( -- table ) ! Is there a way to avoid this?
|
||||
H{
|
||||
{ "Other" wOther } { "CR" wCR } { "LF" wLF } { "Newline" wNewline }
|
||||
{ "Extend" wExtend } { "Format" wFormat } { "Katakana" wKatakana }
|
||||
{ "ALetter" wALetter } { "MidLetter" wMidLetter }
|
||||
{ "MidNum" wMidNum } { "MidNumLet" wMidNumLet } { "Numeric" wNumeric }
|
||||
{ "ExtendNumLet" wExtendNumLet }
|
||||
} [ execute ] assoc-map ;
|
||||
{ "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
|
||||
{ "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
|
||||
{ "ALetter" 7 } { "MidLetter" 8 }
|
||||
{ "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
|
||||
{ "ExtendNumLet" 12 }
|
||||
} ;
|
||||
|
||||
: word-break-prop ( char -- word-break-prop )
|
||||
word-break-table interval-at
|
||||
|
@ -185,22 +185,49 @@ words init-table table
|
|||
[ make-word-table finish-word-table ] with-variable
|
||||
to: word-table
|
||||
|
||||
: word-break? ( class1 class2 -- ? )
|
||||
word-table nth nth not ;
|
||||
: word-table-nth ( class1 class2 -- ? )
|
||||
word-table nth nth ;
|
||||
|
||||
: skip? ( char -- ? )
|
||||
word-break-prop { 4 5 } member? ; ! wExtend or wFormat
|
||||
: property-not= ( i str property -- ? )
|
||||
pick [
|
||||
[ ?nth ] dip swap
|
||||
[ word-break-prop = not ] [ drop f ] if*
|
||||
] [ 3drop t ] if ;
|
||||
|
||||
: word-break-next ( old-class new-char -- next-class ? )
|
||||
word-break-prop dup { 4 5 } member?
|
||||
[ drop f ] [ tuck word-break? ] if ;
|
||||
: format/extended? ( ch -- ? )
|
||||
word-break-prop { 4 5 } member? ;
|
||||
|
||||
: first-word ( str -- i )
|
||||
unclip-slice word-break-prop over
|
||||
[ word-break-next ] find-index
|
||||
:: walk-up ( str i -- j )
|
||||
i 1 + str [ format/extended? not ] find-from drop
|
||||
1+ str [ format/extended? not ] find-from drop ; ! possible bounds error?
|
||||
|
||||
:: walk-down ( str i -- j )
|
||||
i str [ format/extended? not ] find-last-from drop
|
||||
1- str [ format/extended? not ] find-last-from drop ; ! possible bounds error?
|
||||
|
||||
:: word-break? ( table-entry i str -- ? )
|
||||
table-entry {
|
||||
{ t [ f ] }
|
||||
{ f [ t ] }
|
||||
{ check-letter-after
|
||||
[ str i walk-up str wALetter property-not= ] }
|
||||
{ check-letter-before
|
||||
[ str i walk-down str wALetter property-not= ] }
|
||||
{ check-number-after
|
||||
[ str i walk-up str wNumeric property-not= ] }
|
||||
{ check-number-before
|
||||
[ str i walk-down str wNumeric property-not= ] }
|
||||
} case ;
|
||||
|
||||
:: word-break-next ( old-class new-char i str -- next-class ? )
|
||||
new-char word-break-prop dup { 4 5 } member?
|
||||
[ drop old-class dup { 1 2 3 } member? ]
|
||||
[ old-class over word-table-nth i str word-break? ] if ;
|
||||
|
||||
:: first-word ( str -- i )
|
||||
str unclip-slice word-break-prop over <enum>
|
||||
[ swap str word-break-next ] assoc-find 2drop
|
||||
nip swap length or 1+ ;
|
||||
! This must be changed to ignore format/extended chars and
|
||||
! handle symbols in the table specially
|
||||
|
||||
: >words ( str -- words )
|
||||
[ first-word ] >pieces ;
|
||||
|
|
Loading…
Reference in New Issue