Unicode word breaks

db4
Daniel Ehrenberg 2009-01-07 15:08:08 -06:00
parent a860ae82f1
commit b3d175821a
2 changed files with 49 additions and 22 deletions

View File

@ -36,4 +36,4 @@ IN: unicode.breaks.tests
] each ;
grapheme-break-test parse-test-file [ >graphemes ] test
! word-break-test parse-test-file [ >words ] test
word-break-test parse-test-file [ >words ] test

View File

@ -110,10 +110,10 @@ VALUE: grapheme-table
str [
dup quot call cut-slice
swap , quot (>pieces)
] unless-empty ;
] unless-empty ; inline recursive
: >pieces ( str quot -- graphemes )
[ (>pieces) ] { } make ;
[ (>pieces) ] { } make ; inline
: >graphemes ( str -- graphemes )
[ first-grapheme ] >pieces ;
@ -139,14 +139,14 @@ to: word-break-table
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
wMidNum wMidNumLet wNumeric wExtendNumLet words ;
MEMO: word-break-classes ( -- table )
: word-break-classes ( -- table ) ! Is there a way to avoid this?
H{
{ "Other" wOther } { "CR" wCR } { "LF" wLF } { "Newline" wNewline }
{ "Extend" wExtend } { "Format" wFormat } { "Katakana" wKatakana }
{ "ALetter" wALetter } { "MidLetter" wMidLetter }
{ "MidNum" wMidNum } { "MidNumLet" wMidNumLet } { "Numeric" wNumeric }
{ "ExtendNumLet" wExtendNumLet }
} [ execute ] assoc-map ;
{ "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
{ "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
{ "ALetter" 7 } { "MidLetter" 8 }
{ "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
{ "ExtendNumLet" 12 }
} ;
: word-break-prop ( char -- word-break-prop )
word-break-table interval-at
@ -185,22 +185,49 @@ words init-table table
[ make-word-table finish-word-table ] with-variable
to: word-table
: word-break? ( class1 class2 -- ? )
word-table nth nth not ;
: word-table-nth ( class1 class2 -- ? )
word-table nth nth ;
: skip? ( char -- ? )
word-break-prop { 4 5 } member? ; ! wExtend or wFormat
: property-not= ( i str property -- ? )
pick [
[ ?nth ] dip swap
[ word-break-prop = not ] [ drop f ] if*
] [ 3drop t ] if ;
: word-break-next ( old-class new-char -- next-class ? )
word-break-prop dup { 4 5 } member?
[ drop f ] [ tuck word-break? ] if ;
: format/extended? ( ch -- ? )
word-break-prop { 4 5 } member? ;
: first-word ( str -- i )
unclip-slice word-break-prop over
[ word-break-next ] find-index
:: walk-up ( str i -- j )
i 1 + str [ format/extended? not ] find-from drop
1+ str [ format/extended? not ] find-from drop ; ! possible bounds error?
:: walk-down ( str i -- j )
i str [ format/extended? not ] find-last-from drop
1- str [ format/extended? not ] find-last-from drop ; ! possible bounds error?
:: word-break? ( table-entry i str -- ? )
table-entry {
{ t [ f ] }
{ f [ t ] }
{ check-letter-after
[ str i walk-up str wALetter property-not= ] }
{ check-letter-before
[ str i walk-down str wALetter property-not= ] }
{ check-number-after
[ str i walk-up str wNumeric property-not= ] }
{ check-number-before
[ str i walk-down str wNumeric property-not= ] }
} case ;
:: word-break-next ( old-class new-char i str -- next-class ? )
new-char word-break-prop dup { 4 5 } member?
[ drop old-class dup { 1 2 3 } member? ]
[ old-class over word-table-nth i str word-break? ] if ;
:: first-word ( str -- i )
str unclip-slice word-break-prop over <enum>
[ swap str word-break-next ] assoc-find 2drop
nip swap length or 1+ ;
! This must be changed to ignore format/extended chars and
! handle symbols in the table specially
: >words ( str -- words )
[ first-word ] >pieces ;