diff --git a/basis/unicode/breaks/breaks-tests.factor b/basis/unicode/breaks/breaks-tests.factor index 39baa8f808..b91cb2b26c 100644 --- a/basis/unicode/breaks/breaks-tests.factor +++ b/basis/unicode/breaks/breaks-tests.factor @@ -36,4 +36,4 @@ IN: unicode.breaks.tests ] each ; grapheme-break-test parse-test-file [ >graphemes ] test -! word-break-test parse-test-file [ >words ] test +word-break-test parse-test-file [ >words ] test diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor index 9d2bad4724..5652cc2906 100644 --- a/basis/unicode/breaks/breaks.factor +++ b/basis/unicode/breaks/breaks.factor @@ -110,10 +110,10 @@ VALUE: grapheme-table str [ dup quot call cut-slice swap , quot (>pieces) - ] unless-empty ; + ] unless-empty ; inline recursive : >pieces ( str quot -- graphemes ) - [ (>pieces) ] { } make ; + [ (>pieces) ] { } make ; inline : >graphemes ( str -- graphemes ) [ first-grapheme ] >pieces ; @@ -139,14 +139,14 @@ to: word-break-table C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter wMidNum wMidNumLet wNumeric wExtendNumLet words ; -MEMO: word-break-classes ( -- table ) +: word-break-classes ( -- table ) ! Is there a way to avoid this? H{ - { "Other" wOther } { "CR" wCR } { "LF" wLF } { "Newline" wNewline } - { "Extend" wExtend } { "Format" wFormat } { "Katakana" wKatakana } - { "ALetter" wALetter } { "MidLetter" wMidLetter } - { "MidNum" wMidNum } { "MidNumLet" wMidNumLet } { "Numeric" wNumeric } - { "ExtendNumLet" wExtendNumLet } - } [ execute ] assoc-map ; + { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 } + { "Extend" 4 } { "Format" 5 } { "Katakana" 6 } + { "ALetter" 7 } { "MidLetter" 8 } + { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 } + { "ExtendNumLet" 12 } + } ; : word-break-prop ( char -- word-break-prop ) word-break-table interval-at @@ -185,22 +185,49 @@ words init-table table [ make-word-table finish-word-table ] with-variable to: word-table -: word-break? ( class1 class2 -- ? ) - word-table nth nth not ; +: word-table-nth ( class1 class2 -- ? ) + word-table nth nth ; -: skip? ( char -- ? ) - word-break-prop { 4 5 } member? ; ! wExtend or wFormat +: property-not= ( i str property -- ? ) + pick [ + [ ?nth ] dip swap + [ word-break-prop = not ] [ drop f ] if* + ] [ 3drop t ] if ; -: word-break-next ( old-class new-char -- next-class ? ) - word-break-prop dup { 4 5 } member? - [ drop f ] [ tuck word-break? ] if ; +: format/extended? ( ch -- ? ) + word-break-prop { 4 5 } member? ; -: first-word ( str -- i ) - unclip-slice word-break-prop over - [ word-break-next ] find-index +:: walk-up ( str i -- j ) + i 1 + str [ format/extended? not ] find-from drop + 1+ str [ format/extended? not ] find-from drop ; ! possible bounds error? + +:: walk-down ( str i -- j ) + i str [ format/extended? not ] find-last-from drop + 1- str [ format/extended? not ] find-last-from drop ; ! possible bounds error? + +:: word-break? ( table-entry i str -- ? ) + table-entry { + { t [ f ] } + { f [ t ] } + { check-letter-after + [ str i walk-up str wALetter property-not= ] } + { check-letter-before + [ str i walk-down str wALetter property-not= ] } + { check-number-after + [ str i walk-up str wNumeric property-not= ] } + { check-number-before + [ str i walk-down str wNumeric property-not= ] } + } case ; + +:: word-break-next ( old-class new-char i str -- next-class ? ) + new-char word-break-prop dup { 4 5 } member? + [ drop old-class dup { 1 2 3 } member? ] + [ old-class over word-table-nth i str word-break? ] if ; + +:: first-word ( str -- i ) + str unclip-slice word-break-prop over + [ swap str word-break-next ] assoc-find 2drop nip swap length or 1+ ; -! This must be changed to ignore format/extended chars and -! handle symbols in the table specially : >words ( str -- words ) [ first-word ] >pieces ;