Unicode word breaks

2009-01-07 15:08:08 -06:00 · 2009-01-07 15:08:08 -06:00 · b3d175821a
parent a860ae82f1
commit b3d175821a
2 changed files with 49 additions and 22 deletions
--- a/basis/unicode/breaks/breaks-tests.factor
+++ b/basis/unicode/breaks/breaks-tests.factor
@ -36,4 +36,4 @@ IN: unicode.breaks.tests
    ] each ;

 grapheme-break-test parse-test-file [ >graphemes ] test
-! word-break-test parse-test-file [ >words ] test
+word-break-test parse-test-file [ >words ] test
--- a/basis/unicode/breaks/breaks.factor
+++ b/basis/unicode/breaks/breaks.factor
@ -110,10 +110,10 @@ VALUE: grapheme-table
    str [
        dup quot call cut-slice
        swap , quot (>pieces)
-    ] unless-empty ;
+    ] unless-empty ; inline recursive

 : >pieces ( str quot -- graphemes )
-    [ (>pieces) ] { } make ;
+    [ (>pieces) ] { } make ; inline

 : >graphemes ( str -- graphemes )
    [ first-grapheme ] >pieces ;
@ -139,14 +139,14 @@ to: word-break-table
 C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
 wMidNum wMidNumLet wNumeric wExtendNumLet words ;

-MEMO: word-break-classes ( -- table )
+: word-break-classes ( -- table ) ! Is there a way to avoid this?
    H{
-        { "Other" wOther } { "CR" wCR } { "LF" wLF } { "Newline" wNewline }
-        { "Extend" wExtend } { "Format" wFormat } { "Katakana" wKatakana }
-        { "ALetter" wALetter } { "MidLetter" wMidLetter }
-        { "MidNum" wMidNum } { "MidNumLet" wMidNumLet } { "Numeric" wNumeric }
-        { "ExtendNumLet" wExtendNumLet }
-    } [ execute ] assoc-map ;
+        { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
+        { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
+        { "ALetter" 7 } { "MidLetter" 8 }
+        { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
+        { "ExtendNumLet" 12 }
+    } ;

 : word-break-prop ( char -- word-break-prop )
    word-break-table interval-at
@ -185,22 +185,49 @@ words init-table table
 [ make-word-table finish-word-table ] with-variable
 to: word-table

-: word-break? ( class1 class2 -- ? )
-    word-table nth nth not ;
+: word-table-nth ( class1 class2 -- ? )
+    word-table nth nth ;

-: skip? ( char -- ? )
-    word-break-prop { 4 5 } member? ; ! wExtend or wFormat
+: property-not= ( i str property -- ? )
+    pick [
+        [ ?nth ] dip swap
+        [ word-break-prop = not ] [ drop f ] if*
+    ] [ 3drop t ] if ;

-: word-break-next ( old-class new-char -- next-class ? )
-    word-break-prop dup { 4 5 } member?
-    [ drop f ] [ tuck word-break? ] if ;
+: format/extended? ( ch -- ? )
+    word-break-prop { 4 5 } member? ;

-: first-word ( str -- i )
-    unclip-slice word-break-prop over
-    [ word-break-next ] find-index
+:: walk-up ( str i -- j )
+    i 1 + str [ format/extended? not ] find-from drop
+    1+ str [ format/extended? not ] find-from drop ; ! possible bounds error?
+
+:: walk-down ( str i -- j )
+    i str [ format/extended? not ] find-last-from drop
+    1- str [ format/extended? not ] find-last-from drop ; ! possible bounds error?
+
+:: word-break? ( table-entry i str -- ? )
+    table-entry {
+        { t [ f ] }
+        { f [ t ] }
+        { check-letter-after
+            [ str i walk-up str wALetter property-not= ] }
+        { check-letter-before
+            [ str i walk-down str wALetter property-not= ] }
+        { check-number-after
+            [ str i walk-up str wNumeric property-not= ] }
+        { check-number-before
+            [ str i walk-down str wNumeric property-not= ] }
+    } case ;
+
+:: word-break-next ( old-class new-char i str -- next-class ? )
+    new-char word-break-prop dup { 4 5 } member?
+    [ drop old-class dup { 1 2 3 } member? ]
+    [ old-class over word-table-nth i str word-break? ] if ;
+
+:: first-word ( str -- i )
+    str unclip-slice word-break-prop over <enum>
+    [ swap str word-break-next ] assoc-find 2drop
    nip swap length or 1+ ;
-! This must be changed to ignore format/extended chars and
-! handle symbols in the table specially

 : >words ( str -- words )
    [ first-word ] >pieces ;