Privatizing unicode.case:ch>{lower,upper,title}

2009-01-08 19:07:46 -06:00 · 2009-01-08 19:07:46 -06:00 · ad53cb8635
parent 0aec786359
commit ad53cb8635
15 changed files with 67 additions and 40 deletions
--- a/basis/ascii/ascii-docs.factor
+++ b/basis/ascii/ascii-docs.factor
@ -37,6 +37,26 @@ HELP: quotable?
 { $values { "ch" "a character" } { "?" "a boolean" } }
 { $description "Tests for characters which may appear in a Factor string literal without escaping." } ;

+HELP: ascii?
+{ $values { "ch" "a character" } { "?" "a boolean" } }
+{ $description "Tests for whether a number is an ASCII character." } ;
+
+HELP: ch>lower
+{ $values { "ch" "a character" } { "lower" "a character" } }
+{ $description "Converts an ASCII character to lower case." } ;
+
+HELP: ch>upper
+{ $values { "ch" "a character" } { "upper" "a character" } }
+{ $description "Converts an ASCII character to upper case." } ;
+
+HELP: >lower
+{ $values { "str" "a string" } { "lower" "a string" } }
+{ $description "Converts an ASCII string to lower case." } ;
+
+HELP: >upper
+{ $values { "str" "a string" } { "upper" "a string" } }
+{ $description "Converts an ASCII string to upper case." } ;
+
 ARTICLE: "ascii" "ASCII character classes"
 "The " { $vocab-link "ascii" } " vocabulary implements traditional ASCII character classes:"
 { $subsection blank? }
@ -46,6 +66,12 @@ ARTICLE: "ascii" "ASCII character classes"
 { $subsection printable? }
 { $subsection control? }
 { $subsection quotable? }
-"Modern applications should use Unicode 5.0 instead (" { $vocab-link "unicode.categories" } ")." ;
+{ $subsection ascii? }
+"ASCII case conversion is also implemented:"
+{ $subsection ch>lower }
+{ $subsection ch>upper }
+{ $subsection >lower }
+{ $subsection >upper }
+"Modern applications should use Unicode 5.1 instead (" { $vocab-link "unicode.categories" } ")." ;

 ABOUT: "ascii"
--- a/basis/ascii/ascii-tests.factor
+++ b/basis/ascii/ascii-tests.factor
@ -12,3 +12,8 @@ IN: ascii.tests
    0 "There are Four Upper Case characters"
    [ LETTER? [ 1+ ] when ] each
 ] unit-test
+
+[ t f ] [ CHAR: \s ascii? 400 ascii? ] unit-test
+
+[ "HELLO HOW ARE YOU?" ] [ "hellO hOw arE YOU?" >upper ] unit-test
+[ "i'm good thx bai" ] [ "I'm Good THX bai" >lower ] unit-test
--- a/basis/ascii/ascii.factor
+++ b/basis/ascii/ascii.factor
@ -4,6 +4,8 @@ USING: kernel math math.order sequences
 combinators.short-circuit ;
 IN: ascii

+: ascii? ( ch -- ? ) 0 127 between? ; inline
+
 : blank? ( ch -- ? ) " \t\n\r" member? ; inline

 : letter? ( ch -- ? ) CHAR: a CHAR: z between? ; inline
@ -25,3 +27,15 @@ IN: ascii

 : alpha? ( ch -- ? )
    [ [ Letter? ] [ digit? ] ] 1|| ;
+
+: ch>lower ( ch -- lower )
+   dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ;
+
+: >lower ( str -- lower )
+   [ ch>lower ] map ;
+
+: ch>upper ( ch -- upper )
+    dup CHAR: a CHAR: z between? [ HEX: 20 - ] when ;
+
+: >upper ( str -- upper )
+    [ ch>upper ] map ;
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@ -3,7 +3,10 @@
 USING: accessors arrays assocs grouping kernel regexp.backend
 locals math namespaces regexp.parser sequences fry quotations
 math.order math.ranges vectors unicode.categories regexp.utils
-regexp.transition-tables words sets regexp.classes unicode.case ;
+regexp.transition-tables words sets regexp.classes unicode.case.private ;
+! This uses unicode.case.private for ch>upper and ch>lower
+! but case-insensitive matching should be done by case-folding everything
+! before processing starts
 IN: regexp.nfa

 SYMBOL: negation-mode
@ -160,6 +163,8 @@ M: LETTER-class nfa-node ( node -- )

 M: character-class-range nfa-node ( node -- )
    case-insensitive option? [
+        ! This should be implemented for Unicode by case-folding
+        ! the input and all strings in the regexp.
        dup [ from>> ] [ to>> ] bi
        2dup [ Letter? ] bi@ and [
            rot drop
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@ -3,8 +3,8 @@
 USING: accessors arrays assocs combinators io io.streams.string
 kernel math math.parser namespaces sets
 quotations sequences splitting vectors math.order
-unicode.categories strings regexp.backend regexp.utils
-unicode.case words locals regexp.classes ;
+strings regexp.backend regexp.utils
+unicode.case unicode.categories words locals regexp.classes ;
 IN: regexp.parser

 FROM: math.ranges => [a,b] ;
@ -261,7 +261,7 @@ ERROR: bad-escaped-literals seq ;
    parse-til-E
    drop1
    [ epsilon ] [
-        [ quot call <constant> ] V{ } map-as
+        quot call [ <constant> ] V{ } map-as
        first|concatenation
    ] if-empty ; inline

@ -269,10 +269,10 @@ ERROR: bad-escaped-literals seq ;
    [ ] (parse-escaped-literals) ;

 : lower-case-literals ( -- obj )
-    [ ch>lower ] (parse-escaped-literals) ;
+    [ >lower ] (parse-escaped-literals) ;

 : upper-case-literals ( -- obj )
-    [ ch>upper ] (parse-escaped-literals) ;
+    [ >upper ] (parse-escaped-literals) ;

 : parse-escaped ( -- obj )
    read1
--- a/basis/soundex/soundex.factor
+++ b/basis/soundex/soundex.factor
@ -1,6 +1,6 @@
 ! Copyright (C) 2008 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: sequences grouping assocs kernel ascii unicode.case tr ;
+USING: sequences grouping assocs kernel ascii ascii tr ;
 IN: soundex

 TR: soundex-tr
--- a/basis/tr/tr-tests.factor
+++ b/basis/tr/tr-tests.factor
@ -1,5 +1,5 @@
 IN: tr.tests
-USING: tr tools.test unicode.case ;
+USING: tr tools.test ascii ;

 TR: tr-test ch>upper "ABC" "XYZ" ;

--- a/basis/tr/tr.factor
+++ b/basis/tr/tr.factor
@ -1,6 +1,6 @@
 ! Copyright (C) 2008 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: byte-arrays strings sequences sequences.private
+USING: byte-arrays strings sequences sequences.private ascii
 fry kernel words parser lexer assocs math math.order summary ;
 IN: tr

@ -11,8 +11,6 @@ M: bad-tr summary

 <PRIVATE

-: ascii? ( ch -- ? ) 0 127 between? ; inline
-
 : tr-nth ( n mapping -- ch ) nth-unsafe 127 bitand ; inline

 : check-tr ( from to -- )
--- a/basis/unicode/case/case-docs.factor
+++ b/basis/unicode/case/case-docs.factor
@ -9,10 +9,6 @@ ARTICLE: "unicode.case" "Case mapping"
 { $subsection >lower }
 { $subsection >title }
 { $subsection >case-fold }
-"There are analogous routines which operate on individual code points, but these should " { $emphasis "not be used" } " in general as they have slightly different behavior. In some cases, for example, they do not perform the case operation, as a single code point must expand to more than one."
-{ $subsection ch>upper }
-{ $subsection ch>lower }
-{ $subsection ch>title }
 "To test if a string is in a given case:"
 { $subsection upper? }
 { $subsection lower? }
@ -53,18 +49,3 @@ HELP: title?
 HELP: case-fold?
 { $values { "string" string } { "?" "a boolean" } }
 { $description "Tests if a string is in case-folded form." } ;
-
-HELP: ch>lower
-{ $values { "ch" "a code point" } { "lower" "a code point" } }
-{ $description "Converts a code point to lower case." }
-{ $warning "Don't use this unless you know what you're doing! " { $code ">lower" } " is not the same as " { $code "[ ch>lower ] map" } "." } ;
-
-HELP: ch>upper
-{ $values { "ch" "a code point" } { "upper" "a code point" } }
-{ $description "Converts a code point to upper case." }
-{ $warning "Don't use this unless you know what you're doing! " { $code ">upper" } " is not the same as " { $code "[ ch>upper ] map" } "." } ;
-
-HELP: ch>title
-{ $values { "ch" "a code point" } { "title" "a code point" } }
-{ $description "Converts a code point to title case." }
-{ $warning "Don't use this unless you know what you're doing! " { $code ">title" } " is not the same as " { $code "[ ch>title ] map" } "." } ;
--- a/basis/unicode/case/case.factor
+++ b/basis/unicode/case/case.factor
@ -7,11 +7,11 @@ IN: unicode.case

 <PRIVATE
 : at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ;
-PRIVATE>

 : ch>lower ( ch -- lower ) simple-lower at-default ;
 : ch>upper ( ch -- upper ) simple-upper at-default ;
 : ch>title ( ch -- title ) simple-title at-default ;
+PRIVATE>

 SYMBOL: locale ! Just casing locale, or overall?

--- a/basis/unicode/data/data.factor
+++ b/basis/unicode/data/data.factor
@ -128,12 +128,9 @@ VALUE: properties
            cat categories index char table ?set-nth
        ] assoc-each table fill-ranges ] ;

-: ascii-lower ( string -- lower )
-    [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
-
 : process-names ( data -- names-hash )
    1 swap (process-data) [
-        ascii-lower { { CHAR: \s CHAR: - } } substitute swap
+        >lower { { CHAR: \s CHAR: - } } substitute swap
    ] H{ } assoc-map-as ;

 : multihex ( hexstring -- string )
--- a/basis/unicode/normalize/normalize.factor
+++ b/basis/unicode/normalize/normalize.factor
@ -1,7 +1,8 @@
 ! Copyright (C) 2008 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: sequences namespaces make unicode.data kernel math arrays
-locals sorting.insertion accessors assocs math.order combinators ;
+locals sorting.insertion accessors assocs math.order combinators
+unicode.syntax ;
 IN: unicode.normalize

 <PRIVATE
--- a/basis/xmode/marker/marker.factor
+++ b/basis/xmode/marker/marker.factor
@ -5,7 +5,7 @@ USING: kernel namespaces make xmode.rules xmode.tokens
 xmode.marker.state xmode.marker.context xmode.utilities
 xmode.catalog sequences math assocs combinators strings
 parser-combinators.regexp splitting parser-combinators ascii
-unicode.case combinators.short-circuit accessors ;
+ascii combinators.short-circuit accessors ;

 ! Based on org.gjt.sp.jedit.syntax.TokenMarker

--- a/extra/benchmark/reverse-complement/reverse-complement.factor
+++ b/extra/benchmark/reverse-complement/reverse-complement.factor
@ -3,7 +3,7 @@
 USING: io io.files io.files.temp io.streams.duplex kernel
 sequences sequences.private strings vectors words memoize
 splitting grouping hints tr continuations io.encodings.ascii
-unicode.case ;
+ascii ;
 IN: benchmark.reverse-complement

 TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ;
--- a/extra/parser-combinators/regexp/regexp.factor
+++ b/extra/parser-combinators/regexp/regexp.factor
@ -2,7 +2,7 @@ USING: arrays combinators kernel lists math math.parser
 namespaces parser lexer parser-combinators
 parser-combinators.simple promises quotations sequences strings
 math.order assocs prettyprint.backend prettyprint.custom memoize
-unicode.case unicode.categories combinators.short-circuit
+ascii unicode.categories combinators.short-circuit
 accessors make io ;
 IN: parser-combinators.regexp