diff --git a/basis/ascii/ascii-docs.factor b/basis/ascii/ascii-docs.factor index 6af697cf89..4c783e609c 100644 --- a/basis/ascii/ascii-docs.factor +++ b/basis/ascii/ascii-docs.factor @@ -37,6 +37,26 @@ HELP: quotable? { $values { "ch" "a character" } { "?" "a boolean" } } { $description "Tests for characters which may appear in a Factor string literal without escaping." } ; +HELP: ascii? +{ $values { "ch" "a character" } { "?" "a boolean" } } +{ $description "Tests for whether a number is an ASCII character." } ; + +HELP: ch>lower +{ $values { "ch" "a character" } { "lower" "a character" } } +{ $description "Converts an ASCII character to lower case." } ; + +HELP: ch>upper +{ $values { "ch" "a character" } { "upper" "a character" } } +{ $description "Converts an ASCII character to upper case." } ; + +HELP: >lower +{ $values { "str" "a string" } { "lower" "a string" } } +{ $description "Converts an ASCII string to lower case." } ; + +HELP: >upper +{ $values { "str" "a string" } { "upper" "a string" } } +{ $description "Converts an ASCII string to upper case." } ; + ARTICLE: "ascii" "ASCII character classes" "The " { $vocab-link "ascii" } " vocabulary implements traditional ASCII character classes:" { $subsection blank? } @@ -46,6 +66,12 @@ ARTICLE: "ascii" "ASCII character classes" { $subsection printable? } { $subsection control? } { $subsection quotable? } -"Modern applications should use Unicode 5.0 instead (" { $vocab-link "unicode.categories" } ")." ; +{ $subsection ascii? } +"ASCII case conversion is also implemented:" +{ $subsection ch>lower } +{ $subsection ch>upper } +{ $subsection >lower } +{ $subsection >upper } +"Modern applications should use Unicode 5.1 instead (" { $vocab-link "unicode.categories" } ")." ; ABOUT: "ascii" diff --git a/basis/ascii/ascii-tests.factor b/basis/ascii/ascii-tests.factor index 7dacce734b..6f39b32a01 100644 --- a/basis/ascii/ascii-tests.factor +++ b/basis/ascii/ascii-tests.factor @@ -12,3 +12,8 @@ IN: ascii.tests 0 "There are Four Upper Case characters" [ LETTER? [ 1+ ] when ] each ] unit-test + +[ t f ] [ CHAR: \s ascii? 400 ascii? ] unit-test + +[ "HELLO HOW ARE YOU?" ] [ "hellO hOw arE YOU?" >upper ] unit-test +[ "i'm good thx bai" ] [ "I'm Good THX bai" >lower ] unit-test diff --git a/basis/ascii/ascii.factor b/basis/ascii/ascii.factor index c009c66cde..a64a7b8eb5 100644 --- a/basis/ascii/ascii.factor +++ b/basis/ascii/ascii.factor @@ -4,6 +4,8 @@ USING: kernel math math.order sequences combinators.short-circuit ; IN: ascii +: ascii? ( ch -- ? ) 0 127 between? ; inline + : blank? ( ch -- ? ) " \t\n\r" member? ; inline : letter? ( ch -- ? ) CHAR: a CHAR: z between? ; inline @@ -25,3 +27,15 @@ IN: ascii : alpha? ( ch -- ? ) [ [ Letter? ] [ digit? ] ] 1|| ; + +: ch>lower ( ch -- lower ) + dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ; + +: >lower ( str -- lower ) + [ ch>lower ] map ; + +: ch>upper ( ch -- upper ) + dup CHAR: a CHAR: z between? [ HEX: 20 - ] when ; + +: >upper ( str -- upper ) + [ ch>upper ] map ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 7620652948..dd116f3d7a 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -3,7 +3,10 @@ USING: accessors arrays assocs grouping kernel regexp.backend locals math namespaces regexp.parser sequences fry quotations math.order math.ranges vectors unicode.categories regexp.utils -regexp.transition-tables words sets regexp.classes unicode.case ; +regexp.transition-tables words sets regexp.classes unicode.case.private ; +! This uses unicode.case.private for ch>upper and ch>lower +! but case-insensitive matching should be done by case-folding everything +! before processing starts IN: regexp.nfa SYMBOL: negation-mode @@ -160,6 +163,8 @@ M: LETTER-class nfa-node ( node -- ) M: character-class-range nfa-node ( node -- ) case-insensitive option? [ + ! This should be implemented for Unicode by case-folding + ! the input and all strings in the regexp. dup [ from>> ] [ to>> ] bi 2dup [ Letter? ] bi@ and [ rot drop diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 25509ec798..2f397538a0 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -3,8 +3,8 @@ USING: accessors arrays assocs combinators io io.streams.string kernel math math.parser namespaces sets quotations sequences splitting vectors math.order -unicode.categories strings regexp.backend regexp.utils -unicode.case words locals regexp.classes ; +strings regexp.backend regexp.utils +unicode.case unicode.categories words locals regexp.classes ; IN: regexp.parser FROM: math.ranges => [a,b] ; @@ -261,7 +261,7 @@ ERROR: bad-escaped-literals seq ; parse-til-E drop1 [ epsilon ] [ - [ quot call ] V{ } map-as + quot call [ ] V{ } map-as first|concatenation ] if-empty ; inline @@ -269,10 +269,10 @@ ERROR: bad-escaped-literals seq ; [ ] (parse-escaped-literals) ; : lower-case-literals ( -- obj ) - [ ch>lower ] (parse-escaped-literals) ; + [ >lower ] (parse-escaped-literals) ; : upper-case-literals ( -- obj ) - [ ch>upper ] (parse-escaped-literals) ; + [ >upper ] (parse-escaped-literals) ; : parse-escaped ( -- obj ) read1 diff --git a/basis/soundex/soundex.factor b/basis/soundex/soundex.factor index 416ec4a6bc..164f634185 100644 --- a/basis/soundex/soundex.factor +++ b/basis/soundex/soundex.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: sequences grouping assocs kernel ascii unicode.case tr ; +USING: sequences grouping assocs kernel ascii ascii tr ; IN: soundex TR: soundex-tr diff --git a/basis/tr/tr-tests.factor b/basis/tr/tr-tests.factor index c168f5384d..3434c28216 100644 --- a/basis/tr/tr-tests.factor +++ b/basis/tr/tr-tests.factor @@ -1,5 +1,5 @@ IN: tr.tests -USING: tr tools.test unicode.case ; +USING: tr tools.test ascii ; TR: tr-test ch>upper "ABC" "XYZ" ; diff --git a/basis/tr/tr.factor b/basis/tr/tr.factor index 66d8df7d44..ce535f335a 100644 --- a/basis/tr/tr.factor +++ b/basis/tr/tr.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: byte-arrays strings sequences sequences.private +USING: byte-arrays strings sequences sequences.private ascii fry kernel words parser lexer assocs math math.order summary ; IN: tr @@ -11,8 +11,6 @@ M: bad-tr summary lower } { $subsection >title } { $subsection >case-fold } -"There are analogous routines which operate on individual code points, but these should " { $emphasis "not be used" } " in general as they have slightly different behavior. In some cases, for example, they do not perform the case operation, as a single code point must expand to more than one." -{ $subsection ch>upper } -{ $subsection ch>lower } -{ $subsection ch>title } "To test if a string is in a given case:" { $subsection upper? } { $subsection lower? } @@ -53,18 +49,3 @@ HELP: title? HELP: case-fold? { $values { "string" string } { "?" "a boolean" } } { $description "Tests if a string is in case-folded form." } ; - -HELP: ch>lower -{ $values { "ch" "a code point" } { "lower" "a code point" } } -{ $description "Converts a code point to lower case." } -{ $warning "Don't use this unless you know what you're doing! " { $code ">lower" } " is not the same as " { $code "[ ch>lower ] map" } "." } ; - -HELP: ch>upper -{ $values { "ch" "a code point" } { "upper" "a code point" } } -{ $description "Converts a code point to upper case." } -{ $warning "Don't use this unless you know what you're doing! " { $code ">upper" } " is not the same as " { $code "[ ch>upper ] map" } "." } ; - -HELP: ch>title -{ $values { "ch" "a code point" } { "title" "a code point" } } -{ $description "Converts a code point to title case." } -{ $warning "Don't use this unless you know what you're doing! " { $code ">title" } " is not the same as " { $code "[ ch>title ] map" } "." } ; diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor index 99278cd72e..c800205704 100644 --- a/basis/unicode/case/case.factor +++ b/basis/unicode/case/case.factor @@ -7,11 +7,11 @@ IN: unicode.case : ch>lower ( ch -- lower ) simple-lower at-default ; : ch>upper ( ch -- upper ) simple-upper at-default ; : ch>title ( ch -- title ) simple-title at-default ; +PRIVATE> SYMBOL: locale ! Just casing locale, or overall? diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor index 61a93d9375..6cf913bffa 100644 --- a/basis/unicode/data/data.factor +++ b/basis/unicode/data/data.factor @@ -128,12 +128,9 @@ VALUE: properties cat categories index char table ?set-nth ] assoc-each table fill-ranges ] ; -: ascii-lower ( string -- lower ) - [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ; - : process-names ( data -- names-hash ) 1 swap (process-data) [ - ascii-lower { { CHAR: \s CHAR: - } } substitute swap + >lower { { CHAR: \s CHAR: - } } substitute swap ] H{ } assoc-map-as ; : multihex ( hexstring -- string ) diff --git a/basis/unicode/normalize/normalize.factor b/basis/unicode/normalize/normalize.factor index c8d0eb3f7d..2fbe2e1843 100644 --- a/basis/unicode/normalize/normalize.factor +++ b/basis/unicode/normalize/normalize.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: sequences namespaces make unicode.data kernel math arrays -locals sorting.insertion accessors assocs math.order combinators ; +locals sorting.insertion accessors assocs math.order combinators +unicode.syntax ; IN: unicode.normalize upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ; diff --git a/extra/parser-combinators/regexp/regexp.factor b/extra/parser-combinators/regexp/regexp.factor index 2becd937f2..1c94308e93 100755 --- a/extra/parser-combinators/regexp/regexp.factor +++ b/extra/parser-combinators/regexp/regexp.factor @@ -2,7 +2,7 @@ USING: arrays combinators kernel lists math math.parser namespaces parser lexer parser-combinators parser-combinators.simple promises quotations sequences strings math.order assocs prettyprint.backend prettyprint.custom memoize -unicode.case unicode.categories combinators.short-circuit +ascii unicode.categories combinators.short-circuit accessors make io ; IN: parser-combinators.regexp