Privatizing unicode.case:ch>{lower,upper,title}

db4
Daniel Ehrenberg 2009-01-08 19:07:46 -06:00
parent 0aec786359
commit ad53cb8635
15 changed files with 67 additions and 40 deletions

View File

@ -37,6 +37,26 @@ HELP: quotable?
{ $values { "ch" "a character" } { "?" "a boolean" } }
{ $description "Tests for characters which may appear in a Factor string literal without escaping." } ;
HELP: ascii?
{ $values { "ch" "a character" } { "?" "a boolean" } }
{ $description "Tests for whether a number is an ASCII character." } ;
HELP: ch>lower
{ $values { "ch" "a character" } { "lower" "a character" } }
{ $description "Converts an ASCII character to lower case." } ;
HELP: ch>upper
{ $values { "ch" "a character" } { "upper" "a character" } }
{ $description "Converts an ASCII character to upper case." } ;
HELP: >lower
{ $values { "str" "a string" } { "lower" "a string" } }
{ $description "Converts an ASCII string to lower case." } ;
HELP: >upper
{ $values { "str" "a string" } { "upper" "a string" } }
{ $description "Converts an ASCII string to upper case." } ;
ARTICLE: "ascii" "ASCII character classes"
"The " { $vocab-link "ascii" } " vocabulary implements traditional ASCII character classes:"
{ $subsection blank? }
@ -46,6 +66,12 @@ ARTICLE: "ascii" "ASCII character classes"
{ $subsection printable? }
{ $subsection control? }
{ $subsection quotable? }
"Modern applications should use Unicode 5.0 instead (" { $vocab-link "unicode.categories" } ")." ;
{ $subsection ascii? }
"ASCII case conversion is also implemented:"
{ $subsection ch>lower }
{ $subsection ch>upper }
{ $subsection >lower }
{ $subsection >upper }
"Modern applications should use Unicode 5.1 instead (" { $vocab-link "unicode.categories" } ")." ;
ABOUT: "ascii"

View File

@ -12,3 +12,8 @@ IN: ascii.tests
0 "There are Four Upper Case characters"
[ LETTER? [ 1+ ] when ] each
] unit-test
[ t f ] [ CHAR: \s ascii? 400 ascii? ] unit-test
[ "HELLO HOW ARE YOU?" ] [ "hellO hOw arE YOU?" >upper ] unit-test
[ "i'm good thx bai" ] [ "I'm Good THX bai" >lower ] unit-test

View File

@ -4,6 +4,8 @@ USING: kernel math math.order sequences
combinators.short-circuit ;
IN: ascii
: ascii? ( ch -- ? ) 0 127 between? ; inline
: blank? ( ch -- ? ) " \t\n\r" member? ; inline
: letter? ( ch -- ? ) CHAR: a CHAR: z between? ; inline
@ -25,3 +27,15 @@ IN: ascii
: alpha? ( ch -- ? )
[ [ Letter? ] [ digit? ] ] 1|| ;
: ch>lower ( ch -- lower )
dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ;
: >lower ( str -- lower )
[ ch>lower ] map ;
: ch>upper ( ch -- upper )
dup CHAR: a CHAR: z between? [ HEX: 20 - ] when ;
: >upper ( str -- upper )
[ ch>upper ] map ;

View File

@ -3,7 +3,10 @@
USING: accessors arrays assocs grouping kernel regexp.backend
locals math namespaces regexp.parser sequences fry quotations
math.order math.ranges vectors unicode.categories regexp.utils
regexp.transition-tables words sets regexp.classes unicode.case ;
regexp.transition-tables words sets regexp.classes unicode.case.private ;
! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
IN: regexp.nfa
SYMBOL: negation-mode
@ -160,6 +163,8 @@ M: LETTER-class nfa-node ( node -- )
M: character-class-range nfa-node ( node -- )
case-insensitive option? [
! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp.
dup [ from>> ] [ to>> ] bi
2dup [ Letter? ] bi@ and [
rot drop

View File

@ -3,8 +3,8 @@
USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser namespaces sets
quotations sequences splitting vectors math.order
unicode.categories strings regexp.backend regexp.utils
unicode.case words locals regexp.classes ;
strings regexp.backend regexp.utils
unicode.case unicode.categories words locals regexp.classes ;
IN: regexp.parser
FROM: math.ranges => [a,b] ;
@ -261,7 +261,7 @@ ERROR: bad-escaped-literals seq ;
parse-til-E
drop1
[ epsilon ] [
[ quot call <constant> ] V{ } map-as
quot call [ <constant> ] V{ } map-as
first|concatenation
] if-empty ; inline
@ -269,10 +269,10 @@ ERROR: bad-escaped-literals seq ;
[ ] (parse-escaped-literals) ;
: lower-case-literals ( -- obj )
[ ch>lower ] (parse-escaped-literals) ;
[ >lower ] (parse-escaped-literals) ;
: upper-case-literals ( -- obj )
[ ch>upper ] (parse-escaped-literals) ;
[ >upper ] (parse-escaped-literals) ;
: parse-escaped ( -- obj )
read1

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
USING: sequences grouping assocs kernel ascii unicode.case tr ;
USING: sequences grouping assocs kernel ascii ascii tr ;
IN: soundex
TR: soundex-tr

View File

@ -1,5 +1,5 @@
IN: tr.tests
USING: tr tools.test unicode.case ;
USING: tr tools.test ascii ;
TR: tr-test ch>upper "ABC" "XYZ" ;

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
USING: byte-arrays strings sequences sequences.private
USING: byte-arrays strings sequences sequences.private ascii
fry kernel words parser lexer assocs math math.order summary ;
IN: tr
@ -11,8 +11,6 @@ M: bad-tr summary
<PRIVATE
: ascii? ( ch -- ? ) 0 127 between? ; inline
: tr-nth ( n mapping -- ch ) nth-unsafe 127 bitand ; inline
: check-tr ( from to -- )

View File

@ -9,10 +9,6 @@ ARTICLE: "unicode.case" "Case mapping"
{ $subsection >lower }
{ $subsection >title }
{ $subsection >case-fold }
"There are analogous routines which operate on individual code points, but these should " { $emphasis "not be used" } " in general as they have slightly different behavior. In some cases, for example, they do not perform the case operation, as a single code point must expand to more than one."
{ $subsection ch>upper }
{ $subsection ch>lower }
{ $subsection ch>title }
"To test if a string is in a given case:"
{ $subsection upper? }
{ $subsection lower? }
@ -53,18 +49,3 @@ HELP: title?
HELP: case-fold?
{ $values { "string" string } { "?" "a boolean" } }
{ $description "Tests if a string is in case-folded form." } ;
HELP: ch>lower
{ $values { "ch" "a code point" } { "lower" "a code point" } }
{ $description "Converts a code point to lower case." }
{ $warning "Don't use this unless you know what you're doing! " { $code ">lower" } " is not the same as " { $code "[ ch>lower ] map" } "." } ;
HELP: ch>upper
{ $values { "ch" "a code point" } { "upper" "a code point" } }
{ $description "Converts a code point to upper case." }
{ $warning "Don't use this unless you know what you're doing! " { $code ">upper" } " is not the same as " { $code "[ ch>upper ] map" } "." } ;
HELP: ch>title
{ $values { "ch" "a code point" } { "title" "a code point" } }
{ $description "Converts a code point to title case." }
{ $warning "Don't use this unless you know what you're doing! " { $code ">title" } " is not the same as " { $code "[ ch>title ] map" } "." } ;

View File

@ -7,11 +7,11 @@ IN: unicode.case
<PRIVATE
: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ;
PRIVATE>
: ch>lower ( ch -- lower ) simple-lower at-default ;
: ch>upper ( ch -- upper ) simple-upper at-default ;
: ch>title ( ch -- title ) simple-title at-default ;
PRIVATE>
SYMBOL: locale ! Just casing locale, or overall?

View File

@ -128,12 +128,9 @@ VALUE: properties
cat categories index char table ?set-nth
] assoc-each table fill-ranges ] ;
: ascii-lower ( string -- lower )
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
: process-names ( data -- names-hash )
1 swap (process-data) [
ascii-lower { { CHAR: \s CHAR: - } } substitute swap
>lower { { CHAR: \s CHAR: - } } substitute swap
] H{ } assoc-map-as ;
: multihex ( hexstring -- string )

View File

@ -1,7 +1,8 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: sequences namespaces make unicode.data kernel math arrays
locals sorting.insertion accessors assocs math.order combinators ;
locals sorting.insertion accessors assocs math.order combinators
unicode.syntax ;
IN: unicode.normalize
<PRIVATE

View File

@ -5,7 +5,7 @@ USING: kernel namespaces make xmode.rules xmode.tokens
xmode.marker.state xmode.marker.context xmode.utilities
xmode.catalog sequences math assocs combinators strings
parser-combinators.regexp splitting parser-combinators ascii
unicode.case combinators.short-circuit accessors ;
ascii combinators.short-circuit accessors ;
! Based on org.gjt.sp.jedit.syntax.TokenMarker

View File

@ -3,7 +3,7 @@
USING: io io.files io.files.temp io.streams.duplex kernel
sequences sequences.private strings vectors words memoize
splitting grouping hints tr continuations io.encodings.ascii
unicode.case ;
ascii ;
IN: benchmark.reverse-complement
TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ;

View File

@ -2,7 +2,7 @@ USING: arrays combinators kernel lists math math.parser
namespaces parser lexer parser-combinators
parser-combinators.simple promises quotations sequences strings
math.order assocs prettyprint.backend prettyprint.custom memoize
unicode.case unicode.categories combinators.short-circuit
ascii unicode.categories combinators.short-circuit
accessors make io ;
IN: parser-combinators.regexp