Privatizing unicode.case:ch>{lower,upper,title}

db4
Daniel Ehrenberg 2009-01-08 19:07:46 -06:00
parent 0aec786359
commit ad53cb8635
15 changed files with 67 additions and 40 deletions

View File

@ -37,6 +37,26 @@ HELP: quotable?
{ $values { "ch" "a character" } { "?" "a boolean" } } { $values { "ch" "a character" } { "?" "a boolean" } }
{ $description "Tests for characters which may appear in a Factor string literal without escaping." } ; { $description "Tests for characters which may appear in a Factor string literal without escaping." } ;
HELP: ascii?
{ $values { "ch" "a character" } { "?" "a boolean" } }
{ $description "Tests for whether a number is an ASCII character." } ;
HELP: ch>lower
{ $values { "ch" "a character" } { "lower" "a character" } }
{ $description "Converts an ASCII character to lower case." } ;
HELP: ch>upper
{ $values { "ch" "a character" } { "upper" "a character" } }
{ $description "Converts an ASCII character to upper case." } ;
HELP: >lower
{ $values { "str" "a string" } { "lower" "a string" } }
{ $description "Converts an ASCII string to lower case." } ;
HELP: >upper
{ $values { "str" "a string" } { "upper" "a string" } }
{ $description "Converts an ASCII string to upper case." } ;
ARTICLE: "ascii" "ASCII character classes" ARTICLE: "ascii" "ASCII character classes"
"The " { $vocab-link "ascii" } " vocabulary implements traditional ASCII character classes:" "The " { $vocab-link "ascii" } " vocabulary implements traditional ASCII character classes:"
{ $subsection blank? } { $subsection blank? }
@ -46,6 +66,12 @@ ARTICLE: "ascii" "ASCII character classes"
{ $subsection printable? } { $subsection printable? }
{ $subsection control? } { $subsection control? }
{ $subsection quotable? } { $subsection quotable? }
"Modern applications should use Unicode 5.0 instead (" { $vocab-link "unicode.categories" } ")." ; { $subsection ascii? }
"ASCII case conversion is also implemented:"
{ $subsection ch>lower }
{ $subsection ch>upper }
{ $subsection >lower }
{ $subsection >upper }
"Modern applications should use Unicode 5.1 instead (" { $vocab-link "unicode.categories" } ")." ;
ABOUT: "ascii" ABOUT: "ascii"

View File

@ -12,3 +12,8 @@ IN: ascii.tests
0 "There are Four Upper Case characters" 0 "There are Four Upper Case characters"
[ LETTER? [ 1+ ] when ] each [ LETTER? [ 1+ ] when ] each
] unit-test ] unit-test
[ t f ] [ CHAR: \s ascii? 400 ascii? ] unit-test
[ "HELLO HOW ARE YOU?" ] [ "hellO hOw arE YOU?" >upper ] unit-test
[ "i'm good thx bai" ] [ "I'm Good THX bai" >lower ] unit-test

View File

@ -4,6 +4,8 @@ USING: kernel math math.order sequences
combinators.short-circuit ; combinators.short-circuit ;
IN: ascii IN: ascii
: ascii? ( ch -- ? ) 0 127 between? ; inline
: blank? ( ch -- ? ) " \t\n\r" member? ; inline : blank? ( ch -- ? ) " \t\n\r" member? ; inline
: letter? ( ch -- ? ) CHAR: a CHAR: z between? ; inline : letter? ( ch -- ? ) CHAR: a CHAR: z between? ; inline
@ -25,3 +27,15 @@ IN: ascii
: alpha? ( ch -- ? ) : alpha? ( ch -- ? )
[ [ Letter? ] [ digit? ] ] 1|| ; [ [ Letter? ] [ digit? ] ] 1|| ;
: ch>lower ( ch -- lower )
dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ;
: >lower ( str -- lower )
[ ch>lower ] map ;
: ch>upper ( ch -- upper )
dup CHAR: a CHAR: z between? [ HEX: 20 - ] when ;
: >upper ( str -- upper )
[ ch>upper ] map ;

View File

@ -3,7 +3,10 @@
USING: accessors arrays assocs grouping kernel regexp.backend USING: accessors arrays assocs grouping kernel regexp.backend
locals math namespaces regexp.parser sequences fry quotations locals math namespaces regexp.parser sequences fry quotations
math.order math.ranges vectors unicode.categories regexp.utils math.order math.ranges vectors unicode.categories regexp.utils
regexp.transition-tables words sets regexp.classes unicode.case ; regexp.transition-tables words sets regexp.classes unicode.case.private ;
! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
IN: regexp.nfa IN: regexp.nfa
SYMBOL: negation-mode SYMBOL: negation-mode
@ -160,6 +163,8 @@ M: LETTER-class nfa-node ( node -- )
M: character-class-range nfa-node ( node -- ) M: character-class-range nfa-node ( node -- )
case-insensitive option? [ case-insensitive option? [
! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp.
dup [ from>> ] [ to>> ] bi dup [ from>> ] [ to>> ] bi
2dup [ Letter? ] bi@ and [ 2dup [ Letter? ] bi@ and [
rot drop rot drop

View File

@ -3,8 +3,8 @@
USING: accessors arrays assocs combinators io io.streams.string USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser namespaces sets kernel math math.parser namespaces sets
quotations sequences splitting vectors math.order quotations sequences splitting vectors math.order
unicode.categories strings regexp.backend regexp.utils strings regexp.backend regexp.utils
unicode.case words locals regexp.classes ; unicode.case unicode.categories words locals regexp.classes ;
IN: regexp.parser IN: regexp.parser
FROM: math.ranges => [a,b] ; FROM: math.ranges => [a,b] ;
@ -261,7 +261,7 @@ ERROR: bad-escaped-literals seq ;
parse-til-E parse-til-E
drop1 drop1
[ epsilon ] [ [ epsilon ] [
[ quot call <constant> ] V{ } map-as quot call [ <constant> ] V{ } map-as
first|concatenation first|concatenation
] if-empty ; inline ] if-empty ; inline
@ -269,10 +269,10 @@ ERROR: bad-escaped-literals seq ;
[ ] (parse-escaped-literals) ; [ ] (parse-escaped-literals) ;
: lower-case-literals ( -- obj ) : lower-case-literals ( -- obj )
[ ch>lower ] (parse-escaped-literals) ; [ >lower ] (parse-escaped-literals) ;
: upper-case-literals ( -- obj ) : upper-case-literals ( -- obj )
[ ch>upper ] (parse-escaped-literals) ; [ >upper ] (parse-escaped-literals) ;
: parse-escaped ( -- obj ) : parse-escaped ( -- obj )
read1 read1

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Slava Pestov. ! Copyright (C) 2008 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: sequences grouping assocs kernel ascii unicode.case tr ; USING: sequences grouping assocs kernel ascii ascii tr ;
IN: soundex IN: soundex
TR: soundex-tr TR: soundex-tr

View File

@ -1,5 +1,5 @@
IN: tr.tests IN: tr.tests
USING: tr tools.test unicode.case ; USING: tr tools.test ascii ;
TR: tr-test ch>upper "ABC" "XYZ" ; TR: tr-test ch>upper "ABC" "XYZ" ;

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Slava Pestov. ! Copyright (C) 2008 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: byte-arrays strings sequences sequences.private USING: byte-arrays strings sequences sequences.private ascii
fry kernel words parser lexer assocs math math.order summary ; fry kernel words parser lexer assocs math math.order summary ;
IN: tr IN: tr
@ -11,8 +11,6 @@ M: bad-tr summary
<PRIVATE <PRIVATE
: ascii? ( ch -- ? ) 0 127 between? ; inline
: tr-nth ( n mapping -- ch ) nth-unsafe 127 bitand ; inline : tr-nth ( n mapping -- ch ) nth-unsafe 127 bitand ; inline
: check-tr ( from to -- ) : check-tr ( from to -- )

View File

@ -9,10 +9,6 @@ ARTICLE: "unicode.case" "Case mapping"
{ $subsection >lower } { $subsection >lower }
{ $subsection >title } { $subsection >title }
{ $subsection >case-fold } { $subsection >case-fold }
"There are analogous routines which operate on individual code points, but these should " { $emphasis "not be used" } " in general as they have slightly different behavior. In some cases, for example, they do not perform the case operation, as a single code point must expand to more than one."
{ $subsection ch>upper }
{ $subsection ch>lower }
{ $subsection ch>title }
"To test if a string is in a given case:" "To test if a string is in a given case:"
{ $subsection upper? } { $subsection upper? }
{ $subsection lower? } { $subsection lower? }
@ -53,18 +49,3 @@ HELP: title?
HELP: case-fold? HELP: case-fold?
{ $values { "string" string } { "?" "a boolean" } } { $values { "string" string } { "?" "a boolean" } }
{ $description "Tests if a string is in case-folded form." } ; { $description "Tests if a string is in case-folded form." } ;
HELP: ch>lower
{ $values { "ch" "a code point" } { "lower" "a code point" } }
{ $description "Converts a code point to lower case." }
{ $warning "Don't use this unless you know what you're doing! " { $code ">lower" } " is not the same as " { $code "[ ch>lower ] map" } "." } ;
HELP: ch>upper
{ $values { "ch" "a code point" } { "upper" "a code point" } }
{ $description "Converts a code point to upper case." }
{ $warning "Don't use this unless you know what you're doing! " { $code ">upper" } " is not the same as " { $code "[ ch>upper ] map" } "." } ;
HELP: ch>title
{ $values { "ch" "a code point" } { "title" "a code point" } }
{ $description "Converts a code point to title case." }
{ $warning "Don't use this unless you know what you're doing! " { $code ">title" } " is not the same as " { $code "[ ch>title ] map" } "." } ;

View File

@ -7,11 +7,11 @@ IN: unicode.case
<PRIVATE <PRIVATE
: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ; : at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ;
PRIVATE>
: ch>lower ( ch -- lower ) simple-lower at-default ; : ch>lower ( ch -- lower ) simple-lower at-default ;
: ch>upper ( ch -- upper ) simple-upper at-default ; : ch>upper ( ch -- upper ) simple-upper at-default ;
: ch>title ( ch -- title ) simple-title at-default ; : ch>title ( ch -- title ) simple-title at-default ;
PRIVATE>
SYMBOL: locale ! Just casing locale, or overall? SYMBOL: locale ! Just casing locale, or overall?

View File

@ -128,12 +128,9 @@ VALUE: properties
cat categories index char table ?set-nth cat categories index char table ?set-nth
] assoc-each table fill-ranges ] ; ] assoc-each table fill-ranges ] ;
: ascii-lower ( string -- lower )
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
: process-names ( data -- names-hash ) : process-names ( data -- names-hash )
1 swap (process-data) [ 1 swap (process-data) [
ascii-lower { { CHAR: \s CHAR: - } } substitute swap >lower { { CHAR: \s CHAR: - } } substitute swap
] H{ } assoc-map-as ; ] H{ } assoc-map-as ;
: multihex ( hexstring -- string ) : multihex ( hexstring -- string )

View File

@ -1,7 +1,8 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: sequences namespaces make unicode.data kernel math arrays USING: sequences namespaces make unicode.data kernel math arrays
locals sorting.insertion accessors assocs math.order combinators ; locals sorting.insertion accessors assocs math.order combinators
unicode.syntax ;
IN: unicode.normalize IN: unicode.normalize
<PRIVATE <PRIVATE

View File

@ -5,7 +5,7 @@ USING: kernel namespaces make xmode.rules xmode.tokens
xmode.marker.state xmode.marker.context xmode.utilities xmode.marker.state xmode.marker.context xmode.utilities
xmode.catalog sequences math assocs combinators strings xmode.catalog sequences math assocs combinators strings
parser-combinators.regexp splitting parser-combinators ascii parser-combinators.regexp splitting parser-combinators ascii
unicode.case combinators.short-circuit accessors ; ascii combinators.short-circuit accessors ;
! Based on org.gjt.sp.jedit.syntax.TokenMarker ! Based on org.gjt.sp.jedit.syntax.TokenMarker

View File

@ -3,7 +3,7 @@
USING: io io.files io.files.temp io.streams.duplex kernel USING: io io.files io.files.temp io.streams.duplex kernel
sequences sequences.private strings vectors words memoize sequences sequences.private strings vectors words memoize
splitting grouping hints tr continuations io.encodings.ascii splitting grouping hints tr continuations io.encodings.ascii
unicode.case ; ascii ;
IN: benchmark.reverse-complement IN: benchmark.reverse-complement
TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ; TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ;

View File

@ -2,7 +2,7 @@ USING: arrays combinators kernel lists math math.parser
namespaces parser lexer parser-combinators namespaces parser lexer parser-combinators
parser-combinators.simple promises quotations sequences strings parser-combinators.simple promises quotations sequences strings
math.order assocs prettyprint.backend prettyprint.custom memoize math.order assocs prettyprint.backend prettyprint.custom memoize
unicode.case unicode.categories combinators.short-circuit ascii unicode.categories combinators.short-circuit
accessors make io ; accessors make io ;
IN: parser-combinators.regexp IN: parser-combinators.regexp