Privatizing unicode.case:ch>{lower,upper,title}
parent
0aec786359
commit
ad53cb8635
|
@ -37,6 +37,26 @@ HELP: quotable?
|
||||||
{ $values { "ch" "a character" } { "?" "a boolean" } }
|
{ $values { "ch" "a character" } { "?" "a boolean" } }
|
||||||
{ $description "Tests for characters which may appear in a Factor string literal without escaping." } ;
|
{ $description "Tests for characters which may appear in a Factor string literal without escaping." } ;
|
||||||
|
|
||||||
|
HELP: ascii?
|
||||||
|
{ $values { "ch" "a character" } { "?" "a boolean" } }
|
||||||
|
{ $description "Tests for whether a number is an ASCII character." } ;
|
||||||
|
|
||||||
|
HELP: ch>lower
|
||||||
|
{ $values { "ch" "a character" } { "lower" "a character" } }
|
||||||
|
{ $description "Converts an ASCII character to lower case." } ;
|
||||||
|
|
||||||
|
HELP: ch>upper
|
||||||
|
{ $values { "ch" "a character" } { "upper" "a character" } }
|
||||||
|
{ $description "Converts an ASCII character to upper case." } ;
|
||||||
|
|
||||||
|
HELP: >lower
|
||||||
|
{ $values { "str" "a string" } { "lower" "a string" } }
|
||||||
|
{ $description "Converts an ASCII string to lower case." } ;
|
||||||
|
|
||||||
|
HELP: >upper
|
||||||
|
{ $values { "str" "a string" } { "upper" "a string" } }
|
||||||
|
{ $description "Converts an ASCII string to upper case." } ;
|
||||||
|
|
||||||
ARTICLE: "ascii" "ASCII character classes"
|
ARTICLE: "ascii" "ASCII character classes"
|
||||||
"The " { $vocab-link "ascii" } " vocabulary implements traditional ASCII character classes:"
|
"The " { $vocab-link "ascii" } " vocabulary implements traditional ASCII character classes:"
|
||||||
{ $subsection blank? }
|
{ $subsection blank? }
|
||||||
|
@ -46,6 +66,12 @@ ARTICLE: "ascii" "ASCII character classes"
|
||||||
{ $subsection printable? }
|
{ $subsection printable? }
|
||||||
{ $subsection control? }
|
{ $subsection control? }
|
||||||
{ $subsection quotable? }
|
{ $subsection quotable? }
|
||||||
"Modern applications should use Unicode 5.0 instead (" { $vocab-link "unicode.categories" } ")." ;
|
{ $subsection ascii? }
|
||||||
|
"ASCII case conversion is also implemented:"
|
||||||
|
{ $subsection ch>lower }
|
||||||
|
{ $subsection ch>upper }
|
||||||
|
{ $subsection >lower }
|
||||||
|
{ $subsection >upper }
|
||||||
|
"Modern applications should use Unicode 5.1 instead (" { $vocab-link "unicode.categories" } ")." ;
|
||||||
|
|
||||||
ABOUT: "ascii"
|
ABOUT: "ascii"
|
||||||
|
|
|
@ -12,3 +12,8 @@ IN: ascii.tests
|
||||||
0 "There are Four Upper Case characters"
|
0 "There are Four Upper Case characters"
|
||||||
[ LETTER? [ 1+ ] when ] each
|
[ LETTER? [ 1+ ] when ] each
|
||||||
] unit-test
|
] unit-test
|
||||||
|
|
||||||
|
[ t f ] [ CHAR: \s ascii? 400 ascii? ] unit-test
|
||||||
|
|
||||||
|
[ "HELLO HOW ARE YOU?" ] [ "hellO hOw arE YOU?" >upper ] unit-test
|
||||||
|
[ "i'm good thx bai" ] [ "I'm Good THX bai" >lower ] unit-test
|
||||||
|
|
|
@ -4,6 +4,8 @@ USING: kernel math math.order sequences
|
||||||
combinators.short-circuit ;
|
combinators.short-circuit ;
|
||||||
IN: ascii
|
IN: ascii
|
||||||
|
|
||||||
|
: ascii? ( ch -- ? ) 0 127 between? ; inline
|
||||||
|
|
||||||
: blank? ( ch -- ? ) " \t\n\r" member? ; inline
|
: blank? ( ch -- ? ) " \t\n\r" member? ; inline
|
||||||
|
|
||||||
: letter? ( ch -- ? ) CHAR: a CHAR: z between? ; inline
|
: letter? ( ch -- ? ) CHAR: a CHAR: z between? ; inline
|
||||||
|
@ -25,3 +27,15 @@ IN: ascii
|
||||||
|
|
||||||
: alpha? ( ch -- ? )
|
: alpha? ( ch -- ? )
|
||||||
[ [ Letter? ] [ digit? ] ] 1|| ;
|
[ [ Letter? ] [ digit? ] ] 1|| ;
|
||||||
|
|
||||||
|
: ch>lower ( ch -- lower )
|
||||||
|
dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ;
|
||||||
|
|
||||||
|
: >lower ( str -- lower )
|
||||||
|
[ ch>lower ] map ;
|
||||||
|
|
||||||
|
: ch>upper ( ch -- upper )
|
||||||
|
dup CHAR: a CHAR: z between? [ HEX: 20 - ] when ;
|
||||||
|
|
||||||
|
: >upper ( str -- upper )
|
||||||
|
[ ch>upper ] map ;
|
||||||
|
|
|
@ -3,7 +3,10 @@
|
||||||
USING: accessors arrays assocs grouping kernel regexp.backend
|
USING: accessors arrays assocs grouping kernel regexp.backend
|
||||||
locals math namespaces regexp.parser sequences fry quotations
|
locals math namespaces regexp.parser sequences fry quotations
|
||||||
math.order math.ranges vectors unicode.categories regexp.utils
|
math.order math.ranges vectors unicode.categories regexp.utils
|
||||||
regexp.transition-tables words sets regexp.classes unicode.case ;
|
regexp.transition-tables words sets regexp.classes unicode.case.private ;
|
||||||
|
! This uses unicode.case.private for ch>upper and ch>lower
|
||||||
|
! but case-insensitive matching should be done by case-folding everything
|
||||||
|
! before processing starts
|
||||||
IN: regexp.nfa
|
IN: regexp.nfa
|
||||||
|
|
||||||
SYMBOL: negation-mode
|
SYMBOL: negation-mode
|
||||||
|
@ -160,6 +163,8 @@ M: LETTER-class nfa-node ( node -- )
|
||||||
|
|
||||||
M: character-class-range nfa-node ( node -- )
|
M: character-class-range nfa-node ( node -- )
|
||||||
case-insensitive option? [
|
case-insensitive option? [
|
||||||
|
! This should be implemented for Unicode by case-folding
|
||||||
|
! the input and all strings in the regexp.
|
||||||
dup [ from>> ] [ to>> ] bi
|
dup [ from>> ] [ to>> ] bi
|
||||||
2dup [ Letter? ] bi@ and [
|
2dup [ Letter? ] bi@ and [
|
||||||
rot drop
|
rot drop
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
USING: accessors arrays assocs combinators io io.streams.string
|
USING: accessors arrays assocs combinators io io.streams.string
|
||||||
kernel math math.parser namespaces sets
|
kernel math math.parser namespaces sets
|
||||||
quotations sequences splitting vectors math.order
|
quotations sequences splitting vectors math.order
|
||||||
unicode.categories strings regexp.backend regexp.utils
|
strings regexp.backend regexp.utils
|
||||||
unicode.case words locals regexp.classes ;
|
unicode.case unicode.categories words locals regexp.classes ;
|
||||||
IN: regexp.parser
|
IN: regexp.parser
|
||||||
|
|
||||||
FROM: math.ranges => [a,b] ;
|
FROM: math.ranges => [a,b] ;
|
||||||
|
@ -261,7 +261,7 @@ ERROR: bad-escaped-literals seq ;
|
||||||
parse-til-E
|
parse-til-E
|
||||||
drop1
|
drop1
|
||||||
[ epsilon ] [
|
[ epsilon ] [
|
||||||
[ quot call <constant> ] V{ } map-as
|
quot call [ <constant> ] V{ } map-as
|
||||||
first|concatenation
|
first|concatenation
|
||||||
] if-empty ; inline
|
] if-empty ; inline
|
||||||
|
|
||||||
|
@ -269,10 +269,10 @@ ERROR: bad-escaped-literals seq ;
|
||||||
[ ] (parse-escaped-literals) ;
|
[ ] (parse-escaped-literals) ;
|
||||||
|
|
||||||
: lower-case-literals ( -- obj )
|
: lower-case-literals ( -- obj )
|
||||||
[ ch>lower ] (parse-escaped-literals) ;
|
[ >lower ] (parse-escaped-literals) ;
|
||||||
|
|
||||||
: upper-case-literals ( -- obj )
|
: upper-case-literals ( -- obj )
|
||||||
[ ch>upper ] (parse-escaped-literals) ;
|
[ >upper ] (parse-escaped-literals) ;
|
||||||
|
|
||||||
: parse-escaped ( -- obj )
|
: parse-escaped ( -- obj )
|
||||||
read1
|
read1
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
! Copyright (C) 2008 Slava Pestov.
|
! Copyright (C) 2008 Slava Pestov.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: sequences grouping assocs kernel ascii unicode.case tr ;
|
USING: sequences grouping assocs kernel ascii ascii tr ;
|
||||||
IN: soundex
|
IN: soundex
|
||||||
|
|
||||||
TR: soundex-tr
|
TR: soundex-tr
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
IN: tr.tests
|
IN: tr.tests
|
||||||
USING: tr tools.test unicode.case ;
|
USING: tr tools.test ascii ;
|
||||||
|
|
||||||
TR: tr-test ch>upper "ABC" "XYZ" ;
|
TR: tr-test ch>upper "ABC" "XYZ" ;
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
! Copyright (C) 2008 Slava Pestov.
|
! Copyright (C) 2008 Slava Pestov.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: byte-arrays strings sequences sequences.private
|
USING: byte-arrays strings sequences sequences.private ascii
|
||||||
fry kernel words parser lexer assocs math math.order summary ;
|
fry kernel words parser lexer assocs math math.order summary ;
|
||||||
IN: tr
|
IN: tr
|
||||||
|
|
||||||
|
@ -11,8 +11,6 @@ M: bad-tr summary
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
: ascii? ( ch -- ? ) 0 127 between? ; inline
|
|
||||||
|
|
||||||
: tr-nth ( n mapping -- ch ) nth-unsafe 127 bitand ; inline
|
: tr-nth ( n mapping -- ch ) nth-unsafe 127 bitand ; inline
|
||||||
|
|
||||||
: check-tr ( from to -- )
|
: check-tr ( from to -- )
|
||||||
|
|
|
@ -9,10 +9,6 @@ ARTICLE: "unicode.case" "Case mapping"
|
||||||
{ $subsection >lower }
|
{ $subsection >lower }
|
||||||
{ $subsection >title }
|
{ $subsection >title }
|
||||||
{ $subsection >case-fold }
|
{ $subsection >case-fold }
|
||||||
"There are analogous routines which operate on individual code points, but these should " { $emphasis "not be used" } " in general as they have slightly different behavior. In some cases, for example, they do not perform the case operation, as a single code point must expand to more than one."
|
|
||||||
{ $subsection ch>upper }
|
|
||||||
{ $subsection ch>lower }
|
|
||||||
{ $subsection ch>title }
|
|
||||||
"To test if a string is in a given case:"
|
"To test if a string is in a given case:"
|
||||||
{ $subsection upper? }
|
{ $subsection upper? }
|
||||||
{ $subsection lower? }
|
{ $subsection lower? }
|
||||||
|
@ -53,18 +49,3 @@ HELP: title?
|
||||||
HELP: case-fold?
|
HELP: case-fold?
|
||||||
{ $values { "string" string } { "?" "a boolean" } }
|
{ $values { "string" string } { "?" "a boolean" } }
|
||||||
{ $description "Tests if a string is in case-folded form." } ;
|
{ $description "Tests if a string is in case-folded form." } ;
|
||||||
|
|
||||||
HELP: ch>lower
|
|
||||||
{ $values { "ch" "a code point" } { "lower" "a code point" } }
|
|
||||||
{ $description "Converts a code point to lower case." }
|
|
||||||
{ $warning "Don't use this unless you know what you're doing! " { $code ">lower" } " is not the same as " { $code "[ ch>lower ] map" } "." } ;
|
|
||||||
|
|
||||||
HELP: ch>upper
|
|
||||||
{ $values { "ch" "a code point" } { "upper" "a code point" } }
|
|
||||||
{ $description "Converts a code point to upper case." }
|
|
||||||
{ $warning "Don't use this unless you know what you're doing! " { $code ">upper" } " is not the same as " { $code "[ ch>upper ] map" } "." } ;
|
|
||||||
|
|
||||||
HELP: ch>title
|
|
||||||
{ $values { "ch" "a code point" } { "title" "a code point" } }
|
|
||||||
{ $description "Converts a code point to title case." }
|
|
||||||
{ $warning "Don't use this unless you know what you're doing! " { $code ">title" } " is not the same as " { $code "[ ch>title ] map" } "." } ;
|
|
||||||
|
|
|
@ -7,11 +7,11 @@ IN: unicode.case
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ;
|
: at-default ( key assoc -- value/key ) [ at ] [ drop ] 2bi or ;
|
||||||
PRIVATE>
|
|
||||||
|
|
||||||
: ch>lower ( ch -- lower ) simple-lower at-default ;
|
: ch>lower ( ch -- lower ) simple-lower at-default ;
|
||||||
: ch>upper ( ch -- upper ) simple-upper at-default ;
|
: ch>upper ( ch -- upper ) simple-upper at-default ;
|
||||||
: ch>title ( ch -- title ) simple-title at-default ;
|
: ch>title ( ch -- title ) simple-title at-default ;
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
SYMBOL: locale ! Just casing locale, or overall?
|
SYMBOL: locale ! Just casing locale, or overall?
|
||||||
|
|
||||||
|
|
|
@ -128,12 +128,9 @@ VALUE: properties
|
||||||
cat categories index char table ?set-nth
|
cat categories index char table ?set-nth
|
||||||
] assoc-each table fill-ranges ] ;
|
] assoc-each table fill-ranges ] ;
|
||||||
|
|
||||||
: ascii-lower ( string -- lower )
|
|
||||||
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
|
|
||||||
|
|
||||||
: process-names ( data -- names-hash )
|
: process-names ( data -- names-hash )
|
||||||
1 swap (process-data) [
|
1 swap (process-data) [
|
||||||
ascii-lower { { CHAR: \s CHAR: - } } substitute swap
|
>lower { { CHAR: \s CHAR: - } } substitute swap
|
||||||
] H{ } assoc-map-as ;
|
] H{ } assoc-map-as ;
|
||||||
|
|
||||||
: multihex ( hexstring -- string )
|
: multihex ( hexstring -- string )
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: sequences namespaces make unicode.data kernel math arrays
|
USING: sequences namespaces make unicode.data kernel math arrays
|
||||||
locals sorting.insertion accessors assocs math.order combinators ;
|
locals sorting.insertion accessors assocs math.order combinators
|
||||||
|
unicode.syntax ;
|
||||||
IN: unicode.normalize
|
IN: unicode.normalize
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
|
@ -5,7 +5,7 @@ USING: kernel namespaces make xmode.rules xmode.tokens
|
||||||
xmode.marker.state xmode.marker.context xmode.utilities
|
xmode.marker.state xmode.marker.context xmode.utilities
|
||||||
xmode.catalog sequences math assocs combinators strings
|
xmode.catalog sequences math assocs combinators strings
|
||||||
parser-combinators.regexp splitting parser-combinators ascii
|
parser-combinators.regexp splitting parser-combinators ascii
|
||||||
unicode.case combinators.short-circuit accessors ;
|
ascii combinators.short-circuit accessors ;
|
||||||
|
|
||||||
! Based on org.gjt.sp.jedit.syntax.TokenMarker
|
! Based on org.gjt.sp.jedit.syntax.TokenMarker
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
USING: io io.files io.files.temp io.streams.duplex kernel
|
USING: io io.files io.files.temp io.streams.duplex kernel
|
||||||
sequences sequences.private strings vectors words memoize
|
sequences sequences.private strings vectors words memoize
|
||||||
splitting grouping hints tr continuations io.encodings.ascii
|
splitting grouping hints tr continuations io.encodings.ascii
|
||||||
unicode.case ;
|
ascii ;
|
||||||
IN: benchmark.reverse-complement
|
IN: benchmark.reverse-complement
|
||||||
|
|
||||||
TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ;
|
TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ;
|
||||||
|
|
|
@ -2,7 +2,7 @@ USING: arrays combinators kernel lists math math.parser
|
||||||
namespaces parser lexer parser-combinators
|
namespaces parser lexer parser-combinators
|
||||||
parser-combinators.simple promises quotations sequences strings
|
parser-combinators.simple promises quotations sequences strings
|
||||||
math.order assocs prettyprint.backend prettyprint.custom memoize
|
math.order assocs prettyprint.backend prettyprint.custom memoize
|
||||||
unicode.case unicode.categories combinators.short-circuit
|
ascii unicode.categories combinators.short-circuit
|
||||||
accessors make io ;
|
accessors make io ;
|
||||||
IN: parser-combinators.regexp
|
IN: parser-combinators.regexp
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue