Regexp supports Unicode properties (categories and script)
parent
c2e3b6ac5a
commit
9760f54857
|
@ -108,21 +108,24 @@ M: terminator-class class-member? ( obj class -- ? )
|
|||
|
||||
M: f class-member? 2drop f ;
|
||||
|
||||
: same? ( obj1 obj2 quot1: ( obj1 -- val1 ) quot2: ( obj2 -- val2 ) -- ? )
|
||||
bi* = ; inline
|
||||
|
||||
M: script-class class-member?
|
||||
[ script-of ] [ script>> ] bi* = ;
|
||||
[ script-of ] [ script>> ] same? ;
|
||||
|
||||
M: category-class class-member?
|
||||
[ category# ] [ category>> ] bi* = ;
|
||||
[ category ] [ category>> ] same? ;
|
||||
|
||||
M: category-range-class class-member?
|
||||
[ category first ] [ category>> ] bi* = ;
|
||||
[ category first ] [ category>> ] same? ;
|
||||
|
||||
TUPLE: not-class class ;
|
||||
|
||||
PREDICATE: not-integer < not-class class>> integer? ;
|
||||
|
||||
UNION: simple-class
|
||||
primitive-class range-class category-class category-range-class dot ;
|
||||
primitive-class range-class dot ;
|
||||
PREDICATE: not-simple < not-class class>> simple-class? ;
|
||||
|
||||
M: not-class class-member?
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
|
||||
combinators regexp.classes strings splitting peg locals accessors
|
||||
regexp.ast unicode.case ;
|
||||
regexp.ast unicode.case unicode.script.private unicode.categories
|
||||
memoize interval-maps sets unicode.data combinators.short-circuit ;
|
||||
IN: regexp.parser
|
||||
|
||||
: allowed-char? ( ch -- ? )
|
||||
|
@ -18,15 +19,41 @@ ERROR: bad-number ;
|
|||
|
||||
ERROR: bad-class name ;
|
||||
|
||||
: simple ( str -- simple )
|
||||
! Alternatively, first collation key level?
|
||||
>case-fold [ " \t_" member? not ] filter ;
|
||||
|
||||
: simple-table ( seq -- table )
|
||||
[ [ simple ] keep ] H{ } map>assoc ;
|
||||
|
||||
MEMO: simple-script-table ( -- table )
|
||||
script-table interval-values prune simple-table ;
|
||||
|
||||
MEMO: simple-category-table ( -- table )
|
||||
categories simple-table ;
|
||||
|
||||
: parse-unicode-class ( name -- class )
|
||||
! Implement this!
|
||||
drop f ;
|
||||
{
|
||||
{ [ dup { [ length 1 = ] [ first "clmnpsz" member? ] } 1&& ] [
|
||||
>upper first
|
||||
<category-range-class>
|
||||
] }
|
||||
{ [ dup >title categories member? ] [
|
||||
simple-category-table at <category-class>
|
||||
] }
|
||||
{ [ "script=" ?head ] [
|
||||
dup simple-script-table at
|
||||
[ <script-class> ]
|
||||
[ "script=" prepend bad-class ] ?if
|
||||
] }
|
||||
[ bad-class ]
|
||||
} cond ;
|
||||
|
||||
: unicode-class ( name -- class )
|
||||
dup parse-unicode-class [ ] [ bad-class ] ?if ;
|
||||
|
||||
: name>class ( name -- class )
|
||||
>string >case-fold {
|
||||
>string simple {
|
||||
{ "lower" letter-class }
|
||||
{ "upper" LETTER-class }
|
||||
{ "alpha" Letter-class }
|
||||
|
|
|
@ -72,8 +72,10 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
|
|||
{ { $snippet "\\p{blank}" } "Non-newline whitespace" }
|
||||
{ { $snippet "\\p{cntrl}" } "Control character" }
|
||||
{ { $snippet "\\p{space}" } "Whitespace" }
|
||||
{ { $snippet "\\p{xdigit}" } "Hexidecimal digit" } } ! In the future: Unicode
|
||||
"Full unicode properties are not yet supported."
|
||||
{ { $snippet "\\p{xdigit}" } "Hexidecimal digit" }
|
||||
{ { $snippet "\\p{Nd}" } "Character in Unicode category Nd" }
|
||||
{ { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" }
|
||||
{ { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
|
||||
{ $heading "Boundaries" }
|
||||
"Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
|
||||
{ $table
|
||||
|
|
|
@ -480,3 +480,31 @@ IN: regexp-tests
|
|||
[ f ] [ "a\r" R/ a$./mds matches? ] unit-test
|
||||
[ t ] [ "a\n" R/ a$./ms matches? ] unit-test
|
||||
[ t ] [ "a\n" R/ a$./mds matches? ] unit-test
|
||||
|
||||
! Unicode categories
|
||||
[ t ] [ "a" R/ \p{L}/ matches? ] unit-test
|
||||
[ t ] [ "A" R/ \p{L}/ matches? ] unit-test
|
||||
[ f ] [ " " R/ \p{L}/ matches? ] unit-test
|
||||
[ f ] [ "a" R/ \P{L}/ matches? ] unit-test
|
||||
[ f ] [ "A" R/ \P{L}/ matches? ] unit-test
|
||||
[ t ] [ " " R/ \P{L}/ matches? ] unit-test
|
||||
|
||||
[ t ] [ "a" R/ \p{Ll}/ matches? ] unit-test
|
||||
[ f ] [ "A" R/ \p{Ll}/ matches? ] unit-test
|
||||
[ f ] [ " " R/ \p{Ll}/ matches? ] unit-test
|
||||
[ f ] [ "a" R/ \P{Ll}/ matches? ] unit-test
|
||||
[ t ] [ "A" R/ \P{Ll}/ matches? ] unit-test
|
||||
[ t ] [ " " R/ \P{Ll}/ matches? ] unit-test
|
||||
|
||||
[ t ] [ "a" R/ \p{script=Latin}/ matches? ] unit-test
|
||||
[ f ] [ " " R/ \p{script=Latin}/ matches? ] unit-test
|
||||
[ f ] [ "a" R/ \P{script=Latin}/ matches? ] unit-test
|
||||
[ t ] [ " " R/ \P{script=Latin}/ matches? ] unit-test
|
||||
|
||||
! These should be case-insensitive
|
||||
[ f ] [ " " R/ \p{l}/ matches? ] unit-test
|
||||
[ f ] [ "a" R/ \P{l}/ matches? ] unit-test
|
||||
[ f ] [ "a" R/ \P{ll}/ matches? ] unit-test
|
||||
[ t ] [ " " R/ \P{LL}/ matches? ] unit-test
|
||||
[ f ] [ "a" R/ \P{sCriPt = latin}/ matches? ] unit-test
|
||||
[ t ] [ " " R/ \P{SCRIPT = laTIn}/ matches? ] unit-test
|
||||
|
|
Loading…
Reference in New Issue