pcre: add matches? and split verbs, now also handles tricky corner cases like zero-length matches
parent
2dd398b701
commit
e40a6cb927
|
@ -1,9 +1,9 @@
|
||||||
USING: pcre pcre.info sequences tools.test ;
|
USING: accessors pcre pcre.info sequences tools.test ;
|
||||||
|
|
||||||
[ { { 3 "day" } { 2 "month" } { 1 "year" } } ]
|
[ { { 3 "day" } { 2 "month" } { 1 "year" } } ]
|
||||||
[
|
[
|
||||||
"(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" <compiled-pcre>
|
"(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" <compiled-pcre>
|
||||||
name-table-entries
|
nametable>>
|
||||||
] unit-test
|
] unit-test
|
||||||
|
|
||||||
[ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test
|
[ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
USING:
|
USING:
|
||||||
accessors
|
accessors
|
||||||
arrays
|
arrays
|
||||||
|
assocs
|
||||||
|
http.client
|
||||||
kernel
|
kernel
|
||||||
math math.ranges
|
math math.ranges
|
||||||
pcre pcre.ffi pcre.info
|
pcre pcre.ffi pcre.info
|
||||||
random
|
random
|
||||||
sequences
|
sequences
|
||||||
splitting
|
|
||||||
system
|
system
|
||||||
tools.test ;
|
tools.test ;
|
||||||
IN: pcre.tests
|
IN: pcre.tests
|
||||||
|
@ -20,7 +21,7 @@ CONSTANT: iso-date "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})"
|
||||||
{ "year" "month" "day" } [ pcre_get_stringnumber ] with map
|
{ "year" "month" "day" } [ pcre_get_stringnumber ] with map
|
||||||
] unit-test
|
] unit-test
|
||||||
|
|
||||||
[ t ] [ "foo" <compiled-pcre> pcre>> options PCRE_UTF8 bitand 0 > ] unit-test
|
[ t ] [ "foo" <compiled-pcre> PCRE_UTF8 has-option? ] unit-test
|
||||||
|
|
||||||
os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
|
os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
|
||||||
|
|
||||||
|
@ -50,13 +51,57 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
|
||||||
|
|
||||||
[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test
|
[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test
|
||||||
|
|
||||||
|
[ { ", " ", " "." } ] [
|
||||||
|
"Words, words, word." "\\W+" findall [ first second ] map
|
||||||
|
] unit-test
|
||||||
|
|
||||||
: long-string ( -- x )
|
: long-string ( -- x )
|
||||||
10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ;
|
10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ;
|
||||||
|
|
||||||
! Performance
|
! Performance
|
||||||
[ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test
|
[ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test
|
||||||
|
|
||||||
|
! Empty matches, corner case behaviour is copied from pcredemo.c
|
||||||
|
[ { { { f "foo" } } { { f "" } } } ]
|
||||||
|
[ "foo" ".*" findall ] unit-test
|
||||||
|
|
||||||
|
[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ]
|
||||||
|
[ "foo" "B*" findall ] unit-test
|
||||||
|
|
||||||
! Tests for matches?
|
! Tests for matches?
|
||||||
[ t ] [ "örjan" "örjan" matches? ] unit-test
|
[ t ] [ "örjan" "örjan" matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test
|
[ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test
|
||||||
|
|
||||||
|
! Dotall mode, off by default
|
||||||
|
[ f ] [ "." <compiled-pcre> PCRE_DOTALL has-option? ] unit-test
|
||||||
|
[ t ] [ "(?s)." <compiled-pcre> PCRE_DOTALL has-option? ] unit-test
|
||||||
|
|
||||||
|
[ f ] [ "\n" "." matches? ] unit-test
|
||||||
|
[ t ] [ "\n" "(?s)." matches? ] unit-test
|
||||||
|
|
||||||
|
! Caseless mode, off by default
|
||||||
|
[ { f t } ] [
|
||||||
|
{ "x" "(?i)x" } [ <compiled-pcre> PCRE_CASELESS has-option? ] map
|
||||||
|
] unit-test
|
||||||
|
|
||||||
|
! Backreferences
|
||||||
|
[ { t f } ] [
|
||||||
|
{ "response and responsibility" "sense and responsibility" }
|
||||||
|
[ "(sens|respons)e and \\1ibility" matches? ] map
|
||||||
|
] unit-test
|
||||||
|
|
||||||
|
[ { t t f } ] [
|
||||||
|
{ "rah rah" "RAH RAH" "RAH rah" } [ "((?i)rah)\\s+\\1" matches? ] map
|
||||||
|
] unit-test
|
||||||
|
|
||||||
|
! Splitting
|
||||||
|
[ { { "Words" "words" "word" } { "Words" "words" "word" } } ] [
|
||||||
|
"Words, words, word." { "\\W+" "[,. ]" } [ split ] with map
|
||||||
|
] unit-test
|
||||||
|
|
||||||
|
! Bigger tests
|
||||||
|
[ t ] [
|
||||||
|
"http://factorcode.org/" http-get nip
|
||||||
|
"href=\"(?P<link>[^\"]+)\"" findall [ "link" of ] map sequence?
|
||||||
|
] unit-test
|
||||||
|
|
|
@ -3,21 +3,27 @@ USING:
|
||||||
alien.c-types alien.data alien.strings
|
alien.c-types alien.data alien.strings
|
||||||
arrays
|
arrays
|
||||||
assocs
|
assocs
|
||||||
|
fry
|
||||||
grouping
|
grouping
|
||||||
io.encodings.utf8 io.encodings.string
|
io.encodings.utf8 io.encodings.string
|
||||||
kernel
|
kernel
|
||||||
locals
|
|
||||||
math
|
math
|
||||||
|
mirrors
|
||||||
pcre.ffi pcre.info
|
pcre.ffi pcre.info
|
||||||
sequences
|
sequences sequences.generalizations
|
||||||
|
sets.private
|
||||||
strings ;
|
strings ;
|
||||||
|
QUALIFIED: splitting
|
||||||
IN: pcre
|
IN: pcre
|
||||||
|
|
||||||
ERROR: malformed-regexp expr error ;
|
ERROR: malformed-regexp expr error ;
|
||||||
ERROR: pcre-error value ;
|
ERROR: pcre-error value ;
|
||||||
|
|
||||||
TUPLE: compiled-pcre pcre extra nametable ;
|
TUPLE: compiled-pcre pcre extra nametable ;
|
||||||
TUPLE: matcher subject compiled-pcre ofs match ;
|
|
||||||
|
! Gen. utility
|
||||||
|
: replace-all ( seq subseqs new -- seq )
|
||||||
|
swapd '[ _ splitting:replace ] reduce ;
|
||||||
|
|
||||||
: default-opts ( -- opts )
|
: default-opts ( -- opts )
|
||||||
PCRE_UTF8 PCRE_UCP bitor ;
|
PCRE_UTF8 PCRE_UCP bitor ;
|
||||||
|
@ -28,9 +34,8 @@ TUPLE: matcher subject compiled-pcre ofs match ;
|
||||||
: <pcre> ( expr -- pcre )
|
: <pcre> ( expr -- pcre )
|
||||||
dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ;
|
dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ;
|
||||||
|
|
||||||
:: exec ( subject ofs pcre extra -- count match-data )
|
: exec ( pcre extra subject ofs opts -- count match-data )
|
||||||
pcre extra subject dup length ofs 0 30 int <c-array>
|
[ dup length ] 2dip 30 int <c-array> 30 [ pcre_exec ] 2keep drop ;
|
||||||
[ 30 pcre_exec ] keep ;
|
|
||||||
|
|
||||||
: <pcre-extra> ( pcre -- pcre-extra )
|
: <pcre-extra> ( pcre -- pcre-extra )
|
||||||
0 { c-string } [ pcre_study ] with-out-parameters drop ;
|
0 { c-string } [ pcre_study ] with-out-parameters drop ;
|
||||||
|
@ -39,43 +44,68 @@ TUPLE: matcher subject compiled-pcre ofs match ;
|
||||||
{ int } [ pcre_config ] with-out-parameters ;
|
{ int } [ pcre_config ] with-out-parameters ;
|
||||||
|
|
||||||
! Finding stuff
|
! Finding stuff
|
||||||
: (findnext) ( subject ofs compiled-pcre -- match/f )
|
TUPLE: matcher pcre extra subject ofs exec-opts match ;
|
||||||
[ pcre>> ] [ extra>> ] bi exec over
|
|
||||||
dup -1 < [ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ;
|
: <matcher> ( subject compiled-pcre -- matcher )
|
||||||
|
[ utf8 encode ] dip [ pcre>> ] [ extra>> ] bi rot 0 0 f matcher boa ;
|
||||||
|
|
||||||
|
: exec-result>match ( count match-data -- match/f )
|
||||||
|
over dup -1 <
|
||||||
|
[ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ;
|
||||||
|
|
||||||
|
! This handling of zero-length matches is taken from pcredemo.c
|
||||||
|
: empty-match-opts ( -- opts )
|
||||||
|
PCRE_NOTEMPTY_ATSTART PCRE_ANCHORED bitor ;
|
||||||
|
|
||||||
: findnext ( matcher -- matcher'/f )
|
: findnext ( matcher -- matcher'/f )
|
||||||
clone dup [ subject>> ] [ ofs>> ] [ compiled-pcre>> ] tri (findnext)
|
clone dup <mirror> values 6 firstn drop exec exec-result>match
|
||||||
[ [ >>match ] [ second second >>ofs ] bi ] [ drop f ] if* ;
|
[
|
||||||
|
[ >>match ]
|
||||||
|
[
|
||||||
|
second
|
||||||
|
[ first2 = [ empty-match-opts ] [ 0 ] if >>exec-opts ]
|
||||||
|
[ second >>ofs ] bi
|
||||||
|
] bi
|
||||||
|
]
|
||||||
|
[
|
||||||
|
dup exec-opts>> 0 =
|
||||||
|
[ drop f ]
|
||||||
|
[
|
||||||
|
dup [ ofs>> 1 + ] [ subject>> length ] bi over <
|
||||||
|
[ 2drop f ]
|
||||||
|
[
|
||||||
|
[ >>ofs ] [ drop 0 >>exec-opts ] bi
|
||||||
|
] if
|
||||||
|
] if
|
||||||
|
] if* ;
|
||||||
|
|
||||||
! Result parsing
|
! Result parsing
|
||||||
: substring-list ( subject match-data count -- alien )
|
: substring-list ( subject match-array count -- alien )
|
||||||
{ void* } [ pcre_get_substring_list drop ] with-out-parameters ;
|
{ void* } [ pcre_get_substring_list drop ] with-out-parameters ;
|
||||||
|
|
||||||
: parse-groups ( ngroups seq -- match )
|
: parse-match ( subject nametable match-data -- match )
|
||||||
swap 2 * head 2 <groups> [ >array ] map ;
|
|
||||||
|
|
||||||
: parse-match ( subject compiled-pcre match-data -- match )
|
|
||||||
swapd first2 swap [ substring-list ] keep void* <c-direct-array>
|
swapd first2 swap [ substring-list ] keep void* <c-direct-array>
|
||||||
[ alien>native-string ] { } map-as [ nametable>> ] dip
|
[ alien>native-string ] { } map-as [ of swap 2array ] with map-index ;
|
||||||
[ of swap 2array ] with map-index ;
|
|
||||||
|
|
||||||
! High-level
|
! High-level
|
||||||
: <compiled-pcre> ( expr -- compiled-pcre )
|
: <compiled-pcre> ( expr -- compiled-pcre )
|
||||||
<pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ;
|
<pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ;
|
||||||
|
|
||||||
|
: has-option? ( compiled-pcre option -- ? )
|
||||||
|
[ pcre>> options ] dip bitand 0 > ;
|
||||||
|
|
||||||
GENERIC: findall ( subject obj -- matches )
|
GENERIC: findall ( subject obj -- matches )
|
||||||
|
|
||||||
M: compiled-pcre findall
|
M: compiled-pcre findall
|
||||||
[ utf8 encode ] dip 2dup 0 f matcher boa [ findnext ] follow
|
[ <matcher> [ findnext ] follow [ match>> ] map pruned harvest ]
|
||||||
[ match>> ] map harvest [ parse-match ] 2with map ;
|
[ nametable>> rot [ parse-match ] 2with map ] 2bi >array ;
|
||||||
|
|
||||||
M: string findall
|
M: string findall
|
||||||
<compiled-pcre> findall ;
|
<compiled-pcre> findall ;
|
||||||
|
|
||||||
GENERIC: matches? ( subject obj -- ? )
|
: matches? ( subject obj -- ? )
|
||||||
|
|
||||||
M: compiled-pcre matches?
|
|
||||||
dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
|
dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
|
||||||
|
|
||||||
M: string matches?
|
: split ( subject obj -- strings )
|
||||||
<compiled-pcre> matches? ;
|
dupd findall [ first second ] map
|
||||||
|
dup first [ replace-all ] keep splitting:split harvest ;
|
||||||
|
|
Loading…
Reference in New Issue