diff --git a/basis/pcre/info-tests.factor b/basis/pcre/info-tests.factor index 3086d58dae..361c907c95 100644 --- a/basis/pcre/info-tests.factor +++ b/basis/pcre/info-tests.factor @@ -1,9 +1,9 @@ -USING: pcre pcre.info sequences tools.test ; +USING: accessors pcre pcre.info sequences tools.test ; [ { { 3 "day" } { 2 "month" } { 1 "year" } } ] [ "(?P\\d{4})-(?P\\d{2})-(?P\\d{2})" - name-table-entries + nametable>> ] unit-test [ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test diff --git a/basis/pcre/pcre-tests.factor b/basis/pcre/pcre-tests.factor index 1be1f2d504..05866f4588 100644 --- a/basis/pcre/pcre-tests.factor +++ b/basis/pcre/pcre-tests.factor @@ -1,12 +1,13 @@ USING: accessors arrays + assocs + http.client kernel math math.ranges pcre pcre.ffi pcre.info random sequences - splitting system tools.test ; IN: pcre.tests @@ -20,7 +21,7 @@ CONSTANT: iso-date "(?P\\d{4})-(?P\\d{2})-(?P\\d{2})" { "year" "month" "day" } [ pcre_get_stringnumber ] with map ] unit-test -[ t ] [ "foo" pcre>> options PCRE_UTF8 bitand 0 > ] unit-test +[ t ] [ "foo" PCRE_UTF8 has-option? ] unit-test os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when @@ -50,13 +51,57 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when [ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test +[ { ", " ", " "." } ] [ + "Words, words, word." "\\W+" findall [ first second ] map +] unit-test + : long-string ( -- x ) 10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ; ! Performance [ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test +! Empty matches, corner case behaviour is copied from pcredemo.c +[ { { { f "foo" } } { { f "" } } } ] +[ "foo" ".*" findall ] unit-test + +[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ] +[ "foo" "B*" findall ] unit-test + ! Tests for matches? [ t ] [ "örjan" "örjan" matches? ] unit-test [ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test + +! Dotall mode, off by default +[ f ] [ "." PCRE_DOTALL has-option? ] unit-test +[ t ] [ "(?s)." PCRE_DOTALL has-option? ] unit-test + +[ f ] [ "\n" "." matches? ] unit-test +[ t ] [ "\n" "(?s)." matches? ] unit-test + +! Caseless mode, off by default +[ { f t } ] [ + { "x" "(?i)x" } [ PCRE_CASELESS has-option? ] map +] unit-test + +! Backreferences +[ { t f } ] [ + { "response and responsibility" "sense and responsibility" } + [ "(sens|respons)e and \\1ibility" matches? ] map +] unit-test + +[ { t t f } ] [ + { "rah rah" "RAH RAH" "RAH rah" } [ "((?i)rah)\\s+\\1" matches? ] map +] unit-test + +! Splitting +[ { { "Words" "words" "word" } { "Words" "words" "word" } } ] [ + "Words, words, word." { "\\W+" "[,. ]" } [ split ] with map +] unit-test + +! Bigger tests +[ t ] [ + "http://factorcode.org/" http-get nip + "href=\"(?P[^\"]+)\"" findall [ "link" of ] map sequence? +] unit-test diff --git a/basis/pcre/pcre.factor b/basis/pcre/pcre.factor index 133f8dc00c..74e6b673d2 100644 --- a/basis/pcre/pcre.factor +++ b/basis/pcre/pcre.factor @@ -3,21 +3,27 @@ USING: alien.c-types alien.data alien.strings arrays assocs + fry grouping io.encodings.utf8 io.encodings.string kernel - locals math + mirrors pcre.ffi pcre.info - sequences + sequences sequences.generalizations + sets.private strings ; +QUALIFIED: splitting IN: pcre ERROR: malformed-regexp expr error ; ERROR: pcre-error value ; TUPLE: compiled-pcre pcre extra nametable ; -TUPLE: matcher subject compiled-pcre ofs match ; + +! Gen. utility +: replace-all ( seq subseqs new -- seq ) + swapd '[ _ splitting:replace ] reduce ; : default-opts ( -- opts ) PCRE_UTF8 PCRE_UCP bitor ; @@ -28,9 +34,8 @@ TUPLE: matcher subject compiled-pcre ofs match ; : ( expr -- pcre ) dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ; -:: exec ( subject ofs pcre extra -- count match-data ) - pcre extra subject dup length ofs 0 30 int - [ 30 pcre_exec ] keep ; +: exec ( pcre extra subject ofs opts -- count match-data ) + [ dup length ] 2dip 30 int 30 [ pcre_exec ] 2keep drop ; : ( pcre -- pcre-extra ) 0 { c-string } [ pcre_study ] with-out-parameters drop ; @@ -39,43 +44,68 @@ TUPLE: matcher subject compiled-pcre ofs match ; { int } [ pcre_config ] with-out-parameters ; ! Finding stuff -: (findnext) ( subject ofs compiled-pcre -- match/f ) - [ pcre>> ] [ extra>> ] bi exec over - dup -1 < [ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ; +TUPLE: matcher pcre extra subject ofs exec-opts match ; + +: ( subject compiled-pcre -- matcher ) + [ utf8 encode ] dip [ pcre>> ] [ extra>> ] bi rot 0 0 f matcher boa ; + +: exec-result>match ( count match-data -- match/f ) + over dup -1 < + [ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ; + +! This handling of zero-length matches is taken from pcredemo.c +: empty-match-opts ( -- opts ) + PCRE_NOTEMPTY_ATSTART PCRE_ANCHORED bitor ; : findnext ( matcher -- matcher'/f ) - clone dup [ subject>> ] [ ofs>> ] [ compiled-pcre>> ] tri (findnext) - [ [ >>match ] [ second second >>ofs ] bi ] [ drop f ] if* ; + clone dup values 6 firstn drop exec exec-result>match + [ + [ >>match ] + [ + second + [ first2 = [ empty-match-opts ] [ 0 ] if >>exec-opts ] + [ second >>ofs ] bi + ] bi + ] + [ + dup exec-opts>> 0 = + [ drop f ] + [ + dup [ ofs>> 1 + ] [ subject>> length ] bi over < + [ 2drop f ] + [ + [ >>ofs ] [ drop 0 >>exec-opts ] bi + ] if + ] if + ] if* ; ! Result parsing -: substring-list ( subject match-data count -- alien ) +: substring-list ( subject match-array count -- alien ) { void* } [ pcre_get_substring_list drop ] with-out-parameters ; -: parse-groups ( ngroups seq -- match ) - swap 2 * head 2 [ >array ] map ; - -: parse-match ( subject compiled-pcre match-data -- match ) +: parse-match ( subject nametable match-data -- match ) swapd first2 swap [ substring-list ] keep void* - [ alien>native-string ] { } map-as [ nametable>> ] dip - [ of swap 2array ] with map-index ; + [ alien>native-string ] { } map-as [ of swap 2array ] with map-index ; ! High-level : ( expr -- compiled-pcre ) dup 2dup name-table-entries compiled-pcre boa ; +: has-option? ( compiled-pcre option -- ? ) + [ pcre>> options ] dip bitand 0 > ; + GENERIC: findall ( subject obj -- matches ) M: compiled-pcre findall - [ utf8 encode ] dip 2dup 0 f matcher boa [ findnext ] follow - [ match>> ] map harvest [ parse-match ] 2with map ; + [ [ findnext ] follow [ match>> ] map pruned harvest ] + [ nametable>> rot [ parse-match ] 2with map ] 2bi >array ; M: string findall findall ; -GENERIC: matches? ( subject obj -- ? ) - -M: compiled-pcre matches? +: matches? ( subject obj -- ? ) dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ; -M: string matches? - matches? ; +: split ( subject obj -- strings ) + dupd findall [ first second ] map + dup first [ replace-all ] keep splitting:split harvest ;