pcre: add matches? and split verbs, now also handles tricky corner cases like zero-length matches
							parent
							
								
									2dd398b701
								
							
						
					
					
						commit
						e40a6cb927
					
				|  | @ -1,9 +1,9 @@ | ||||||
| USING: pcre pcre.info sequences tools.test ; | USING: accessors pcre pcre.info sequences tools.test ; | ||||||
| 
 | 
 | ||||||
| [ { { 3 "day" } { 2 "month" } { 1 "year" } } ] | [ { { 3 "day" } { 2 "month" } { 1 "year" } } ] | ||||||
| [ | [ | ||||||
|     "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" <compiled-pcre> |     "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" <compiled-pcre> | ||||||
|     name-table-entries |     nametable>> | ||||||
| ] unit-test | ] unit-test | ||||||
| 
 | 
 | ||||||
| [ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test | [ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test | ||||||
|  |  | ||||||
|  | @ -1,12 +1,13 @@ | ||||||
| USING: | USING: | ||||||
|     accessors |     accessors | ||||||
|     arrays |     arrays | ||||||
|  |     assocs | ||||||
|  |     http.client | ||||||
|     kernel |     kernel | ||||||
|     math math.ranges |     math math.ranges | ||||||
|     pcre pcre.ffi pcre.info |     pcre pcre.ffi pcre.info | ||||||
|     random |     random | ||||||
|     sequences |     sequences | ||||||
|     splitting |  | ||||||
|     system |     system | ||||||
|     tools.test ; |     tools.test ; | ||||||
| IN: pcre.tests | IN: pcre.tests | ||||||
|  | @ -20,7 +21,7 @@ CONSTANT: iso-date "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" | ||||||
|     { "year" "month" "day" } [ pcre_get_stringnumber ] with map |     { "year" "month" "day" } [ pcre_get_stringnumber ] with map | ||||||
| ] unit-test | ] unit-test | ||||||
| 
 | 
 | ||||||
| [ t ] [ "foo" <compiled-pcre> pcre>> options PCRE_UTF8 bitand 0 > ] unit-test | [ t ] [ "foo" <compiled-pcre> PCRE_UTF8 has-option? ] unit-test | ||||||
| 
 | 
 | ||||||
| os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when | os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when | ||||||
| 
 | 
 | ||||||
|  | @ -50,13 +51,57 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when | ||||||
| 
 | 
 | ||||||
| [ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test | [ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test | ||||||
| 
 | 
 | ||||||
|  | [ { ", " ", " "." } ] [ | ||||||
|  |     "Words, words, word." "\\W+" findall [ first second ] map | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
| : long-string ( -- x ) | : long-string ( -- x ) | ||||||
|     10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ; |     10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ; | ||||||
| 
 | 
 | ||||||
| ! Performance | ! Performance | ||||||
| [ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test | [ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test | ||||||
| 
 | 
 | ||||||
|  | ! Empty matches, corner case behaviour is copied from pcredemo.c | ||||||
|  | [ { { { f "foo" } } { { f "" } } } ] | ||||||
|  | [ "foo" ".*" findall ] unit-test | ||||||
|  | 
 | ||||||
|  | [ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ] | ||||||
|  | [ "foo" "B*" findall ] unit-test | ||||||
|  | 
 | ||||||
| ! Tests for matches? | ! Tests for matches? | ||||||
| [ t ] [ "örjan" "örjan" matches? ] unit-test | [ t ] [ "örjan" "örjan" matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test | [ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Dotall mode, off by default | ||||||
|  | [ f ] [ "." <compiled-pcre> PCRE_DOTALL has-option? ] unit-test | ||||||
|  | [ t ] [ "(?s)." <compiled-pcre> PCRE_DOTALL has-option? ] unit-test | ||||||
|  | 
 | ||||||
|  | [ f ] [ "\n" "." matches? ] unit-test | ||||||
|  | [ t ] [ "\n" "(?s)." matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Caseless mode, off by default | ||||||
|  | [ { f t } ] [ | ||||||
|  |     { "x" "(?i)x" } [ <compiled-pcre> PCRE_CASELESS has-option? ] map | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Backreferences | ||||||
|  | [ { t f } ] [ | ||||||
|  |     { "response and responsibility" "sense and responsibility" } | ||||||
|  |     [ "(sens|respons)e and \\1ibility" matches? ] map | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
|  | [ { t t f } ] [ | ||||||
|  |     { "rah rah" "RAH RAH" "RAH rah" } [ "((?i)rah)\\s+\\1" matches? ] map | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Splitting | ||||||
|  | [ { { "Words" "words" "word" } { "Words" "words" "word" } } ] [ | ||||||
|  |     "Words, words, word." { "\\W+" "[,. ]" } [ split ] with map | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Bigger tests | ||||||
|  | [ t ] [ | ||||||
|  |     "http://factorcode.org/" http-get nip | ||||||
|  |     "href=\"(?P<link>[^\"]+)\"" findall [ "link" of ] map sequence? | ||||||
|  | ] unit-test | ||||||
|  |  | ||||||
|  | @ -3,21 +3,27 @@ USING: | ||||||
|     alien.c-types alien.data alien.strings |     alien.c-types alien.data alien.strings | ||||||
|     arrays |     arrays | ||||||
|     assocs |     assocs | ||||||
|  |     fry | ||||||
|     grouping |     grouping | ||||||
|     io.encodings.utf8 io.encodings.string |     io.encodings.utf8 io.encodings.string | ||||||
|     kernel |     kernel | ||||||
|     locals |  | ||||||
|     math |     math | ||||||
|  |     mirrors | ||||||
|     pcre.ffi pcre.info |     pcre.ffi pcre.info | ||||||
|     sequences |     sequences sequences.generalizations | ||||||
|  |     sets.private | ||||||
|     strings ; |     strings ; | ||||||
|  | QUALIFIED: splitting | ||||||
| IN: pcre | IN: pcre | ||||||
| 
 | 
 | ||||||
| ERROR: malformed-regexp expr error ; | ERROR: malformed-regexp expr error ; | ||||||
| ERROR: pcre-error value ; | ERROR: pcre-error value ; | ||||||
| 
 | 
 | ||||||
| TUPLE: compiled-pcre pcre extra nametable ; | TUPLE: compiled-pcre pcre extra nametable ; | ||||||
| TUPLE: matcher subject compiled-pcre ofs match ; | 
 | ||||||
|  | ! Gen. utility | ||||||
|  | : replace-all ( seq subseqs new -- seq ) | ||||||
|  |     swapd '[ _ splitting:replace ] reduce ; | ||||||
| 
 | 
 | ||||||
| : default-opts ( -- opts ) | : default-opts ( -- opts ) | ||||||
|     PCRE_UTF8 PCRE_UCP bitor ; |     PCRE_UTF8 PCRE_UCP bitor ; | ||||||
|  | @ -28,9 +34,8 @@ TUPLE: matcher subject compiled-pcre ofs match ; | ||||||
| : <pcre> ( expr -- pcre ) | : <pcre> ( expr -- pcre ) | ||||||
|     dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ; |     dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ; | ||||||
| 
 | 
 | ||||||
| :: exec ( subject ofs pcre extra -- count match-data ) | : exec ( pcre extra subject ofs opts -- count match-data ) | ||||||
|     pcre extra subject dup length ofs 0 30 int <c-array> |     [ dup length ] 2dip 30 int <c-array> 30 [ pcre_exec ] 2keep drop ; | ||||||
|     [ 30 pcre_exec ] keep ; |  | ||||||
| 
 | 
 | ||||||
| : <pcre-extra> ( pcre -- pcre-extra ) | : <pcre-extra> ( pcre -- pcre-extra ) | ||||||
|     0 { c-string } [ pcre_study ] with-out-parameters drop ; |     0 { c-string } [ pcre_study ] with-out-parameters drop ; | ||||||
|  | @ -39,43 +44,68 @@ TUPLE: matcher subject compiled-pcre ofs match ; | ||||||
|     { int } [ pcre_config ] with-out-parameters ; |     { int } [ pcre_config ] with-out-parameters ; | ||||||
| 
 | 
 | ||||||
| ! Finding stuff | ! Finding stuff | ||||||
| : (findnext) ( subject ofs compiled-pcre -- match/f ) | TUPLE: matcher pcre extra subject ofs exec-opts match ; | ||||||
|     [ pcre>> ] [ extra>> ] bi exec over | 
 | ||||||
|     dup -1 < [ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ; | : <matcher> ( subject compiled-pcre -- matcher ) | ||||||
|  |     [ utf8 encode ] dip [ pcre>> ] [ extra>> ] bi rot 0 0 f matcher boa ; | ||||||
|  | 
 | ||||||
|  | : exec-result>match ( count match-data -- match/f ) | ||||||
|  |     over dup -1 < | ||||||
|  |     [ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ; | ||||||
|  | 
 | ||||||
|  | ! This handling of zero-length matches is taken from pcredemo.c | ||||||
|  | : empty-match-opts ( -- opts ) | ||||||
|  |     PCRE_NOTEMPTY_ATSTART PCRE_ANCHORED bitor ; | ||||||
| 
 | 
 | ||||||
| : findnext ( matcher -- matcher'/f ) | : findnext ( matcher -- matcher'/f ) | ||||||
|     clone dup [ subject>> ] [ ofs>> ] [ compiled-pcre>> ] tri (findnext) |     clone dup <mirror> values 6 firstn drop exec exec-result>match | ||||||
|     [ [ >>match ] [ second second >>ofs ] bi ] [ drop f ] if* ; |     [ | ||||||
|  |         [ >>match ] | ||||||
|  |         [ | ||||||
|  |             second | ||||||
|  |             [ first2 = [ empty-match-opts ] [ 0 ] if >>exec-opts ] | ||||||
|  |             [ second >>ofs ] bi | ||||||
|  |         ] bi | ||||||
|  |     ] | ||||||
|  |     [ | ||||||
|  |         dup exec-opts>> 0 = | ||||||
|  |         [ drop f ] | ||||||
|  |         [ | ||||||
|  |             dup [ ofs>> 1 + ] [ subject>> length ] bi over < | ||||||
|  |             [ 2drop f ] | ||||||
|  |             [ | ||||||
|  |                 [ >>ofs ] [ drop 0 >>exec-opts ] bi | ||||||
|  |             ] if | ||||||
|  |         ] if | ||||||
|  |     ] if* ; | ||||||
| 
 | 
 | ||||||
| ! Result parsing | ! Result parsing | ||||||
| : substring-list ( subject match-data count -- alien ) | : substring-list ( subject match-array count -- alien ) | ||||||
|     { void* } [ pcre_get_substring_list drop ] with-out-parameters ; |     { void* } [ pcre_get_substring_list drop ] with-out-parameters ; | ||||||
| 
 | 
 | ||||||
| : parse-groups ( ngroups seq -- match ) | : parse-match ( subject nametable match-data -- match ) | ||||||
|     swap 2 * head 2 <groups> [ >array ] map ; |  | ||||||
| 
 |  | ||||||
| : parse-match ( subject compiled-pcre match-data -- match ) |  | ||||||
|     swapd first2 swap [ substring-list ] keep void* <c-direct-array> |     swapd first2 swap [ substring-list ] keep void* <c-direct-array> | ||||||
|     [ alien>native-string ] { } map-as [ nametable>> ] dip |     [ alien>native-string ] { } map-as [ of swap 2array ] with map-index ; | ||||||
|     [ of swap 2array ] with map-index ; |  | ||||||
| 
 | 
 | ||||||
| ! High-level | ! High-level | ||||||
| : <compiled-pcre> ( expr -- compiled-pcre ) | : <compiled-pcre> ( expr -- compiled-pcre ) | ||||||
|     <pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ; |     <pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ; | ||||||
| 
 | 
 | ||||||
|  | : has-option? ( compiled-pcre option -- ? ) | ||||||
|  |     [ pcre>> options ] dip bitand 0 > ; | ||||||
|  | 
 | ||||||
| GENERIC: findall ( subject obj -- matches ) | GENERIC: findall ( subject obj -- matches ) | ||||||
| 
 | 
 | ||||||
| M: compiled-pcre findall | M: compiled-pcre findall | ||||||
|     [ utf8 encode ] dip 2dup 0 f matcher boa [ findnext ] follow |     [ <matcher> [ findnext ] follow [ match>> ] map pruned harvest ] | ||||||
|     [ match>> ] map harvest [ parse-match ] 2with map ; |     [ nametable>> rot [ parse-match ] 2with map ] 2bi >array ; | ||||||
| 
 | 
 | ||||||
| M: string findall | M: string findall | ||||||
|     <compiled-pcre> findall ; |     <compiled-pcre> findall ; | ||||||
| 
 | 
 | ||||||
| GENERIC: matches? ( subject obj -- ? ) | : matches? ( subject obj -- ? ) | ||||||
| 
 |  | ||||||
| M: compiled-pcre matches? |  | ||||||
|     dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ; |     dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ; | ||||||
| 
 | 
 | ||||||
| M: string matches? | : split ( subject obj -- strings ) | ||||||
|     <compiled-pcre> matches? ; |     dupd findall [ first second ] map | ||||||
|  |     dup first [ replace-all ] keep splitting:split harvest ; | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue