pcre: lots of unit tests and a nice interface to the vocab in the findall and matches? words
							parent
							
								
									152ccf4b27
								
							
						
					
					
						commit
						2dd398b701
					
				|  | @ -0,0 +1,4 @@ | ||||||
|  | USING: pcre.ffi sequences splitting tools.test ; | ||||||
|  | IN: pcre.ffi.tests | ||||||
|  | 
 | ||||||
|  | [ 2 ] [ pcre_version " " split length ] unit-test | ||||||
|  | @ -9,6 +9,68 @@ IN: pcre.ffi | ||||||
|     { [ os unix? ] [ "libpcre" "libpcre.so" cdecl add-library ] } |     { [ os unix? ] [ "libpcre" "libpcre.so" cdecl add-library ] } | ||||||
| } cond >> | } cond >> | ||||||
| 
 | 
 | ||||||
|  | CONSTANT: PCRE_CASELESS           0x00000001 | ||||||
|  | CONSTANT: PCRE_MULTILINE          0x00000002 | ||||||
|  | CONSTANT: PCRE_DOTALL             0x00000004 | ||||||
|  | CONSTANT: PCRE_EXTENDED           0x00000008 | ||||||
|  | CONSTANT: PCRE_ANCHORED           0x00000010 | ||||||
|  | CONSTANT: PCRE_DOLLAR_ENDONLY     0x00000020 | ||||||
|  | CONSTANT: PCRE_EXTRA              0x00000040 | ||||||
|  | CONSTANT: PCRE_NOTBOL             0x00000080 | ||||||
|  | CONSTANT: PCRE_NOTEOL             0x00000100 | ||||||
|  | CONSTANT: PCRE_UNGREEDY           0x00000200 | ||||||
|  | CONSTANT: PCRE_NOTEMPTY           0x00000400 | ||||||
|  | CONSTANT: PCRE_UTF8               0x00000800 | ||||||
|  | CONSTANT: PCRE_NO_AUTO_CAPTURE    0x00001000 | ||||||
|  | CONSTANT: PCRE_NO_UTF8_CHECK      0x00002000 | ||||||
|  | CONSTANT: PCRE_AUTO_CALLOUT       0x00004000 | ||||||
|  | CONSTANT: PCRE_PARTIAL_SOFT       0x00008000 | ||||||
|  | CONSTANT: PCRE_PARTIAL            0x00008000 | ||||||
|  | CONSTANT: PCRE_DFA_SHORTEST       0x00010000 | ||||||
|  | CONSTANT: PCRE_DFA_RESTART        0x00020000 | ||||||
|  | CONSTANT: PCRE_FIRSTLINE          0x00040000 | ||||||
|  | CONSTANT: PCRE_DUPNAMES           0x00080000 | ||||||
|  | CONSTANT: PCRE_NEWLINE_CR         0x00100000 | ||||||
|  | CONSTANT: PCRE_NEWLINE_LF         0x00200000 | ||||||
|  | CONSTANT: PCRE_NEWLINE_CRLF       0x00300000 | ||||||
|  | CONSTANT: PCRE_NEWLINE_ANY        0x00400000 | ||||||
|  | CONSTANT: PCRE_NEWLINE_ANYCRLF    0x00500000 | ||||||
|  | CONSTANT: PCRE_BSR_ANYCRLF        0x00800000 | ||||||
|  | CONSTANT: PCRE_BSR_UNICODE        0x01000000 | ||||||
|  | CONSTANT: PCRE_JAVASCRIPT_COMPAT  0x02000000 | ||||||
|  | CONSTANT: PCRE_NO_START_OPTIMIZE  0x04000000 | ||||||
|  | CONSTANT: PCRE_NO_START_OPTIMISE  0x04000000 | ||||||
|  | CONSTANT: PCRE_PARTIAL_HARD       0x08000000 | ||||||
|  | CONSTANT: PCRE_NOTEMPTY_ATSTART   0x10000000 | ||||||
|  | CONSTANT: PCRE_UCP                0x20000000 | ||||||
|  | 
 | ||||||
|  | CONSTANT: PCRE_ERROR_NOMATCH         -1 | ||||||
|  | CONSTANT: PCRE_ERROR_NULL            -2 | ||||||
|  | CONSTANT: PCRE_ERROR_BADOPTION       -3 | ||||||
|  | CONSTANT: PCRE_ERROR_BADMAGIC        -4 | ||||||
|  | CONSTANT: PCRE_ERROR_UNKNOWN_OPCODE  -5 | ||||||
|  | CONSTANT: PCRE_ERROR_UNKNOWN_NODE    -5 | ||||||
|  | CONSTANT: PCRE_ERROR_NOMEMORY        -6 | ||||||
|  | CONSTANT: PCRE_ERROR_NOSUBSTRING     -7 | ||||||
|  | CONSTANT: PCRE_ERROR_MATCHLIMIT      -8 | ||||||
|  | CONSTANT: PCRE_ERROR_CALLOUT         -9 | ||||||
|  | CONSTANT: PCRE_ERROR_BADUTF8        -10 | ||||||
|  | CONSTANT: PCRE_ERROR_BADUTF8_OFFSET -11 | ||||||
|  | CONSTANT: PCRE_ERROR_PARTIAL        -12 | ||||||
|  | CONSTANT: PCRE_ERROR_BADPARTIAL     -13 | ||||||
|  | CONSTANT: PCRE_ERROR_INTERNAL       -14 | ||||||
|  | CONSTANT: PCRE_ERROR_BADCOUNT       -15 | ||||||
|  | CONSTANT: PCRE_ERROR_DFA_UITEM      -16 | ||||||
|  | CONSTANT: PCRE_ERROR_DFA_UCOND      -17 | ||||||
|  | CONSTANT: PCRE_ERROR_DFA_UMLIMIT    -18 | ||||||
|  | CONSTANT: PCRE_ERROR_DFA_WSSIZE     -19 | ||||||
|  | CONSTANT: PCRE_ERROR_DFA_RECURSE    -20 | ||||||
|  | CONSTANT: PCRE_ERROR_RECURSIONLIMIT -21 | ||||||
|  | CONSTANT: PCRE_ERROR_NULLWSLIMIT    -22 | ||||||
|  | CONSTANT: PCRE_ERROR_BADNEWLINE     -23 | ||||||
|  | CONSTANT: PCRE_ERROR_BADOFFSET      -24 | ||||||
|  | CONSTANT: PCRE_ERROR_SHORTUTF8      -25 | ||||||
|  | 
 | ||||||
| CONSTANT: PCRE_INFO_OPTIONS            0 | CONSTANT: PCRE_INFO_OPTIONS            0 | ||||||
| CONSTANT: PCRE_INFO_SIZE               1 | CONSTANT: PCRE_INFO_SIZE               1 | ||||||
| CONSTANT: PCRE_INFO_CAPTURECOUNT       2 | CONSTANT: PCRE_INFO_CAPTURECOUNT       2 | ||||||
|  | @ -27,6 +89,17 @@ CONSTANT: PCRE_INFO_JCHANGED          13 | ||||||
| CONSTANT: PCRE_INFO_HASCRORLF         14 | CONSTANT: PCRE_INFO_HASCRORLF         14 | ||||||
| CONSTANT: PCRE_INFO_MINLENGTH         15 | CONSTANT: PCRE_INFO_MINLENGTH         15 | ||||||
| 
 | 
 | ||||||
|  | CONSTANT: PCRE_CONFIG_UTF8                    0 | ||||||
|  | CONSTANT: PCRE_CONFIG_NEWLINE                 1 | ||||||
|  | CONSTANT: PCRE_CONFIG_LINK_SIZE               2 | ||||||
|  | CONSTANT: PCRE_CONFIG_POSIX_MALLOC_THRESHOLD  3 | ||||||
|  | CONSTANT: PCRE_CONFIG_MATCH_LIMIT             4 | ||||||
|  | CONSTANT: PCRE_CONFIG_STACKRECURSE            5 | ||||||
|  | CONSTANT: PCRE_CONFIG_UNICODE_PROPERTIES      6 | ||||||
|  | CONSTANT: PCRE_CONFIG_MATCH_LIMIT_RECURSION   7 | ||||||
|  | CONSTANT: PCRE_CONFIG_BSR                     8 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| STRUCT: pcre_extra | STRUCT: pcre_extra | ||||||
|     { flags int } |     { flags int } | ||||||
|     { study_data void* } |     { study_data void* } | ||||||
|  | @ -36,12 +109,21 @@ STRUCT: pcre_extra | ||||||
|     { match_limit_recursion int } |     { match_limit_recursion int } | ||||||
|     { mark uchar** } ; |     { mark uchar** } ; | ||||||
| 
 | 
 | ||||||
|  | FUNCTION: void pcre_config ( int what, void* where ) ; | ||||||
|  | 
 | ||||||
| FUNCTION: void* pcre_compile ( c-string pattern, | FUNCTION: void* pcre_compile ( c-string pattern, | ||||||
|                                int options, |                                int options, | ||||||
|                                char** errptr, |                                char** errptr, | ||||||
|                                int* erroffset, |                                int* erroffset, | ||||||
|                                char* tableptr ) ; |                                char* tableptr ) ; | ||||||
| 
 | 
 | ||||||
|  | FUNCTION: void* pcre_compile2 ( c-string pattern, | ||||||
|  |                                 int options, | ||||||
|  |                                 int* errcodeptr, | ||||||
|  |                                 char** errptr, | ||||||
|  |                                 int* erroffset, | ||||||
|  |                                 char* tableptr ) ; | ||||||
|  | 
 | ||||||
| FUNCTION: int pcre_info ( void* pcre, int* optptr, int* first_byte ) ; | FUNCTION: int pcre_info ( void* pcre, int* optptr, int* first_byte ) ; | ||||||
| FUNCTION: int pcre_fullinfo ( void* pcre, pcre_extra* extra, int what, void *where ) ; | FUNCTION: int pcre_fullinfo ( void* pcre, pcre_extra* extra, int what, void *where ) ; | ||||||
| 
 | 
 | ||||||
|  | @ -55,12 +137,19 @@ FUNCTION: int pcre_exec ( void* pcre, | ||||||
|                           int* ovector, |                           int* ovector, | ||||||
|                           int ovecsize ) ; |                           int ovecsize ) ; | ||||||
| 
 | 
 | ||||||
|  | FUNCTION: int pcre_get_stringnumber ( void* pcre, c-string name ) ; | ||||||
|  | 
 | ||||||
| FUNCTION: int pcre_get_substring ( c-string subject, | FUNCTION: int pcre_get_substring ( c-string subject, | ||||||
|                                    int* ovector, |                                    int* ovector, | ||||||
|                                    int stringcount, |                                    int stringcount, | ||||||
|                                    int stringnumber, |                                    int stringnumber, | ||||||
|                                    void *stringptr ) ; |                                    void *stringptr ) ; | ||||||
| 
 | 
 | ||||||
|  | FUNCTION: int pcre_get_substring_list ( c-string subject, | ||||||
|  |                                         int* ovector, | ||||||
|  |                                         int stringcount, | ||||||
|  |                                         void *stringptr ) ; | ||||||
|  | 
 | ||||||
| FUNCTION: c-string pcre_version ( ) ; | FUNCTION: c-string pcre_version ( ) ; | ||||||
| 
 | 
 | ||||||
| FUNCTION: uchar* pcre_maketables ( ) ; | FUNCTION: uchar* pcre_maketables ( ) ; | ||||||
|  |  | ||||||
|  | @ -0,0 +1,9 @@ | ||||||
|  | USING: pcre pcre.info sequences tools.test ; | ||||||
|  | 
 | ||||||
|  | [ { { 3 "day" } { 2 "month" } { 1 "year" } } ] | ||||||
|  | [ | ||||||
|  |     "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" <compiled-pcre> | ||||||
|  |     name-table-entries | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
|  | [ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test | ||||||
|  | @ -0,0 +1,40 @@ | ||||||
|  | USING: | ||||||
|  |     accessors | ||||||
|  |     alien alien.accessors alien.c-types alien.data alien.strings | ||||||
|  |     arrays | ||||||
|  |     kernel | ||||||
|  |     math | ||||||
|  |     pcre.ffi | ||||||
|  |     sequences ; | ||||||
|  | IN: pcre.info | ||||||
|  | 
 | ||||||
|  | ! Gen. utility | ||||||
|  | : 2with ( param1 param2 obj quot -- obj curry ) | ||||||
|  |     [ -rot ] dip [ [ rot ] dip call ] 3curry ; inline | ||||||
|  | 
 | ||||||
|  | : gen-array-addrs ( base size n -- addrs ) | ||||||
|  |     iota [ * + ] 2with map ; | ||||||
|  | 
 | ||||||
|  | ! Mostly internal | ||||||
|  | : fullinfo ( pcre extra what -- obj ) | ||||||
|  |     { int } [ pcre_fullinfo ] with-out-parameters nip ; | ||||||
|  | 
 | ||||||
|  | : name-count ( pcre extra -- n ) | ||||||
|  |     PCRE_INFO_NAMECOUNT fullinfo ; | ||||||
|  | 
 | ||||||
|  | : name-table ( pcre extra -- addr ) | ||||||
|  |     PCRE_INFO_NAMETABLE fullinfo ; | ||||||
|  | 
 | ||||||
|  | : name-entry-size ( pcre extra -- size ) | ||||||
|  |     PCRE_INFO_NAMEENTRYSIZE fullinfo ; | ||||||
|  | 
 | ||||||
|  | : name-table-entry ( addr -- group-index group-name ) | ||||||
|  |     [ <alien> 1 alien-unsigned-1 ] [ 2 + <alien> alien>native-string ] bi ; | ||||||
|  | 
 | ||||||
|  | : options ( pcre -- opts ) | ||||||
|  |     f PCRE_INFO_OPTIONS fullinfo ; | ||||||
|  | 
 | ||||||
|  | ! Exported | ||||||
|  | : name-table-entries ( pcre extra -- addrs ) | ||||||
|  |     [ name-table ] [ name-entry-size ] [ name-count ] 2tri gen-array-addrs | ||||||
|  |     [ name-table-entry 2array ] map ; | ||||||
|  | @ -0,0 +1,62 @@ | ||||||
|  | USING: | ||||||
|  |     accessors | ||||||
|  |     arrays | ||||||
|  |     kernel | ||||||
|  |     math math.ranges | ||||||
|  |     pcre pcre.ffi pcre.info | ||||||
|  |     random | ||||||
|  |     sequences | ||||||
|  |     splitting | ||||||
|  |     system | ||||||
|  |     tools.test ; | ||||||
|  | IN: pcre.tests | ||||||
|  | 
 | ||||||
|  | CONSTANT: iso-date "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" | ||||||
|  | 
 | ||||||
|  | [ { f -1 } ] [ "foo" (pcre) 3array 1 tail ] unit-test | ||||||
|  | 
 | ||||||
|  | [ { 1 2 3 } ] [ | ||||||
|  |     iso-date <pcre> | ||||||
|  |     { "year" "month" "day" } [ pcre_get_stringnumber ] with map | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
|  | [ t ] [ "foo" <compiled-pcre> pcre>> options PCRE_UTF8 bitand 0 > ] unit-test | ||||||
|  | 
 | ||||||
|  | os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when | ||||||
|  | 
 | ||||||
|  | ! In this day and age, not supporting utf-8 is broken. | ||||||
|  | [ 1 ] [ PCRE_CONFIG_UTF8 config ] unit-test | ||||||
|  | 
 | ||||||
|  | [ 1 ] [ PCRE_CONFIG_UNICODE_PROPERTIES config ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Tests for findall | ||||||
|  | [ | ||||||
|  |     { { f "1999-01-12" } { "year" "1999" } { "month" "01" } { "day" "12" } } | ||||||
|  | ] [ | ||||||
|  |     "1999-01-12" iso-date <compiled-pcre> findall first | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
|  | [ 3 ] [ | ||||||
|  |     "2003-10-09 1999-09-01 1514-10-20" iso-date <compiled-pcre> findall length | ||||||
|  | ] unit-test | ||||||
|  | 
 | ||||||
|  | [ 5 ] [ "abcdef" "[a-e]" findall length ] unit-test | ||||||
|  | 
 | ||||||
|  | [ 3 ] [ "foo bar baz" "foo|bar|baz" findall length ] unit-test | ||||||
|  | 
 | ||||||
|  | [ 3 ] [ "örjan är åtta" "[åäö]" findall length ] unit-test | ||||||
|  | 
 | ||||||
|  | [ 3 ] [ "ÅÄÖ" "\\p{Lu}" findall length ] unit-test | ||||||
|  | 
 | ||||||
|  | [ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test | ||||||
|  | 
 | ||||||
|  | : long-string ( -- x ) | ||||||
|  |     10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ; | ||||||
|  | 
 | ||||||
|  | ! Performance | ||||||
|  | [ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Tests for matches? | ||||||
|  | [ t ] [ "örjan" "örjan" matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | [ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test | ||||||
|  | @ -1,45 +1,81 @@ | ||||||
| USING: | USING: | ||||||
|     accessors |     accessors | ||||||
|     alien.c-types alien.data |     alien.c-types alien.data alien.strings | ||||||
|     arrays |     arrays | ||||||
|  |     assocs | ||||||
|  |     grouping | ||||||
|  |     io.encodings.utf8 io.encodings.string | ||||||
|     kernel |     kernel | ||||||
|     pcre.ffi |     locals | ||||||
|     sequences ; |     math | ||||||
|  |     pcre.ffi pcre.info | ||||||
|  |     sequences | ||||||
|  |     strings ; | ||||||
| IN: pcre | IN: pcre | ||||||
| 
 | 
 | ||||||
| ERROR: malformed-regexp expr error ; | ERROR: malformed-regexp expr error ; | ||||||
|  | ERROR: pcre-error value ; | ||||||
| 
 | 
 | ||||||
| TUPLE: compiled-pcre pcre extra ; | TUPLE: compiled-pcre pcre extra nametable ; | ||||||
|  | TUPLE: matcher subject compiled-pcre ofs match ; | ||||||
| 
 | 
 | ||||||
| ! Low-level | : default-opts ( -- opts ) | ||||||
| 
 |     PCRE_UTF8 PCRE_UCP bitor ; | ||||||
| : exec ( pcre extra subject ofs -- count match-data ) |  | ||||||
|     [ dup length ] dip 0 30 int <c-array> [ 30 pcre_exec ] keep ; |  | ||||||
| 
 | 
 | ||||||
| : (pcre) ( expr -- pcre err-message err-offset ) | : (pcre) ( expr -- pcre err-message err-offset ) | ||||||
|     0 { c-string int } [ f pcre_compile ] with-out-parameters ; |     default-opts { c-string int } [ f pcre_compile ] with-out-parameters ; | ||||||
| 
 | 
 | ||||||
| : <pcre> ( expr -- pcre ) | : <pcre> ( expr -- pcre ) | ||||||
|     dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ; |     dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ; | ||||||
| 
 | 
 | ||||||
|  | :: exec ( subject ofs pcre extra -- count match-data ) | ||||||
|  |     pcre extra subject dup length ofs 0 30 int <c-array> | ||||||
|  |     [ 30 pcre_exec ] keep ; | ||||||
|  | 
 | ||||||
| : <pcre-extra> ( pcre -- pcre-extra ) | : <pcre-extra> ( pcre -- pcre-extra ) | ||||||
|     0 { c-string } [ pcre_study ] with-out-parameters drop ; |     0 { c-string } [ pcre_study ] with-out-parameters drop ; | ||||||
| 
 | 
 | ||||||
|  | : config ( what -- alien ) | ||||||
|  |     { int } [ pcre_config ] with-out-parameters ; | ||||||
|  | 
 | ||||||
|  | ! Finding stuff | ||||||
|  | : (findnext) ( subject ofs compiled-pcre -- match/f ) | ||||||
|  |     [ pcre>> ] [ extra>> ] bi exec over | ||||||
|  |     dup -1 < [ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ; | ||||||
|  | 
 | ||||||
|  | : findnext ( matcher -- matcher'/f ) | ||||||
|  |     clone dup [ subject>> ] [ ofs>> ] [ compiled-pcre>> ] tri (findnext) | ||||||
|  |     [ [ >>match ] [ second second >>ofs ] bi ] [ drop f ] if* ; | ||||||
|  | 
 | ||||||
|  | ! Result parsing | ||||||
|  | : substring-list ( subject match-data count -- alien ) | ||||||
|  |     { void* } [ pcre_get_substring_list drop ] with-out-parameters ; | ||||||
|  | 
 | ||||||
|  | : parse-groups ( ngroups seq -- match ) | ||||||
|  |     swap 2 * head 2 <groups> [ >array ] map ; | ||||||
|  | 
 | ||||||
|  | : parse-match ( subject compiled-pcre match-data -- match ) | ||||||
|  |     swapd first2 swap [ substring-list ] keep void* <c-direct-array> | ||||||
|  |     [ alien>native-string ] { } map-as [ nametable>> ] dip | ||||||
|  |     [ of swap 2array ] with map-index ; | ||||||
|  | 
 | ||||||
| ! High-level | ! High-level | ||||||
| 
 |  | ||||||
| : <compiled-pcre> ( expr -- compiled-pcre ) | : <compiled-pcre> ( expr -- compiled-pcre ) | ||||||
|     <pcre> dup <pcre-extra> compiled-pcre boa ; |     <pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ; | ||||||
| 
 | 
 | ||||||
| : findall ( subject compiled-pcre -- matches ) | GENERIC: findall ( subject obj -- matches ) | ||||||
|     [ pcre>> ] [ extra>> ] bi rot 0 exec nip ; |  | ||||||
| 
 | 
 | ||||||
|  | M: compiled-pcre findall | ||||||
|  |     [ utf8 encode ] dip 2dup 0 f matcher boa [ findnext ] follow | ||||||
|  |     [ match>> ] map harvest [ parse-match ] 2with map ; | ||||||
| 
 | 
 | ||||||
|  | M: string findall | ||||||
|  |     <compiled-pcre> findall ; | ||||||
| 
 | 
 | ||||||
| : info ( pcre -- x x x ) | GENERIC: matches? ( subject obj -- ? ) | ||||||
|     { int int } [ pcre_info ] with-out-parameters ; |  | ||||||
| 
 | 
 | ||||||
| : fullinfo ( pcre pcre-extra what -- num x ) | M: compiled-pcre matches? | ||||||
|     { int } [ pcre_fullinfo ] with-out-parameters ; |     dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ; | ||||||
| 
 | 
 | ||||||
| : substring ( subject match-data count n -- str ) | M: string matches? | ||||||
|     { c-string } [ pcre_get_substring drop ] with-out-parameters ; |     <compiled-pcre> matches? ; | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue