diff --git a/basis/pcre/ffi/ffi-tests.factor b/basis/pcre/ffi/ffi-tests.factor new file mode 100644 index 0000000000..491ae1efa6 --- /dev/null +++ b/basis/pcre/ffi/ffi-tests.factor @@ -0,0 +1,4 @@ +USING: pcre.ffi sequences splitting tools.test ; +IN: pcre.ffi.tests + +[ 2 ] [ pcre_version " " split length ] unit-test diff --git a/basis/pcre/ffi/ffi.factor b/basis/pcre/ffi/ffi.factor index e7ace4b1b7..837cb4edf6 100644 --- a/basis/pcre/ffi/ffi.factor +++ b/basis/pcre/ffi/ffi.factor @@ -9,6 +9,68 @@ IN: pcre.ffi { [ os unix? ] [ "libpcre" "libpcre.so" cdecl add-library ] } } cond >> +CONSTANT: PCRE_CASELESS 0x00000001 +CONSTANT: PCRE_MULTILINE 0x00000002 +CONSTANT: PCRE_DOTALL 0x00000004 +CONSTANT: PCRE_EXTENDED 0x00000008 +CONSTANT: PCRE_ANCHORED 0x00000010 +CONSTANT: PCRE_DOLLAR_ENDONLY 0x00000020 +CONSTANT: PCRE_EXTRA 0x00000040 +CONSTANT: PCRE_NOTBOL 0x00000080 +CONSTANT: PCRE_NOTEOL 0x00000100 +CONSTANT: PCRE_UNGREEDY 0x00000200 +CONSTANT: PCRE_NOTEMPTY 0x00000400 +CONSTANT: PCRE_UTF8 0x00000800 +CONSTANT: PCRE_NO_AUTO_CAPTURE 0x00001000 +CONSTANT: PCRE_NO_UTF8_CHECK 0x00002000 +CONSTANT: PCRE_AUTO_CALLOUT 0x00004000 +CONSTANT: PCRE_PARTIAL_SOFT 0x00008000 +CONSTANT: PCRE_PARTIAL 0x00008000 +CONSTANT: PCRE_DFA_SHORTEST 0x00010000 +CONSTANT: PCRE_DFA_RESTART 0x00020000 +CONSTANT: PCRE_FIRSTLINE 0x00040000 +CONSTANT: PCRE_DUPNAMES 0x00080000 +CONSTANT: PCRE_NEWLINE_CR 0x00100000 +CONSTANT: PCRE_NEWLINE_LF 0x00200000 +CONSTANT: PCRE_NEWLINE_CRLF 0x00300000 +CONSTANT: PCRE_NEWLINE_ANY 0x00400000 +CONSTANT: PCRE_NEWLINE_ANYCRLF 0x00500000 +CONSTANT: PCRE_BSR_ANYCRLF 0x00800000 +CONSTANT: PCRE_BSR_UNICODE 0x01000000 +CONSTANT: PCRE_JAVASCRIPT_COMPAT 0x02000000 +CONSTANT: PCRE_NO_START_OPTIMIZE 0x04000000 +CONSTANT: PCRE_NO_START_OPTIMISE 0x04000000 +CONSTANT: PCRE_PARTIAL_HARD 0x08000000 +CONSTANT: PCRE_NOTEMPTY_ATSTART 0x10000000 +CONSTANT: PCRE_UCP 0x20000000 + +CONSTANT: PCRE_ERROR_NOMATCH -1 +CONSTANT: PCRE_ERROR_NULL -2 +CONSTANT: PCRE_ERROR_BADOPTION -3 +CONSTANT: PCRE_ERROR_BADMAGIC -4 +CONSTANT: PCRE_ERROR_UNKNOWN_OPCODE -5 +CONSTANT: PCRE_ERROR_UNKNOWN_NODE -5 +CONSTANT: PCRE_ERROR_NOMEMORY -6 +CONSTANT: PCRE_ERROR_NOSUBSTRING -7 +CONSTANT: PCRE_ERROR_MATCHLIMIT -8 +CONSTANT: PCRE_ERROR_CALLOUT -9 +CONSTANT: PCRE_ERROR_BADUTF8 -10 +CONSTANT: PCRE_ERROR_BADUTF8_OFFSET -11 +CONSTANT: PCRE_ERROR_PARTIAL -12 +CONSTANT: PCRE_ERROR_BADPARTIAL -13 +CONSTANT: PCRE_ERROR_INTERNAL -14 +CONSTANT: PCRE_ERROR_BADCOUNT -15 +CONSTANT: PCRE_ERROR_DFA_UITEM -16 +CONSTANT: PCRE_ERROR_DFA_UCOND -17 +CONSTANT: PCRE_ERROR_DFA_UMLIMIT -18 +CONSTANT: PCRE_ERROR_DFA_WSSIZE -19 +CONSTANT: PCRE_ERROR_DFA_RECURSE -20 +CONSTANT: PCRE_ERROR_RECURSIONLIMIT -21 +CONSTANT: PCRE_ERROR_NULLWSLIMIT -22 +CONSTANT: PCRE_ERROR_BADNEWLINE -23 +CONSTANT: PCRE_ERROR_BADOFFSET -24 +CONSTANT: PCRE_ERROR_SHORTUTF8 -25 + CONSTANT: PCRE_INFO_OPTIONS 0 CONSTANT: PCRE_INFO_SIZE 1 CONSTANT: PCRE_INFO_CAPTURECOUNT 2 @@ -27,6 +89,17 @@ CONSTANT: PCRE_INFO_JCHANGED 13 CONSTANT: PCRE_INFO_HASCRORLF 14 CONSTANT: PCRE_INFO_MINLENGTH 15 +CONSTANT: PCRE_CONFIG_UTF8 0 +CONSTANT: PCRE_CONFIG_NEWLINE 1 +CONSTANT: PCRE_CONFIG_LINK_SIZE 2 +CONSTANT: PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3 +CONSTANT: PCRE_CONFIG_MATCH_LIMIT 4 +CONSTANT: PCRE_CONFIG_STACKRECURSE 5 +CONSTANT: PCRE_CONFIG_UNICODE_PROPERTIES 6 +CONSTANT: PCRE_CONFIG_MATCH_LIMIT_RECURSION 7 +CONSTANT: PCRE_CONFIG_BSR 8 + + STRUCT: pcre_extra { flags int } { study_data void* } @@ -36,12 +109,21 @@ STRUCT: pcre_extra { match_limit_recursion int } { mark uchar** } ; +FUNCTION: void pcre_config ( int what, void* where ) ; + FUNCTION: void* pcre_compile ( c-string pattern, int options, char** errptr, int* erroffset, char* tableptr ) ; +FUNCTION: void* pcre_compile2 ( c-string pattern, + int options, + int* errcodeptr, + char** errptr, + int* erroffset, + char* tableptr ) ; + FUNCTION: int pcre_info ( void* pcre, int* optptr, int* first_byte ) ; FUNCTION: int pcre_fullinfo ( void* pcre, pcre_extra* extra, int what, void *where ) ; @@ -55,12 +137,19 @@ FUNCTION: int pcre_exec ( void* pcre, int* ovector, int ovecsize ) ; +FUNCTION: int pcre_get_stringnumber ( void* pcre, c-string name ) ; + FUNCTION: int pcre_get_substring ( c-string subject, int* ovector, int stringcount, int stringnumber, void *stringptr ) ; +FUNCTION: int pcre_get_substring_list ( c-string subject, + int* ovector, + int stringcount, + void *stringptr ) ; + FUNCTION: c-string pcre_version ( ) ; FUNCTION: uchar* pcre_maketables ( ) ; diff --git a/basis/pcre/info-tests.factor b/basis/pcre/info-tests.factor new file mode 100644 index 0000000000..3086d58dae --- /dev/null +++ b/basis/pcre/info-tests.factor @@ -0,0 +1,9 @@ +USING: pcre pcre.info sequences tools.test ; + +[ { { 3 "day" } { 2 "month" } { 1 "year" } } ] +[ + "(?P\\d{4})-(?P\\d{2})-(?P\\d{2})" + name-table-entries +] unit-test + +[ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test diff --git a/basis/pcre/info.factor b/basis/pcre/info.factor new file mode 100644 index 0000000000..aaa6f36c1b --- /dev/null +++ b/basis/pcre/info.factor @@ -0,0 +1,40 @@ +USING: + accessors + alien alien.accessors alien.c-types alien.data alien.strings + arrays + kernel + math + pcre.ffi + sequences ; +IN: pcre.info + +! Gen. utility +: 2with ( param1 param2 obj quot -- obj curry ) + [ -rot ] dip [ [ rot ] dip call ] 3curry ; inline + +: gen-array-addrs ( base size n -- addrs ) + iota [ * + ] 2with map ; + +! Mostly internal +: fullinfo ( pcre extra what -- obj ) + { int } [ pcre_fullinfo ] with-out-parameters nip ; + +: name-count ( pcre extra -- n ) + PCRE_INFO_NAMECOUNT fullinfo ; + +: name-table ( pcre extra -- addr ) + PCRE_INFO_NAMETABLE fullinfo ; + +: name-entry-size ( pcre extra -- size ) + PCRE_INFO_NAMEENTRYSIZE fullinfo ; + +: name-table-entry ( addr -- group-index group-name ) + [ 1 alien-unsigned-1 ] [ 2 + alien>native-string ] bi ; + +: options ( pcre -- opts ) + f PCRE_INFO_OPTIONS fullinfo ; + +! Exported +: name-table-entries ( pcre extra -- addrs ) + [ name-table ] [ name-entry-size ] [ name-count ] 2tri gen-array-addrs + [ name-table-entry 2array ] map ; diff --git a/basis/pcre/pcre-tests.factor b/basis/pcre/pcre-tests.factor new file mode 100644 index 0000000000..1be1f2d504 --- /dev/null +++ b/basis/pcre/pcre-tests.factor @@ -0,0 +1,62 @@ +USING: + accessors + arrays + kernel + math math.ranges + pcre pcre.ffi pcre.info + random + sequences + splitting + system + tools.test ; +IN: pcre.tests + +CONSTANT: iso-date "(?P\\d{4})-(?P\\d{2})-(?P\\d{2})" + +[ { f -1 } ] [ "foo" (pcre) 3array 1 tail ] unit-test + +[ { 1 2 3 } ] [ + iso-date + { "year" "month" "day" } [ pcre_get_stringnumber ] with map +] unit-test + +[ t ] [ "foo" pcre>> options PCRE_UTF8 bitand 0 > ] unit-test + +os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when + +! In this day and age, not supporting utf-8 is broken. +[ 1 ] [ PCRE_CONFIG_UTF8 config ] unit-test + +[ 1 ] [ PCRE_CONFIG_UNICODE_PROPERTIES config ] unit-test + +! Tests for findall +[ + { { f "1999-01-12" } { "year" "1999" } { "month" "01" } { "day" "12" } } +] [ + "1999-01-12" iso-date findall first +] unit-test + +[ 3 ] [ + "2003-10-09 1999-09-01 1514-10-20" iso-date findall length +] unit-test + +[ 5 ] [ "abcdef" "[a-e]" findall length ] unit-test + +[ 3 ] [ "foo bar baz" "foo|bar|baz" findall length ] unit-test + +[ 3 ] [ "örjan är åtta" "[åäö]" findall length ] unit-test + +[ 3 ] [ "ÅÄÖ" "\\p{Lu}" findall length ] unit-test + +[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test + +: long-string ( -- x ) + 10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ; + +! Performance +[ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test + +! Tests for matches? +[ t ] [ "örjan" "örjan" matches? ] unit-test + +[ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test diff --git a/basis/pcre/pcre.factor b/basis/pcre/pcre.factor index 811925d3d7..133f8dc00c 100644 --- a/basis/pcre/pcre.factor +++ b/basis/pcre/pcre.factor @@ -1,45 +1,81 @@ USING: accessors - alien.c-types alien.data + alien.c-types alien.data alien.strings arrays + assocs + grouping + io.encodings.utf8 io.encodings.string kernel - pcre.ffi - sequences ; + locals + math + pcre.ffi pcre.info + sequences + strings ; IN: pcre ERROR: malformed-regexp expr error ; +ERROR: pcre-error value ; -TUPLE: compiled-pcre pcre extra ; +TUPLE: compiled-pcre pcre extra nametable ; +TUPLE: matcher subject compiled-pcre ofs match ; -! Low-level - -: exec ( pcre extra subject ofs -- count match-data ) - [ dup length ] dip 0 30 int [ 30 pcre_exec ] keep ; +: default-opts ( -- opts ) + PCRE_UTF8 PCRE_UCP bitor ; : (pcre) ( expr -- pcre err-message err-offset ) - 0 { c-string int } [ f pcre_compile ] with-out-parameters ; + default-opts { c-string int } [ f pcre_compile ] with-out-parameters ; : ( expr -- pcre ) dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ; +:: exec ( subject ofs pcre extra -- count match-data ) + pcre extra subject dup length ofs 0 30 int + [ 30 pcre_exec ] keep ; + : ( pcre -- pcre-extra ) 0 { c-string } [ pcre_study ] with-out-parameters drop ; +: config ( what -- alien ) + { int } [ pcre_config ] with-out-parameters ; + +! Finding stuff +: (findnext) ( subject ofs compiled-pcre -- match/f ) + [ pcre>> ] [ extra>> ] bi exec over + dup -1 < [ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ; + +: findnext ( matcher -- matcher'/f ) + clone dup [ subject>> ] [ ofs>> ] [ compiled-pcre>> ] tri (findnext) + [ [ >>match ] [ second second >>ofs ] bi ] [ drop f ] if* ; + +! Result parsing +: substring-list ( subject match-data count -- alien ) + { void* } [ pcre_get_substring_list drop ] with-out-parameters ; + +: parse-groups ( ngroups seq -- match ) + swap 2 * head 2 [ >array ] map ; + +: parse-match ( subject compiled-pcre match-data -- match ) + swapd first2 swap [ substring-list ] keep void* + [ alien>native-string ] { } map-as [ nametable>> ] dip + [ of swap 2array ] with map-index ; + ! High-level - : ( expr -- compiled-pcre ) - dup compiled-pcre boa ; + dup 2dup name-table-entries compiled-pcre boa ; -: findall ( subject compiled-pcre -- matches ) - [ pcre>> ] [ extra>> ] bi rot 0 exec nip ; +GENERIC: findall ( subject obj -- matches ) +M: compiled-pcre findall + [ utf8 encode ] dip 2dup 0 f matcher boa [ findnext ] follow + [ match>> ] map harvest [ parse-match ] 2with map ; +M: string findall + findall ; -: info ( pcre -- x x x ) - { int int } [ pcre_info ] with-out-parameters ; +GENERIC: matches? ( subject obj -- ? ) -: fullinfo ( pcre pcre-extra what -- num x ) - { int } [ pcre_fullinfo ] with-out-parameters ; +M: compiled-pcre matches? + dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ; -: substring ( subject match-data count n -- str ) - { c-string } [ pcre_get_substring drop ] with-out-parameters ; +M: string matches? + matches? ;