pcre: lots of unit tests and a nice interface to the vocab in the findall and matches? words
parent
152ccf4b27
commit
2dd398b701
|
@ -0,0 +1,4 @@
|
|||
USING: pcre.ffi sequences splitting tools.test ;
|
||||
IN: pcre.ffi.tests
|
||||
|
||||
[ 2 ] [ pcre_version " " split length ] unit-test
|
|
@ -9,6 +9,68 @@ IN: pcre.ffi
|
|||
{ [ os unix? ] [ "libpcre" "libpcre.so" cdecl add-library ] }
|
||||
} cond >>
|
||||
|
||||
CONSTANT: PCRE_CASELESS 0x00000001
|
||||
CONSTANT: PCRE_MULTILINE 0x00000002
|
||||
CONSTANT: PCRE_DOTALL 0x00000004
|
||||
CONSTANT: PCRE_EXTENDED 0x00000008
|
||||
CONSTANT: PCRE_ANCHORED 0x00000010
|
||||
CONSTANT: PCRE_DOLLAR_ENDONLY 0x00000020
|
||||
CONSTANT: PCRE_EXTRA 0x00000040
|
||||
CONSTANT: PCRE_NOTBOL 0x00000080
|
||||
CONSTANT: PCRE_NOTEOL 0x00000100
|
||||
CONSTANT: PCRE_UNGREEDY 0x00000200
|
||||
CONSTANT: PCRE_NOTEMPTY 0x00000400
|
||||
CONSTANT: PCRE_UTF8 0x00000800
|
||||
CONSTANT: PCRE_NO_AUTO_CAPTURE 0x00001000
|
||||
CONSTANT: PCRE_NO_UTF8_CHECK 0x00002000
|
||||
CONSTANT: PCRE_AUTO_CALLOUT 0x00004000
|
||||
CONSTANT: PCRE_PARTIAL_SOFT 0x00008000
|
||||
CONSTANT: PCRE_PARTIAL 0x00008000
|
||||
CONSTANT: PCRE_DFA_SHORTEST 0x00010000
|
||||
CONSTANT: PCRE_DFA_RESTART 0x00020000
|
||||
CONSTANT: PCRE_FIRSTLINE 0x00040000
|
||||
CONSTANT: PCRE_DUPNAMES 0x00080000
|
||||
CONSTANT: PCRE_NEWLINE_CR 0x00100000
|
||||
CONSTANT: PCRE_NEWLINE_LF 0x00200000
|
||||
CONSTANT: PCRE_NEWLINE_CRLF 0x00300000
|
||||
CONSTANT: PCRE_NEWLINE_ANY 0x00400000
|
||||
CONSTANT: PCRE_NEWLINE_ANYCRLF 0x00500000
|
||||
CONSTANT: PCRE_BSR_ANYCRLF 0x00800000
|
||||
CONSTANT: PCRE_BSR_UNICODE 0x01000000
|
||||
CONSTANT: PCRE_JAVASCRIPT_COMPAT 0x02000000
|
||||
CONSTANT: PCRE_NO_START_OPTIMIZE 0x04000000
|
||||
CONSTANT: PCRE_NO_START_OPTIMISE 0x04000000
|
||||
CONSTANT: PCRE_PARTIAL_HARD 0x08000000
|
||||
CONSTANT: PCRE_NOTEMPTY_ATSTART 0x10000000
|
||||
CONSTANT: PCRE_UCP 0x20000000
|
||||
|
||||
CONSTANT: PCRE_ERROR_NOMATCH -1
|
||||
CONSTANT: PCRE_ERROR_NULL -2
|
||||
CONSTANT: PCRE_ERROR_BADOPTION -3
|
||||
CONSTANT: PCRE_ERROR_BADMAGIC -4
|
||||
CONSTANT: PCRE_ERROR_UNKNOWN_OPCODE -5
|
||||
CONSTANT: PCRE_ERROR_UNKNOWN_NODE -5
|
||||
CONSTANT: PCRE_ERROR_NOMEMORY -6
|
||||
CONSTANT: PCRE_ERROR_NOSUBSTRING -7
|
||||
CONSTANT: PCRE_ERROR_MATCHLIMIT -8
|
||||
CONSTANT: PCRE_ERROR_CALLOUT -9
|
||||
CONSTANT: PCRE_ERROR_BADUTF8 -10
|
||||
CONSTANT: PCRE_ERROR_BADUTF8_OFFSET -11
|
||||
CONSTANT: PCRE_ERROR_PARTIAL -12
|
||||
CONSTANT: PCRE_ERROR_BADPARTIAL -13
|
||||
CONSTANT: PCRE_ERROR_INTERNAL -14
|
||||
CONSTANT: PCRE_ERROR_BADCOUNT -15
|
||||
CONSTANT: PCRE_ERROR_DFA_UITEM -16
|
||||
CONSTANT: PCRE_ERROR_DFA_UCOND -17
|
||||
CONSTANT: PCRE_ERROR_DFA_UMLIMIT -18
|
||||
CONSTANT: PCRE_ERROR_DFA_WSSIZE -19
|
||||
CONSTANT: PCRE_ERROR_DFA_RECURSE -20
|
||||
CONSTANT: PCRE_ERROR_RECURSIONLIMIT -21
|
||||
CONSTANT: PCRE_ERROR_NULLWSLIMIT -22
|
||||
CONSTANT: PCRE_ERROR_BADNEWLINE -23
|
||||
CONSTANT: PCRE_ERROR_BADOFFSET -24
|
||||
CONSTANT: PCRE_ERROR_SHORTUTF8 -25
|
||||
|
||||
CONSTANT: PCRE_INFO_OPTIONS 0
|
||||
CONSTANT: PCRE_INFO_SIZE 1
|
||||
CONSTANT: PCRE_INFO_CAPTURECOUNT 2
|
||||
|
@ -27,6 +89,17 @@ CONSTANT: PCRE_INFO_JCHANGED 13
|
|||
CONSTANT: PCRE_INFO_HASCRORLF 14
|
||||
CONSTANT: PCRE_INFO_MINLENGTH 15
|
||||
|
||||
CONSTANT: PCRE_CONFIG_UTF8 0
|
||||
CONSTANT: PCRE_CONFIG_NEWLINE 1
|
||||
CONSTANT: PCRE_CONFIG_LINK_SIZE 2
|
||||
CONSTANT: PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 3
|
||||
CONSTANT: PCRE_CONFIG_MATCH_LIMIT 4
|
||||
CONSTANT: PCRE_CONFIG_STACKRECURSE 5
|
||||
CONSTANT: PCRE_CONFIG_UNICODE_PROPERTIES 6
|
||||
CONSTANT: PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
|
||||
CONSTANT: PCRE_CONFIG_BSR 8
|
||||
|
||||
|
||||
STRUCT: pcre_extra
|
||||
{ flags int }
|
||||
{ study_data void* }
|
||||
|
@ -36,12 +109,21 @@ STRUCT: pcre_extra
|
|||
{ match_limit_recursion int }
|
||||
{ mark uchar** } ;
|
||||
|
||||
FUNCTION: void pcre_config ( int what, void* where ) ;
|
||||
|
||||
FUNCTION: void* pcre_compile ( c-string pattern,
|
||||
int options,
|
||||
char** errptr,
|
||||
int* erroffset,
|
||||
char* tableptr ) ;
|
||||
|
||||
FUNCTION: void* pcre_compile2 ( c-string pattern,
|
||||
int options,
|
||||
int* errcodeptr,
|
||||
char** errptr,
|
||||
int* erroffset,
|
||||
char* tableptr ) ;
|
||||
|
||||
FUNCTION: int pcre_info ( void* pcre, int* optptr, int* first_byte ) ;
|
||||
FUNCTION: int pcre_fullinfo ( void* pcre, pcre_extra* extra, int what, void *where ) ;
|
||||
|
||||
|
@ -55,12 +137,19 @@ FUNCTION: int pcre_exec ( void* pcre,
|
|||
int* ovector,
|
||||
int ovecsize ) ;
|
||||
|
||||
FUNCTION: int pcre_get_stringnumber ( void* pcre, c-string name ) ;
|
||||
|
||||
FUNCTION: int pcre_get_substring ( c-string subject,
|
||||
int* ovector,
|
||||
int stringcount,
|
||||
int stringnumber,
|
||||
void *stringptr ) ;
|
||||
|
||||
FUNCTION: int pcre_get_substring_list ( c-string subject,
|
||||
int* ovector,
|
||||
int stringcount,
|
||||
void *stringptr ) ;
|
||||
|
||||
FUNCTION: c-string pcre_version ( ) ;
|
||||
|
||||
FUNCTION: uchar* pcre_maketables ( ) ;
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
USING: pcre pcre.info sequences tools.test ;
|
||||
|
||||
[ { { 3 "day" } { 2 "month" } { 1 "year" } } ]
|
||||
[
|
||||
"(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})" <compiled-pcre>
|
||||
name-table-entries
|
||||
] unit-test
|
||||
|
||||
[ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test
|
|
@ -0,0 +1,40 @@
|
|||
USING:
|
||||
accessors
|
||||
alien alien.accessors alien.c-types alien.data alien.strings
|
||||
arrays
|
||||
kernel
|
||||
math
|
||||
pcre.ffi
|
||||
sequences ;
|
||||
IN: pcre.info
|
||||
|
||||
! Gen. utility
|
||||
: 2with ( param1 param2 obj quot -- obj curry )
|
||||
[ -rot ] dip [ [ rot ] dip call ] 3curry ; inline
|
||||
|
||||
: gen-array-addrs ( base size n -- addrs )
|
||||
iota [ * + ] 2with map ;
|
||||
|
||||
! Mostly internal
|
||||
: fullinfo ( pcre extra what -- obj )
|
||||
{ int } [ pcre_fullinfo ] with-out-parameters nip ;
|
||||
|
||||
: name-count ( pcre extra -- n )
|
||||
PCRE_INFO_NAMECOUNT fullinfo ;
|
||||
|
||||
: name-table ( pcre extra -- addr )
|
||||
PCRE_INFO_NAMETABLE fullinfo ;
|
||||
|
||||
: name-entry-size ( pcre extra -- size )
|
||||
PCRE_INFO_NAMEENTRYSIZE fullinfo ;
|
||||
|
||||
: name-table-entry ( addr -- group-index group-name )
|
||||
[ <alien> 1 alien-unsigned-1 ] [ 2 + <alien> alien>native-string ] bi ;
|
||||
|
||||
: options ( pcre -- opts )
|
||||
f PCRE_INFO_OPTIONS fullinfo ;
|
||||
|
||||
! Exported
|
||||
: name-table-entries ( pcre extra -- addrs )
|
||||
[ name-table ] [ name-entry-size ] [ name-count ] 2tri gen-array-addrs
|
||||
[ name-table-entry 2array ] map ;
|
|
@ -0,0 +1,62 @@
|
|||
USING:
|
||||
accessors
|
||||
arrays
|
||||
kernel
|
||||
math math.ranges
|
||||
pcre pcre.ffi pcre.info
|
||||
random
|
||||
sequences
|
||||
splitting
|
||||
system
|
||||
tools.test ;
|
||||
IN: pcre.tests
|
||||
|
||||
CONSTANT: iso-date "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})"
|
||||
|
||||
[ { f -1 } ] [ "foo" (pcre) 3array 1 tail ] unit-test
|
||||
|
||||
[ { 1 2 3 } ] [
|
||||
iso-date <pcre>
|
||||
{ "year" "month" "day" } [ pcre_get_stringnumber ] with map
|
||||
] unit-test
|
||||
|
||||
[ t ] [ "foo" <compiled-pcre> pcre>> options PCRE_UTF8 bitand 0 > ] unit-test
|
||||
|
||||
os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
|
||||
|
||||
! In this day and age, not supporting utf-8 is broken.
|
||||
[ 1 ] [ PCRE_CONFIG_UTF8 config ] unit-test
|
||||
|
||||
[ 1 ] [ PCRE_CONFIG_UNICODE_PROPERTIES config ] unit-test
|
||||
|
||||
! Tests for findall
|
||||
[
|
||||
{ { f "1999-01-12" } { "year" "1999" } { "month" "01" } { "day" "12" } }
|
||||
] [
|
||||
"1999-01-12" iso-date <compiled-pcre> findall first
|
||||
] unit-test
|
||||
|
||||
[ 3 ] [
|
||||
"2003-10-09 1999-09-01 1514-10-20" iso-date <compiled-pcre> findall length
|
||||
] unit-test
|
||||
|
||||
[ 5 ] [ "abcdef" "[a-e]" findall length ] unit-test
|
||||
|
||||
[ 3 ] [ "foo bar baz" "foo|bar|baz" findall length ] unit-test
|
||||
|
||||
[ 3 ] [ "örjan är åtta" "[åäö]" findall length ] unit-test
|
||||
|
||||
[ 3 ] [ "ÅÄÖ" "\\p{Lu}" findall length ] unit-test
|
||||
|
||||
[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test
|
||||
|
||||
: long-string ( -- x )
|
||||
10000 [ CHAR: a CHAR: z [a,b] random ] "" replicate-as ;
|
||||
|
||||
! Performance
|
||||
[ 0 ] [ long-string ".{0,15}foobar.{0,10}" findall length ] unit-test
|
||||
|
||||
! Tests for matches?
|
||||
[ t ] [ "örjan" "örjan" matches? ] unit-test
|
||||
|
||||
[ t ] [ "abcö" "\\p{Ll}{4}" matches? ] unit-test
|
|
@ -1,45 +1,81 @@
|
|||
USING:
|
||||
accessors
|
||||
alien.c-types alien.data
|
||||
alien.c-types alien.data alien.strings
|
||||
arrays
|
||||
assocs
|
||||
grouping
|
||||
io.encodings.utf8 io.encodings.string
|
||||
kernel
|
||||
pcre.ffi
|
||||
sequences ;
|
||||
locals
|
||||
math
|
||||
pcre.ffi pcre.info
|
||||
sequences
|
||||
strings ;
|
||||
IN: pcre
|
||||
|
||||
ERROR: malformed-regexp expr error ;
|
||||
ERROR: pcre-error value ;
|
||||
|
||||
TUPLE: compiled-pcre pcre extra ;
|
||||
TUPLE: compiled-pcre pcre extra nametable ;
|
||||
TUPLE: matcher subject compiled-pcre ofs match ;
|
||||
|
||||
! Low-level
|
||||
|
||||
: exec ( pcre extra subject ofs -- count match-data )
|
||||
[ dup length ] dip 0 30 int <c-array> [ 30 pcre_exec ] keep ;
|
||||
: default-opts ( -- opts )
|
||||
PCRE_UTF8 PCRE_UCP bitor ;
|
||||
|
||||
: (pcre) ( expr -- pcre err-message err-offset )
|
||||
0 { c-string int } [ f pcre_compile ] with-out-parameters ;
|
||||
default-opts { c-string int } [ f pcre_compile ] with-out-parameters ;
|
||||
|
||||
: <pcre> ( expr -- pcre )
|
||||
dup (pcre) 2array swap [ 2nip ] [ malformed-regexp ] if* ;
|
||||
|
||||
:: exec ( subject ofs pcre extra -- count match-data )
|
||||
pcre extra subject dup length ofs 0 30 int <c-array>
|
||||
[ 30 pcre_exec ] keep ;
|
||||
|
||||
: <pcre-extra> ( pcre -- pcre-extra )
|
||||
0 { c-string } [ pcre_study ] with-out-parameters drop ;
|
||||
|
||||
: config ( what -- alien )
|
||||
{ int } [ pcre_config ] with-out-parameters ;
|
||||
|
||||
! Finding stuff
|
||||
: (findnext) ( subject ofs compiled-pcre -- match/f )
|
||||
[ pcre>> ] [ extra>> ] bi exec over
|
||||
dup -1 < [ pcre-error ] [ dup -1 = [ 3drop f ] [ drop 2array ] if ] if ;
|
||||
|
||||
: findnext ( matcher -- matcher'/f )
|
||||
clone dup [ subject>> ] [ ofs>> ] [ compiled-pcre>> ] tri (findnext)
|
||||
[ [ >>match ] [ second second >>ofs ] bi ] [ drop f ] if* ;
|
||||
|
||||
! Result parsing
|
||||
: substring-list ( subject match-data count -- alien )
|
||||
{ void* } [ pcre_get_substring_list drop ] with-out-parameters ;
|
||||
|
||||
: parse-groups ( ngroups seq -- match )
|
||||
swap 2 * head 2 <groups> [ >array ] map ;
|
||||
|
||||
: parse-match ( subject compiled-pcre match-data -- match )
|
||||
swapd first2 swap [ substring-list ] keep void* <c-direct-array>
|
||||
[ alien>native-string ] { } map-as [ nametable>> ] dip
|
||||
[ of swap 2array ] with map-index ;
|
||||
|
||||
! High-level
|
||||
|
||||
: <compiled-pcre> ( expr -- compiled-pcre )
|
||||
<pcre> dup <pcre-extra> compiled-pcre boa ;
|
||||
<pcre> dup <pcre-extra> 2dup name-table-entries compiled-pcre boa ;
|
||||
|
||||
: findall ( subject compiled-pcre -- matches )
|
||||
[ pcre>> ] [ extra>> ] bi rot 0 exec nip ;
|
||||
GENERIC: findall ( subject obj -- matches )
|
||||
|
||||
M: compiled-pcre findall
|
||||
[ utf8 encode ] dip 2dup 0 f matcher boa [ findnext ] follow
|
||||
[ match>> ] map harvest [ parse-match ] 2with map ;
|
||||
|
||||
M: string findall
|
||||
<compiled-pcre> findall ;
|
||||
|
||||
: info ( pcre -- x x x )
|
||||
{ int int } [ pcre_info ] with-out-parameters ;
|
||||
GENERIC: matches? ( subject obj -- ? )
|
||||
|
||||
: fullinfo ( pcre pcre-extra what -- num x )
|
||||
{ int } [ pcre_fullinfo ] with-out-parameters ;
|
||||
M: compiled-pcre matches?
|
||||
dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
|
||||
|
||||
: substring ( subject match-data count n -- str )
|
||||
{ c-string } [ pcre_get_substring drop ] with-out-parameters ;
|
||||
M: string matches?
|
||||
<compiled-pcre> matches? ;
|
||||
|
|
Loading…
Reference in New Issue