diff --git a/basis/pcre/ffi/ffi.factor b/basis/pcre/ffi/ffi.factor index 837cb4edf6..020e88c894 100644 --- a/basis/pcre/ffi/ffi.factor +++ b/basis/pcre/ffi/ffi.factor @@ -44,6 +44,34 @@ CONSTANT: PCRE_PARTIAL_HARD 0x08000000 CONSTANT: PCRE_NOTEMPTY_ATSTART 0x10000000 CONSTANT: PCRE_UCP 0x20000000 +ENUM: PCRE_ERRORS + { PCRE_ERROR_NOMATCH -1 } + { PCRE_ERROR_NULL -2 } + { PCRE_ERROR_BADOPTION -3 } + { PCRE_ERROR_BADMAGIC -4 } + { PCRE_ERROR_UNKNOWN_OPCODE -5 } + { PCRE_ERROR_UNKNOWN_NODE -5 } + { PCRE_ERROR_NOMEMORY -6 } + { PCRE_ERROR_NOSUBSTRING -7 } + { PCRE_ERROR_MATCHLIMIT -8 } + { PCRE_ERROR_CALLOUT -9 } + { PCRE_ERROR_BADUTF8 -10 } + { PCRE_ERROR_BADUTF8_OFFSET -11 } + { PCRE_ERROR_PARTIAL -12 } + { PCRE_ERROR_BADPARTIAL -13 } + { PCRE_ERROR_INTERNAL -14 } + { PCRE_ERROR_BADCOUNT -15 } + { PCRE_ERROR_DFA_UITEM -16 } + { PCRE_ERROR_DFA_UCOND -17 } + { PCRE_ERROR_DFA_UMLIMIT -18 } + { PCRE_ERROR_DFA_WSSIZE -19 } + { PCRE_ERROR_DFA_RECURSE -20 } + { PCRE_ERROR_RECURSIONLIMIT -21 } + { PCRE_ERROR_NULLWSLIMIT -22 } + { PCRE_ERROR_BADNEWLINE -23 } + { PCRE_ERROR_BADOFFSET -24 } + { PCRE_ERROR_SHORTUTF8 -25 } ; + CONSTANT: PCRE_ERROR_NOMATCH -1 CONSTANT: PCRE_ERROR_NULL -2 CONSTANT: PCRE_ERROR_BADOPTION -3 diff --git a/basis/pcre/info.factor b/basis/pcre/info.factor index aaa6f36c1b..0efee2b068 100644 --- a/basis/pcre/info.factor +++ b/basis/pcre/info.factor @@ -4,17 +4,10 @@ USING: arrays kernel math - pcre.ffi + pcre.ffi pcre.utils sequences ; IN: pcre.info -! Gen. utility -: 2with ( param1 param2 obj quot -- obj curry ) - [ -rot ] dip [ [ rot ] dip call ] 3curry ; inline - -: gen-array-addrs ( base size n -- addrs ) - iota [ * + ] 2with map ; - ! Mostly internal : fullinfo ( pcre extra what -- obj ) { int } [ pcre_fullinfo ] with-out-parameters nip ; diff --git a/basis/pcre/pcre-tests.factor b/basis/pcre/pcre-tests.factor index a98dcf6700..aeb220abc4 100644 --- a/basis/pcre/pcre-tests.factor +++ b/basis/pcre/pcre-tests.factor @@ -51,6 +51,10 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when [ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test +[ { { { f ", " } } { { f ", " } } { { f "." } } } ] [ + "Words, words, word." "\\W+" findall +] unit-test + [ { ", " ", " "." } ] [ "Words, words, word." "\\W+" findall [ first second ] map ] unit-test @@ -65,11 +69,12 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when [ { { { f "foo" } } { { f "" } } } ] [ "foo" ".*" findall ] unit-test -[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ] +[ { { { f "" } } { { f "" } } { { f "" } } } ] [ "foo" "B*" findall ] unit-test ! Empty matches in strings with multi-byte characters are tricky. -[ ] [ "ööööö" "x*" findall ] unit-test +[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ] +[ "öööö" "x*" findall ] unit-test ! Tests for matches? [ t ] [ "örjan" "örjan" matches? ] unit-test diff --git a/basis/pcre/pcre.factor b/basis/pcre/pcre.factor index 1c6b53cc21..c8d0769ac6 100644 --- a/basis/pcre/pcre.factor +++ b/basis/pcre/pcre.factor @@ -1,17 +1,15 @@ USING: accessors - alien.c-types alien.data alien.strings + alien.c-types alien.data alien.enums alien.strings arrays assocs - fry io.encodings.utf8 io.encodings.string kernel math mirrors - pcre.ffi pcre.info + pcre.ffi pcre.info pcre.utils sequences sequences.generalizations strings ; -QUALIFIED: splitting IN: pcre ERROR: malformed-regexp expr error ; @@ -19,10 +17,6 @@ ERROR: pcre-error value ; TUPLE: compiled-pcre pcre extra nametable ; -! Gen. utility -: replace-all ( seq subseqs new -- seq ) - swapd '[ _ splitting:replace ] reduce ; - : default-opts ( -- opts ) PCRE_UTF8 PCRE_UCP bitor ; @@ -53,17 +47,14 @@ TUPLE: matcher pcre extra subject ofs exec-opts match ; : findnext ( matcher -- matcher'/f ) clone dup values 6 firstn drop exec - over dup -1 < [ pcre-error ] when + over dup -1 < [ PCRE_ERRORS number>enum pcre-error ] when -1 = [ 2drop dup exec-opts>> 0 = [ drop f ] [ - ! dup [ ofs>> 1 + dup ] [ subject>> ] bi bounds-check? - ! [ >>ofs 0 >>exec-opts findnext ] [ 2drop f ] if - - dup [ ofs>> 1 + ] [ subject>> length ] bi over < - [ 2drop f ] [ >>ofs 0 >>exec-opts findnext ] if + dup [ subject>> ] [ ofs>> ] bi next-utf8-char + [ >>ofs 0 >>exec-opts findnext ] [ drop f ] if* ] if ] [ @@ -103,5 +94,4 @@ M: string findall dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ; : split ( subject obj -- strings ) - dupd findall [ first second ] map - dup first [ replace-all ] keep splitting:split harvest ; + dupd findall [ first second ] map split-subseqs ; diff --git a/basis/pcre/utils-tests.factor b/basis/pcre/utils-tests.factor new file mode 100644 index 0000000000..f53b823cea --- /dev/null +++ b/basis/pcre/utils-tests.factor @@ -0,0 +1,6 @@ +USING: pcre.utils tools.test ; +IN: pcre.utils.tests + +[ { "Bords" "words" "word" } ] [ + "Bords, words, word." { ", " ", " "." } split-subseqs +] unit-test diff --git a/basis/pcre/utils.factor b/basis/pcre/utils.factor new file mode 100644 index 0000000000..f5fea26eae --- /dev/null +++ b/basis/pcre/utils.factor @@ -0,0 +1,21 @@ +USING: assocs fry kernel math mirrors sequences splitting strings ; +IN: pcre.utils + +: replace-all ( seq subseqs new -- seq ) + swapd '[ _ replace ] reduce ; + +: split-subseqs ( seq subseqs -- seqs ) + dup first [ replace-all ] keep split-subseq [ >string ] map harvest ; + +: 2with ( param1 param2 obj quot -- obj curry ) + [ -rot ] dip [ [ rot ] dip call ] 3curry ; inline + +: gen-array-addrs ( base size n -- addrs ) + iota [ * + ] 2with map ; + +: utf8-start-byte? ( byte -- ? ) + 0xc0 bitand 0x80 = not ; + +: next-utf8-char ( byte-array pos -- pos' ) + 1 + 2dup swap ?nth + [ utf8-start-byte? [ nip ] [ next-utf8-char ] if ] [ 2drop f ] if* ;