pcre: new module pcre.utils for the utility functions which aren't really regexp-related

db4
Björn Lindqvist 2013-09-23 16:10:11 +02:00 committed by John Benediktsson
parent 80c32e2bc6
commit ba8810dac3
6 changed files with 69 additions and 26 deletions

View File

@ -44,6 +44,34 @@ CONSTANT: PCRE_PARTIAL_HARD 0x08000000
CONSTANT: PCRE_NOTEMPTY_ATSTART 0x10000000
CONSTANT: PCRE_UCP 0x20000000
ENUM: PCRE_ERRORS
{ PCRE_ERROR_NOMATCH -1 }
{ PCRE_ERROR_NULL -2 }
{ PCRE_ERROR_BADOPTION -3 }
{ PCRE_ERROR_BADMAGIC -4 }
{ PCRE_ERROR_UNKNOWN_OPCODE -5 }
{ PCRE_ERROR_UNKNOWN_NODE -5 }
{ PCRE_ERROR_NOMEMORY -6 }
{ PCRE_ERROR_NOSUBSTRING -7 }
{ PCRE_ERROR_MATCHLIMIT -8 }
{ PCRE_ERROR_CALLOUT -9 }
{ PCRE_ERROR_BADUTF8 -10 }
{ PCRE_ERROR_BADUTF8_OFFSET -11 }
{ PCRE_ERROR_PARTIAL -12 }
{ PCRE_ERROR_BADPARTIAL -13 }
{ PCRE_ERROR_INTERNAL -14 }
{ PCRE_ERROR_BADCOUNT -15 }
{ PCRE_ERROR_DFA_UITEM -16 }
{ PCRE_ERROR_DFA_UCOND -17 }
{ PCRE_ERROR_DFA_UMLIMIT -18 }
{ PCRE_ERROR_DFA_WSSIZE -19 }
{ PCRE_ERROR_DFA_RECURSE -20 }
{ PCRE_ERROR_RECURSIONLIMIT -21 }
{ PCRE_ERROR_NULLWSLIMIT -22 }
{ PCRE_ERROR_BADNEWLINE -23 }
{ PCRE_ERROR_BADOFFSET -24 }
{ PCRE_ERROR_SHORTUTF8 -25 } ;
CONSTANT: PCRE_ERROR_NOMATCH -1
CONSTANT: PCRE_ERROR_NULL -2
CONSTANT: PCRE_ERROR_BADOPTION -3

View File

@ -4,17 +4,10 @@ USING:
arrays
kernel
math
pcre.ffi
pcre.ffi pcre.utils
sequences ;
IN: pcre.info
! Gen. utility
: 2with ( param1 param2 obj quot -- obj curry )
[ -rot ] dip [ [ rot ] dip call ] 3curry ; inline
: gen-array-addrs ( base size n -- addrs )
iota [ * + ] 2with map ;
! Mostly internal
: fullinfo ( pcre extra what -- obj )
{ int } [ pcre_fullinfo ] with-out-parameters nip ;

View File

@ -51,6 +51,10 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test
[ { { { f ", " } } { { f ", " } } { { f "." } } } ] [
"Words, words, word." "\\W+" findall
] unit-test
[ { ", " ", " "." } ] [
"Words, words, word." "\\W+" findall [ first second ] map
] unit-test
@ -65,11 +69,12 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
[ { { { f "foo" } } { { f "" } } } ]
[ "foo" ".*" findall ] unit-test
[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ]
[ { { { f "" } } { { f "" } } { { f "" } } } ]
[ "foo" "B*" findall ] unit-test
! Empty matches in strings with multi-byte characters are tricky.
[ ] [ "ööööö" "x*" findall ] unit-test
[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ]
[ "öööö" "x*" findall ] unit-test
! Tests for matches?
[ t ] [ "örjan" "örjan" matches? ] unit-test

View File

@ -1,17 +1,15 @@
USING:
accessors
alien.c-types alien.data alien.strings
alien.c-types alien.data alien.enums alien.strings
arrays
assocs
fry
io.encodings.utf8 io.encodings.string
kernel
math
mirrors
pcre.ffi pcre.info
pcre.ffi pcre.info pcre.utils
sequences sequences.generalizations
strings ;
QUALIFIED: splitting
IN: pcre
ERROR: malformed-regexp expr error ;
@ -19,10 +17,6 @@ ERROR: pcre-error value ;
TUPLE: compiled-pcre pcre extra nametable ;
! Gen. utility
: replace-all ( seq subseqs new -- seq )
swapd '[ _ splitting:replace ] reduce ;
: default-opts ( -- opts )
PCRE_UTF8 PCRE_UCP bitor ;
@ -53,17 +47,14 @@ TUPLE: matcher pcre extra subject ofs exec-opts match ;
: findnext ( matcher -- matcher'/f )
clone dup <mirror> values 6 firstn drop exec
over dup -1 < [ pcre-error ] when
over dup -1 < [ PCRE_ERRORS number>enum pcre-error ] when
-1 =
[
2drop dup exec-opts>> 0 =
[ drop f ]
[
! dup [ ofs>> 1 + dup ] [ subject>> ] bi bounds-check?
! [ >>ofs 0 >>exec-opts findnext ] [ 2drop f ] if
dup [ ofs>> 1 + ] [ subject>> length ] bi over <
[ 2drop f ] [ >>ofs 0 >>exec-opts findnext ] if
dup [ subject>> ] [ ofs>> ] bi next-utf8-char
[ >>ofs 0 >>exec-opts findnext ] [ drop f ] if*
] if
]
[
@ -103,5 +94,4 @@ M: string findall
dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
: split ( subject obj -- strings )
dupd findall [ first second ] map
dup first [ replace-all ] keep splitting:split harvest ;
dupd findall [ first second ] map split-subseqs ;

View File

@ -0,0 +1,6 @@
USING: pcre.utils tools.test ;
IN: pcre.utils.tests
[ { "Bords" "words" "word" } ] [
"Bords, words, word." { ", " ", " "." } split-subseqs
] unit-test

21
basis/pcre/utils.factor Normal file
View File

@ -0,0 +1,21 @@
USING: assocs fry kernel math mirrors sequences splitting strings ;
IN: pcre.utils
: replace-all ( seq subseqs new -- seq )
swapd '[ _ replace ] reduce ;
: split-subseqs ( seq subseqs -- seqs )
dup first [ replace-all ] keep split-subseq [ >string ] map harvest ;
: 2with ( param1 param2 obj quot -- obj curry )
[ -rot ] dip [ [ rot ] dip call ] 3curry ; inline
: gen-array-addrs ( base size n -- addrs )
iota [ * + ] 2with map ;
: utf8-start-byte? ( byte -- ? )
0xc0 bitand 0x80 = not ;
: next-utf8-char ( byte-array pos -- pos' )
1 + 2dup swap ?nth
[ utf8-start-byte? [ nip ] [ next-utf8-char ] if ] [ 2drop f ] if* ;