pcre: new module pcre.utils for the utility functions which aren't really regexp-related
parent
80c32e2bc6
commit
ba8810dac3
|
@ -44,6 +44,34 @@ CONSTANT: PCRE_PARTIAL_HARD 0x08000000
|
||||||
CONSTANT: PCRE_NOTEMPTY_ATSTART 0x10000000
|
CONSTANT: PCRE_NOTEMPTY_ATSTART 0x10000000
|
||||||
CONSTANT: PCRE_UCP 0x20000000
|
CONSTANT: PCRE_UCP 0x20000000
|
||||||
|
|
||||||
|
ENUM: PCRE_ERRORS
|
||||||
|
{ PCRE_ERROR_NOMATCH -1 }
|
||||||
|
{ PCRE_ERROR_NULL -2 }
|
||||||
|
{ PCRE_ERROR_BADOPTION -3 }
|
||||||
|
{ PCRE_ERROR_BADMAGIC -4 }
|
||||||
|
{ PCRE_ERROR_UNKNOWN_OPCODE -5 }
|
||||||
|
{ PCRE_ERROR_UNKNOWN_NODE -5 }
|
||||||
|
{ PCRE_ERROR_NOMEMORY -6 }
|
||||||
|
{ PCRE_ERROR_NOSUBSTRING -7 }
|
||||||
|
{ PCRE_ERROR_MATCHLIMIT -8 }
|
||||||
|
{ PCRE_ERROR_CALLOUT -9 }
|
||||||
|
{ PCRE_ERROR_BADUTF8 -10 }
|
||||||
|
{ PCRE_ERROR_BADUTF8_OFFSET -11 }
|
||||||
|
{ PCRE_ERROR_PARTIAL -12 }
|
||||||
|
{ PCRE_ERROR_BADPARTIAL -13 }
|
||||||
|
{ PCRE_ERROR_INTERNAL -14 }
|
||||||
|
{ PCRE_ERROR_BADCOUNT -15 }
|
||||||
|
{ PCRE_ERROR_DFA_UITEM -16 }
|
||||||
|
{ PCRE_ERROR_DFA_UCOND -17 }
|
||||||
|
{ PCRE_ERROR_DFA_UMLIMIT -18 }
|
||||||
|
{ PCRE_ERROR_DFA_WSSIZE -19 }
|
||||||
|
{ PCRE_ERROR_DFA_RECURSE -20 }
|
||||||
|
{ PCRE_ERROR_RECURSIONLIMIT -21 }
|
||||||
|
{ PCRE_ERROR_NULLWSLIMIT -22 }
|
||||||
|
{ PCRE_ERROR_BADNEWLINE -23 }
|
||||||
|
{ PCRE_ERROR_BADOFFSET -24 }
|
||||||
|
{ PCRE_ERROR_SHORTUTF8 -25 } ;
|
||||||
|
|
||||||
CONSTANT: PCRE_ERROR_NOMATCH -1
|
CONSTANT: PCRE_ERROR_NOMATCH -1
|
||||||
CONSTANT: PCRE_ERROR_NULL -2
|
CONSTANT: PCRE_ERROR_NULL -2
|
||||||
CONSTANT: PCRE_ERROR_BADOPTION -3
|
CONSTANT: PCRE_ERROR_BADOPTION -3
|
||||||
|
|
|
@ -4,17 +4,10 @@ USING:
|
||||||
arrays
|
arrays
|
||||||
kernel
|
kernel
|
||||||
math
|
math
|
||||||
pcre.ffi
|
pcre.ffi pcre.utils
|
||||||
sequences ;
|
sequences ;
|
||||||
IN: pcre.info
|
IN: pcre.info
|
||||||
|
|
||||||
! Gen. utility
|
|
||||||
: 2with ( param1 param2 obj quot -- obj curry )
|
|
||||||
[ -rot ] dip [ [ rot ] dip call ] 3curry ; inline
|
|
||||||
|
|
||||||
: gen-array-addrs ( base size n -- addrs )
|
|
||||||
iota [ * + ] 2with map ;
|
|
||||||
|
|
||||||
! Mostly internal
|
! Mostly internal
|
||||||
: fullinfo ( pcre extra what -- obj )
|
: fullinfo ( pcre extra what -- obj )
|
||||||
{ int } [ pcre_fullinfo ] with-out-parameters nip ;
|
{ int } [ pcre_fullinfo ] with-out-parameters nip ;
|
||||||
|
|
|
@ -51,6 +51,10 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
|
||||||
|
|
||||||
[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test
|
[ 3 ] [ "foobar" "foo(?=bar)" findall first first second length ] unit-test
|
||||||
|
|
||||||
|
[ { { { f ", " } } { { f ", " } } { { f "." } } } ] [
|
||||||
|
"Words, words, word." "\\W+" findall
|
||||||
|
] unit-test
|
||||||
|
|
||||||
[ { ", " ", " "." } ] [
|
[ { ", " ", " "." } ] [
|
||||||
"Words, words, word." "\\W+" findall [ first second ] map
|
"Words, words, word." "\\W+" findall [ first second ] map
|
||||||
] unit-test
|
] unit-test
|
||||||
|
@ -65,11 +69,12 @@ os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when
|
||||||
[ { { { f "foo" } } { { f "" } } } ]
|
[ { { { f "foo" } } { { f "" } } } ]
|
||||||
[ "foo" ".*" findall ] unit-test
|
[ "foo" ".*" findall ] unit-test
|
||||||
|
|
||||||
[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ]
|
[ { { { f "" } } { { f "" } } { { f "" } } } ]
|
||||||
[ "foo" "B*" findall ] unit-test
|
[ "foo" "B*" findall ] unit-test
|
||||||
|
|
||||||
! Empty matches in strings with multi-byte characters are tricky.
|
! Empty matches in strings with multi-byte characters are tricky.
|
||||||
[ ] [ "ööööö" "x*" findall ] unit-test
|
[ { { { f "" } } { { f "" } } { { f "" } } { { f "" } } } ]
|
||||||
|
[ "öööö" "x*" findall ] unit-test
|
||||||
|
|
||||||
! Tests for matches?
|
! Tests for matches?
|
||||||
[ t ] [ "örjan" "örjan" matches? ] unit-test
|
[ t ] [ "örjan" "örjan" matches? ] unit-test
|
||||||
|
|
|
@ -1,17 +1,15 @@
|
||||||
USING:
|
USING:
|
||||||
accessors
|
accessors
|
||||||
alien.c-types alien.data alien.strings
|
alien.c-types alien.data alien.enums alien.strings
|
||||||
arrays
|
arrays
|
||||||
assocs
|
assocs
|
||||||
fry
|
|
||||||
io.encodings.utf8 io.encodings.string
|
io.encodings.utf8 io.encodings.string
|
||||||
kernel
|
kernel
|
||||||
math
|
math
|
||||||
mirrors
|
mirrors
|
||||||
pcre.ffi pcre.info
|
pcre.ffi pcre.info pcre.utils
|
||||||
sequences sequences.generalizations
|
sequences sequences.generalizations
|
||||||
strings ;
|
strings ;
|
||||||
QUALIFIED: splitting
|
|
||||||
IN: pcre
|
IN: pcre
|
||||||
|
|
||||||
ERROR: malformed-regexp expr error ;
|
ERROR: malformed-regexp expr error ;
|
||||||
|
@ -19,10 +17,6 @@ ERROR: pcre-error value ;
|
||||||
|
|
||||||
TUPLE: compiled-pcre pcre extra nametable ;
|
TUPLE: compiled-pcre pcre extra nametable ;
|
||||||
|
|
||||||
! Gen. utility
|
|
||||||
: replace-all ( seq subseqs new -- seq )
|
|
||||||
swapd '[ _ splitting:replace ] reduce ;
|
|
||||||
|
|
||||||
: default-opts ( -- opts )
|
: default-opts ( -- opts )
|
||||||
PCRE_UTF8 PCRE_UCP bitor ;
|
PCRE_UTF8 PCRE_UCP bitor ;
|
||||||
|
|
||||||
|
@ -53,17 +47,14 @@ TUPLE: matcher pcre extra subject ofs exec-opts match ;
|
||||||
|
|
||||||
: findnext ( matcher -- matcher'/f )
|
: findnext ( matcher -- matcher'/f )
|
||||||
clone dup <mirror> values 6 firstn drop exec
|
clone dup <mirror> values 6 firstn drop exec
|
||||||
over dup -1 < [ pcre-error ] when
|
over dup -1 < [ PCRE_ERRORS number>enum pcre-error ] when
|
||||||
-1 =
|
-1 =
|
||||||
[
|
[
|
||||||
2drop dup exec-opts>> 0 =
|
2drop dup exec-opts>> 0 =
|
||||||
[ drop f ]
|
[ drop f ]
|
||||||
[
|
[
|
||||||
! dup [ ofs>> 1 + dup ] [ subject>> ] bi bounds-check?
|
dup [ subject>> ] [ ofs>> ] bi next-utf8-char
|
||||||
! [ >>ofs 0 >>exec-opts findnext ] [ 2drop f ] if
|
[ >>ofs 0 >>exec-opts findnext ] [ drop f ] if*
|
||||||
|
|
||||||
dup [ ofs>> 1 + ] [ subject>> length ] bi over <
|
|
||||||
[ 2drop f ] [ >>ofs 0 >>exec-opts findnext ] if
|
|
||||||
] if
|
] if
|
||||||
]
|
]
|
||||||
[
|
[
|
||||||
|
@ -103,5 +94,4 @@ M: string findall
|
||||||
dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
|
dupd findall [ nip length 1 = ] [ ?first ?first ?last = ] 2bi and ;
|
||||||
|
|
||||||
: split ( subject obj -- strings )
|
: split ( subject obj -- strings )
|
||||||
dupd findall [ first second ] map
|
dupd findall [ first second ] map split-subseqs ;
|
||||||
dup first [ replace-all ] keep splitting:split harvest ;
|
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
USING: pcre.utils tools.test ;
|
||||||
|
IN: pcre.utils.tests
|
||||||
|
|
||||||
|
[ { "Bords" "words" "word" } ] [
|
||||||
|
"Bords, words, word." { ", " ", " "." } split-subseqs
|
||||||
|
] unit-test
|
|
@ -0,0 +1,21 @@
|
||||||
|
USING: assocs fry kernel math mirrors sequences splitting strings ;
|
||||||
|
IN: pcre.utils
|
||||||
|
|
||||||
|
: replace-all ( seq subseqs new -- seq )
|
||||||
|
swapd '[ _ replace ] reduce ;
|
||||||
|
|
||||||
|
: split-subseqs ( seq subseqs -- seqs )
|
||||||
|
dup first [ replace-all ] keep split-subseq [ >string ] map harvest ;
|
||||||
|
|
||||||
|
: 2with ( param1 param2 obj quot -- obj curry )
|
||||||
|
[ -rot ] dip [ [ rot ] dip call ] 3curry ; inline
|
||||||
|
|
||||||
|
: gen-array-addrs ( base size n -- addrs )
|
||||||
|
iota [ * + ] 2with map ;
|
||||||
|
|
||||||
|
: utf8-start-byte? ( byte -- ? )
|
||||||
|
0xc0 bitand 0x80 = not ;
|
||||||
|
|
||||||
|
: next-utf8-char ( byte-array pos -- pos' )
|
||||||
|
1 + 2dup swap ?nth
|
||||||
|
[ utf8-start-byte? [ nip ] [ next-utf8-char ] if ] [ 2drop f ] if* ;
|
Loading…
Reference in New Issue