diff --git a/extra/pcre/pcre-tests.factor b/extra/pcre/pcre-tests.factor index 3d51014c26..e9d476243e 100644 --- a/extra/pcre/pcre-tests.factor +++ b/extra/pcre/pcre-tests.factor @@ -12,8 +12,6 @@ IN: pcre.tests nametable>> ] unit-test -[ { 100 110 120 130 } ] [ 100 10 4 gen-array-addrs ] unit-test - CONSTANT: iso-date "(?P\\d{4})-(?P\\d{2})-(?P\\d{2})" ! On windows the erroffset appears to be set to 0 despite there being @@ -29,16 +27,16 @@ CONSTANT: iso-date "(?P\\d{4})-(?P\\d{2})-(?P\\d{2})" [ t ] [ "foo" PCRE_UTF8 has-option? ] unit-test -os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE config ] unit-test ] when +os unix? [ [ 10 ] [ PCRE_CONFIG_NEWLINE pcre-config ] unit-test ] when ! In this day and age, not supporting utf-8 is broken. -[ 1 ] [ PCRE_CONFIG_UTF8 config ] unit-test +[ 1 ] [ PCRE_CONFIG_UTF8 pcre-config ] unit-test -[ 1 ] [ PCRE_CONFIG_UNICODE_PROPERTIES config ] unit-test +[ 1 ] [ PCRE_CONFIG_UNICODE_PROPERTIES pcre-config ] unit-test ! libpcre must not support 16 or 32 bit code points. -[ 0 ] [ PCRE_CONFIG_UTF16 config ] unit-test -[ 0 ] [ PCRE_CONFIG_UTF32 config ] unit-test +[ 0 ] [ PCRE_CONFIG_UTF16 pcre-config ] unit-test +[ 0 ] [ PCRE_CONFIG_UTF32 pcre-config ] unit-test ! Tests for findall [ diff --git a/extra/pcre/pcre.factor b/extra/pcre/pcre.factor index 08e0a84540..ab5987af95 100644 --- a/extra/pcre/pcre.factor +++ b/extra/pcre/pcre.factor @@ -1,7 +1,10 @@ +! Copyright (C) 2013 Björn Lindqvist +! See http://factorcode.org/license.txt for BSD license + USING: accessors alien alien.accessors alien.c-types alien.data -alien.enums alien.strings arrays assocs fry io.encodings.string -io.encodings.utf8 kernel literals math math.bitwise mirrors -pcre.ffi sequences sequences.generalizations splitting strings ; +alien.enums alien.strings arrays assocs combinators fry +io.encodings.string io.encodings.utf8 kernel literals math +math.bitwise pcre.ffi sequences splitting strings ; QUALIFIED: regexp IN: pcre @@ -20,42 +23,43 @@ ERROR: pcre-error value ; : 2with ( param1 param2 obj quot -- obj curry ) [ -rot ] dip [ [ rot ] dip call ] 3curry ; inline -: gen-array-addrs ( base size n -- addrs ) - iota [ * + ] 2with map ; - : utf8-start-byte? ( byte -- ? ) 0xc0 bitand 0x80 = not ; : next-utf8-char ( byte-array pos -- pos' ) - 1 + 2dup swap ?nth - [ utf8-start-byte? [ nip ] [ next-utf8-char ] if ] [ 2drop f ] if* ; + 1 + 2dup swap ?nth [ + utf8-start-byte? [ nip ] [ next-utf8-char ] if + ] [ 2drop f ] if* ; -: config ( what -- alien ) +: pcre-config ( what -- alien ) { int } [ pcre_config ] with-out-parameters ; -: fullinfo ( pcre extra what -- obj ) +: pcre-fullinfo ( pcre extra what -- obj ) { int } [ pcre_fullinfo ] with-out-parameters nip ; +: pcre-substring-list ( subject match-array count -- alien ) + { void* } [ pcre_get_substring_list drop ] with-out-parameters ; + : name-count ( pcre extra -- n ) - PCRE_INFO_NAMECOUNT fullinfo ; + PCRE_INFO_NAMECOUNT pcre-fullinfo ; : name-table ( pcre extra -- addr ) [ drop alien-address 32 on-bits unmask ] - [ PCRE_INFO_NAMETABLE fullinfo ] 2bi + ; + [ PCRE_INFO_NAMETABLE pcre-fullinfo ] 2bi + ; : name-entry-size ( pcre extra -- size ) - PCRE_INFO_NAMEENTRYSIZE fullinfo ; + PCRE_INFO_NAMEENTRYSIZE pcre-fullinfo ; : name-table-entry ( addr -- group-index group-name ) - [ 1 alien-unsigned-1 ] + [ 1 alien-unsigned-1 ] [ 2 + utf8 alien>string ] bi ; -: options ( pcre -- opts ) - f PCRE_INFO_OPTIONS fullinfo ; - : name-table-entries ( pcre extra -- addrs ) [ name-table ] [ name-entry-size ] [ name-count ] 2tri - gen-array-addrs [ name-table-entry 2array ] map ; + iota [ * + name-table-entry 2array ] 2with map ; + +: options ( pcre -- opts ) + f PCRE_INFO_OPTIONS pcre-fullinfo ; CONSTANT: default-opts flags{ PCRE_UTF8 PCRE_UCP } @@ -71,39 +75,43 @@ CONSTANT: default-opts flags{ PCRE_UTF8 PCRE_UCP } : exec ( pcre extra subject ofs opts -- count match-data ) [ dup length ] 2dip 30 int 30 [ pcre_exec ] 2keep drop ; -TUPLE: matcher pcre extra subject ofs exec-opts match ; +TUPLE: matcher pcre extra subject ofs exec-opts ; : ( subject compiled-pcre -- matcher ) - [ utf8 encode ] dip [ pcre>> ] [ extra>> ] bi rot 0 0 f matcher boa ; + [ utf8 encode ] dip [ pcre>> ] [ extra>> ] bi rot 0 0 matcher boa ; CONSTANT: empty-match-opts flags{ PCRE_NOTEMPTY_ATSTART PCRE_ANCHORED } -: findnext ( matcher -- matcher'/f ) - clone dup values 6 firstn drop exec - over dup -1 < [ PCRE_ERRORS number>enum pcre-error ] when - -1 = - [ - 2drop dup exec-opts>> 0 = - [ drop f ] - [ - dup [ subject>> ] [ ofs>> ] bi next-utf8-char - [ >>ofs 0 >>exec-opts findnext ] [ drop f ] if* +: findnext ( matcher -- matcher match/f ) + dup { + [ pcre>> ] + [ extra>> ] + [ subject>> ] + [ ofs>> ] + [ exec-opts>> ] + } cleave exec over dup -1 < [ + PCRE_ERRORS number>enum pcre-error + ] [ + -1 = [ + 2drop dup exec-opts>> 0 = + [ f ] + [ + dup [ subject>> ] [ ofs>> ] bi next-utf8-char + [ >>ofs 0 >>exec-opts findnext ] [ f ] if* + ] if + ] [ + [ + nip + [ first2 = [ empty-match-opts ] [ 0 ] if >>exec-opts ] + [ second >>ofs ] bi + ] [ + 2array + ] 2bi ] if - ] - [ - [ 2array >>match ] - [ - nip - [ first2 = [ empty-match-opts ] [ 0 ] if >>exec-opts ] - [ second >>ofs ] bi - ] 2bi ] if ; -: substring-list ( subject match-array count -- alien ) - { void* } [ pcre_get_substring_list drop ] with-out-parameters ; - : parse-match ( subject nametable match-data -- match ) - swapd first2 swap [ substring-list ] keep void* + swapd first2 swap [ pcre-substring-list ] keep void* [ utf8 alien>string ] { } map-as [ of swap 2array ] with map-index ; PRIVATE> @@ -119,7 +127,7 @@ TUPLE: compiled-pcre pcre extra nametable ; GENERIC: findall ( subject obj -- matches ) M: compiled-pcre findall - [ [ findnext ] follow [ match>> ] map harvest ] + [ [ findnext dup ] [ ] produce 2nip ] [ nametable>> rot [ parse-match ] 2with { } map-as ] 2bi ; M: string findall