From 41312ae2e543e4ead232e98704c50b5534ef7ec3 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sun, 15 Feb 2009 14:28:22 -0600 Subject: [PATCH] Unfinished changes to regexp --- basis/ascii/ascii.factor | 4 +- basis/regexp/classes/classes.factor | 4 +- basis/regexp/regexp-tests.factor | 16 +-- .../transition-tables.factor | 4 +- basis/regexp/traversal/traversal.factor | 122 ++---------------- basis/regexp/utils/utils.factor | 28 +--- 6 files changed, 25 insertions(+), 153 deletions(-) diff --git a/basis/ascii/ascii.factor b/basis/ascii/ascii.factor index 193e847d27..bd1b86b279 100644 --- a/basis/ascii/ascii.factor +++ b/basis/ascii/ascii.factor @@ -10,7 +10,7 @@ IN: ascii : LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline : digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline : printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline -: control? ( ch -- ? ) "\0\e\r\n\t\u000008\u00007f" member? ; inline +: control? ( ch -- ? ) { [ 0 HEX: 1F between? ] [ HEX: 7F = ] } 1|| ; inline : quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline : Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline : alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline @@ -20,4 +20,4 @@ IN: ascii : >upper ( str -- upper ) [ ch>upper ] map ; HINTS: >lower string ; -HINTS: >upper string ; \ No newline at end of file +HINTS: >upper string ; diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 4a807fa51b..94d1b78d59 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors kernel math math.order words regexp.utils -unicode.categories combinators.short-circuit ; +ascii unicode.categories combinators.short-circuit ; IN: regexp.classes SINGLETONS: any-char any-char-no-nl @@ -64,7 +64,7 @@ M: non-newline-blank-class class-member? ( obj class -- ? ) drop { [ blank? ] [ CHAR: \n = not ] } 1&& ; M: control-character-class class-member? ( obj class -- ? ) - drop control-char? ; + drop control? ; M: hex-digit-class class-member? ( obj class -- ? ) drop hex-digit? ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 1cd9a2392e..cc9b2cccf1 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -44,9 +44,9 @@ IN: regexp-tests ! Dotall mode -- when on, . matches newlines. ! Off by default. [ f ] [ "\n" "." matches? ] unit-test -[ t ] [ "\n" "(?s)." matches? ] unit-test +! [ t ] [ "\n" "(?s)." matches? ] unit-test [ t ] [ "\n" R/ ./s matches? ] unit-test -[ f ] [ "\n\n" "(?s).(?-s)." matches? ] unit-test +! [ f ] [ "\n\n" "(?s).(?-s)." matches? ] unit-test [ f ] [ "" ".+" matches? ] unit-test [ t ] [ "a" ".+" matches? ] unit-test @@ -76,8 +76,6 @@ IN: regexp-tests [ t ] [ "bar" "foo|bar" matches? ] unit-test [ f ] [ "foobar" "foo|bar" matches? ] unit-test -/* -! FIXME [ f ] [ "" "(a)" matches? ] unit-test [ t ] [ "a" "(a)" matches? ] unit-test [ f ] [ "aa" "(a)" matches? ] unit-test @@ -85,7 +83,6 @@ IN: regexp-tests [ f ] [ "aababaaabbac" "(a|b)+" matches? ] unit-test [ t ] [ "ababaaabba" "(a|b)+" matches? ] unit-test -*/ [ f ] [ "" "a{1}" matches? ] unit-test [ t ] [ "a" "a{1}" matches? ] unit-test @@ -168,12 +165,9 @@ IN: regexp-tests [ f ] [ "0" "[^\\d]" matches? ] unit-test [ t ] [ "a" "[^\\d]" matches? ] unit-test -/* -! FIXME [ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" matches? ] unit-test [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" matches? ] unit-test [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" matches? ] unit-test -*/ [ t ] [ "1000" "\\d{4,6}" matches? ] unit-test [ t ] [ "1000" "[0-9]{4,6}" matches? ] unit-test @@ -226,6 +220,7 @@ IN: regexp-tests [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test +/* [ t ] [ "a" "(?i)a" matches? ] unit-test [ t ] [ "a" "(?i)a" matches? ] unit-test [ t ] [ "A" "(?i)a" matches? ] unit-test @@ -235,6 +230,7 @@ IN: regexp-tests [ t ] [ "a" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test +*/ [ f ] [ "A" "[a-z]" matches? ] unit-test [ t ] [ "A" R/ [a-z]/i matches? ] unit-test @@ -253,8 +249,6 @@ IN: regexp-tests [ t ] [ "abc*" "[^\\*]*\\*" matches? ] unit-test [ t ] [ "bca" "[^a]*a" matches? ] unit-test -/* -! FIXME [ ] [ "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" drop @@ -278,7 +272,6 @@ IN: regexp-tests [ "abc" ] [ "abc" "(ab|a)(bc)?" first-match >string ] unit-test [ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" first-match >string ] unit-test -*/ ! [ t ] [ "a:b" ".+:?" matches? ] unit-test @@ -309,7 +302,6 @@ IN: regexp-tests [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test /* -! FIXME [ f ] [ "ab" "a(?!b)" first-match ] unit-test [ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index e5c31a54e0..64d5cdb244 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -41,8 +41,8 @@ TUPLE: transition-table transitions start-state final-states ; #! set the state as a key 2dup [ to>> ] dip maybe-initialize-key [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip - 2dup at* [ 2nip insert-at ] - [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ; + 2dup at* [ 2nip push-at ] + [ drop [ H{ } clone [ push-at ] keep ] 2dip set-at ] if ; : add-transition ( transition transition-table -- ) transitions>> set-transition ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 104a6c2ce1..d0a76a6ddc 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -7,34 +7,20 @@ IN: regexp.traversal TUPLE: dfa-traverser dfa-table - traversal-flags - traverse-forward - lookahead-counters - lookbehind-counters - capture-counters - captured-groups - capture-group-index - last-state current-state + current-state text match-failed? start-index current-index matches ; : ( text regexp -- match ) - [ dfa-table>> ] [ dfa-traversal-flags>> ] bi + dfa-table>> dfa-traverser new - swap >>traversal-flags swap [ start-state>> >>current-state ] [ >>dfa-table ] bi swap >>text - t >>traverse-forward 0 >>start-index 0 >>current-index - 0 >>capture-group-index - V{ } clone >>matches - V{ } clone >>capture-counters - V{ } clone >>lookbehind-counters - V{ } clone >>lookahead-counters - H{ } clone >>captured-groups ; + V{ } clone >>matches ; : final-state? ( dfa-traverser -- ? ) [ current-state>> ] @@ -61,111 +47,28 @@ TUPLE: dfa-traverser dup save-final-state ] when text-finished? ; +: text-character ( dfa-traverser n -- ch ) + [ text>> ] swap '[ current-index>> _ + ] bi nth ; + : previous-text-character ( dfa-traverser -- ch ) - [ text>> ] [ current-index>> 1- ] bi nth ; + -1 text-character ; : current-text-character ( dfa-traverser -- ch ) - [ text>> ] [ current-index>> ] bi nth ; + 0 text-character ; : next-text-character ( dfa-traverser -- ch ) - [ text>> ] [ current-index>> 1+ ] bi nth ; - -GENERIC: flag-action ( dfa-traverser flag -- ) - - -M: beginning-of-input flag-action ( dfa-traverser flag -- ) - drop - dup beginning-of-text? [ t >>match-failed? ] unless drop ; - -M: end-of-input flag-action ( dfa-traverser flag -- ) - drop - dup end-of-text? [ t >>match-failed? ] unless drop ; - - -M: beginning-of-line flag-action ( dfa-traverser flag -- ) - drop - dup { - [ beginning-of-text? ] - [ previous-text-character terminator-class class-member? ] - } 1|| [ t >>match-failed? ] unless drop ; - -M: end-of-line flag-action ( dfa-traverser flag -- ) - drop - dup { - [ end-of-text? ] - [ next-text-character terminator-class class-member? ] - } 1|| [ t >>match-failed? ] unless drop ; - - -M: word-boundary flag-action ( dfa-traverser flag -- ) - drop - dup { - [ end-of-text? ] - [ current-text-character terminator-class class-member? ] - } 1|| [ t >>match-failed? ] unless drop ; - - -M: lookahead-on flag-action ( dfa-traverser flag -- ) - drop - lookahead-counters>> 0 swap push ; - -M: lookahead-off flag-action ( dfa-traverser flag -- ) - drop - dup lookahead-counters>> - [ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ; - -M: lookbehind-on flag-action ( dfa-traverser flag -- ) - drop - f >>traverse-forward - [ 2 - ] change-current-index - lookbehind-counters>> 0 swap push ; - -M: lookbehind-off flag-action ( dfa-traverser flag -- ) - drop - t >>traverse-forward - dup lookbehind-counters>> - [ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ; - -M: capture-group-on flag-action ( dfa-traverser flag -- ) - drop - [ current-index>> 0 2array ] - [ capture-counters>> ] bi push ; - -M: capture-group-off flag-action ( dfa-traverser flag -- ) - drop - dup capture-counters>> empty? [ - drop - ] [ - { - [ capture-counters>> pop first2 dupd + ] - [ text>> ] - [ [ 1+ ] change-capture-group-index capture-group-index>> ] - [ captured-groups>> set-at ] - } cleave - ] if ; - -: process-flags ( dfa-traverser -- ) - [ [ 1+ ] map ] change-lookahead-counters - [ [ 1+ ] map ] change-lookbehind-counters - [ [ first2 1+ 2array ] map ] change-capture-counters - ! dup current-state>> . - dup [ current-state>> ] [ traversal-flags>> ] bi - at [ flag-action ] with each ; + 1 text-character ; : increment-state ( dfa-traverser state -- dfa-traverser ) - [ - dup traverse-forward>> - [ [ 1+ ] change-current-index ] - [ [ 1- ] change-current-index ] if - dup current-state>> >>last-state - ] [ first ] bi* >>current-state ; + [ [ 1 + ] change-current-index ] + [ first ] bi* >>current-state ; : match-literal ( transition from-state table -- to-state/f ) transitions>> at at ; : match-class ( transition from-state table -- to-state/f ) transitions>> at* [ - [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if + '[ drop _ swap class-member? ] assoc-find [ nip ] [ drop ] if ] [ drop ] if ; : match-default ( transition from-state table -- to-state/f ) @@ -180,7 +83,6 @@ M: capture-group-off flag-action ( dfa-traverser flag -- ) [ dfa-table>> ] tri ; : do-match ( dfa-traverser -- dfa-traverser ) - dup process-flags dup match-done? [ dup setup-match match-transition [ increment-state do-match ] when* diff --git a/basis/regexp/utils/utils.factor b/basis/regexp/utils/utils.factor index af1b2fa1fb..d1266a6d98 100644 --- a/basis/regexp/utils/utils.factor +++ b/basis/regexp/utils/utils.factor @@ -12,47 +12,25 @@ IN: regexp.utils : while-changes ( obj quot pred -- obj' ) pick over call (while-changes) ; inline -: assoc-with ( param assoc quot -- assoc curry ) - swapd [ [ -rot ] dip call ] 2curry ; inline - -: insert-at ( value key hash -- ) - 2dup at* [ - 2nip push - ] [ - drop - [ dup vector? [ 1vector ] unless ] 2dip set-at - ] if ; - -: ?insert-at ( value key hash/f -- hash ) - [ H{ } clone ] unless* [ insert-at ] keep ; - ERROR: bad-octal number ; ERROR: bad-hex number ; : check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ; : check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ; -: ascii? ( n -- ? ) 0 HEX: 7f between? ; -: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; : decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; : hex-digit? ( n -- ? ) - [ + { [ decimal-digit? ] [ CHAR: a CHAR: f between? ] [ CHAR: A CHAR: F between? ] - ] 1|| ; - -: control-char? ( n -- ? ) - [ - [ 0 HEX: 1f between? ] - [ HEX: 7f = ] - ] 1|| ; + } 1|| ; : punct? ( n -- ? ) "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; : c-identifier-char? ( ch -- ? ) - [ [ alpha? ] [ CHAR: _ = ] ] 1|| ; + { [ alpha? ] [ CHAR: _ = ] } 1|| ; : java-blank? ( n -- ? ) {