From d0d615fb2bd301b3dc30e4e0f74aff877c94d7f0 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 12 Feb 2009 13:18:43 -0600 Subject: [PATCH 001/125] Starting to switch xmode to regexp; getting rid of >file) } @@ -17,11 +18,9 @@ TAG: MODE ] dip rot set-at ; -TAGS> - : parse-modes-tag ( tag -- modes ) H{ } clone [ - swap child-tags [ parse-mode-tag ] with each + swap children-tags [ parse-mode-tag ] with each ] keep ; MEMO: modes ( -- modes ) diff --git a/basis/xmode/loader/loader.factor b/basis/xmode/loader/loader.factor index 70466913a0..61b60b5292 100644 --- a/basis/xmode/loader/loader.factor +++ b/basis/xmode/loader/loader.factor @@ -1,56 +1,54 @@ USING: xmode.loader.syntax xmode.tokens xmode.rules xmode.keyword-map xml.data xml.traversal xml assocs kernel combinators sequences math.parser namespaces parser -xmode.utilities parser-combinators.regexp io.files accessors ; +xmode.utilities regexp io.files accessors ; IN: xmode.loader ! Based on org.gjt.sp.jedit.XModeHandler ! RULES and its children ->props drop ; -TAG: IMPORT +TAG: IMPORT parse-rule-tag "DELEGATE" attr swap import-rule-set ; -TAG: TERMINATE +TAG: TERMINATE parse-rule-tag "AT_CHAR" attr string>number >>terminate-char drop ; -RULE: SEQ seq-rule +RULE: SEQ seq-rule parse-rule-tag shared-tag-attrs delegate-attr literal-start ; -RULE: SEQ_REGEXP seq-rule +RULE: SEQ_REGEXP seq-rule parse-rule-tag shared-tag-attrs delegate-attr regexp-attr regexp-start ; -RULE: SPAN span-rule +RULE: SPAN span-rule parse-rule-tag shared-tag-attrs delegate-attr match-type-attr span-attrs parse-begin/end-tags init-span-tag ; -RULE: SPAN_REGEXP span-rule +RULE: SPAN_REGEXP span-rule parse-rule-tag shared-tag-attrs delegate-attr match-type-attr span-attrs regexp-attr parse-begin/end-tags init-span-tag ; -RULE: EOL_SPAN eol-span-rule +RULE: EOL_SPAN eol-span-rule parse-rule-tag shared-tag-attrs delegate-attr match-type-attr literal-start init-eol-span-tag ; -RULE: EOL_SPAN_REGEXP eol-span-rule +RULE: EOL_SPAN_REGEXP eol-span-rule parse-rule-tag shared-tag-attrs delegate-attr match-type-attr regexp-attr regexp-start init-eol-span-tag ; -RULE: MARK_FOLLOWING mark-following-rule +RULE: MARK_FOLLOWING mark-following-rule parse-rule-tag shared-tag-attrs match-type-attr literal-start ; -RULE: MARK_PREVIOUS mark-previous-rule +RULE: MARK_PREVIOUS mark-previous-rule parse-rule-tag shared-tag-attrs match-type-attr literal-start ; -TAG: KEYWORDS ( rule-set tag -- key value ) +TAG: KEYWORDS parse-rule-tag rule-set get ignore-case?>> - swap child-tags [ over parse-keyword-tag ] each + swap children-tags [ over parse-keyword-tag ] each swap (>>keywords) ; -TAGS> - : ? ( string/f -- regexp/f ) - dup [ rule-set get ignore-case?>> ] when ; + dup [ rule-set get ignore-case?>> drop ] when ; : (parse-rules-tag) ( tag -- rule-set ) dup rule-set set @@ -66,7 +64,7 @@ TAGS> : parse-rules-tag ( tag -- rule-set ) [ - [ (parse-rules-tag) ] [ child-tags ] bi + [ (parse-rules-tag) ] [ children-tags ] bi [ parse-rule-tag ] with each rule-set get ] with-scope ; diff --git a/basis/xmode/loader/syntax/syntax.factor b/basis/xmode/loader/syntax/syntax.factor index 0e7293da97..88ff7b919b 100644 --- a/basis/xmode/loader/syntax/syntax.factor +++ b/basis/xmode/loader/syntax/syntax.factor @@ -3,7 +3,7 @@ USING: accessors xmode.tokens xmode.rules xmode.keyword-map xml.data xml.traversal xml assocs kernel combinators sequences math.parser namespaces make parser lexer xmode.utilities -parser-combinators.regexp io.files splitting arrays ; +regexp io.files splitting arrays xml.syntax.private ; IN: xmode.loader.syntax ! Rule tag parsing utilities @@ -11,9 +11,10 @@ IN: xmode.loader.syntax new swap init-from-tag swap add-rule ; inline : RULE: - scan scan-word + scan scan-word scan-word parse-definition { } make - swap [ (parse-rule-tag) ] 2curry (TAG:) ; parsing + [ swap [ (parse-rule-tag) ] 2curry ] dip + swap define-tag ; parsing ! Attribute utilities : string>boolean ( string -- ? ) "TRUE" = ; @@ -32,7 +33,7 @@ IN: xmode.loader.syntax [ "NAME" attr ] [ "VALUE" attr ] bi ; : parse-props-tag ( tag -- assoc ) - child-tags + children-tags [ parse-prop-tag ] H{ } map>assoc ; : position-attrs ( tag -- at-line-start? at-whitespace-end? at-word-start? ) @@ -46,7 +47,7 @@ IN: xmode.loader.syntax swap position-attrs ; : parse-regexp-matcher ( tag -- matcher ) - dup children>string rule-set get ignore-case?>> + dup children>string rule-set get ignore-case?>> drop swap position-attrs ; : shared-tag-attrs ( -- ) @@ -79,22 +80,20 @@ IN: xmode.loader.syntax [ parse-literal-matcher >>end drop ] , ; ! SPAN's children ->start drop ; -TAG: END +TAG: END parse-begin/end-tag ! XXX parse-literal-matcher >>end drop ; -TAGS> - : parse-begin/end-tags ( -- ) [ ! XXX: handle position attrs on span tag itself - child-tags [ parse-begin/end-tag ] with each + children-tags [ parse-begin/end-tag ] with each ] , ; : init-span-tag ( -- ) [ drop init-span ] , ; diff --git a/basis/xmode/marker/marker.factor b/basis/xmode/marker/marker.factor index cff0af2a98..5cbd9e1e9c 100755 --- a/basis/xmode/marker/marker.factor +++ b/basis/xmode/marker/marker.factor @@ -4,8 +4,10 @@ IN: xmode.marker USING: kernel namespaces make xmode.rules xmode.tokens xmode.marker.state xmode.marker.context xmode.utilities xmode.catalog sequences math assocs combinators strings -parser-combinators.regexp splitting parser-combinators ascii +regexp splitting ascii parser-combinators regexp.backend ascii combinators.short-circuit accessors ; +! parser-combinators is for the string-head? word +! regexp.backend is for the regexp class ! Based on org.gjt.sp.jedit.syntax.TokenMarker @@ -150,7 +152,7 @@ M: escape-rule handle-rule-start process-escape? get [ escaped? [ not ] change position [ + ] change - ] [ 2drop ] if ; + ] [ drop ] if ; M: seq-rule handle-rule-start ?end-rule diff --git a/basis/xmode/rules/rules.factor b/basis/xmode/rules/rules.factor index adc43d7bb6..99364fe7cd 100644 --- a/basis/xmode/rules/rules.factor +++ b/basis/xmode/rules/rules.factor @@ -1,6 +1,6 @@ USING: accessors xmode.tokens xmode.keyword-map kernel sequences vectors assocs strings memoize unicode.case -parser-combinators.regexp ; +regexp regexp.backend ; ! regexp.backend has the regexp class IN: xmode.rules TUPLE: string-matcher string ignore-case? ; diff --git a/basis/xmode/utilities/utilities.factor b/basis/xmode/utilities/utilities.factor index 2423fb0d86..22db69de3f 100644 --- a/basis/xmode/utilities/utilities.factor +++ b/basis/xmode/utilities/utilities.factor @@ -4,8 +4,6 @@ IN: xmode.utilities : implies ( x y -- z ) [ not ] dip or ; inline -: child-tags ( tag -- seq ) children>> [ tag? ] filter ; - : map-find ( seq quot -- result elt ) [ f ] 2dip '[ nip @ dup ] find @@ -37,21 +35,3 @@ MACRO: (init-from-tag) ( specs -- ) : init-from-tag ( tag tuple specs -- tuple ) over [ (init-from-tag) ] dip ; inline - -SYMBOL: tag-handlers -SYMBOL: tag-handler-word - -: - tag-handler-word get - tag-handlers get >alist [ [ dup main>> ] dip case ] curry - define ; parsing From ff265aa91994005b5f0dda1de414508c25c2c67e Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 12 Feb 2009 20:42:32 -0600 Subject: [PATCH 002/125] XMode doesn't use parser combinators at all; regexes allow parens for grouping --- basis/regexp/nfa/nfa.factor | 9 +--- basis/xmode/catalog/catalog.factor | 6 +-- basis/xmode/loader/loader.factor | 4 +- basis/xmode/loader/syntax/syntax.factor | 13 +++--- basis/xmode/marker/marker.factor | 18 +++++++- basis/xmode/utilities/utilities-tests.factor | 46 +------------------- basis/xmode/utilities/utilities.factor | 6 ++- 7 files changed, 36 insertions(+), 66 deletions(-) diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 537c85c2d3..44481454fc 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -183,15 +183,8 @@ M: character-class-range nfa-node ( node -- ) ] if ; M: capture-group nfa-node ( node -- ) - "capture-groups" feature-is-broken - eps literal-transition add-simple-entry - capture-group-on add-traversal-flag - term>> nfa-node - eps literal-transition add-simple-entry - capture-group-off add-traversal-flag - 2 [ concatenate-nodes ] times ; + term>> nfa-node ; -! xyzzy M: non-capture-group nfa-node ( node -- ) term>> nfa-node ; diff --git a/basis/xmode/catalog/catalog.factor b/basis/xmode/catalog/catalog.factor index 3a87d71d58..b08e47ddc5 100644 --- a/basis/xmode/catalog/catalog.factor +++ b/basis/xmode/catalog/catalog.factor @@ -1,7 +1,7 @@ USING: xmode.loader xmode.utilities xmode.rules namespaces strings splitting assocs sequences kernel io.files xml memoize words globs combinators io.encodings.utf8 sorting accessors xml.data -xml.traversal ; +xml.traversal xml.syntax ; IN: xmode.catalog TUPLE: mode file file-name-glob first-line-glob ; @@ -97,8 +97,8 @@ ERROR: mutually-recursive-rulesets ruleset ; ] if ; : finalize-mode ( rulesets -- ) - rule-sets [ - dup [ nip finalize-rule-set ] assoc-each + dup rule-sets [ + [ nip finalize-rule-set ] assoc-each ] with-variable ; : load-mode ( name -- rule-sets ) diff --git a/basis/xmode/loader/loader.factor b/basis/xmode/loader/loader.factor index 61b60b5292..d6f3943e14 100644 --- a/basis/xmode/loader/loader.factor +++ b/basis/xmode/loader/loader.factor @@ -1,7 +1,7 @@ USING: xmode.loader.syntax xmode.tokens xmode.rules xmode.keyword-map xml.data xml.traversal xml assocs kernel combinators sequences math.parser namespaces parser -xmode.utilities regexp io.files accessors ; +xmode.utilities regexp io.files accessors xml.syntax ; IN: xmode.loader ! Based on org.gjt.sp.jedit.XModeHandler @@ -48,7 +48,7 @@ TAG: KEYWORDS parse-rule-tag swap (>>keywords) ; : ? ( string/f -- regexp/f ) - dup [ rule-set get ignore-case?>> drop ] when ; + dup [ rule-set get ignore-case?>> ] when ; : (parse-rules-tag) ( tag -- rule-set ) dup rule-set set diff --git a/basis/xmode/loader/syntax/syntax.factor b/basis/xmode/loader/syntax/syntax.factor index 88ff7b919b..60318e669e 100644 --- a/basis/xmode/loader/syntax/syntax.factor +++ b/basis/xmode/loader/syntax/syntax.factor @@ -3,7 +3,7 @@ USING: accessors xmode.tokens xmode.rules xmode.keyword-map xml.data xml.traversal xml assocs kernel combinators sequences math.parser namespaces make parser lexer xmode.utilities -regexp io.files splitting arrays xml.syntax.private ; +regexp io.files splitting arrays xml.syntax xml.syntax.private ; IN: xmode.loader.syntax ! Rule tag parsing utilities @@ -11,10 +11,10 @@ IN: xmode.loader.syntax new swap init-from-tag swap add-rule ; inline : RULE: - scan scan-word scan-word - parse-definition { } make - [ swap [ (parse-rule-tag) ] 2curry ] dip - swap define-tag ; parsing + scan scan-word scan-word [ + parse-definition { } make + swap [ (parse-rule-tag) ] 2curry + ] dip swap define-tag ; parsing ! Attribute utilities : string>boolean ( string -- ? ) "TRUE" = ; @@ -47,7 +47,8 @@ IN: xmode.loader.syntax swap position-attrs ; : parse-regexp-matcher ( tag -- matcher ) - dup children>string rule-set get ignore-case?>> drop + dup children>string + rule-set get ignore-case?>> swap position-attrs ; : shared-tag-attrs ( -- ) diff --git a/basis/xmode/marker/marker.factor b/basis/xmode/marker/marker.factor index 5cbd9e1e9c..e106af7952 100755 --- a/basis/xmode/marker/marker.factor +++ b/basis/xmode/marker/marker.factor @@ -4,11 +4,25 @@ IN: xmode.marker USING: kernel namespaces make xmode.rules xmode.tokens xmode.marker.state xmode.marker.context xmode.utilities xmode.catalog sequences math assocs combinators strings -regexp splitting ascii parser-combinators regexp.backend +regexp splitting ascii regexp.backend unicode.case ascii combinators.short-circuit accessors ; -! parser-combinators is for the string-head? word ! regexp.backend is for the regexp class +! Next two words copied from parser-combinators +! Just like head?, but they optionally ignore case + +: string= ( str1 str2 ignore-case -- ? ) + [ [ >upper ] bi@ ] when sequence= ; + +: string-head? ( str1 str2 ignore-case -- ? ) + 2over shorter? + [ 3drop f ] [ + [ + [ nip ] + [ length head-slice ] 2bi + ] dip string= + ] if ; + ! Based on org.gjt.sp.jedit.syntax.TokenMarker : current-keyword ( -- string ) diff --git a/basis/xmode/utilities/utilities-tests.factor b/basis/xmode/utilities/utilities-tests.factor index 45238ca2b1..0ef221f237 100644 --- a/basis/xmode/utilities/utilities-tests.factor +++ b/basis/xmode/utilities/utilities-tests.factor @@ -1,7 +1,6 @@ +USING: assocs xmode.utilities tools.test ; IN: xmode.utilities.tests -USING: accessors xmode.utilities tools.test xml xml.data kernel -strings vectors sequences io.files prettyprint assocs -unicode.case ; + [ "hi" 3 ] [ { 1 2 3 4 5 6 7 8 } [ H{ { 3 "hi" } } at ] map-find ] unit-test @@ -9,44 +8,3 @@ unicode.case ; [ f f ] [ { 1 2 3 4 5 6 7 8 } [ H{ { 11 "hi" } } at ] map-find ] unit-test - -TUPLE: company employees type ; - -: V{ } clone f company boa ; - -: add-employee employees>> push ; - ->name) } { f (>>description) } } - init-from-tag swap add-employee ; - -TAGS> - -\ parse-employee-tag see - -: parse-company-tag - [ - - { { "type" >upper (>>type) } } - init-from-tag dup - ] keep - children>> [ tag? ] filter - [ parse-employee-tag ] with each ; - -[ - T{ company f - V{ - T{ employee f "Joe" "VP Sales" } - T{ employee f "Jane" "CFO" } - } - "PUBLIC" - } -] [ - "resource:basis/xmode/utilities/test.xml" - file>xml parse-company-tag -] unit-test diff --git a/basis/xmode/utilities/utilities.factor b/basis/xmode/utilities/utilities.factor index 22db69de3f..1b2b4a352f 100644 --- a/basis/xmode/utilities/utilities.factor +++ b/basis/xmode/utilities/utilities.factor @@ -1,5 +1,6 @@ USING: accessors sequences assocs kernel quotations namespaces -xml.data xml.traversal combinators macros parser lexer words fry ; +xml.data xml.traversal combinators macros parser lexer words fry +regexp ; IN: xmode.utilities : implies ( x y -- z ) [ not ] dip or ; inline @@ -35,3 +36,6 @@ MACRO: (init-from-tag) ( specs -- ) : init-from-tag ( tag tuple specs -- tuple ) over [ (init-from-tag) ] dip ; inline + +: ( string ? -- regexp ) + "i" "" ? ; From 41312ae2e543e4ead232e98704c50b5534ef7ec3 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sun, 15 Feb 2009 14:28:22 -0600 Subject: [PATCH 003/125] Unfinished changes to regexp --- basis/ascii/ascii.factor | 4 +- basis/regexp/classes/classes.factor | 4 +- basis/regexp/regexp-tests.factor | 16 +-- .../transition-tables.factor | 4 +- basis/regexp/traversal/traversal.factor | 122 ++---------------- basis/regexp/utils/utils.factor | 28 +--- 6 files changed, 25 insertions(+), 153 deletions(-) diff --git a/basis/ascii/ascii.factor b/basis/ascii/ascii.factor index 193e847d27..bd1b86b279 100644 --- a/basis/ascii/ascii.factor +++ b/basis/ascii/ascii.factor @@ -10,7 +10,7 @@ IN: ascii : LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline : digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline : printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline -: control? ( ch -- ? ) "\0\e\r\n\t\u000008\u00007f" member? ; inline +: control? ( ch -- ? ) { [ 0 HEX: 1F between? ] [ HEX: 7F = ] } 1|| ; inline : quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline : Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline : alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline @@ -20,4 +20,4 @@ IN: ascii : >upper ( str -- upper ) [ ch>upper ] map ; HINTS: >lower string ; -HINTS: >upper string ; \ No newline at end of file +HINTS: >upper string ; diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 4a807fa51b..94d1b78d59 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors kernel math math.order words regexp.utils -unicode.categories combinators.short-circuit ; +ascii unicode.categories combinators.short-circuit ; IN: regexp.classes SINGLETONS: any-char any-char-no-nl @@ -64,7 +64,7 @@ M: non-newline-blank-class class-member? ( obj class -- ? ) drop { [ blank? ] [ CHAR: \n = not ] } 1&& ; M: control-character-class class-member? ( obj class -- ? ) - drop control-char? ; + drop control? ; M: hex-digit-class class-member? ( obj class -- ? ) drop hex-digit? ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 1cd9a2392e..cc9b2cccf1 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -44,9 +44,9 @@ IN: regexp-tests ! Dotall mode -- when on, . matches newlines. ! Off by default. [ f ] [ "\n" "." matches? ] unit-test -[ t ] [ "\n" "(?s)." matches? ] unit-test +! [ t ] [ "\n" "(?s)." matches? ] unit-test [ t ] [ "\n" R/ ./s matches? ] unit-test -[ f ] [ "\n\n" "(?s).(?-s)." matches? ] unit-test +! [ f ] [ "\n\n" "(?s).(?-s)." matches? ] unit-test [ f ] [ "" ".+" matches? ] unit-test [ t ] [ "a" ".+" matches? ] unit-test @@ -76,8 +76,6 @@ IN: regexp-tests [ t ] [ "bar" "foo|bar" matches? ] unit-test [ f ] [ "foobar" "foo|bar" matches? ] unit-test -/* -! FIXME [ f ] [ "" "(a)" matches? ] unit-test [ t ] [ "a" "(a)" matches? ] unit-test [ f ] [ "aa" "(a)" matches? ] unit-test @@ -85,7 +83,6 @@ IN: regexp-tests [ f ] [ "aababaaabbac" "(a|b)+" matches? ] unit-test [ t ] [ "ababaaabba" "(a|b)+" matches? ] unit-test -*/ [ f ] [ "" "a{1}" matches? ] unit-test [ t ] [ "a" "a{1}" matches? ] unit-test @@ -168,12 +165,9 @@ IN: regexp-tests [ f ] [ "0" "[^\\d]" matches? ] unit-test [ t ] [ "a" "[^\\d]" matches? ] unit-test -/* -! FIXME [ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" matches? ] unit-test [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" matches? ] unit-test [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" matches? ] unit-test -*/ [ t ] [ "1000" "\\d{4,6}" matches? ] unit-test [ t ] [ "1000" "[0-9]{4,6}" matches? ] unit-test @@ -226,6 +220,7 @@ IN: regexp-tests [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test +/* [ t ] [ "a" "(?i)a" matches? ] unit-test [ t ] [ "a" "(?i)a" matches? ] unit-test [ t ] [ "A" "(?i)a" matches? ] unit-test @@ -235,6 +230,7 @@ IN: regexp-tests [ t ] [ "a" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test +*/ [ f ] [ "A" "[a-z]" matches? ] unit-test [ t ] [ "A" R/ [a-z]/i matches? ] unit-test @@ -253,8 +249,6 @@ IN: regexp-tests [ t ] [ "abc*" "[^\\*]*\\*" matches? ] unit-test [ t ] [ "bca" "[^a]*a" matches? ] unit-test -/* -! FIXME [ ] [ "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" drop @@ -278,7 +272,6 @@ IN: regexp-tests [ "abc" ] [ "abc" "(ab|a)(bc)?" first-match >string ] unit-test [ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" first-match >string ] unit-test -*/ ! [ t ] [ "a:b" ".+:?" matches? ] unit-test @@ -309,7 +302,6 @@ IN: regexp-tests [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test /* -! FIXME [ f ] [ "ab" "a(?!b)" first-match ] unit-test [ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index e5c31a54e0..64d5cdb244 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -41,8 +41,8 @@ TUPLE: transition-table transitions start-state final-states ; #! set the state as a key 2dup [ to>> ] dip maybe-initialize-key [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip - 2dup at* [ 2nip insert-at ] - [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ; + 2dup at* [ 2nip push-at ] + [ drop [ H{ } clone [ push-at ] keep ] 2dip set-at ] if ; : add-transition ( transition transition-table -- ) transitions>> set-transition ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 104a6c2ce1..d0a76a6ddc 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -7,34 +7,20 @@ IN: regexp.traversal TUPLE: dfa-traverser dfa-table - traversal-flags - traverse-forward - lookahead-counters - lookbehind-counters - capture-counters - captured-groups - capture-group-index - last-state current-state + current-state text match-failed? start-index current-index matches ; : ( text regexp -- match ) - [ dfa-table>> ] [ dfa-traversal-flags>> ] bi + dfa-table>> dfa-traverser new - swap >>traversal-flags swap [ start-state>> >>current-state ] [ >>dfa-table ] bi swap >>text - t >>traverse-forward 0 >>start-index 0 >>current-index - 0 >>capture-group-index - V{ } clone >>matches - V{ } clone >>capture-counters - V{ } clone >>lookbehind-counters - V{ } clone >>lookahead-counters - H{ } clone >>captured-groups ; + V{ } clone >>matches ; : final-state? ( dfa-traverser -- ? ) [ current-state>> ] @@ -61,111 +47,28 @@ TUPLE: dfa-traverser dup save-final-state ] when text-finished? ; +: text-character ( dfa-traverser n -- ch ) + [ text>> ] swap '[ current-index>> _ + ] bi nth ; + : previous-text-character ( dfa-traverser -- ch ) - [ text>> ] [ current-index>> 1- ] bi nth ; + -1 text-character ; : current-text-character ( dfa-traverser -- ch ) - [ text>> ] [ current-index>> ] bi nth ; + 0 text-character ; : next-text-character ( dfa-traverser -- ch ) - [ text>> ] [ current-index>> 1+ ] bi nth ; - -GENERIC: flag-action ( dfa-traverser flag -- ) - - -M: beginning-of-input flag-action ( dfa-traverser flag -- ) - drop - dup beginning-of-text? [ t >>match-failed? ] unless drop ; - -M: end-of-input flag-action ( dfa-traverser flag -- ) - drop - dup end-of-text? [ t >>match-failed? ] unless drop ; - - -M: beginning-of-line flag-action ( dfa-traverser flag -- ) - drop - dup { - [ beginning-of-text? ] - [ previous-text-character terminator-class class-member? ] - } 1|| [ t >>match-failed? ] unless drop ; - -M: end-of-line flag-action ( dfa-traverser flag -- ) - drop - dup { - [ end-of-text? ] - [ next-text-character terminator-class class-member? ] - } 1|| [ t >>match-failed? ] unless drop ; - - -M: word-boundary flag-action ( dfa-traverser flag -- ) - drop - dup { - [ end-of-text? ] - [ current-text-character terminator-class class-member? ] - } 1|| [ t >>match-failed? ] unless drop ; - - -M: lookahead-on flag-action ( dfa-traverser flag -- ) - drop - lookahead-counters>> 0 swap push ; - -M: lookahead-off flag-action ( dfa-traverser flag -- ) - drop - dup lookahead-counters>> - [ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ; - -M: lookbehind-on flag-action ( dfa-traverser flag -- ) - drop - f >>traverse-forward - [ 2 - ] change-current-index - lookbehind-counters>> 0 swap push ; - -M: lookbehind-off flag-action ( dfa-traverser flag -- ) - drop - t >>traverse-forward - dup lookbehind-counters>> - [ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ; - -M: capture-group-on flag-action ( dfa-traverser flag -- ) - drop - [ current-index>> 0 2array ] - [ capture-counters>> ] bi push ; - -M: capture-group-off flag-action ( dfa-traverser flag -- ) - drop - dup capture-counters>> empty? [ - drop - ] [ - { - [ capture-counters>> pop first2 dupd + ] - [ text>> ] - [ [ 1+ ] change-capture-group-index capture-group-index>> ] - [ captured-groups>> set-at ] - } cleave - ] if ; - -: process-flags ( dfa-traverser -- ) - [ [ 1+ ] map ] change-lookahead-counters - [ [ 1+ ] map ] change-lookbehind-counters - [ [ first2 1+ 2array ] map ] change-capture-counters - ! dup current-state>> . - dup [ current-state>> ] [ traversal-flags>> ] bi - at [ flag-action ] with each ; + 1 text-character ; : increment-state ( dfa-traverser state -- dfa-traverser ) - [ - dup traverse-forward>> - [ [ 1+ ] change-current-index ] - [ [ 1- ] change-current-index ] if - dup current-state>> >>last-state - ] [ first ] bi* >>current-state ; + [ [ 1 + ] change-current-index ] + [ first ] bi* >>current-state ; : match-literal ( transition from-state table -- to-state/f ) transitions>> at at ; : match-class ( transition from-state table -- to-state/f ) transitions>> at* [ - [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if + '[ drop _ swap class-member? ] assoc-find [ nip ] [ drop ] if ] [ drop ] if ; : match-default ( transition from-state table -- to-state/f ) @@ -180,7 +83,6 @@ M: capture-group-off flag-action ( dfa-traverser flag -- ) [ dfa-table>> ] tri ; : do-match ( dfa-traverser -- dfa-traverser ) - dup process-flags dup match-done? [ dup setup-match match-transition [ increment-state do-match ] when* diff --git a/basis/regexp/utils/utils.factor b/basis/regexp/utils/utils.factor index af1b2fa1fb..d1266a6d98 100644 --- a/basis/regexp/utils/utils.factor +++ b/basis/regexp/utils/utils.factor @@ -12,47 +12,25 @@ IN: regexp.utils : while-changes ( obj quot pred -- obj' ) pick over call (while-changes) ; inline -: assoc-with ( param assoc quot -- assoc curry ) - swapd [ [ -rot ] dip call ] 2curry ; inline - -: insert-at ( value key hash -- ) - 2dup at* [ - 2nip push - ] [ - drop - [ dup vector? [ 1vector ] unless ] 2dip set-at - ] if ; - -: ?insert-at ( value key hash/f -- hash ) - [ H{ } clone ] unless* [ insert-at ] keep ; - ERROR: bad-octal number ; ERROR: bad-hex number ; : check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ; : check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ; -: ascii? ( n -- ? ) 0 HEX: 7f between? ; -: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; : decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; : hex-digit? ( n -- ? ) - [ + { [ decimal-digit? ] [ CHAR: a CHAR: f between? ] [ CHAR: A CHAR: F between? ] - ] 1|| ; - -: control-char? ( n -- ? ) - [ - [ 0 HEX: 1f between? ] - [ HEX: 7f = ] - ] 1|| ; + } 1|| ; : punct? ( n -- ? ) "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; : c-identifier-char? ( ch -- ? ) - [ [ alpha? ] [ CHAR: _ = ] ] 1|| ; + { [ alpha? ] [ CHAR: _ = ] } 1|| ; : java-blank? ( n -- ? ) { From 105ef28433925637e257b4e05a7faa7754c61270 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Feb 2009 20:23:00 -0600 Subject: [PATCH 004/125] Rewriting regexp parser --- basis/regexp/nfa/nfa.factor | 60 +-- basis/regexp/parser/parser-tests.factor | 50 +-- basis/regexp/parser/parser.factor | 538 +++++++----------------- basis/regexp/regexp.factor | 5 +- basis/regexp/traversal/traversal.factor | 2 +- 5 files changed, 167 insertions(+), 488 deletions(-) diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 44481454fc..c8ee1187bc 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -11,22 +11,10 @@ IN: regexp.nfa ERROR: feature-is-broken feature ; -SYMBOL: negation-mode -: negated? ( -- ? ) negation-mode get 0 or odd? ; +SYMBOL: negated? SINGLETON: eps -MIXIN: traversal-flag -SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag -SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag -SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag -SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag -SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag -SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag -SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag -SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag -SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag - : options ( -- obj ) current-regexp get options>> ; : option? ( obj -- ? ) options key? ; @@ -53,7 +41,7 @@ GENERIC: nfa-node ( node -- ) s1 [ regexp next-state ] stack [ regexp stack>> ] table [ regexp nfa-table>> ] | - negated? [ + negated? get [ s0 f obj class make-transition table add-transition s0 s1 table add-transition ] [ @@ -62,10 +50,6 @@ GENERIC: nfa-node ( node -- ) s0 s1 2array stack push t s1 table final-states>> set-at ] ; -: add-traversal-flag ( flag -- ) - stack peek second - current-regexp get nfa-traversal-flags>> push-at ; - :: concatenate-nodes ( -- ) [let* | regexp [ current-regexp get ] stack [ regexp stack>> ] @@ -97,7 +81,7 @@ GENERIC: nfa-node ( node -- ) t s5 table final-states>> set-at s4 s5 2array stack push ] ; -M: kleene-star nfa-node ( node -- ) +M: star nfa-node ( node -- ) term>> nfa-node [let* | regexp [ current-regexp get ] stack [ regexp stack>> ] @@ -139,17 +123,12 @@ M: constant nfa-node ( node -- ) char>> literal-transition add-simple-entry ] if ; -M: epsilon nfa-node ( node -- ) - drop eps literal-transition add-simple-entry ; - M: word nfa-node ( node -- ) class-transition add-simple-entry ; M: any-char nfa-node ( node -- ) [ dotall option? ] dip any-char-no-nl ? class-transition add-simple-entry ; -! M: beginning-of-text nfa-node ( node -- ) ; - M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ; M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ; @@ -182,38 +161,6 @@ M: character-class-range nfa-node ( node -- ) class-transition add-simple-entry ] if ; -M: capture-group nfa-node ( node -- ) - term>> nfa-node ; - -M: non-capture-group nfa-node ( node -- ) - term>> nfa-node ; - -M: reluctant-kleene-star nfa-node ( node -- ) - term>> nfa-node ; - -M: negation nfa-node ( node -- ) - negation-mode inc - term>> nfa-node - negation-mode dec ; - -M: lookahead nfa-node ( node -- ) - "lookahead" feature-is-broken - eps literal-transition add-simple-entry - lookahead-on add-traversal-flag - term>> nfa-node - eps literal-transition add-simple-entry - lookahead-off add-traversal-flag - 2 [ concatenate-nodes ] times ; - -M: lookbehind nfa-node ( node -- ) - "lookbehind" feature-is-broken - eps literal-transition add-simple-entry - lookbehind-on add-traversal-flag - term>> nfa-node - eps literal-transition add-simple-entry - lookbehind-off add-traversal-flag - 2 [ concatenate-nodes ] times ; - M: option nfa-node ( node -- ) [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if eps literal-transition add-simple-entry ; @@ -221,7 +168,6 @@ M: option nfa-node ( node -- ) : construct-nfa ( regexp -- ) [ reset-regexp - negation-mode off [ current-regexp set ] [ parse-tree>> nfa-node ] [ set-start-state ] tri diff --git a/basis/regexp/parser/parser-tests.factor b/basis/regexp/parser/parser-tests.factor index fe4d2f1d1a..d606015f61 100644 --- a/basis/regexp/parser/parser-tests.factor +++ b/basis/regexp/parser/parser-tests.factor @@ -1,34 +1,24 @@ -USING: kernel tools.test regexp.backend regexp ; -IN: regexp.parser +USING: kernel tools.test regexp.parser fry sequences ; +IN: regexp.parser.tests -: test-regexp ( string -- ) - default-regexp parse-regexp ; +: regexp-parses ( string -- ) + [ [ ] ] dip '[ _ parse-regexp drop ] unit-test ; -! [ "(" ] [ unmatched-parentheses? ] must-fail-with +: regexp-fails ( string -- ) + '[ _ parse-regexp ] must-fail ; -[ ] [ "a|b" test-regexp ] unit-test -[ ] [ "a.b" test-regexp ] unit-test -[ ] [ "a|b|c" test-regexp ] unit-test -[ ] [ "abc|b" test-regexp ] unit-test -[ ] [ "a|bcd" test-regexp ] unit-test -[ ] [ "a|(b)" test-regexp ] unit-test -[ ] [ "(a)|b" test-regexp ] unit-test -[ ] [ "(a|b)" test-regexp ] unit-test -[ ] [ "((a)|(b))" test-regexp ] unit-test +{ + "a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||" + "(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|" + "[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]" + "[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}" + "(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]" + "[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)" + "\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}" + "\\ueeee" "\\0333" "\\xff" "\\\\" "\\w" +} [ regexp-parses ] each -[ ] [ "(?:a)" test-regexp ] unit-test -[ ] [ "(?i:a)" test-regexp ] unit-test -[ ] [ "(?-i:a)" test-regexp ] unit-test -[ "(?z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with -[ "(?-z:a)" test-regexp ] [ unknown-regexp-option? ] must-fail-with - -[ ] [ "(?=a)" test-regexp ] unit-test - -[ ] [ "[abc]" test-regexp ] unit-test -[ ] [ "[a-c]" test-regexp ] unit-test -[ ] [ "[^a-c]" test-regexp ] unit-test -[ "[^]" test-regexp ] must-fail - -[ ] [ "|b" test-regexp ] unit-test -[ ] [ "b|" test-regexp ] unit-test -[ ] [ "||" test-regexp ] unit-test +{ + "[^]" "[]" "a{foo}" "a{,}" "a{}" "(?)" "\\p{foo}" "\\P{foo}" + "\\ueeeg" "\\0339" "\\xfg" +} [ regexp-fails ] each diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 377535eccd..65965fdeb9 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -1,437 +1,183 @@ -! Copyright (C) 2008 Doug Coleman. +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs combinators io io.streams.string -kernel math math.parser namespaces sets -quotations sequences splitting vectors math.order -strings regexp.backend regexp.utils -unicode.case unicode.categories words locals regexp.classes ; +USING: peg.ebnf kernel math.parser sequences assocs arrays +combinators regexp.classes strings splitting peg locals ; IN: regexp.parser -FROM: math.ranges => [a,b] ; +TUPLE: range from to ; +TUPLE: char-class ranges ; +TUPLE: primitive-class class ; +TUPLE: not-char-class ranges ; +TUPLE: not-primitive-class class ; +TUPLE: from-to n m ; +TUPLE: at-least n ; +TUPLE: up-to n ; +TUPLE: exactly n ; +TUPLE: times expression times ; +TUPLE: concatenation seq ; +TUPLE: alternation seq ; +TUPLE: maybe term ; +TUPLE: star term ; +TUPLE: plus term ; +TUPLE: with-options tree options ; +TUPLE: ast ^? $? tree ; +SINGLETON: any-char -TUPLE: concatenation seq ; INSTANCE: concatenation node -TUPLE: alternation seq ; INSTANCE: alternation node -TUPLE: kleene-star term ; INSTANCE: kleene-star node +: allowed-char? ( ch -- ? ) + ".()|[*+?" member? not ; -! !!!!!!!! -TUPLE: possessive-question term ; INSTANCE: possessive-question node -TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node +ERROR: bad-number ; -! !!!!!!!! -TUPLE: reluctant-question term ; INSTANCE: reluctant-question node -TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node +: ensure-number ( n -- n ) + [ bad-number ] unless* ; -TUPLE: negation term ; INSTANCE: negation node -TUPLE: constant char ; INSTANCE: constant node -TUPLE: range from to ; INSTANCE: range node +:: at-error ( key assoc quot: ( key -- replacement ) -- value ) + key assoc at* [ drop key quot call ] unless ; inline -MIXIN: parentheses-group -TUPLE: lookahead term ; INSTANCE: lookahead node -INSTANCE: lookahead parentheses-group -TUPLE: lookbehind term ; INSTANCE: lookbehind node -INSTANCE: lookbehind parentheses-group -TUPLE: capture-group term ; INSTANCE: capture-group node -INSTANCE: capture-group parentheses-group -TUPLE: non-capture-group term ; INSTANCE: non-capture-group node -INSTANCE: non-capture-group parentheses-group -TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group -INSTANCE: independent-group parentheses-group -TUPLE: comment-group term ; INSTANCE: comment-group node -INSTANCE: comment-group parentheses-group +ERROR: bad-class name ; -SINGLETON: epsilon INSTANCE: epsilon node +: name>class ( name -- class ) + { + { "Lower" letter-class } + { "Upper" LETTER-class } + { "Alpha" Letter-class } + { "ASCII" ascii-class } + { "Digit" digit-class } + { "Alnum" alpha-class } + { "Punct" punctuation-class } + { "Graph" java-printable-class } + { "Print" java-printable-class } + { "Blank" non-newline-blank-class } + { "Cntrl" control-character-class } + { "XDigit" hex-digit-class } + { "Space" java-blank-class } + ! TODO: unicode-character-class + } [ bad-class ] at-error ; -TUPLE: option option on? ; INSTANCE: option node +: lookup-escape ( char -- ast ) + { + { CHAR: t [ CHAR: \t ] } + { CHAR: n [ CHAR: \n ] } + { CHAR: r [ CHAR: \r ] } + { CHAR: f [ HEX: c ] } + { CHAR: a [ HEX: 7 ] } + { CHAR: e [ HEX: 1b ] } + { CHAR: \\ [ CHAR: \\ ] } + + { CHAR: w [ c-identifier-class primitive-class boa ] } + { CHAR: W [ c-identifier-class not-primitive-class boa ] } + { CHAR: s [ java-blank-class primitive-class boa ] } + { CHAR: S [ java-blank-class not-primitive-class boa ] } + { CHAR: d [ digit-class primitive-class boa ] } + { CHAR: D [ digit-class not-primitive-class boa ] } + + [ ] + } case ; + +TUPLE: options on off ; SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ; -SINGLETONS: beginning-of-character-class end-of-character-class -left-parenthesis pipe caret dash ; - -: push1 ( obj -- ) input-stream get stream>> push ; -: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ; -: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ; -: drop1 ( -- ) read1 drop ; - -: stack ( -- obj ) current-regexp get stack>> ; -: change-whole-stack ( quot -- ) - current-regexp get - [ stack>> swap call ] keep (>>stack) ; inline -: push-stack ( obj -- ) stack push ; -: pop-stack ( -- obj ) stack pop ; -: cut-out ( vector n -- vector' vector ) cut rest ; -ERROR: cut-stack-error ; -: cut-stack ( obj vector -- vector' vector ) - [ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ; - -: ( obj -- kleene ) possessive-kleene-star boa ; -: ( obj -- kleene ) reluctant-kleene-star boa ; -: ( obj -- kleene ) possessive-question boa ; -: ( obj -- kleene ) reluctant-question boa ; - -: ( obj -- negation ) negation boa ; -: ( seq -- concatenation ) - >vector [ epsilon ] [ concatenation boa ] if-empty ; -: ( seq -- alternation ) >vector alternation boa ; -: ( obj -- capture-group ) capture-group boa ; -: ( obj -- kleene-star ) kleene-star boa ; -: ( obj -- constant ) constant boa ; - -: first|concatenation ( seq -- first/concatenation ) - dup length 1 = [ first ] [ ] if ; - -: first|alternation ( seq -- first/alternation ) - dup length 1 = [ first ] [ ] if ; - -: ( from to -- obj ) - 2dup < - [ character-class-range boa ] [ 2drop unmatchable-class ] if ; - -ERROR: unmatched-parentheses ; - -ERROR: unknown-regexp-option option ; +: options-assoc ( -- assoc ) + H{ + { CHAR: i case-insensitive } + { CHAR: d unix-lines } + { CHAR: m multiline } + { CHAR: n multiline } + { CHAR: r reversed-regexp } + { CHAR: s dotall } + { CHAR: u unicode-case } + { CHAR: x comments } + } ; : ch>option ( ch -- singleton ) - { - { CHAR: i [ case-insensitive ] } - { CHAR: d [ unix-lines ] } - { CHAR: m [ multiline ] } - { CHAR: n [ multiline ] } - { CHAR: r [ reversed-regexp ] } - { CHAR: s [ dotall ] } - { CHAR: u [ unicode-case ] } - { CHAR: x [ comments ] } - [ unknown-regexp-option ] - } case ; + options-assoc at ; : option>ch ( option -- string ) - { - { case-insensitive [ CHAR: i ] } - { multiline [ CHAR: m ] } - { reversed-regexp [ CHAR: r ] } - { dotall [ CHAR: s ] } - [ unknown-regexp-option ] - } case ; + options-assoc value-at ; -: toggle-option ( ch ? -- ) - [ ch>option ] dip option boa push-stack ; +: parse-options ( on off -- options ) + [ [ ch>option ] map ] bi@ options boa ; -: (parse-options) ( string ? -- ) [ toggle-option ] curry each ; +! TODO: make range syntax better (negation, and, etc), +! add syntax for various parenthized things, +! add greedy and nongreedy forms of matching +! (once it's all implemented) -: parse-options ( string -- ) - "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; +EBNF: (parse-regexp) -ERROR: bad-special-group string ; +CharacterInBracket = !("}") Character -DEFER: (parse-regexp) -: nested-parse-regexp ( token ? -- ) - [ push-stack (parse-regexp) pop-stack ] dip - [ ] when pop-stack new swap >>term push-stack ; +Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]] + | "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]] + | "u" Character:a Character:b Character:c Character:d + => [[ { a b c d } hex> ensure-number ]] + | "x" Character:a Character:b + => [[ { a b } hex> ensure-number ]] + | "0" Character:a Character:b Character:c + => [[ { a b c } oct> ensure-number ]] + | . => [[ lookup-escape ]] -! non-capturing groups -: (parse-special-group) ( -- ) - read1 { - { [ dup CHAR: # = ] ! comment - [ drop comment-group f nested-parse-regexp pop-stack drop ] } - { [ dup CHAR: : = ] - [ drop non-capture-group f nested-parse-regexp ] } - { [ dup CHAR: = = ] - [ drop lookahead f nested-parse-regexp ] } - { [ dup CHAR: ! = ] - [ drop lookahead t nested-parse-regexp ] } - { [ dup CHAR: > = ] - [ drop non-capture-group f nested-parse-regexp ] } - { [ dup CHAR: < = peek1 CHAR: = = and ] - [ drop drop1 lookbehind f nested-parse-regexp ] } - { [ dup CHAR: < = peek1 CHAR: ! = and ] - [ drop drop1 lookbehind t nested-parse-regexp ] } - [ - ":)" read-until - [ swap prefix ] dip - { - { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] } - { CHAR: ) [ parse-options ] } - [ drop bad-special-group ] - } case - ] - } cond ; +Character = "\\" Escape:e => [[ e ]] + | . ?[ allowed-char? ]? -: handle-left-parenthesis ( -- ) - peek1 CHAR: ? = - [ drop1 (parse-special-group) ] - [ capture-group f nested-parse-regexp ] if ; +AnyRangeCharacter = Character | "[" -: handle-dot ( -- ) any-char push-stack ; -: handle-pipe ( -- ) pipe push-stack ; -: (handle-star) ( obj -- kleene-star ) - peek1 { - { CHAR: + [ drop1 ] } - { CHAR: ? [ drop1 ] } - [ drop ] - } case ; -: handle-star ( -- ) stack pop (handle-star) push-stack ; -: handle-question ( -- ) - stack pop peek1 { - { CHAR: + [ drop1 ] } - { CHAR: ? [ drop1 ] } - [ drop epsilon 2array ] - } case push-stack ; -: handle-plus ( -- ) - stack pop dup (handle-star) - 2array push-stack ; +RangeCharacter = !("]") AnyRangeCharacter -ERROR: unmatched-brace ; -: parse-repetition ( -- start finish ? ) - "}" read-until [ unmatched-brace ] unless - [ "," split1 [ string>number ] bi@ ] - [ CHAR: , swap index >boolean ] bi ; +Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] + | RangeCharacter -: replicate/concatenate ( n obj -- obj' ) - over zero? [ 2drop epsilon ] - [ first|concatenation ] if ; +StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] + | AnyRangeCharacter -: exactly-n ( n -- ) - stack pop replicate/concatenate push-stack ; +Ranges = StartRange:s Range*:r => [[ r s prefix ]] -: at-least-n ( n -- ) - stack pop - [ replicate/concatenate ] keep - 2array push-stack ; +CharClass = "^" Ranges:e => [[ e not-char-class boa ]] + | Ranges:e => [[ e char-class boa ]] -: at-most-n ( n -- ) - 1+ - stack pop - [ replicate/concatenate ] curry map push-stack ; +Options = [idmsux]* -: from-m-to-n ( m n -- ) - [a,b] - stack pop - [ replicate/concatenate ] curry map - push-stack ; +Parenthized = "?:" Alternation:a => [[ a ]] + | "?" Options:on "-"? Options:off ":" Alternation:a + => [[ a on off parse-options with-options boa ]] + | "?#" [^)]* => [[ ignore ]] + | Alternation -ERROR: invalid-range a b ; +Element = "(" Parenthized:p ")" => [[ p ]] + | "[" CharClass:r "]" => [[ r ]] + | ".":d => [[ any-char ]] + | Character -: handle-left-brace ( -- ) - parse-repetition - [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip - [ - 2dup and [ from-m-to-n ] - [ [ nip at-most-n ] [ at-least-n ] if* ] if - ] [ drop 0 max exactly-n ] if ; +Number = (!(","|"}").)* => [[ string>number ensure-number ]] -: handle-front-anchor ( -- ) beginning-of-line push-stack ; -: handle-back-anchor ( -- ) end-of-line push-stack ; +Times = "," Number:n "}" => [[ n up-to boa ]] + | Number:n ",}" => [[ n at-least boa ]] + | Number:n "}" => [[ n exactly boa ]] + | "}" => [[ bad-number ]] + | Number:n "," Number:m "}" => [[ n m from-to boa ]] -ERROR: bad-character-class obj ; -ERROR: expected-posix-class ; +Repeated = Element:e "{" Times:t => [[ e t times boa ]] + | Element:e "?" => [[ e maybe boa ]] + | Element:e "*" => [[ e star boa ]] + | Element:e "+" => [[ e plus boa ]] + | Element -: parse-posix-class ( -- obj ) - read1 CHAR: { = [ expected-posix-class ] unless - "}" read-until [ bad-character-class ] unless - { - { "Lower" [ letter-class ] } - { "Upper" [ LETTER-class ] } - { "Alpha" [ Letter-class ] } - { "ASCII" [ ascii-class ] } - { "Digit" [ digit-class ] } - { "Alnum" [ alpha-class ] } - { "Punct" [ punctuation-class ] } - { "Graph" [ java-printable-class ] } - { "Print" [ java-printable-class ] } - { "Blank" [ non-newline-blank-class ] } - { "Cntrl" [ control-character-class ] } - { "XDigit" [ hex-digit-class ] } - { "Space" [ java-blank-class ] } - ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss - [ bad-character-class ] - } case ; +Concatenation = Repeated*:r => [[ r concatenation boa ]] -: parse-octal ( -- n ) 3 read oct> check-octal ; -: parse-short-hex ( -- n ) 2 read hex> check-hex ; -: parse-long-hex ( -- n ) 6 read hex> check-hex ; -: parse-control-character ( -- n ) read1 ; +Alternation = Concatenation:c ("|" Concatenation)*:a + => [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]] -ERROR: bad-escaped-literals seq ; +End = !(.) -: parse-til-E ( -- obj ) - "\\E" read-until [ bad-escaped-literals ] unless ; - -:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj ) - parse-til-E - drop1 - [ epsilon ] [ - quot call [ ] V{ } map-as - first|concatenation - ] if-empty ; inline +Main = Alternation End +;EBNF -: parse-escaped-literals ( -- obj ) - [ ] (parse-escaped-literals) ; - -: lower-case-literals ( -- obj ) - [ >lower ] (parse-escaped-literals) ; - -: upper-case-literals ( -- obj ) - [ >upper ] (parse-escaped-literals) ; - -: parse-escaped ( -- obj ) - read1 - { - { CHAR: t [ CHAR: \t ] } - { CHAR: n [ CHAR: \n ] } - { CHAR: r [ CHAR: \r ] } - { CHAR: f [ HEX: c ] } - { CHAR: a [ HEX: 7 ] } - { CHAR: e [ HEX: 1b ] } - - { CHAR: w [ c-identifier-class ] } - { CHAR: W [ c-identifier-class ] } - { CHAR: s [ java-blank-class ] } - { CHAR: S [ java-blank-class ] } - { CHAR: d [ digit-class ] } - { CHAR: D [ digit-class ] } - - { CHAR: p [ parse-posix-class ] } - { CHAR: P [ parse-posix-class ] } - { CHAR: x [ parse-short-hex ] } - { CHAR: u [ parse-long-hex ] } - { CHAR: 0 [ parse-octal ] } - { CHAR: c [ parse-control-character ] } - - { CHAR: Q [ parse-escaped-literals ] } - - ! { CHAR: b [ word-boundary-class ] } - ! { CHAR: B [ word-boundary-class ] } - ! { CHAR: A [ handle-beginning-of-input ] } - ! { CHAR: z [ handle-end-of-input ] } - - ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator - - ! m//g mode - ! { CHAR: G [ end of previous match ] } - - ! Group capture - ! { CHAR: 1 [ CHAR: 1 ] } - ! { CHAR: 2 [ CHAR: 2 ] } - ! { CHAR: 3 [ CHAR: 3 ] } - ! { CHAR: 4 [ CHAR: 4 ] } - ! { CHAR: 5 [ CHAR: 5 ] } - ! { CHAR: 6 [ CHAR: 6 ] } - ! { CHAR: 7 [ CHAR: 7 ] } - ! { CHAR: 8 [ CHAR: 8 ] } - ! { CHAR: 9 [ CHAR: 9 ] } - - ! Perl extensions - ! can't do \l and \u because \u is already a 4-hex - { CHAR: L [ lower-case-literals ] } - { CHAR: U [ upper-case-literals ] } - - [ ] - } case ; - -: handle-escape ( -- ) parse-escaped push-stack ; - -: handle-dash ( vector -- vector' ) - H{ { dash CHAR: - } } substitute ; - -: character-class>alternation ( seq -- alternation ) - [ dup number? [ ] when ] map first|alternation ; - -: handle-caret ( vector -- vector' ) - dup [ length 2 >= ] [ first caret eq? ] bi and [ - rest-slice character-class>alternation - ] [ - character-class>alternation - ] if ; - -: make-character-class ( -- character-class ) - [ beginning-of-character-class swap cut-stack ] change-whole-stack - handle-dash handle-caret ; - -: apply-dash ( -- ) - stack [ pop3 nip ] keep push ; - -: apply-dash? ( -- ? ) - stack dup length 3 >= - [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ; - -ERROR: empty-negated-character-class ; -DEFER: handle-left-bracket -: (parse-character-class) ( -- ) - read1 [ empty-negated-character-class ] unless* { - { CHAR: [ [ handle-left-bracket t ] } - { CHAR: ] [ make-character-class push-stack f ] } - { CHAR: - [ dash push-stack t ] } - { CHAR: \ [ parse-escaped push-stack t ] } - [ push-stack apply-dash? [ apply-dash ] when t ] - } case - [ (parse-character-class) ] when ; - -: push-constant ( ch -- ) push-stack ; - -: parse-character-class-second ( -- ) - read1 { - { CHAR: [ [ CHAR: [ push-constant ] } - { CHAR: ] [ CHAR: ] push-constant ] } - { CHAR: - [ CHAR: - push-constant ] } - [ push1 ] - } case ; - -: parse-character-class-first ( -- ) - read1 { - { CHAR: ^ [ caret push-stack parse-character-class-second ] } - { CHAR: [ [ CHAR: [ push-constant ] } - { CHAR: ] [ CHAR: ] push-constant ] } - { CHAR: - [ CHAR: - push-constant ] } - [ push1 ] - } case ; - -: handle-left-bracket ( -- ) - beginning-of-character-class push-stack - parse-character-class-first (parse-character-class) ; - -: finish-regexp-parse ( stack -- obj ) - { pipe } split - [ first|concatenation ] map first|alternation ; - -: handle-right-parenthesis ( -- ) - stack dup [ parentheses-group "members" word-prop member? ] find-last - -rot cut rest - [ [ push ] keep current-regexp get (>>stack) ] - [ finish-regexp-parse push-stack ] bi* ; - -: parse-regexp-token ( token -- ? ) - { - { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning? - { CHAR: ) [ handle-right-parenthesis f ] } - { CHAR: . [ handle-dot t ] } - { CHAR: | [ handle-pipe t ] } - { CHAR: ? [ handle-question t ] } - { CHAR: * [ handle-star t ] } - { CHAR: + [ handle-plus t ] } - { CHAR: { [ handle-left-brace t ] } - { CHAR: [ [ handle-left-bracket t ] } - { CHAR: \ [ handle-escape t ] } - [ - dup CHAR: $ = peek1 f = and - [ drop handle-back-anchor f ] - [ push-constant t ] if - ] - } case ; - -: (parse-regexp) ( -- ) - read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ; - -: parse-regexp-beginning ( -- ) - peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ; - -: parse-regexp ( regexp -- ) - dup current-regexp [ - raw>> [ - [ - parse-regexp-beginning (parse-regexp) - ] with-input-stream - ] unless-empty - current-regexp get [ finish-regexp-parse ] change-stack - dup stack>> >>parse-tree drop - ] with-variable ; +: parse-regexp ( string -- regexp ) + ! Hack because I want $ allowable in regexps, + ! but with special behavior at the end + ! This fails if the regexp is stupid, though... + dup first CHAR: ^ = tuck [ rest ] when + dup peek CHAR: $ = tuck [ but-last ] when + (parse-regexp) ast boa ; diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 86f978373b..62ebaab502 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -21,7 +21,7 @@ IN: regexp : construct-regexp ( regexp -- regexp' ) { - [ parse-regexp ] + [ dup raw>> parse-regexp >>parse-tree drop ] [ construct-nfa ] [ construct-dfa ] [ ] @@ -33,9 +33,6 @@ IN: regexp : match ( string regexp -- slice/f ) (match) return-match ; -: match* ( string regexp -- slice/f captured-groups ) - (match) [ return-match ] [ captured-groups>> ] bi ; - : matches? ( string regexp -- ? ) dupd match [ [ length ] bi@ = ] [ drop f ] if* ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index d0a76a6ddc..394bfe0d52 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -68,7 +68,7 @@ TUPLE: dfa-traverser : match-class ( transition from-state table -- to-state/f ) transitions>> at* [ - '[ drop _ swap class-member? ] assoc-find [ nip ] [ drop ] if + swap '[ drop _ swap class-member? ] assoc-find spin ? ] [ drop ] if ; : match-default ( transition from-state table -- to-state/f ) From b8845cb87efdf316f19902c2b94d37fbc4e5e19c Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 18 Feb 2009 12:27:07 -0600 Subject: [PATCH 005/125] Almost done with regexp cleanup --- basis/regexp/ast/ast.factor | 53 +++++++ basis/regexp/backend/backend.factor | 27 ---- basis/regexp/classes/classes.factor | 32 +++- basis/regexp/dfa/dfa.factor | 103 ++++++------- basis/regexp/nfa/nfa.factor | 141 ++++++++++-------- basis/regexp/parser/parser.factor | 111 ++++++-------- basis/regexp/regexp-docs.factor | 2 +- basis/regexp/regexp-tests.factor | 22 +-- basis/regexp/regexp.factor | 44 ++---- .../transition-tables.factor | 2 +- basis/regexp/traversal/traversal.factor | 7 +- basis/regexp/utils/utils-tests.factor | 4 - basis/regexp/utils/utils.factor | 42 ------ 13 files changed, 271 insertions(+), 319 deletions(-) create mode 100644 basis/regexp/ast/ast.factor delete mode 100644 basis/regexp/backend/backend.factor delete mode 100644 basis/regexp/utils/utils-tests.factor delete mode 100644 basis/regexp/utils/utils.factor diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor new file mode 100644 index 0000000000..d018fa3a36 --- /dev/null +++ b/basis/regexp/ast/ast.factor @@ -0,0 +1,53 @@ +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: kernel arrays accessors fry sequences ; +FROM: math.ranges => [a,b] ; +IN: regexp.ast + +TUPLE: primitive-class class ; +C: primitive-class + +TUPLE: negation term ; +C: negation + +TUPLE: from-to n m ; +C: from-to + +TUPLE: at-least n ; +C: at-least + +TUPLE: concatenation seq ; +C: concatenation + +TUPLE: alternation seq ; +C: alternation + +TUPLE: star term ; +C: star + +TUPLE: with-options tree options ; +C: with-options + +TUPLE: options on off ; +C: options + +SINGLETONS: unix-lines dotall multiline comments case-insensitive +unicode-case reversed-regexp ; + +: ( term -- term' ) + f 2array ; + +: ( term -- term' ) + dup 2array ; + +: repetition ( n term -- term' ) + ; + +GENERIC: ( term times -- term' ) +M: at-least + n>> swap [ repetition ] [ ] bi 2array ; +M: from-to + [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map ; + +: char-class ( ranges ? -- term ) + [ ] dip [ ] when ; diff --git a/basis/regexp/backend/backend.factor b/basis/regexp/backend/backend.factor deleted file mode 100644 index 5eff0579c8..0000000000 --- a/basis/regexp/backend/backend.factor +++ /dev/null @@ -1,27 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors hashtables kernel math vectors ; -IN: regexp.backend - -TUPLE: regexp - raw - { options hashtable } - stack - parse-tree - nfa-table - dfa-table - minimized-table - matchers - { nfa-traversal-flags hashtable } - { dfa-traversal-flags hashtable } - { state integer } - { new-states vector } - { visited-states hashtable } ; - -: reset-regexp ( regexp -- regexp ) - 0 >>state - V{ } clone >>stack - V{ } clone >>new-states - H{ } clone >>visited-states ; - -SYMBOL: current-regexp diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 94d1b78d59..7109e8bcbd 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,9 +1,31 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors kernel math math.order words regexp.utils -ascii unicode.categories combinators.short-circuit ; +USING: accessors kernel math math.order words +ascii unicode.categories combinators.short-circuit sequences ; IN: regexp.classes +: punct? ( ch -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + +: c-identifier-char? ( ch -- ? ) + { [ alpha? ] [ CHAR: _ = ] } 1|| ; + +: java-blank? ( ch -- ? ) + { + CHAR: \s CHAR: \t CHAR: \n + HEX: b HEX: 7 CHAR: \r + } member? ; + +: java-printable? ( ch -- ? ) + [ [ alpha? ] [ punct? ] ] 1|| ; + +: hex-digit? ( ch -- ? ) + { + [ CHAR: A CHAR: F between? ] + [ CHAR: a CHAR: f between? ] + [ CHAR: 0 CHAR: 9 between? ] + } 1|| ; + SINGLETONS: any-char any-char-no-nl letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class @@ -14,8 +36,8 @@ unmatchable-class terminator-class word-boundary-class ; SINGLETONS: beginning-of-input beginning-of-line end-of-input end-of-line ; -MIXIN: node -TUPLE: character-class-range from to ; INSTANCE: character-class-range node +TUPLE: range from to ; +C: range GENERIC: class-member? ( obj class -- ? ) @@ -23,7 +45,7 @@ M: t class-member? ( obj class -- ? ) 2drop f ; M: integer class-member? ( obj class -- ? ) 2drop f ; -M: character-class-range class-member? ( obj class -- ? ) +M: range class-member? ( obj class -- ? ) [ from>> ] [ to>> ] bi between? ; M: any-char class-member? ( obj class -- ? ) diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 549669cab7..4dd3713fc2 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -2,83 +2,74 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals math math.order regexp.nfa regexp.transition-tables sequences -sets sorting vectors regexp.utils sequences.deep ; +sets sorting vectors sequences.deep ; USING: io prettyprint threads ; IN: regexp.dfa -: find-delta ( states transition regexp -- new-states ) - nfa-table>> transitions>> - rot [ swap at at ] with with gather sift ; +: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj ) + [ [ dup slip ] dip pick over call ] dip dupd = + [ 3drop ] [ (while-changes) ] if ; inline recursive -: (find-epsilon-closure) ( states regexp -- new-states ) +: while-changes ( obj quot pred -- obj' ) + 3dup nip call (while-changes) ; inline + +: find-delta ( states transition nfa -- new-states ) + transitions>> '[ _ swap _ at at ] gather sift ; + +: (find-epsilon-closure) ( states nfa -- new-states ) eps swap find-delta ; -: find-epsilon-closure ( states regexp -- new-states ) +: find-epsilon-closure ( states nfa -- new-states ) '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes natural-sort ; -: find-closure ( states transition regexp -- new-states ) - [ find-delta ] 2keep nip find-epsilon-closure ; +: find-closure ( states transition nfa -- new-states ) + [ find-delta ] keep find-epsilon-closure ; -: find-start-state ( regexp -- state ) - [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ; +: find-start-state ( nfa -- state ) + [ start-state>> 1vector ] keep find-epsilon-closure ; -: find-transitions ( seq1 regexp -- seq2 ) - nfa-table>> transitions>> - [ at keys ] curry gather +: find-transitions ( dfa-state nfa -- next-dfa-state ) + transitions>> + '[ _ at keys ] gather eps swap remove ; -: add-todo-state ( state regexp -- ) - 2dup visited-states>> key? [ - 2drop - ] [ - [ visited-states>> conjoin ] - [ new-states>> push ] 2bi +: add-todo-state ( state visited-states new-states -- ) + 3dup drop key? [ 3drop ] [ + [ conjoin ] [ push ] bi-curry* bi ] if ; -: new-transitions ( regexp -- ) - dup new-states>> [ - drop - ] [ - dupd pop dup pick find-transitions rot - [ - [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep - [ swapd transition make-transition ] dip - dfa-table>> add-transition - ] curry with each - new-transitions +:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa ) + new-states [ nfa dfa ] [ + new-states pop :> state + state nfa-table find-transitions + [| trans | + state trans nfa find-closure :> new-state + state visited-states new-state add-todo-state + state new-state trans transition make-transition dfa add-transition + ] each + nfa dfa new-states visited-states new-transitions ] if-empty ; : states ( hashtable -- array ) [ keys ] [ values [ values concat ] map concat append ] bi ; -: set-final-states ( regexp -- ) - dup - [ nfa-table>> final-states>> keys ] - [ dfa-table>> transitions>> states ] bi - [ intersects? ] with filter - - swap dfa-table>> final-states>> +: set-final-states ( nfa dfa -- ) + [ + [ final-states>> keys ] + [ transitions>> states ] bi* + [ intersects? ] with filter + ] [ final-states>> ] bi [ conjoin ] curry each ; -: set-initial-state ( regexp -- ) - dup - [ dfa-table>> ] [ find-start-state ] bi - [ >>start-state drop ] keep - 1vector >>new-states drop ; +: initialize-dfa ( nfa -- dfa ) + + swap find-start-state >>start-state ; -: set-traversal-flags ( regexp -- ) - dup - [ nfa-traversal-flags>> ] - [ dfa-table>> transitions>> keys ] bi - [ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc - >>dfa-traversal-flags drop ; - -: construct-dfa ( regexp -- ) - { - [ set-initial-state ] - [ new-transitions ] - [ set-final-states ] - [ set-traversal-flags ] - } cleave ; +: construct-dfa ( nfa -- dfa ) + dup initialize-dfa + dup start-state>> 1vector + H{ } clone + new-transitions + [ set-final-states ] keep ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index c8ee1187bc..4ad5e0314d 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -1,9 +1,10 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs grouping kernel regexp.backend -locals math namespaces regexp.parser sequences fry quotations -math.order math.ranges vectors unicode.categories regexp.utils -regexp.transition-tables words sets regexp.classes unicode.case.private ; +USING: accessors arrays assocs grouping kernel +locals math namespaces sequences fry quotations +math.order math.ranges vectors unicode.categories +regexp.transition-tables words sets +unicode.case.private regexp.ast regexp.classes ; ! This uses unicode.case.private for ch>upper and ch>lower ! but case-insensitive matching should be done by case-folding everything ! before processing starts @@ -13,34 +14,49 @@ ERROR: feature-is-broken feature ; SYMBOL: negated? +: negate ( -- ) + negated? [ not ] change ; + SINGLETON: eps -: options ( -- obj ) current-regexp get options>> ; +SYMBOL: option-stack -: option? ( obj -- ? ) options key? ; +SYMBOL: combine-stack -: option-on ( obj -- ) options conjoin ; +SYMBOL: state -: option-off ( obj -- ) options delete-at ; +: next-state ( -- state ) + state [ get ] [ inc ] bi ; -: next-state ( regexp -- state ) - [ state>> ] [ [ 1+ ] change-state drop ] bi ; +SYMBOL: nfa-table -: set-start-state ( regexp -- ) - dup stack>> [ - drop - ] [ - [ nfa-table>> ] [ pop first ] bi* >>start-state drop - ] if-empty ; +: set-each ( keys value hashtable -- ) + '[ _ swap _ set-at ] each ; + +: options>hash ( options -- hashtable ) + H{ } clone [ + [ [ on>> t ] dip set-each ] + [ [ off>> f ] dip set-each ] 2bi + ] keep ; + +: using-options ( options quot -- ) + [ options>hash option-stack [ ?push ] change ] dip + call option-stack get pop* ; inline + +: option? ( obj -- ? ) + option-stack get assoc-stack ; + +: set-start-state ( -- nfa-table ) + nfa-table get + combine-stack get pop first >>start-state ; GENERIC: nfa-node ( node -- ) :: add-simple-entry ( obj class -- ) - [let* | regexp [ current-regexp get ] - s0 [ regexp next-state ] - s1 [ regexp next-state ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] | + [let* | s0 [ next-state ] + s1 [ next-state ] + stack [ combine-stack get ] + table [ nfa-table get ] | negated? get [ s0 f obj class make-transition table add-transition s0 s1 table add-transition @@ -51,9 +67,8 @@ GENERIC: nfa-node ( node -- ) t s1 table final-states>> set-at ] ; :: concatenate-nodes ( -- ) - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] + [let* | stack [ combine-stack get ] + table [ nfa-table get ] s2 [ stack peek first ] s3 [ stack pop second ] s0 [ stack peek first ] @@ -63,15 +78,14 @@ GENERIC: nfa-node ( node -- ) s0 s3 2array stack push ] ; :: alternate-nodes ( -- ) - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] + [let* | stack [ combine-stack get ] + table [ nfa-table get ] s2 [ stack peek first ] s3 [ stack pop second ] s0 [ stack peek first ] s1 [ stack pop second ] - s4 [ regexp next-state ] - s5 [ regexp next-state ] | + s4 [ next-state ] + s5 [ next-state ] | s4 s0 eps table add-transition s4 s2 eps table add-transition s1 s5 eps table add-transition @@ -83,13 +97,12 @@ GENERIC: nfa-node ( node -- ) M: star nfa-node ( node -- ) term>> nfa-node - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] + [let* | stack [ combine-stack get ] s0 [ stack peek first ] s1 [ stack pop second ] - s2 [ regexp next-state ] - s3 [ regexp next-state ] - table [ regexp nfa-table>> ] | + s2 [ next-state ] + s3 [ next-state ] + table [ nfa-table get ] | s1 table final-states>> delete-at t s3 table final-states>> set-at s1 s0 eps table add-transition @@ -99,58 +112,53 @@ M: star nfa-node ( node -- ) s2 s3 2array stack push ] ; M: concatenation nfa-node ( node -- ) - seq>> - reversed-regexp option? [ ] when - [ [ nfa-node ] each ] - [ length 1- [ concatenate-nodes ] times ] bi ; + seq>> [ eps literal-transition add-simple-entry ] [ + reversed-regexp option? [ ] when + [ [ nfa-node ] each ] + [ length 1- [ concatenate-nodes ] times ] bi + ] if-empty ; M: alternation nfa-node ( node -- ) seq>> [ [ nfa-node ] each ] [ length 1- [ alternate-nodes ] times ] bi ; -M: constant nfa-node ( node -- ) +M: integer nfa-node ( node -- ) case-insensitive option? [ - dup char>> [ ch>lower ] [ ch>upper ] bi + dup [ ch>lower ] [ ch>upper ] bi 2dup = [ 2drop - char>> literal-transition add-simple-entry + literal-transition add-simple-entry ] [ [ literal-transition add-simple-entry ] bi@ alternate-nodes drop ] if ] [ - char>> literal-transition add-simple-entry + literal-transition add-simple-entry ] if ; -M: word nfa-node ( node -- ) class-transition add-simple-entry ; +M: primitive-class nfa-node ( node -- ) + class>> dup + { letter-class LETTER-class } member? case-insensitive option? and + [ drop Letter-class ] when + class-transition add-simple-entry ; M: any-char nfa-node ( node -- ) [ dotall option? ] dip any-char-no-nl ? class-transition add-simple-entry ; -M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ; +M: negation nfa-node ( node -- ) + negate term>> nfa-node negate ; -M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ; - -: choose-letter-class ( node -- node' ) - case-insensitive option? Letter-class rot ? ; - -M: letter-class nfa-node ( node -- ) - choose-letter-class class-transition add-simple-entry ; - -M: LETTER-class nfa-node ( node -- ) - choose-letter-class class-transition add-simple-entry ; - -M: character-class-range nfa-node ( node -- ) +M: range nfa-node ( node -- ) case-insensitive option? [ ! This should be implemented for Unicode by case-folding ! the input and all strings in the regexp. dup [ from>> ] [ to>> ] bi 2dup [ Letter? ] bi@ and [ rot drop - [ [ ch>lower ] bi@ character-class-range boa ] - [ [ ch>upper ] bi@ character-class-range boa ] 2bi + [ [ ch>lower ] bi@ ] + [ [ ch>upper ] bi@ ] 2bi [ class-transition add-simple-entry ] bi@ alternate-nodes ] [ @@ -161,14 +169,15 @@ M: character-class-range nfa-node ( node -- ) class-transition add-simple-entry ] if ; -M: option nfa-node ( node -- ) - [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if - eps literal-transition add-simple-entry ; +M: with-options nfa-node ( node -- ) + dup options>> [ tree>> nfa-node ] using-options ; -: construct-nfa ( regexp -- ) +: construct-nfa ( ast -- nfa-table ) [ - reset-regexp - [ current-regexp set ] - [ parse-tree>> nfa-node ] - [ set-start-state ] tri + negated? off + V{ } clone combine-stack set + 0 state set + clone nfa-table set + nfa-node + set-start-state ] with-scope ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 65965fdeb9..dbd37f2d8e 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -1,28 +1,9 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: peg.ebnf kernel math.parser sequences assocs arrays -combinators regexp.classes strings splitting peg locals ; +USING: peg.ebnf kernel math.parser sequences assocs arrays fry math +combinators regexp.classes strings splitting peg locals accessors +regexp.ast ; IN: regexp.parser - -TUPLE: range from to ; -TUPLE: char-class ranges ; -TUPLE: primitive-class class ; -TUPLE: not-char-class ranges ; -TUPLE: not-primitive-class class ; -TUPLE: from-to n m ; -TUPLE: at-least n ; -TUPLE: up-to n ; -TUPLE: exactly n ; -TUPLE: times expression times ; -TUPLE: concatenation seq ; -TUPLE: alternation seq ; -TUPLE: maybe term ; -TUPLE: star term ; -TUPLE: plus term ; -TUPLE: with-options tree options ; -TUPLE: ast ^? $? tree ; -SINGLETON: any-char - : allowed-char? ( ch -- ? ) ".()|[*+?" member? not ; @@ -64,21 +45,16 @@ ERROR: bad-class name ; { CHAR: e [ HEX: 1b ] } { CHAR: \\ [ CHAR: \\ ] } - { CHAR: w [ c-identifier-class primitive-class boa ] } - { CHAR: W [ c-identifier-class not-primitive-class boa ] } - { CHAR: s [ java-blank-class primitive-class boa ] } - { CHAR: S [ java-blank-class not-primitive-class boa ] } - { CHAR: d [ digit-class primitive-class boa ] } - { CHAR: D [ digit-class not-primitive-class boa ] } + { CHAR: w [ c-identifier-class ] } + { CHAR: W [ c-identifier-class ] } + { CHAR: s [ java-blank-class ] } + { CHAR: S [ java-blank-class ] } + { CHAR: d [ digit-class ] } + { CHAR: D [ digit-class ] } [ ] } case ; -TUPLE: options on off ; - -SINGLETONS: unix-lines dotall multiline comments case-insensitive -unicode-case reversed-regexp ; - : options-assoc ( -- assoc ) H{ { CHAR: i case-insensitive } @@ -98,19 +74,30 @@ unicode-case reversed-regexp ; options-assoc value-at ; : parse-options ( on off -- options ) - [ [ ch>option ] map ] bi@ options boa ; + [ [ ch>option ] { } map-as ] bi@ ; -! TODO: make range syntax better (negation, and, etc), -! add syntax for various parenthized things, +: string>options ( string -- options ) + "-" split1 parse-options ; + +: options>string ( options -- string ) + [ on>> ] [ off>> ] bi + [ [ option>ch ] map ] bi@ + [ "-" swap 3append ] unless-empty + "" like ; + +! TODO: add syntax for various parenthized things, ! add greedy and nongreedy forms of matching ! (once it's all implemented) -EBNF: (parse-regexp) +EBNF: parse-regexp CharacterInBracket = !("}") Character -Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]] - | "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]] +QuotedCharacter = !("\\E") . + +Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class ]] + | "P{" CharacterInBracket*:s "}" => [[ s >string name>class ]] + | "Q" QuotedCharacter*:s "\\E" => [[ s ]] | "u" Character:a Character:b Character:c Character:d => [[ { a b c d } hex> ensure-number ]] | "x" Character:a Character:b @@ -119,30 +106,30 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-cla => [[ { a b c } oct> ensure-number ]] | . => [[ lookup-escape ]] -Character = "\\" Escape:e => [[ e ]] - | . ?[ allowed-char? ]? +EscapeSequence = "\\" Escape:e => [[ e ]] -AnyRangeCharacter = Character | "[" +Character = EscapeSequence | . ?[ allowed-char? ]? + +AnyRangeCharacter = EscapeSequence | . RangeCharacter = !("]") AnyRangeCharacter -Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] +Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b ]] | RangeCharacter -StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]] +StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b ]] | AnyRangeCharacter Ranges = StartRange:s Range*:r => [[ r s prefix ]] -CharClass = "^" Ranges:e => [[ e not-char-class boa ]] - | Ranges:e => [[ e char-class boa ]] +CharClass = "^"?:n Ranges:e => [[ e n char-class ]] Options = [idmsux]* Parenthized = "?:" Alternation:a => [[ a ]] | "?" Options:on "-"? Options:off ":" Alternation:a - => [[ a on off parse-options with-options boa ]] - | "?#" [^)]* => [[ ignore ]] + => [[ a on off parse-options ]] + | "?#" [^)]* => [[ f ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] @@ -152,32 +139,24 @@ Element = "(" Parenthized:p ")" => [[ p ]] Number = (!(","|"}").)* => [[ string>number ensure-number ]] -Times = "," Number:n "}" => [[ n up-to boa ]] - | Number:n ",}" => [[ n at-least boa ]] - | Number:n "}" => [[ n exactly boa ]] +Times = "," Number:n "}" => [[ 0 n ]] + | Number:n ",}" => [[ n ]] + | Number:n "}" => [[ n n ]] | "}" => [[ bad-number ]] - | Number:n "," Number:m "}" => [[ n m from-to boa ]] + | Number:n "," Number:m "}" => [[ n m ]] -Repeated = Element:e "{" Times:t => [[ e t times boa ]] - | Element:e "?" => [[ e maybe boa ]] - | Element:e "*" => [[ e star boa ]] - | Element:e "+" => [[ e plus boa ]] +Repeated = Element:e "{" Times:t => [[ e t ]] + | Element:e "?" => [[ e ]] + | Element:e "*" => [[ e ]] + | Element:e "+" => [[ e ]] | Element -Concatenation = Repeated*:r => [[ r concatenation boa ]] +Concatenation = Repeated*:r => [[ r sift ]] Alternation = Concatenation:c ("|" Concatenation)*:a - => [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]] + => [[ a empty? [ c ] [ a values c prefix ] if ]] End = !(.) Main = Alternation End ;EBNF - -: parse-regexp ( string -- regexp ) - ! Hack because I want $ allowable in regexps, - ! but with special behavior at the end - ! This fails if the regexp is stupid, though... - dup first CHAR: ^ = tuck [ rest ] when - dup peek CHAR: $ = tuck [ but-last ] when - (parse-regexp) ast boa ; diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index 378ae503ce..1dc2a22d81 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel strings help.markup help.syntax regexp.backend ; +USING: kernel strings help.markup help.syntax ; IN: regexp HELP: diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index cc9b2cccf1..4331eaa250 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -189,8 +189,8 @@ IN: regexp-tests [ t ] [ "SXY" "\\0123XY" matches? ] unit-test [ t ] [ "x" "\\x78" matches? ] unit-test [ f ] [ "y" "\\x78" matches? ] unit-test -[ t ] [ "x" "\\u000078" matches? ] unit-test -[ f ] [ "y" "\\u000078" matches? ] unit-test +[ t ] [ "x" "\\u0078" matches? ] unit-test +[ f ] [ "y" "\\u0078" matches? ] unit-test [ t ] [ "ab" "a+b" matches? ] unit-test [ f ] [ "b" "a+b" matches? ] unit-test @@ -317,16 +317,6 @@ IN: regexp-tests ! Bug in parsing word [ t ] [ "a" R' a' matches? ] unit-test -! Convert to lowercase until E -[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test -[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test - -! Convert to uppercase until E -[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test -[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test - -! [ "{Lower}" ] [ invalid-range? ] must-fail-with - ! [ t ] [ "a" R/ ^a/ matches? ] unit-test ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test @@ -370,10 +360,10 @@ IN: regexp-tests ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test -! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test -! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test -! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test -! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test +! [ t ] [ "a" R/ a$/m matches? ] unit-test +! [ t ] [ "a\n" R/ a$/m matches? ] unit-test +! [ t ] [ "a\r" R/ a$/m matches? ] unit-test +! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test ! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 62ebaab502..8f6edd853e 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -2,33 +2,15 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer -namespaces parser arrays fry regexp.backend regexp.utils +namespaces parser arrays fry locals regexp.parser regexp.nfa regexp.dfa regexp.traversal -regexp.transition-tables splitting sorting ; +regexp.transition-tables splitting sorting regexp.ast ; IN: regexp -: default-regexp ( string -- regexp ) - regexp new - swap >>raw - >>nfa-table - >>dfa-table - >>minimized-table - H{ } clone >>nfa-traversal-flags - H{ } clone >>dfa-traversal-flags - H{ } clone >>options - H{ } clone >>matchers - reset-regexp ; - -: construct-regexp ( regexp -- regexp' ) - { - [ dup raw>> parse-regexp >>parse-tree drop ] - [ construct-nfa ] - [ construct-dfa ] - [ ] - } cleave ; +TUPLE: regexp raw options parse-tree dfa ; : (match) ( string regexp -- dfa-traverser ) - do-match ; inline + dfa>> do-match ; inline : match ( string regexp -- slice/f ) (match) return-match ; @@ -94,17 +76,17 @@ IN: regexp { "R| " "|" } } swap [ subseq? not nip ] curry assoc-find drop ; -: string>options ( string -- options ) - [ ch>option dup ] H{ } map>assoc ; - -: options>string ( options -- string ) - keys [ option>ch ] map natural-sort >string ; - PRIVATE> -: ( string option-string -- regexp ) - [ default-regexp ] [ string>options ] bi* >>options - construct-regexp ; +:: ( string options -- regexp ) + string parse-regexp :> tree + options parse-options :> opt + tree opt :> ast + regexp new + string >>raw + opt >>options + tree >>parse-tree + tree opt construct-nfa construct-dfa >>dfa ; : ( string -- regexp ) "" ; diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index 64d5cdb244..c02ebce91f 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs fry hashtables kernel sequences -vectors regexp.utils ; +vectors ; IN: regexp.transition-tables TUPLE: transition from to obj ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 394bfe0d52..e06efa7f80 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors assocs combinators kernel math -quotations sequences regexp.parser regexp.classes fry arrays -combinators.short-circuit regexp.utils prettyprint regexp.nfa ; +quotations sequences regexp.classes fry arrays +combinators.short-circuit prettyprint regexp.nfa ; IN: regexp.traversal TUPLE: dfa-traverser @@ -13,8 +13,7 @@ TUPLE: dfa-traverser start-index current-index matches ; -: ( text regexp -- match ) - dfa-table>> +: ( text dfa -- match ) dfa-traverser new swap [ start-state>> >>current-state ] [ >>dfa-table ] bi swap >>text diff --git a/basis/regexp/utils/utils-tests.factor b/basis/regexp/utils/utils-tests.factor deleted file mode 100644 index d048ad4be1..0000000000 --- a/basis/regexp/utils/utils-tests.factor +++ /dev/null @@ -1,4 +0,0 @@ -USING: regexp.utils tools.test ; -IN: regexp.utils.tests - -[ [ ] [ ] while-changes ] must-infer diff --git a/basis/regexp/utils/utils.factor b/basis/regexp/utils/utils.factor deleted file mode 100644 index d1266a6d98..0000000000 --- a/basis/regexp/utils/utils.factor +++ /dev/null @@ -1,42 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs io kernel math math.order -namespaces regexp.backend sequences unicode.categories -math.ranges fry combinators.short-circuit vectors ; -IN: regexp.utils - -: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj ) - [ [ dup slip ] dip pick over call ] dip dupd = - [ 3drop ] [ (while-changes) ] if ; inline recursive - -: while-changes ( obj quot pred -- obj' ) - pick over call (while-changes) ; inline - -ERROR: bad-octal number ; -ERROR: bad-hex number ; -: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ; -: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ; - -: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; - -: hex-digit? ( n -- ? ) - { - [ decimal-digit? ] - [ CHAR: a CHAR: f between? ] - [ CHAR: A CHAR: F between? ] - } 1|| ; - -: punct? ( n -- ? ) - "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; - -: c-identifier-char? ( ch -- ? ) - { [ alpha? ] [ CHAR: _ = ] } 1|| ; - -: java-blank? ( n -- ? ) - { - CHAR: \s CHAR: \t CHAR: \n - HEX: b HEX: 7 CHAR: \r - } member? ; - -: java-printable? ( n -- ? ) - [ [ alpha? ] [ punct? ] ] 1|| ; From 77b069ee5c0d5a85b5065c7c77f5ef5d6375dfc0 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 18 Feb 2009 14:52:10 -0600 Subject: [PATCH 006/125] Finish cleanup of regexp --- basis/regexp/dfa/dfa.factor | 6 +++--- basis/regexp/regexp.factor | 33 ++++++++++++++++----------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 4dd3713fc2..543c757a67 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -41,11 +41,11 @@ IN: regexp.dfa :: new-transitions ( nfa dfa new-states visited-states -- nfa dfa ) new-states [ nfa dfa ] [ - new-states pop :> state - state nfa-table find-transitions + pop :> state + state nfa find-transitions [| trans | state trans nfa find-closure :> new-state - state visited-states new-state add-todo-state + new-state visited-states new-states add-todo-state state new-state trans transition make-transition dfa add-transition ] each nfa dfa new-states visited-states new-transitions diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 8f6edd853e..7491961399 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -7,11 +7,22 @@ regexp.parser regexp.nfa regexp.dfa regexp.traversal regexp.transition-tables splitting sorting regexp.ast ; IN: regexp -TUPLE: regexp raw options parse-tree dfa ; +TUPLE: regexp raw parse-tree options dfa ; + +: ( string options -- regexp ) + [ dup parse-regexp ] [ string>options ] bi* + 2dup construct-nfa construct-dfa + regexp boa ; + +: ( string -- regexp ) "" ; + +> do-match ; inline +PRIVATE> + : match ( string regexp -- slice/f ) (match) return-match ; @@ -40,9 +51,13 @@ TUPLE: regexp raw options parse-tree dfa ; dupd first-match [ split1-slice swap ] [ "" like f swap ] if* ; + + : re-split ( string regexp -- seq ) [ (re-split) ] { } make ; @@ -76,22 +91,6 @@ TUPLE: regexp raw options parse-tree dfa ; { "R| " "|" } } swap [ subseq? not nip ] curry assoc-find drop ; -PRIVATE> - -:: ( string options -- regexp ) - string parse-regexp :> tree - options parse-options :> opt - tree opt :> ast - regexp new - string >>raw - opt >>options - tree >>parse-tree - tree opt construct-nfa construct-dfa >>dfa ; - -: ( string -- regexp ) "" ; - - Date: Thu, 19 Feb 2009 00:11:45 -0600 Subject: [PATCH 007/125] DFAs are minimized now --- basis/regexp/dfa/dfa-tests.factor | 5 ++ basis/regexp/dfa/dfa.factor | 12 ++- basis/regexp/minimize/minimize-tests.factor | 48 ++++++++++++ basis/regexp/minimize/minimize.factor | 84 +++++++++++++++++++++ basis/regexp/regexp.factor | 4 +- basis/regexp/traversal/traversal.factor | 5 +- 6 files changed, 149 insertions(+), 9 deletions(-) create mode 100644 basis/regexp/dfa/dfa-tests.factor create mode 100644 basis/regexp/minimize/minimize-tests.factor create mode 100644 basis/regexp/minimize/minimize.factor diff --git a/basis/regexp/dfa/dfa-tests.factor b/basis/regexp/dfa/dfa-tests.factor new file mode 100644 index 0000000000..b6ce13c723 --- /dev/null +++ b/basis/regexp/dfa/dfa-tests.factor @@ -0,0 +1,5 @@ +USING: regexp.dfa tools.test ; +IN: regexp.dfa.tests + +[ [ ] [ ] while-changes ] must-infer + diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 543c757a67..88e4e8f9ff 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -1,4 +1,4 @@ -! Copyright (C) 2008 Doug Coleman. +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals math math.order regexp.nfa regexp.transition-tables sequences @@ -6,9 +6,13 @@ sets sorting vectors sequences.deep ; USING: io prettyprint threads ; IN: regexp.dfa -: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj ) - [ [ dup slip ] dip pick over call ] dip dupd = - [ 3drop ] [ (while-changes) ] if ; inline recursive +:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj ) + obj quot call :> new-obj + new-obj comp call :> new-key + new-key old-key = + [ new-obj ] + [ new-obj quot comp new-key (while-changes) ] + if ; inline recursive : while-changes ( obj quot pred -- obj' ) 3dup nip call (while-changes) ; inline diff --git a/basis/regexp/minimize/minimize-tests.factor b/basis/regexp/minimize/minimize-tests.factor new file mode 100644 index 0000000000..78a90ca3ba --- /dev/null +++ b/basis/regexp/minimize/minimize-tests.factor @@ -0,0 +1,48 @@ +! Copyright (C) 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test regexp.minimize assocs regexp accessors regexp.transition-tables ; +IN: regexp.minimize.tests + +[ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test +[ t ] [ 2 1 H{ { { 1 2 } t } } same-partition? ] unit-test +[ f ] [ 2 3 H{ { { 1 2 } t } } same-partition? ] unit-test + +[ H{ { 1 1 } { 2 1 } { 3 3 } { 4 3 } } ] +[ { { 1 1 } { 1 2 } { 2 2 } { 3 3 } { 3 4 } { 4 4 } } [ t ] H{ } map>assoc partition>classes ] unit-test + +[ { { 1 2 } { 3 4 } } ] [ H{ { "elephant" 1 } { "tiger" 3 } } H{ { "elephant" 2 } { "tiger" 4 } } assemble-values ] unit-test + +[ 3 ] [ R/ ab|ac/ dfa>> transitions>> assoc-size ] unit-test +[ 3 ] [ R/ a(b|c)/ dfa>> transitions>> assoc-size ] unit-test +[ 1 ] [ R/ ((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test +[ 1 ] [ R/ a|((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test +[ 2 ] [ R/ ab|((aa*)*)*b/ dfa>> transitions>> assoc-size ] unit-test +[ 4 ] [ R/ ab|cd/ dfa>> transitions>> assoc-size ] unit-test +[ 1 ] [ R/ [a-z]*|[A-Z]*/i dfa>> transitions>> assoc-size ] unit-test + +[ + T{ transition-table + { transitions H{ + { 0 H{ { CHAR: a 1 } { CHAR: b 1 } } } + { 1 H{ { CHAR: a 2 } { CHAR: b 2 } } } + { 2 H{ { CHAR: c 3 } } } + { 3 H{ } } + } } + { start-state 0 } + { final-states H{ { 3 3 } } } + } +] [ + T{ transition-table + { transitions H{ + { 0 H{ { CHAR: a 1 } { CHAR: b 4 } } } + { 1 H{ { CHAR: a 2 } { CHAR: b 5 } } } + { 2 H{ { CHAR: c 3 } } } + { 3 H{ } } + { 4 H{ { CHAR: a 2 } { CHAR: b 5 } } } + { 5 H{ { CHAR: c 6 } } } + { 6 H{ } } + } } + { start-state 0 } + { final-states H{ { 3 3 } { 6 6 } } } + } combine-states +] unit-test diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor new file mode 100644 index 0000000000..52a852af50 --- /dev/null +++ b/basis/regexp/minimize/minimize.factor @@ -0,0 +1,84 @@ +! Copyright (C) 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: kernel sequences regexp.transition-tables fry assocs +accessors locals math sorting arrays sets hashtables regexp.dfa ; +IN: regexp.minimize + +:: initialize-partitions ( transition-table -- partitions ) + ! Partition table is sorted-array => ? + H{ } clone :> out + transition-table transitions>> keys :> states + states [| s1 | + states [| s2 | + s1 s2 <= [ + s1 s2 [ transition-table transitions>> at keys ] bi@ set= + s1 s2 [ transition-table final-states>> key? ] bi@ = and + [ t s1 s2 2array out set-at ] when + ] when + ] each + ] each out ; + +: same-partition? ( s1 s2 partitions -- ? ) + [ 2array natural-sort ] dip key? ; + +: assemble-values ( assoc1 assoc2 -- values ) + dup keys '[ _ swap [ at ] curry map ] bi@ zip ; + +: stay-same? ( s1 s2 transition partitions -- ? ) + [ '[ _ transitions>> at ] bi@ assemble-values ] dip + '[ _ same-partition? ] assoc-all? ; + +: partition-more ( partitions transition-table -- partitions ) + ! This is horribly slow! + over '[ drop first2 _ _ stay-same? ] assoc-filter ; + +: partition>classes ( partitions -- synonyms ) ! old-state => new-state + >alist sort-keys + [ drop first2 swap ] assoc-map + + >hashtable ; + +: state-classes ( transition-table -- synonyms ) + [ initialize-partitions ] keep + '[ _ partition-more ] [ ] while-changes + partition>classes ; + +: canonical-state? ( state state-classes -- ? ) + dupd at = ; + +: delete-duplicates ( transitions state-classes -- new-transitions ) + '[ drop _ canonical-state? ] assoc-filter ; + +: rewrite-duplicates ( new-transitions state-classes -- new-transitions ) + '[ [ _ at ] assoc-map ] assoc-map ; + +: map-set ( assoc quot -- new-assoc ) + '[ drop @ dup ] assoc-map ; inline + +: combine-states ( table -- smaller-table ) + dup state-classes + [ + '[ + _ [ delete-duplicates ] + [ rewrite-duplicates ] bi + ] change-transitions + ] + [ '[ [ _ at ] map-set ] change-final-states ] + [ '[ _ at ] change-start-state ] + tri ; + +: number-transitions ( transitions numbering -- new-transitions ) + [ + [ at ] + [ '[ first _ at ] assoc-map ] + bi-curry bi* + ] curry assoc-map ; + +: number-states ( table -- newtable ) + dup transitions>> keys [ swap ] H{ } assoc-map-as + [ '[ _ at ] change-start-state ] + [ '[ [ _ at ] map-set ] change-final-states ] + [ '[ _ number-transitions ] change-transitions ] tri ; + +: minimize ( table -- minimal-table ) + clone number-states combine-states ; diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 7491961399..b6fd32a245 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer -namespaces parser arrays fry locals +namespaces parser arrays fry locals regexp.minimize regexp.parser regexp.nfa regexp.dfa regexp.traversal regexp.transition-tables splitting sorting regexp.ast ; IN: regexp @@ -11,7 +11,7 @@ TUPLE: regexp raw parse-tree options dfa ; : ( string options -- regexp ) [ dup parse-regexp ] [ string>options ] bi* - 2dup construct-nfa construct-dfa + 2dup construct-nfa construct-dfa minimize regexp boa ; : ( string -- regexp ) "" ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index e06efa7f80..5d48353f56 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -33,7 +33,7 @@ TUPLE: dfa-traverser : text-finished? ( dfa-traverser -- ? ) { - [ current-state>> empty? ] + [ current-state>> not ] [ end-of-text? ] [ match-failed?>> ] } 1|| ; @@ -59,8 +59,7 @@ TUPLE: dfa-traverser 1 text-character ; : increment-state ( dfa-traverser state -- dfa-traverser ) - [ [ 1 + ] change-current-index ] - [ first ] bi* >>current-state ; + [ [ 1 + ] change-current-index ] dip >>current-state ; : match-literal ( transition from-state table -- to-state/f ) transitions>> at at ; From 9565b59928eba03c50b2a2f98806e9a9ac1aa0c4 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 19 Feb 2009 16:48:46 -0600 Subject: [PATCH 008/125] Regexp negation (partial) and cleanup of regexp.nfa --- basis/regexp/ast/ast.factor | 14 +- basis/regexp/classes/classes.factor | 17 ++- basis/regexp/minimize/minimize.factor | 70 +++++----- basis/regexp/negation/negation-tests.factor | 27 ++++ basis/regexp/negation/negation.factor | 36 ++++++ basis/regexp/nfa/nfa.factor | 136 ++++++++------------ basis/regexp/parser/parser.factor | 2 + basis/regexp/regexp.factor | 5 +- 8 files changed, 184 insertions(+), 123 deletions(-) create mode 100644 basis/regexp/negation/negation-tests.factor create mode 100644 basis/regexp/negation/negation.factor diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index d018fa3a36..ad67d76d12 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -16,11 +16,17 @@ C: from-to TUPLE: at-least n ; C: at-least -TUPLE: concatenation seq ; -C: concatenation +SINGLETON: epsilon -TUPLE: alternation seq ; -C: alternation +TUPLE: concatenation first second ; + +: ( seq -- concatenation ) + epsilon [ concatenation boa ] reduce ; + +TUPLE: alternation first second ; + +: ( seq -- alternation ) + unclip [ alternation boa ] reduce ; TUPLE: star term ; C: star diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 7109e8bcbd..44f33f9fcf 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,4 +1,4 @@ -! Copyright (C) 2008 Doug Coleman. +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: accessors kernel math math.order words ascii unicode.categories combinators.short-circuit sequences ; @@ -41,9 +41,10 @@ C: range GENERIC: class-member? ( obj class -- ? ) +! When does t get put in? M: t class-member? ( obj class -- ? ) 2drop f ; -M: integer class-member? ( obj class -- ? ) 2drop f ; +M: integer class-member? ( obj class -- ? ) = ; M: range class-member? ( obj class -- ? ) [ from>> ] [ to>> ] bi between? ; @@ -111,3 +112,15 @@ M: beginning-of-line class-member? ( obj class -- ? ) M: end-of-line class-member? ( obj class -- ? ) 2drop f ; + +TUPLE: or-class seq ; +C: or-class + +TUPLE: not-class class ; +C: not-class + +M: or-class class-member? + seq>> [ class-member? ] with any? ; + +M: not-class class-member? + class>> class-member? not ; diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor index 52a852af50..163e87f2b4 100644 --- a/basis/regexp/minimize/minimize.factor +++ b/basis/regexp/minimize/minimize.factor @@ -1,20 +1,48 @@ ! Copyright (C) 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: kernel sequences regexp.transition-tables fry assocs -accessors locals math sorting arrays sets hashtables regexp.dfa ; +accessors locals math sorting arrays sets hashtables regexp.dfa +combinators.short-circuit ; IN: regexp.minimize +: number-transitions ( transitions numbering -- new-transitions ) + dup '[ + [ _ at ] + [ [ first _ at ] assoc-map ] bi* + ] assoc-map ; + +: table>state-numbers ( table -- assoc ) + transitions>> keys [ swap ] H{ } assoc-map-as ; + +: map-set ( assoc quot -- new-assoc ) + '[ drop @ dup ] assoc-map ; inline + +: rewrite-transitions ( transition-table assoc quot -- transition-table ) + [ + [ '[ _ at ] change-start-state ] + [ '[ [ _ at ] map-set ] change-final-states ] + [ ] tri + ] dip '[ _ @ ] change-transitions ; inline + +: number-states ( table -- newtable ) + dup table>state-numbers + [ number-transitions ] rewrite-transitions ; + +: initially-same? ( s1 s2 transition-table -- ? ) + { + [ drop <= ] + [ transitions>> '[ _ at keys ] bi@ set= ] + [ final-states>> '[ _ key? ] bi@ = ] + } 3&& ; + :: initialize-partitions ( transition-table -- partitions ) ! Partition table is sorted-array => ? H{ } clone :> out transition-table transitions>> keys :> states states [| s1 | states [| s2 | - s1 s2 <= [ - s1 s2 [ transition-table transitions>> at keys ] bi@ set= - s1 s2 [ transition-table final-states>> key? ] bi@ = and - [ t s1 s2 2array out set-at ] when - ] when + s1 s2 transition-table initially-same? + [ s1 s2 2array out conjoin ] when ] each ] each out ; @@ -29,7 +57,6 @@ IN: regexp.minimize '[ _ same-partition? ] assoc-all? ; : partition-more ( partitions transition-table -- partitions ) - ! This is horribly slow! over '[ drop first2 _ _ stay-same? ] assoc-filter ; : partition>classes ( partitions -- synonyms ) ! old-state => new-state @@ -40,7 +67,7 @@ IN: regexp.minimize : state-classes ( transition-table -- synonyms ) [ initialize-partitions ] keep - '[ _ partition-more ] [ ] while-changes + '[ _ partition-more ] [ assoc-size ] while-changes partition>classes ; : canonical-state? ( state state-classes -- ? ) @@ -52,33 +79,12 @@ IN: regexp.minimize : rewrite-duplicates ( new-transitions state-classes -- new-transitions ) '[ [ _ at ] assoc-map ] assoc-map ; -: map-set ( assoc quot -- new-assoc ) - '[ drop @ dup ] assoc-map ; inline +: combine-transitions ( transitions state-classes -- new-transitions ) + [ delete-duplicates ] [ rewrite-duplicates ] bi ; : combine-states ( table -- smaller-table ) dup state-classes - [ - '[ - _ [ delete-duplicates ] - [ rewrite-duplicates ] bi - ] change-transitions - ] - [ '[ [ _ at ] map-set ] change-final-states ] - [ '[ _ at ] change-start-state ] - tri ; - -: number-transitions ( transitions numbering -- new-transitions ) - [ - [ at ] - [ '[ first _ at ] assoc-map ] - bi-curry bi* - ] curry assoc-map ; - -: number-states ( table -- newtable ) - dup transitions>> keys [ swap ] H{ } assoc-map-as - [ '[ _ at ] change-start-state ] - [ '[ [ _ at ] map-set ] change-final-states ] - [ '[ _ number-transitions ] change-transitions ] tri ; + [ combine-transitions ] rewrite-transitions ; : minimize ( table -- minimal-table ) clone number-states combine-states ; diff --git a/basis/regexp/negation/negation-tests.factor b/basis/regexp/negation/negation-tests.factor new file mode 100644 index 0000000000..2dbca2e8d8 --- /dev/null +++ b/basis/regexp/negation/negation-tests.factor @@ -0,0 +1,27 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test regexp.negation regexp.transition-tables regexp.classes ; +IN: regexp.negation.tests + +[ + ! R/ |[^a]|.+/ + T{ transition-table + { transitions H{ + { 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } } + { 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } } + { -1 H{ { any-char -1 } } } + } } + { start-state 0 } + { final-states H{ { 0 0 } { -1 -1 } } } + } +] [ + ! R/ a/ + T{ transition-table + { transitions H{ + { 0 H{ { CHAR: a 1 } } } + { 1 H{ } } + } } + { start-state 0 } + { final-states H{ { 1 1 } } } + } negate-table +] unit-test diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor new file mode 100644 index 0000000000..5a9f772581 --- /dev/null +++ b/basis/regexp/negation/negation.factor @@ -0,0 +1,36 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences +assocs regexp.classes hashtables accessors ; +IN: regexp.negation + +: ast>dfa ( parse-tree -- minimal-dfa ) + construct-nfa construct-dfa minimize ; + +CONSTANT: fail-state -1 + +: add-default-transition ( state's-transitions -- new-state's-transitions ) + clone dup + [ [ fail-state ] dip keys ] keep set-at ; + +: fail-state-recurses ( transitions -- new-transitions ) + clone dup + [ fail-state any-char associate fail-state ] dip set-at ; + +: add-fail-state ( transitions -- new-transitions ) + [ add-default-transition ] assoc-map + fail-state-recurses ; + +: assoc>set ( assoc -- keys-set ) + [ drop dup ] assoc-map ; + +: inverse-final-states ( transition-table -- final-states ) + [ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ; + +: negate-table ( transition-table -- transition-table ) + clone + [ add-fail-state ] change-transitions + dup inverse-final-states >>final-states ; + +! M: negation nfa-node ( node -- ) +! ast>dfa negate-table adjoin-dfa ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 4ad5e0314d..c759ffdf98 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -3,15 +3,13 @@ USING: accessors arrays assocs grouping kernel locals math namespaces sequences fry quotations math.order math.ranges vectors unicode.categories -regexp.transition-tables words sets +regexp.transition-tables words sets hashtables unicode.case.private regexp.ast regexp.classes ; ! This uses unicode.case.private for ch>upper and ch>lower ! but case-insensitive matching should be done by case-folding everything ! before processing starts IN: regexp.nfa -ERROR: feature-is-broken feature ; - SYMBOL: negated? : negate ( -- ) @@ -21,14 +19,13 @@ SINGLETON: eps SYMBOL: option-stack -SYMBOL: combine-stack - SYMBOL: state : next-state ( -- state ) state [ get ] [ inc ] bi ; SYMBOL: nfa-table +: table ( -- table ) nfa-table get ; : set-each ( keys value hashtable -- ) '[ _ swap _ set-at ] each ; @@ -46,84 +43,56 @@ SYMBOL: nfa-table : option? ( obj -- ? ) option-stack get assoc-stack ; -: set-start-state ( -- nfa-table ) - nfa-table get - combine-stack get pop first >>start-state ; +GENERIC: nfa-node ( node -- start-state end-state ) -GENERIC: nfa-node ( node -- ) +:: add-simple-entry ( obj class -- start-state end-state ) + next-state :> s0 + next-state :> s1 + negated? get [ + s0 f obj class make-transition table add-transition + s0 s1 table add-transition + ] [ + s0 s1 obj class make-transition table add-transition + ] if + s0 s1 ; -:: add-simple-entry ( obj class -- ) - [let* | s0 [ next-state ] - s1 [ next-state ] - stack [ combine-stack get ] - table [ nfa-table get ] | - negated? get [ - s0 f obj class make-transition table add-transition - s0 s1 table add-transition - ] [ - s0 s1 obj class make-transition table add-transition - ] if - s0 s1 2array stack push - t s1 table final-states>> set-at ] ; +: epsilon-transition ( source target -- ) + eps table add-transition ; -:: concatenate-nodes ( -- ) - [let* | stack [ combine-stack get ] - table [ nfa-table get ] - s2 [ stack peek first ] - s3 [ stack pop second ] - s0 [ stack peek first ] - s1 [ stack pop second ] | - s1 s2 eps table add-transition - s1 table final-states>> delete-at - s0 s3 2array stack push ] ; +M:: star nfa-node ( node -- start end ) + node term>> nfa-node :> s1 :> s0 + next-state :> s2 + next-state :> s3 + s1 s0 epsilon-transition + s2 s0 epsilon-transition + s2 s3 epsilon-transition + s1 s3 epsilon-transition + s2 s3 ; -:: alternate-nodes ( -- ) - [let* | stack [ combine-stack get ] - table [ nfa-table get ] - s2 [ stack peek first ] - s3 [ stack pop second ] - s0 [ stack peek first ] - s1 [ stack pop second ] - s4 [ next-state ] - s5 [ next-state ] | - s4 s0 eps table add-transition - s4 s2 eps table add-transition - s1 s5 eps table add-transition - s3 s5 eps table add-transition - s1 table final-states>> delete-at - s3 table final-states>> delete-at - t s5 table final-states>> set-at - s4 s5 2array stack push ] ; +M: epsilon nfa-node + drop eps literal-transition add-simple-entry ; -M: star nfa-node ( node -- ) - term>> nfa-node - [let* | stack [ combine-stack get ] - s0 [ stack peek first ] - s1 [ stack pop second ] - s2 [ next-state ] - s3 [ next-state ] - table [ nfa-table get ] | - s1 table final-states>> delete-at - t s3 table final-states>> set-at - s1 s0 eps table add-transition - s2 s0 eps table add-transition - s2 s3 eps table add-transition - s1 s3 eps table add-transition - s2 s3 2array stack push ] ; +M: concatenation nfa-node ( node -- start end ) + [ first>> ] [ second>> ] bi + reversed-regexp option? [ swap ] when + [ nfa-node ] bi@ + [ epsilon-transition ] dip ; -M: concatenation nfa-node ( node -- ) - seq>> [ eps literal-transition add-simple-entry ] [ - reversed-regexp option? [ ] when - [ [ nfa-node ] each ] - [ length 1- [ concatenate-nodes ] times ] bi - ] if-empty ; +:: alternate-nodes ( s0 s1 s2 s3 -- start end ) + next-state :> s4 + next-state :> s5 + s4 s0 epsilon-transition + s4 s2 epsilon-transition + s1 s5 epsilon-transition + s3 s5 epsilon-transition + s4 s5 ; -M: alternation nfa-node ( node -- ) - seq>> - [ [ nfa-node ] each ] - [ length 1- [ alternate-nodes ] times ] bi ; +M: alternation nfa-node ( node -- start end ) + [ first>> ] [ second>> ] bi + [ nfa-node ] bi@ + alternate-nodes ; -M: integer nfa-node ( node -- ) +M: integer nfa-node ( node -- start end ) case-insensitive option? [ dup [ ch>lower ] [ ch>upper ] bi 2dup = [ @@ -131,26 +100,26 @@ M: integer nfa-node ( node -- ) literal-transition add-simple-entry ] [ [ literal-transition add-simple-entry ] bi@ - alternate-nodes drop + alternate-nodes [ nip ] dip ] if ] [ literal-transition add-simple-entry ] if ; -M: primitive-class nfa-node ( node -- ) +M: primitive-class nfa-node ( node -- start end ) class>> dup { letter-class LETTER-class } member? case-insensitive option? and [ drop Letter-class ] when class-transition add-simple-entry ; -M: any-char nfa-node ( node -- ) +M: any-char nfa-node ( node -- start end ) [ dotall option? ] dip any-char-no-nl ? class-transition add-simple-entry ; -M: negation nfa-node ( node -- ) +M: negation nfa-node ( node -- start end ) negate term>> nfa-node negate ; -M: range nfa-node ( node -- ) +M: range nfa-node ( node -- start end ) case-insensitive option? [ ! This should be implemented for Unicode by case-folding ! the input and all strings in the regexp. @@ -169,15 +138,16 @@ M: range nfa-node ( node -- ) class-transition add-simple-entry ] if ; -M: with-options nfa-node ( node -- ) +M: with-options nfa-node ( node -- start end ) dup options>> [ tree>> nfa-node ] using-options ; : construct-nfa ( ast -- nfa-table ) [ negated? off - V{ } clone combine-stack set 0 state set clone nfa-table set nfa-node - set-start-state + table + swap dup associate >>final-states + swap >>start-state ] with-scope ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index dbd37f2d8e..6b2f28dbf6 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -4,6 +4,7 @@ USING: peg.ebnf kernel math.parser sequences assocs arrays fry math combinators regexp.classes strings splitting peg locals accessors regexp.ast ; IN: regexp.parser + : allowed-char? ( ch -- ? ) ".()|[*+?" member? not ; @@ -130,6 +131,7 @@ Parenthized = "?:" Alternation:a => [[ a ]] | "?" Options:on "-"? Options:off ":" Alternation:a => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] + | "?~" Alternation:a => [[ a ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index b6fd32a245..189d430d85 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -4,14 +4,15 @@ USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer namespaces parser arrays fry locals regexp.minimize regexp.parser regexp.nfa regexp.dfa regexp.traversal -regexp.transition-tables splitting sorting regexp.ast ; +regexp.transition-tables splitting sorting regexp.ast +regexp.negation ; IN: regexp TUPLE: regexp raw parse-tree options dfa ; : ( string options -- regexp ) [ dup parse-regexp ] [ string>options ] bi* - 2dup construct-nfa construct-dfa minimize + 2dup ast>dfa regexp boa ; : ( string -- regexp ) "" ; From 478c1d2928ca3eb6c78c04bb7f3a4d75e5bc4e5b Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 19 Feb 2009 16:50:55 -0600 Subject: [PATCH 009/125] Assocs stack effect fix --- core/assocs/assocs.factor | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/assocs/assocs.factor b/core/assocs/assocs.factor index e46bb7abb6..f2a04dc01b 100755 --- a/core/assocs/assocs.factor +++ b/core/assocs/assocs.factor @@ -38,7 +38,7 @@ M: assoc assoc-like drop ; : substituter ( assoc -- quot ) [ dupd at* [ nip ] [ drop ] if ] curry ; inline -: with-assoc ( assoc quot: ( value key -- assoc ) -- quot: ( key value -- ) ) +: with-assoc ( assoc quot: ( value key assoc -- ) -- quot: ( key value -- ) ) curry [ swap ] prepose ; inline PRIVATE> From f535b66aedc9d79fa0da69a36017356e16d6dc15 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 19 Feb 2009 18:28:54 -0600 Subject: [PATCH 010/125] Negation almost complete in regexp --- basis/regexp/ast/ast.factor | 7 ++---- basis/regexp/classes/classes.factor | 6 ++++++ basis/regexp/negation/negation.factor | 31 ++++++++++++++++++++++++--- basis/regexp/nfa/nfa.factor | 11 +++++----- basis/regexp/parser/parser.factor | 6 +++--- 5 files changed, 45 insertions(+), 16 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index ad67d76d12..e1308f0855 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -1,12 +1,9 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel arrays accessors fry sequences ; +USING: kernel arrays accessors fry sequences regexp.classes ; FROM: math.ranges => [a,b] ; IN: regexp.ast -TUPLE: primitive-class class ; -C: primitive-class - TUPLE: negation term ; C: negation @@ -56,4 +53,4 @@ M: from-to [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map ; : char-class ( ranges ? -- term ) - [ ] dip [ ] when ; + [ ] dip [ ] when ; diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 44f33f9fcf..aaa650726c 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -119,8 +119,14 @@ C: or-class TUPLE: not-class class ; C: not-class +TUPLE: primitive-class class ; +C: primitive-class + M: or-class class-member? seq>> [ class-member? ] with any? ; M: not-class class-member? class>> class-member? not ; + +M: primitive-class class-member? + class>> class-member? ; diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor index 5a9f772581..6b0e6b519e 100644 --- a/basis/regexp/negation/negation.factor +++ b/basis/regexp/negation/negation.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences -assocs regexp.classes hashtables accessors ; +assocs regexp.classes hashtables accessors fry vectors +regexp.ast regexp.transition-tables ; IN: regexp.negation : ast>dfa ( parse-tree -- minimal-dfa ) @@ -32,5 +33,29 @@ CONSTANT: fail-state -1 [ add-fail-state ] change-transitions dup inverse-final-states >>final-states ; -! M: negation nfa-node ( node -- ) -! ast>dfa negate-table adjoin-dfa ; +: renumber-transitions ( transitions numbering -- new-transitions ) + dup '[ + [ _ at ] + [ [ [ _ at ] map ] assoc-map ] bi* + ] assoc-map ; + +: renumber-states ( transition-table -- transition-table ) + dup transitions>> keys [ next-state ] H{ } map>assoc + [ renumber-transitions ] rewrite-transitions ; + +: box-transitions ( transition-table -- transition-table ) + [ [ [ 1vector ] assoc-map ] assoc-map ] change-transitions ; + +: unify-final-state ( transition-table -- transition-table ) + dup [ final-states>> keys ] keep + '[ -1 eps _ add-transition ] each + H{ { -1 -1 } } >>final-states ; + +: adjoin-dfa ( transition-table -- start end ) + box-transitions unify-final-state renumber-states + [ start-state>> ] + [ final-states>> keys first ] + [ table [ transitions>> ] bi@ swap update ] tri ; + +M: negation nfa-node ( node -- start end ) + term>> ast>dfa negate-table adjoin-dfa ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index c759ffdf98..6775124e60 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -102,9 +102,7 @@ M: integer nfa-node ( node -- start end ) [ literal-transition add-simple-entry ] bi@ alternate-nodes [ nip ] dip ] if - ] [ - literal-transition add-simple-entry - ] if ; + ] [ literal-transition add-simple-entry ] if ; M: primitive-class nfa-node ( node -- start end ) class>> dup @@ -112,12 +110,15 @@ M: primitive-class nfa-node ( node -- start end ) [ drop Letter-class ] when class-transition add-simple-entry ; +M: or-class nfa-node class-transition add-simple-entry ; +M: not-class nfa-node class-transition add-simple-entry ; + M: any-char nfa-node ( node -- start end ) [ dotall option? ] dip any-char-no-nl ? class-transition add-simple-entry ; -M: negation nfa-node ( node -- start end ) - negate term>> nfa-node negate ; +! M: negation nfa-node ( node -- start end ) +! negate term>> nfa-node negate ; M: range nfa-node ( node -- start end ) case-insensitive option? [ diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 6b2f28dbf6..3a7ba12552 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -47,11 +47,11 @@ ERROR: bad-class name ; { CHAR: \\ [ CHAR: \\ ] } { CHAR: w [ c-identifier-class ] } - { CHAR: W [ c-identifier-class ] } + { CHAR: W [ c-identifier-class ] } { CHAR: s [ java-blank-class ] } - { CHAR: S [ java-blank-class ] } + { CHAR: S [ java-blank-class ] } { CHAR: d [ digit-class ] } - { CHAR: D [ digit-class ] } + { CHAR: D [ digit-class ] } [ ] } case ; From e41cdf5e8f6a848df14a015b70ca18612b630c35 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 20 Feb 2009 17:54:48 -0600 Subject: [PATCH 011/125] Various unfinshed regexp changes --- basis/regexp/ast/ast.factor | 8 +- basis/regexp/classes/classes.factor | 60 +++++++------ basis/regexp/dfa/dfa.factor | 31 ++++++- basis/regexp/nfa/nfa.factor | 126 +++++++++++++++------------- basis/regexp/parser/parser.factor | 6 +- basis/regexp/regexp-tests.factor | 16 ++++ 6 files changed, 153 insertions(+), 94 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index e1308f0855..65748005f4 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -18,7 +18,7 @@ SINGLETON: epsilon TUPLE: concatenation first second ; : ( seq -- concatenation ) - epsilon [ concatenation boa ] reduce ; + [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ; TUPLE: alternation first second ; @@ -54,3 +54,9 @@ M: from-to : char-class ( ranges ? -- term ) [ ] dip [ ] when ; + +TUPLE: lookahead term ; +C: lookahead + +TUPLE: lookbehind term ; +C: lookbehind diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index aaa650726c..516b6b4a1d 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -4,28 +4,6 @@ USING: accessors kernel math math.order words ascii unicode.categories combinators.short-circuit sequences ; IN: regexp.classes -: punct? ( ch -- ? ) - "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; - -: c-identifier-char? ( ch -- ? ) - { [ alpha? ] [ CHAR: _ = ] } 1|| ; - -: java-blank? ( ch -- ? ) - { - CHAR: \s CHAR: \t CHAR: \n - HEX: b HEX: 7 CHAR: \r - } member? ; - -: java-printable? ( ch -- ? ) - [ [ alpha? ] [ punct? ] ] 1|| ; - -: hex-digit? ( ch -- ? ) - { - [ CHAR: A CHAR: F between? ] - [ CHAR: a CHAR: f between? ] - [ CHAR: 0 CHAR: 9 between? ] - } 1|| ; - SINGLETONS: any-char any-char-no-nl letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class @@ -70,16 +48,24 @@ M: ascii-class class-member? ( obj class -- ? ) M: digit-class class-member? ( obj class -- ? ) drop digit? ; +: c-identifier-char? ( ch -- ? ) + { [ alpha? ] [ CHAR: _ = ] } 1|| ; + M: c-identifier-class class-member? ( obj class -- ? ) - drop - { [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ; + drop c-identifier-char? ; M: alpha-class class-member? ( obj class -- ? ) drop alpha? ; +: punct? ( ch -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + M: punctuation-class class-member? ( obj class -- ? ) drop punct? ; +: java-printable? ( ch -- ? ) + { [ alpha? ] [ punct? ] } 1|| ; + M: java-printable-class class-member? ( obj class -- ? ) drop java-printable? ; @@ -89,9 +75,22 @@ M: non-newline-blank-class class-member? ( obj class -- ? ) M: control-character-class class-member? ( obj class -- ? ) drop control? ; +: hex-digit? ( ch -- ? ) + { + [ CHAR: A CHAR: F between? ] + [ CHAR: a CHAR: f between? ] + [ CHAR: 0 CHAR: 9 between? ] + } 1|| ; + M: hex-digit-class class-member? ( obj class -- ? ) drop hex-digit? ; +: java-blank? ( ch -- ? ) + { + CHAR: \s CHAR: \t CHAR: \n + HEX: b HEX: 7 CHAR: \r + } member? ; + M: java-blank-class class-member? ( obj class -- ? ) drop java-blank? ; @@ -99,13 +98,7 @@ M: unmatchable-class class-member? ( obj class -- ? ) 2drop f ; M: terminator-class class-member? ( obj class -- ? ) - drop { - [ CHAR: \r = ] - [ CHAR: \n = ] - [ CHAR: \u000085 = ] - [ CHAR: \u002028 = ] - [ CHAR: \u002029 = ] - } 1|| ; + drop "\r\n\u000085\u002029\u002028" member? ; M: beginning-of-line class-member? ( obj class -- ? ) 2drop f ; @@ -119,6 +112,9 @@ C: or-class TUPLE: not-class class ; C: not-class +: ( classes -- class ) + [ ] map ; + TUPLE: primitive-class class ; C: primitive-class @@ -130,3 +126,5 @@ M: not-class class-member? M: primitive-class class-member? class>> class-member? ; + +UNION: class primitive-class not-class or-class range ; diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 88e4e8f9ff..9834ca4ca0 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals math math.order regexp.nfa regexp.transition-tables sequences -sets sorting vectors sequences.deep ; +sets sorting vectors sequences.deep math.functions regexp.classes ; USING: io prettyprint threads ; IN: regexp.dfa @@ -17,6 +17,34 @@ IN: regexp.dfa : while-changes ( obj quot pred -- obj' ) 3dup nip call (while-changes) ; inline +TUPLE: parts in out ; + +: make-partition ( choices classes -- partition ) + zip [ first ] partition parts boa ; + +: powerset-partition ( classes -- partitions ) + ! Here is where class algebra will happen, when I implement it + [ length [ 2^ ] keep ] keep '[ + _ [ ] map-bits _ make-partition + ] map ; + +: partition>class ( parts -- class ) + [ in>> ] [ out>> ] bi + [ ] bi@ 2array ; + +: get-transitions ( partition state-transitions -- next-states ) + [ in>> ] dip '[ at ] gather ; + +: disambiguate-overlap ( nfa -- nfa' ) + [ + [ + [ keys powerset-partition ] keep '[ + [ partition>class ] + [ _ get-transitions ] bi + ] H{ } map>assoc + ] assoc-map + ] change-transitions ; + : find-delta ( states transition nfa -- new-states ) transitions>> '[ _ swap _ at at ] gather sift ; @@ -72,6 +100,7 @@ IN: regexp.dfa swap find-start-state >>start-state ; : construct-dfa ( nfa -- dfa ) + disambiguate-overlap dup initialize-dfa dup start-state>> 1vector H{ } clone diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 6775124e60..370b354276 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -3,17 +3,26 @@ USING: accessors arrays assocs grouping kernel locals math namespaces sequences fry quotations math.order math.ranges vectors unicode.categories -regexp.transition-tables words sets hashtables +regexp.transition-tables words sets hashtables combinators.short-circuit unicode.case.private regexp.ast regexp.classes ; +IN: regexp.nfa + ! This uses unicode.case.private for ch>upper and ch>lower ! but case-insensitive matching should be done by case-folding everything ! before processing starts -IN: regexp.nfa -SYMBOL: negated? +GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' ) +! This is unfinished and does nothing right now! -: negate ( -- ) - negated? [ not ] change ; +M: object remove-lookahead ; + +M: with-options remove-lookahead + [ tree>> remove-lookahead ] [ options>> ] bi ; + +M: alternation remove-lookahead + [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ; + +M: concatenation remove-lookahead ; SINGLETON: eps @@ -45,16 +54,9 @@ SYMBOL: nfa-table GENERIC: nfa-node ( node -- start-state end-state ) -:: add-simple-entry ( obj class -- start-state end-state ) - next-state :> s0 - next-state :> s1 - negated? get [ - s0 f obj class make-transition table add-transition - s0 s1 table add-transition - ] [ - s0 s1 obj class make-transition table add-transition - ] if - s0 s1 ; +: add-simple-entry ( obj class -- start-state end-state ) + [ next-state next-state 2dup ] 2dip + make-transition table add-transition ; : epsilon-transition ( source target -- ) eps table add-transition ; @@ -92,62 +94,66 @@ M: alternation nfa-node ( node -- start end ) [ nfa-node ] bi@ alternate-nodes ; +GENERIC: modify-class ( char-class -- char-class' ) + +M: object modify-class ; + +M: integer modify-class + case-insensitive option? [ + dup Letter? [ + [ ch>lower ] [ ch>upper ] bi 2array + ] when + ] when ; + M: integer nfa-node ( node -- start end ) + modify-class dup class? + class-transition literal-transition ? + add-simple-entry ; + +M: primitive-class modify-class + class>> modify-class ; + +M: or-class modify-class + seq>> [ modify-class ] map ; + +M: not-class modify-class + class>> modify-class ; + +M: any-char modify-class + [ dotall option? ] dip any-char-no-nl ? ; + +: modify-letter-class ( class -- newclass ) + case-insensitive option? [ drop Letter-class ] when ; +M: letter-class modify-class modify-letter-class ; +M: LETTER-class modify-class modify-letter-class ; + +: cased-range? ( range -- ? ) + [ from>> ] [ to>> ] bi { + [ [ letter? ] bi@ and ] + [ [ LETTER? ] bi@ and ] + } 2|| ; + +M: range modify-class case-insensitive option? [ - dup [ ch>lower ] [ ch>upper ] bi - 2dup = [ - 2drop - literal-transition add-simple-entry - ] [ - [ literal-transition add-simple-entry ] bi@ - alternate-nodes [ nip ] dip - ] if - ] [ literal-transition add-simple-entry ] if ; - -M: primitive-class nfa-node ( node -- start end ) - class>> dup - { letter-class LETTER-class } member? case-insensitive option? and - [ drop Letter-class ] when - class-transition add-simple-entry ; - -M: or-class nfa-node class-transition add-simple-entry ; -M: not-class nfa-node class-transition add-simple-entry ; - -M: any-char nfa-node ( node -- start end ) - [ dotall option? ] dip any-char-no-nl ? - class-transition add-simple-entry ; - -! M: negation nfa-node ( node -- start end ) -! negate term>> nfa-node negate ; - -M: range nfa-node ( node -- start end ) - case-insensitive option? [ - ! This should be implemented for Unicode by case-folding - ! the input and all strings in the regexp. - dup [ from>> ] [ to>> ] bi - 2dup [ Letter? ] bi@ and [ - rot drop + dup cased-range? [ + [ from>> ] [ to>> ] bi [ [ ch>lower ] bi@ ] [ [ ch>upper ] bi@ ] 2bi - [ class-transition add-simple-entry ] bi@ - alternate-nodes - ] [ - 2drop - class-transition add-simple-entry - ] if - ] [ - class-transition add-simple-entry - ] if ; + 2array + ] when + ] when ; + +M: class nfa-node + modify-class class-transition add-simple-entry ; M: with-options nfa-node ( node -- start end ) dup options>> [ tree>> nfa-node ] using-options ; : construct-nfa ( ast -- nfa-table ) [ - negated? off 0 state set - clone nfa-table set - nfa-node + nfa-table set + remove-lookahead nfa-node table swap dup associate >>final-states swap >>start-state diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 3a7ba12552..18b43674c4 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -132,11 +132,15 @@ Parenthized = "?:" Alternation:a => [[ a ]] => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] | "?~" Alternation:a => [[ a ]] + | "?=" Alternation:a => [[ a ]] + | "?!" Alternation:a => [[ a ]] + | "?<=" Alternation:a => [[ a ]] + | "? [[ a ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] | "[" CharClass:r "]" => [[ r ]] - | ".":d => [[ any-char ]] + | ".":d => [[ any-char ]] | Character Number = (!(","|"}").)* => [[ string>number ensure-number ]] diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 4331eaa250..0d9ed129c8 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -317,6 +317,22 @@ IN: regexp-tests ! Bug in parsing word [ t ] [ "a" R' a' matches? ] unit-test +! Testing negation +[ f ] [ "a" R/ (?~a)/ matches? ] unit-test +[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test +[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test +[ t ] [ "" R/ (?~a)/ matches? ] unit-test + +[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test +[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test +[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test +[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test +[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test + +! Intersecting classes +[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test +[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test + ! [ t ] [ "a" R/ ^a/ matches? ] unit-test ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test From 041be23cdc102582e9a78d7357bec7c13e3561b1 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 20 Feb 2009 18:45:24 -0600 Subject: [PATCH 012/125] trivial change in regexp --- basis/regexp/parser/parser.factor | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 18b43674c4..56c6b1eb04 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -83,7 +83,7 @@ ERROR: bad-class name ; : options>string ( options -- string ) [ on>> ] [ off>> ] bi [ [ option>ch ] map ] bi@ - [ "-" swap 3append ] unless-empty + [ "-" glue ] unless-empty "" like ; ! TODO: add syntax for various parenthized things, From be177fefa0a2657a4fa468da2be69bba9789d7d3 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sat, 21 Feb 2009 12:09:41 -0600 Subject: [PATCH 013/125] Disambiguation of overlapping regexp transitions --- basis/regexp/classes/classes.factor | 37 +++++++++++++++--- basis/regexp/dfa/dfa.factor | 35 ++--------------- basis/regexp/disambiguate/disambiguate.factor | 38 +++++++++++++++++++ basis/regexp/negation/negation.factor | 6 +-- basis/regexp/nfa/nfa.factor | 2 +- 5 files changed, 77 insertions(+), 41 deletions(-) create mode 100644 basis/regexp/disambiguate/disambiguate.factor diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 516b6b4a1d..c7106c9154 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors kernel math math.order words +USING: accessors kernel math math.order words combinators ascii unicode.categories combinators.short-circuit sequences ; IN: regexp.classes @@ -107,20 +107,47 @@ M: end-of-line class-member? ( obj class -- ? ) 2drop f ; TUPLE: or-class seq ; -C: or-class TUPLE: not-class class ; -C: not-class -: ( classes -- class ) - [ ] map ; +TUPLE: and-class seq ; TUPLE: primitive-class class ; C: primitive-class +: ( seq -- class ) + t swap remove + f over member? [ drop f ] [ + dup length { + { 0 [ drop t ] } + { 1 [ first ] } + [ drop and-class boa ] + } case + ] if ; + +M: and-class class-member? + seq>> [ class-member? ] with all? ; + +: ( seq -- class ) + f swap remove + t over member? [ drop t ] [ + dup length { + { 0 [ drop f ] } + { 1 [ first ] } + [ drop or-class boa ] + } case + ] if ; + M: or-class class-member? seq>> [ class-member? ] with any? ; +: ( class -- inverse ) + { + { t [ f ] } + { f [ t ] } + [ not-class boa ] + } case ; + M: not-class class-member? class>> class-member? not ; diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 9834ca4ca0..8c2e995163 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -2,8 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals math math.order regexp.nfa regexp.transition-tables sequences -sets sorting vectors sequences.deep math.functions regexp.classes ; -USING: io prettyprint threads ; +sets sorting vectors ; IN: regexp.dfa :: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj ) @@ -17,34 +16,6 @@ IN: regexp.dfa : while-changes ( obj quot pred -- obj' ) 3dup nip call (while-changes) ; inline -TUPLE: parts in out ; - -: make-partition ( choices classes -- partition ) - zip [ first ] partition parts boa ; - -: powerset-partition ( classes -- partitions ) - ! Here is where class algebra will happen, when I implement it - [ length [ 2^ ] keep ] keep '[ - _ [ ] map-bits _ make-partition - ] map ; - -: partition>class ( parts -- class ) - [ in>> ] [ out>> ] bi - [ ] bi@ 2array ; - -: get-transitions ( partition state-transitions -- next-states ) - [ in>> ] dip '[ at ] gather ; - -: disambiguate-overlap ( nfa -- nfa' ) - [ - [ - [ keys powerset-partition ] keep '[ - [ partition>class ] - [ _ get-transitions ] bi - ] H{ } map>assoc - ] assoc-map - ] change-transitions ; - : find-delta ( states transition nfa -- new-states ) transitions>> '[ _ swap _ at at ] gather sift ; @@ -85,7 +56,8 @@ TUPLE: parts in out ; : states ( hashtable -- array ) [ keys ] - [ values [ values concat ] map concat append ] bi ; + [ values [ values concat ] map concat ] bi + append ; : set-final-states ( nfa dfa -- ) [ @@ -100,7 +72,6 @@ TUPLE: parts in out ; swap find-start-state >>start-state ; : construct-dfa ( nfa -- dfa ) - disambiguate-overlap dup initialize-dfa dup start-state>> 1vector H{ } clone diff --git a/basis/regexp/disambiguate/disambiguate.factor b/basis/regexp/disambiguate/disambiguate.factor new file mode 100644 index 0000000000..2e26e43625 --- /dev/null +++ b/basis/regexp/disambiguate/disambiguate.factor @@ -0,0 +1,38 @@ +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: kernel accessors regexp.classes math.bits assocs sequences +arrays sets regexp.dfa math fry regexp.minimize ; +IN: regexp.disambiguate + +TUPLE: parts in out ; + +: make-partition ( choices classes -- partition ) + zip [ first ] partition [ values ] bi@ parts boa ; + +: powerset-partition ( classes -- partitions ) + [ length [ 2^ ] keep ] keep '[ + _ _ make-partition + ] map ; + +: partition>class ( parts -- class ) + [ in>> ] [ out>> ] bi + [ ] bi@ 2array ; + +: get-transitions ( partition state-transitions -- next-states ) + [ in>> ] dip '[ _ at ] map prune ; + +: disambiguate ( dfa -- nfa ) + [ + [ + [ keys powerset-partition ] keep '[ + [ partition>class ] + [ _ get-transitions ] bi + ] H{ } map>assoc + [ drop ] assoc-filter + ] assoc-map + ] change-transitions ; + +: nfa>dfa ( nfa -- dfa ) + construct-dfa + minimize disambiguate + construct-dfa minimize ; diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor index 6b0e6b519e..f235dc1bf5 100644 --- a/basis/regexp/negation/negation.factor +++ b/basis/regexp/negation/negation.factor @@ -1,12 +1,12 @@ ! Copyright (C) 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences +USING: regexp.nfa regexp.disambiguate kernel sequences assocs regexp.classes hashtables accessors fry vectors -regexp.ast regexp.transition-tables ; +regexp.ast regexp.transition-tables regexp.minimize ; IN: regexp.negation : ast>dfa ( parse-tree -- minimal-dfa ) - construct-nfa construct-dfa minimize ; + construct-nfa nfa>dfa ; CONSTANT: fail-state -1 diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 370b354276..eff023c278 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -20,7 +20,7 @@ M: with-options remove-lookahead [ tree>> remove-lookahead ] [ options>> ] bi ; M: alternation remove-lookahead - [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ; + [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ alternation boa ; M: concatenation remove-lookahead ; From 88f9b3ea9270d762567aa54b9882e81eeeff51f4 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sat, 21 Feb 2009 17:13:11 -0600 Subject: [PATCH 014/125] Work on class algebra for regexp --- basis/regexp/classes/classes.factor | 76 ++++++++++++++----- basis/regexp/disambiguate/disambiguate.factor | 2 +- 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index c7106c9154..8d235daf07 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors kernel math math.order words combinators +USING: accessors kernel math math.order words combinators locals ascii unicode.categories combinators.short-circuit sequences ; +QUALIFIED-WITH: multi-methods m IN: regexp.classes SINGLETONS: any-char any-char-no-nl @@ -106,37 +107,74 @@ M: beginning-of-line class-member? ( obj class -- ? ) M: end-of-line class-member? ( obj class -- ? ) 2drop f ; +M: f class-member? 2drop f ; + +TUPLE: primitive-class class ; +C: primitive-class + TUPLE: or-class seq ; TUPLE: not-class class ; TUPLE: and-class seq ; -TUPLE: primitive-class class ; -C: primitive-class +m:GENERIC: combine-and ( class1 class2 -- combined ? ) + +m:METHOD: combine-and { object object } 2drop f f ; + +m:METHOD: combine-and { integer integer } + 2dup = [ drop t ] [ 2drop f t ] if ; + +m:METHOD: combine-and { t object } + nip t ; + +m:METHOD: combine-and { f object } + drop t ; + +m:METHOD: combine-and { integer object } + 2dup class-member? [ drop t ] [ 2drop f t ] if ; + +m:GENERIC: combine-or ( class1 class2 -- combined ? ) + +m:METHOD: combine-or { object object } 2drop f f ; + +m:METHOD: combine-or { integer integer } + 2dup = [ drop t ] [ 2drop f f ] if ; + +m:METHOD: combine-or { t object } + drop t ; + +m:METHOD: combine-or { f object } + nip t ; + +m:METHOD: combine-or { integer object } + 2dup class-member? [ nip t ] [ 2drop f f ] if ; + +: try-combine ( elt1 elt2 quot -- combined/f ? ) + 3dup call [ [ 3drop ] dip t ] [ drop swapd call ] if ; inline + +:: prefix-combining ( seq elt quot: ( elt1 elt2 -- combined/f ? ) -- newseq ) + f :> combined! + seq [ elt quot try-combine swap combined! ] find drop + [ seq remove-nth combined prefix ] + [ seq elt prefix ] if* ; inline + +:: combine ( seq quot: ( elt1 elt2 -- combined/f ? ) empty class -- newseq ) + seq { } [ quot prefix-combining ] reduce + dup length { + { 0 [ drop empty ] } + { 1 [ first ] } + [ drop class new swap >>seq ] + } case ; inline : ( seq -- class ) - t swap remove - f over member? [ drop f ] [ - dup length { - { 0 [ drop t ] } - { 1 [ first ] } - [ drop and-class boa ] - } case - ] if ; + [ combine-and ] t and-class combine ; M: and-class class-member? seq>> [ class-member? ] with all? ; : ( seq -- class ) - f swap remove - t over member? [ drop t ] [ - dup length { - { 0 [ drop f ] } - { 1 [ first ] } - [ drop or-class boa ] - } case - ] if ; + [ combine-or ] t or-class combine ; M: or-class class-member? seq>> [ class-member? ] with any? ; diff --git a/basis/regexp/disambiguate/disambiguate.factor b/basis/regexp/disambiguate/disambiguate.factor index 2e26e43625..1243ab7cc1 100644 --- a/basis/regexp/disambiguate/disambiguate.factor +++ b/basis/regexp/disambiguate/disambiguate.factor @@ -1,4 +1,4 @@ -! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. +! Copyright (C) 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: kernel accessors regexp.classes math.bits assocs sequences arrays sets regexp.dfa math fry regexp.minimize ; From 2dcbd5b3db15e16464f4057dc5578900216dd056 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 22 Feb 2009 21:26:16 -0600 Subject: [PATCH 015/125] fix docs for a word --- core/io/encodings/encodings-docs.factor | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/io/encodings/encodings-docs.factor b/core/io/encodings/encodings-docs.factor index 509757c68a..e13e05bf40 100644 --- a/core/io/encodings/encodings-docs.factor +++ b/core/io/encodings/encodings-docs.factor @@ -1,4 +1,4 @@ -USING: help.markup help.syntax io quotations ; +USING: help.markup help.syntax io quotations math ; IN: io.encodings HELP: @@ -71,6 +71,9 @@ HELP: with-encoded-output { $description "Creates a new encoder with the given encoding descriptor and calls the quotation using this encoder. The original encoder object is restored after the quotation returns and the stream is kept open for future output operations." } ; HELP: replacement-char +{ $values + { "value" integer } +} { $description "A code point that replaces input that could not be decoded. The presence of this character in the decoded data usually signifies an error." } ; ARTICLE: "encodings-descriptors" "Encoding descriptors" From a4817a0e1712f0b1c521dc3a22de84f45493398c Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 23 Feb 2009 08:37:38 -0600 Subject: [PATCH 016/125] dont run postgresql tests on win64 --- basis/db/errors/postgresql/postgresql-tests.factor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/basis/db/errors/postgresql/postgresql-tests.factor b/basis/db/errors/postgresql/postgresql-tests.factor index 9dbebe0712..f6668031e5 100644 --- a/basis/db/errors/postgresql/postgresql-tests.factor +++ b/basis/db/errors/postgresql/postgresql-tests.factor @@ -5,7 +5,7 @@ db.errors.postgresql db.postgresql io.files.unique kernel namespaces tools.test db.tester continuations ; IN: db.errors.postgresql.tests -postgresql-test-db [ +[ [ "drop table foo;" sql-command ] ignore-errors [ "drop table ship;" sql-command ] ignore-errors @@ -29,4 +29,4 @@ postgresql-test-db [ sql-syntax-error? ] must-fail-with -] with-db +] test-postgresql From c3ef25f81c1a8b0a11b8ad5ac5c214f482a30dfd Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 23 Feb 2009 10:35:42 -0600 Subject: [PATCH 017/125] made editors.emacs load windows file on windows --- basis/editors/emacs/emacs.factor | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/basis/editors/emacs/emacs.factor b/basis/editors/emacs/emacs.factor index fa717a70fa..05b879770e 100644 --- a/basis/editors/emacs/emacs.factor +++ b/basis/editors/emacs/emacs.factor @@ -1,6 +1,6 @@ USING: definitions io.launcher kernel parser words sequences math math.parser namespaces editors make system combinators.short-circuit -fry threads ; +fry threads vocabs.loader ; IN: editors.emacs SYMBOL: emacsclient-path @@ -22,3 +22,5 @@ M: object default-emacsclient ( -- path ) "emacsclient" ; where first2 emacsclient ; [ emacsclient ] edit-hook set-global + +os windows? [ "editors.emacs.windows" require ] when From ba1ac44176858138cd81fe5d96b6e6dcac3a522e Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 23 Feb 2009 13:10:38 -0600 Subject: [PATCH 018/125] Disambiguation works completely in regexp --- basis/regexp/classes/classes-tests.factor | 25 +++++++++++++++++++ basis/regexp/classes/classes.factor | 20 ++++++++++----- basis/regexp/disambiguate/disambiguate.factor | 7 +++--- basis/regexp/negation/negation-tests.factor | 6 ++--- basis/regexp/negation/negation.factor | 8 +++--- basis/regexp/nfa/nfa.factor | 2 +- 6 files changed, 51 insertions(+), 17 deletions(-) create mode 100644 basis/regexp/classes/classes-tests.factor diff --git a/basis/regexp/classes/classes-tests.factor b/basis/regexp/classes/classes-tests.factor new file mode 100644 index 0000000000..4cbb2e7a57 --- /dev/null +++ b/basis/regexp/classes/classes-tests.factor @@ -0,0 +1,25 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: regexp.classes tools.test arrays kernel ; +IN: regexp.classes.tests + +[ f ] [ { 1 2 } ] unit-test +[ T{ or-class f { 2 1 } } ] [ { 1 2 } ] unit-test +[ 3 ] [ { 1 2 } 3 2array ] unit-test +[ CHAR: A ] [ CHAR: A LETTER-class 2array ] unit-test +[ CHAR: A ] [ LETTER-class CHAR: A 2array ] unit-test +[ T{ primitive-class { class LETTER-class } } ] [ CHAR: A LETTER-class 2array ] unit-test +[ T{ primitive-class { class LETTER-class } } ] [ LETTER-class CHAR: A 2array ] unit-test +[ t ] [ { t 1 } ] unit-test +[ t ] [ { 1 t } ] unit-test +[ f ] [ { f 1 } ] unit-test +[ f ] [ { 1 f } ] unit-test +[ 1 ] [ { f 1 } ] unit-test +[ 1 ] [ { 1 f } ] unit-test +[ 1 ] [ { t 1 } ] unit-test +[ 1 ] [ { 1 t } ] unit-test +[ 1 ] [ 1 ] unit-test +[ 1 ] [ { 1 1 } ] unit-test +[ 1 ] [ { 1 1 } ] unit-test +[ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test +[ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 8d235daf07..6e68e9e0f6 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -20,8 +20,7 @@ C: range GENERIC: class-member? ( obj class -- ? ) -! When does t get put in? -M: t class-member? ( obj class -- ? ) 2drop f ; +M: t class-member? ( obj class -- ? ) 2drop t ; M: integer class-member? ( obj class -- ? ) = ; @@ -120,7 +119,10 @@ TUPLE: and-class seq ; m:GENERIC: combine-and ( class1 class2 -- combined ? ) -m:METHOD: combine-and { object object } 2drop f f ; +: replace-if-= ( object object -- object ? ) + over = ; + +m:METHOD: combine-and { object object } replace-if-= ; m:METHOD: combine-and { integer integer } 2dup = [ drop t ] [ 2drop f t ] if ; @@ -131,12 +133,15 @@ m:METHOD: combine-and { t object } m:METHOD: combine-and { f object } drop t ; +m:METHOD: combine-and { not-class object } + [ class>> ] dip = [ f t ] [ f f ] if ; + m:METHOD: combine-and { integer object } 2dup class-member? [ drop t ] [ 2drop f t ] if ; m:GENERIC: combine-or ( class1 class2 -- combined ? ) -m:METHOD: combine-or { object object } 2drop f f ; +m:METHOD: combine-or { object object } replace-if-= ; m:METHOD: combine-or { integer integer } 2dup = [ drop t ] [ 2drop f f ] if ; @@ -147,6 +152,9 @@ m:METHOD: combine-or { t object } m:METHOD: combine-or { f object } nip t ; +m:METHOD: combine-or { not-class object } + [ class>> ] dip = [ t t ] [ f f ] if ; + m:METHOD: combine-or { integer object } 2dup class-member? [ nip t ] [ 2drop f f ] if ; @@ -174,7 +182,7 @@ M: and-class class-member? seq>> [ class-member? ] with all? ; : ( seq -- class ) - [ combine-or ] t or-class combine ; + [ combine-or ] f or-class combine ; M: or-class class-member? seq>> [ class-member? ] with any? ; @@ -183,7 +191,7 @@ M: or-class class-member? { { t [ f ] } { f [ t ] } - [ not-class boa ] + [ dup not-class? [ class>> ] [ not-class boa ] if ] } case ; M: not-class class-member? diff --git a/basis/regexp/disambiguate/disambiguate.factor b/basis/regexp/disambiguate/disambiguate.factor index 1243ab7cc1..0b63351e0c 100644 --- a/basis/regexp/disambiguate/disambiguate.factor +++ b/basis/regexp/disambiguate/disambiguate.factor @@ -12,11 +12,12 @@ TUPLE: parts in out ; : powerset-partition ( classes -- partitions ) [ length [ 2^ ] keep ] keep '[ _ _ make-partition - ] map ; + ] map rest ; : partition>class ( parts -- class ) - [ in>> ] [ out>> ] bi - [ ] bi@ 2array ; + [ out>> [ ] map ] + [ in>> ] bi + prefix ; : get-transitions ( partition state-transitions -- next-states ) [ in>> ] dip '[ _ at ] map prune ; diff --git a/basis/regexp/negation/negation-tests.factor b/basis/regexp/negation/negation-tests.factor index 2dbca2e8d8..41dfe7f493 100644 --- a/basis/regexp/negation/negation-tests.factor +++ b/basis/regexp/negation/negation-tests.factor @@ -7,9 +7,9 @@ IN: regexp.negation.tests ! R/ |[^a]|.+/ T{ transition-table { transitions H{ - { 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } } - { 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } } - { -1 H{ { any-char -1 } } } + { 0 H{ { CHAR: a 1 } { T{ not-class f CHAR: a } -1 } } } + { 1 H{ { t -1 } } } + { -1 H{ { t -1 } } } } } { start-state 0 } { final-states H{ { 0 0 } { -1 -1 } } } diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor index f235dc1bf5..f5a43a2a5e 100644 --- a/basis/regexp/negation/negation.factor +++ b/basis/regexp/negation/negation.factor @@ -12,11 +12,11 @@ CONSTANT: fail-state -1 : add-default-transition ( state's-transitions -- new-state's-transitions ) clone dup - [ [ fail-state ] dip keys ] keep set-at ; + [ [ fail-state ] dip keys [ ] map ] keep set-at ; : fail-state-recurses ( transitions -- new-transitions ) clone dup - [ fail-state any-char associate fail-state ] dip set-at ; + [ fail-state t associate fail-state ] dip set-at ; : add-fail-state ( transitions -- new-transitions ) [ add-default-transition ] assoc-map @@ -48,8 +48,8 @@ CONSTANT: fail-state -1 : unify-final-state ( transition-table -- transition-table ) dup [ final-states>> keys ] keep - '[ -1 eps _ add-transition ] each - H{ { -1 -1 } } >>final-states ; + '[ -2 eps _ add-transition ] each + H{ { -2 -2 } } >>final-states ; : adjoin-dfa ( transition-table -- start end ) box-transitions unify-final-state renumber-states diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index eff023c278..72ce880f8b 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -120,7 +120,7 @@ M: not-class modify-class class>> modify-class ; M: any-char modify-class - [ dotall option? ] dip any-char-no-nl ? ; + drop dotall option? t any-char-no-nl ? ; : modify-letter-class ( class -- newclass ) case-insensitive option? [ drop Letter-class ] when ; From c708bfcbca96759c9049408b4922eb291d0207cb Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 25 Feb 2009 12:22:12 -0600 Subject: [PATCH 019/125] Various regexp changes, including the addition of regexp combinators --- basis/regexp/ast/ast.factor | 12 ++++- basis/regexp/classes/classes.factor | 7 ++- .../combinators/combinators-tests.factor | 29 +++++++++++ basis/regexp/combinators/combinators.factor | 48 +++++++++++++++++++ basis/regexp/dfa/dfa.factor | 6 +-- basis/regexp/negation/negation.factor | 6 +-- basis/regexp/nfa/nfa.factor | 15 +++--- basis/regexp/parser/parser.factor | 11 ++++- basis/regexp/regexp-tests.factor | 12 ++++- basis/regexp/regexp.factor | 21 ++++++-- 10 files changed, 139 insertions(+), 28 deletions(-) create mode 100644 basis/regexp/combinators/combinators-tests.factor create mode 100644 basis/regexp/combinators/combinators.factor diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index 65748005f4..b804eacc09 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -13,7 +13,10 @@ C: from-to TUPLE: at-least n ; C: at-least -SINGLETON: epsilon +TUPLE: tagged-epsilon tag ; +C: tagged-epsilon + +CONSTANT: epsilon T{ tagged-epsilon } TUPLE: concatenation first second ; @@ -60,3 +63,10 @@ C: lookahead TUPLE: lookbehind term ; C: lookbehind + +TUPLE: possessive-star term ; +C: possessive-star + +: ( term -- term' ) + dup 2array ; + diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 6e68e9e0f6..0990ac786b 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -12,8 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class unmatchable-class terminator-class word-boundary-class ; -SINGLETONS: beginning-of-input beginning-of-line -end-of-input end-of-line ; +SINGLETONS: beginning-of-input ^ end-of-input $ ; TUPLE: range from to ; C: range @@ -100,10 +99,10 @@ M: unmatchable-class class-member? ( obj class -- ? ) M: terminator-class class-member? ( obj class -- ? ) drop "\r\n\u000085\u002029\u002028" member? ; -M: beginning-of-line class-member? ( obj class -- ? ) +M: ^ class-member? ( obj class -- ? ) 2drop f ; -M: end-of-line class-member? ( obj class -- ? ) +M: $ class-member? ( obj class -- ? ) 2drop f ; M: f class-member? 2drop f ; diff --git a/basis/regexp/combinators/combinators-tests.factor b/basis/regexp/combinators/combinators-tests.factor new file mode 100644 index 0000000000..dc6b5a6567 --- /dev/null +++ b/basis/regexp/combinators/combinators-tests.factor @@ -0,0 +1,29 @@ +! Copyright (C) 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: regexp.combinators tools.test regexp kernel sequences ; +IN: regexp.combinators.tests + +: strings ( -- regexp ) + { "foo" "bar" "baz" } ; + +[ t t t ] [ "foo" "bar" "baz" [ strings matches? ] tri@ ] unit-test +[ f f f ] [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test + +: conj ( -- regexp ) + { R/ .*a/ R/ b.*/ } ; + +[ t ] [ "bljhasflsda" conj matches? ] unit-test +[ f ] [ "bsdfdfs" conj matches? ] unit-test ! why does this fail? +[ f ] [ "fsfa" conj matches? ] unit-test + +! For some reason, creating this DFA doesn't work +! [ f ] [ "bljhasflsda" conj matches? ] unit-test +! [ t ] [ "bsdfdfs" conj matches? ] unit-test +! [ t ] [ "fsfa" conj matches? ] unit-test + +[ f f ] [ "" "hi" [ matches? ] bi@ ] unit-test +[ t t ] [ "" "hi" [ matches? ] bi@ ] unit-test + +[ { t t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" matches? ] map ] unit-test +[ { f t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" matches? ] map ] unit-test +[ { t t f f } ] [ { "" "a" "aaaaa" "aab" } [ "a"