From 365334fc61d92351ff7de89bc669cc4359b31391 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Tue, 26 Aug 2008 20:24:14 -0500 Subject: [PATCH 1/2] move regexp2 to unfinished for binaries --- unfinished/regexp2/authors.txt | 1 + unfinished/regexp2/backend/backend.factor | 25 ++ unfinished/regexp2/classes/classes.factor | 55 +++ unfinished/regexp2/dfa/dfa.factor | 70 ++++ unfinished/regexp2/nfa/nfa.factor | 126 ++++++ unfinished/regexp2/parser/parser-tests.factor | 33 ++ unfinished/regexp2/parser/parser.factor | 391 ++++++++++++++++++ unfinished/regexp2/regexp2-docs.factor | 14 + unfinished/regexp2/regexp2-tests.factor | 263 ++++++++++++ unfinished/regexp2/regexp2.factor | 59 +++ unfinished/regexp2/summary.txt | 1 + unfinished/regexp2/tags.txt | 2 + .../transition-tables.factor | 44 ++ unfinished/regexp2/traversal/traversal.factor | 80 ++++ unfinished/regexp2/utils/utils.factor | 69 ++++ 15 files changed, 1233 insertions(+) create mode 100644 unfinished/regexp2/authors.txt create mode 100644 unfinished/regexp2/backend/backend.factor create mode 100644 unfinished/regexp2/classes/classes.factor create mode 100644 unfinished/regexp2/dfa/dfa.factor create mode 100644 unfinished/regexp2/nfa/nfa.factor create mode 100644 unfinished/regexp2/parser/parser-tests.factor create mode 100644 unfinished/regexp2/parser/parser.factor create mode 100644 unfinished/regexp2/regexp2-docs.factor create mode 100644 unfinished/regexp2/regexp2-tests.factor create mode 100644 unfinished/regexp2/regexp2.factor create mode 100644 unfinished/regexp2/summary.txt create mode 100644 unfinished/regexp2/tags.txt create mode 100644 unfinished/regexp2/transition-tables/transition-tables.factor create mode 100644 unfinished/regexp2/traversal/traversal.factor create mode 100644 unfinished/regexp2/utils/utils.factor diff --git a/unfinished/regexp2/authors.txt b/unfinished/regexp2/authors.txt new file mode 100644 index 0000000000..7c1b2f2279 --- /dev/null +++ b/unfinished/regexp2/authors.txt @@ -0,0 +1 @@ +Doug Coleman diff --git a/unfinished/regexp2/backend/backend.factor b/unfinished/regexp2/backend/backend.factor new file mode 100644 index 0000000000..c39d67e7b8 --- /dev/null +++ b/unfinished/regexp2/backend/backend.factor @@ -0,0 +1,25 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors hashtables kernel math state-tables vars vectors ; +IN: regexp2.backend + +TUPLE: regexp + raw + { stack vector } + parse-tree + { options hashtable } + nfa-table + dfa-table + minimized-table + { state integer } + { new-states vector } + { visited-states hashtable } ; + +: reset-regexp ( regexp -- regexp ) + 0 >>state + V{ } clone >>stack + V{ } clone >>new-states + H{ } clone >>options + H{ } clone >>visited-states ; + +SYMBOL: current-regexp diff --git a/unfinished/regexp2/classes/classes.factor b/unfinished/regexp2/classes/classes.factor new file mode 100644 index 0000000000..7737e02d40 --- /dev/null +++ b/unfinished/regexp2/classes/classes.factor @@ -0,0 +1,55 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors kernel math math.order symbols regexp2.parser +words regexp2.utils unicode.categories combinators.short-circuit ; +IN: regexp2.classes + +GENERIC: class-member? ( obj class -- ? ) + +M: word class-member? ( obj class -- ? ) 2drop f ; +M: integer class-member? ( obj class -- ? ) 2drop f ; + +M: character-class-range class-member? ( obj class -- ? ) + [ from>> ] [ to>> ] bi between? ; + +M: any-char class-member? ( obj class -- ? ) + 2drop t ; + +M: letter-class class-member? ( obj class -- ? ) + drop letter? ; + +M: LETTER-class class-member? ( obj class -- ? ) + drop LETTER? ; + +M: Letter-class class-member? ( obj class -- ? ) + drop Letter? ; + +M: ascii-class class-member? ( obj class -- ? ) + drop ascii? ; + +M: digit-class class-member? ( obj class -- ? ) + drop digit? ; + +M: alpha-class class-member? ( obj class -- ? ) + drop alpha? ; + +M: punctuation-class class-member? ( obj class -- ? ) + drop punct? ; + +M: java-printable-class class-member? ( obj class -- ? ) + drop java-printable? ; + +M: non-newline-blank-class class-member? ( obj class -- ? ) + drop { [ blank? ] [ CHAR: \n = not ] } 1&& ; + +M: control-character-class class-member? ( obj class -- ? ) + drop control-char? ; + +M: hex-digit-class class-member? ( obj class -- ? ) + drop hex-digit? ; + +M: java-blank-class class-member? ( obj class -- ? ) + drop java-blank? ; + +M: unmatchable-class class-member? ( obj class -- ? ) + 2drop f ; diff --git a/unfinished/regexp2/dfa/dfa.factor b/unfinished/regexp2/dfa/dfa.factor new file mode 100644 index 0000000000..0dcf6c4ab5 --- /dev/null +++ b/unfinished/regexp2/dfa/dfa.factor @@ -0,0 +1,70 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs combinators fry kernel locals +math math.order regexp2.nfa regexp2.transition-tables sequences +sets sorting vectors regexp2.utils sequences.lib ; +USING: io prettyprint threads ; +IN: regexp2.dfa + +: find-delta ( states transition regexp -- new-states ) + nfa-table>> transitions>> + rot [ swap at at ] with with map sift concat prune ; + +: (find-epsilon-closure) ( states regexp -- new-states ) + eps swap find-delta ; + +: find-epsilon-closure ( states regexp -- new-states ) + '[ dup , (find-epsilon-closure) union ] [ length ] while-changes + natural-sort ; + +: find-closure ( states transition regexp -- new-states ) + [ find-delta ] 2keep nip find-epsilon-closure ; + +: find-start-state ( regexp -- state ) + [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ; + +: find-transitions ( seq1 regexp -- seq2 ) + nfa-table>> transitions>> + [ at keys ] curry map concat eps swap remove ; + +: add-todo-state ( state regexp -- ) + 2dup visited-states>> key? [ + 2drop + ] [ + [ visited-states>> conjoin ] + [ new-states>> push ] 2bi + ] if ; + +: new-transitions ( regexp -- ) + dup new-states>> [ + drop + ] [ + dupd pop dup pick find-transitions rot + [ + [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep + >r swapd transition boa r> dfa-table>> add-transition + ] curry with each + new-transitions + ] if-empty ; + +: states ( hashtable -- array ) + [ keys ] + [ values [ values concat ] map concat append ] bi ; + +: set-final-states ( regexp -- ) + dup + [ nfa-table>> final-states>> keys ] + [ dfa-table>> transitions>> states ] bi + [ intersect empty? not ] with filter + + swap dfa-table>> final-states>> + [ conjoin ] curry each ; + +: set-initial-state ( regexp -- ) + dup + [ dfa-table>> ] [ find-start-state ] bi + [ >>start-state drop ] keep + 1vector >>new-states drop ; + +: construct-dfa ( regexp -- ) + [ set-initial-state ] [ new-transitions ] [ set-final-states ] tri ; diff --git a/unfinished/regexp2/nfa/nfa.factor b/unfinished/regexp2/nfa/nfa.factor new file mode 100644 index 0000000000..f87a2a7b52 --- /dev/null +++ b/unfinished/regexp2/nfa/nfa.factor @@ -0,0 +1,126 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs grouping kernel regexp2.backend +locals math namespaces regexp2.parser sequences state-tables fry +quotations math.order math.ranges vectors unicode.categories +regexp2.utils regexp2.transition-tables words sequences.lib ; +IN: regexp2.nfa + +SYMBOL: negation-mode +: negated? ( -- ? ) negation-mode get 0 or odd? ; + +SINGLETON: eps + +: next-state ( regexp -- state ) + [ state>> ] [ [ 1+ ] change-state drop ] bi ; + +: set-start-state ( regexp -- ) + dup stack>> [ + drop + ] [ + [ nfa-table>> ] [ pop first ] bi* >>start-state drop + ] if-empty ; + +GENERIC: nfa-node ( node -- ) + +:: add-simple-entry ( obj class -- ) + [let* | regexp [ current-regexp get ] + s0 [ regexp next-state ] + s1 [ regexp next-state ] + stack [ regexp stack>> ] + table [ regexp nfa-table>> ] | + negated? [ + s0 f obj class boa table add-transition + s0 s1 table add-transition + ] [ + s0 s1 obj class boa table add-transition + ] if + s0 s1 2array stack push + t s1 table final-states>> set-at ] ; + +:: concatenate-nodes ( -- ) + [let* | regexp [ current-regexp get ] + stack [ regexp stack>> ] + table [ regexp nfa-table>> ] + s2 [ stack peek first ] + s3 [ stack pop second ] + s0 [ stack peek first ] + s1 [ stack pop second ] | + s1 s2 eps table add-transition + s1 table final-states>> delete-at + s0 s3 2array stack push ] ; + +:: alternate-nodes ( -- ) + [let* | regexp [ current-regexp get ] + stack [ regexp stack>> ] + table [ regexp nfa-table>> ] + s2 [ stack peek first ] + s3 [ stack pop second ] + s0 [ stack peek first ] + s1 [ stack pop second ] + s4 [ regexp next-state ] + s5 [ regexp next-state ] | + s4 s0 eps table add-transition + s4 s2 eps table add-transition + s1 s5 eps table add-transition + s3 s5 eps table add-transition + s1 table final-states>> delete-at + s3 table final-states>> delete-at + t s5 table final-states>> set-at + s4 s5 2array stack push ] ; + +M: kleene-star nfa-node ( node -- ) + term>> nfa-node + [let* | regexp [ current-regexp get ] + stack [ regexp stack>> ] + s0 [ stack peek first ] + s1 [ stack pop second ] + s2 [ regexp next-state ] + s3 [ regexp next-state ] + table [ regexp nfa-table>> ] | + s1 table final-states>> delete-at + t s3 table final-states>> set-at + s1 s0 eps table add-transition + s2 s0 eps table add-transition + s2 s3 eps table add-transition + s1 s3 eps table add-transition + s2 s3 2array stack push ] ; + +M: concatenation nfa-node ( node -- ) + seq>> + [ [ nfa-node ] each ] + [ length 1- [ concatenate-nodes ] times ] bi ; + +M: alternation nfa-node ( node -- ) + seq>> + [ [ nfa-node ] each ] + [ length 1- [ alternate-nodes ] times ] bi ; + +M: constant nfa-node ( node -- ) + char>> literal-transition add-simple-entry ; + +M: epsilon nfa-node ( node -- ) + drop eps literal-transition add-simple-entry ; + +M: word nfa-node ( node -- ) + class-transition add-simple-entry ; + +M: character-class-range nfa-node ( node -- ) + class-transition add-simple-entry ; + +M: capture-group nfa-node ( node -- ) + term>> nfa-node ; + +M: negation nfa-node ( node -- ) + negation-mode inc + term>> nfa-node + negation-mode dec ; + +: construct-nfa ( regexp -- ) + [ + reset-regexp + negation-mode off + [ current-regexp set ] + [ parse-tree>> nfa-node ] + [ set-start-state ] tri + ] with-scope ; diff --git a/unfinished/regexp2/parser/parser-tests.factor b/unfinished/regexp2/parser/parser-tests.factor new file mode 100644 index 0000000000..9dc7dc7909 --- /dev/null +++ b/unfinished/regexp2/parser/parser-tests.factor @@ -0,0 +1,33 @@ +USING: kernel tools.test regexp2.backend regexp2 ; +IN: regexp2.parser + +: test-regexp ( string -- ) + default-regexp parse-regexp ; + +: test-regexp2 ( string -- regexp ) + default-regexp dup parse-regexp ; + +[ "(" ] [ unmatched-parentheses? ] must-fail-with + +[ ] [ "a|b" test-regexp ] unit-test +[ ] [ "a.b" test-regexp ] unit-test +[ ] [ "a|b|c" test-regexp ] unit-test +[ ] [ "abc|b" test-regexp ] unit-test +[ ] [ "a|bcd" test-regexp ] unit-test +[ ] [ "a|(b)" test-regexp ] unit-test +[ ] [ "(a)|b" test-regexp ] unit-test +[ ] [ "(a|b)" test-regexp ] unit-test +[ ] [ "((a)|(b))" test-regexp ] unit-test + +[ ] [ "(?:a)" test-regexp ] unit-test +[ ] [ "(?i:a)" test-regexp ] unit-test +[ ] [ "(?-i:a)" test-regexp ] unit-test +[ "(?z:a)" test-regexp ] [ bad-option? ] must-fail-with +[ "(?-z:a)" test-regexp ] [ bad-option? ] must-fail-with + +[ ] [ "(?=a)" test-regexp ] unit-test + +[ ] [ "[abc]" test-regexp ] unit-test +[ ] [ "[a-c]" test-regexp ] unit-test +[ ] [ "[^a-c]" test-regexp ] unit-test +[ "[^]" test-regexp ] must-fail diff --git a/unfinished/regexp2/parser/parser.factor b/unfinished/regexp2/parser/parser.factor new file mode 100644 index 0000000000..6eda3310d0 --- /dev/null +++ b/unfinished/regexp2/parser/parser.factor @@ -0,0 +1,391 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs combinators io io.streams.string +kernel math math.parser multi-methods namespaces qualified sets +quotations sequences sequences.lib splitting symbols vectors +dlists math.order combinators.lib unicode.categories strings +sequences.lib regexp2.backend regexp2.utils unicode.case ; +IN: regexp2.parser + +FROM: math.ranges => [a,b] ; + +MIXIN: node +TUPLE: concatenation seq ; INSTANCE: concatenation node +TUPLE: alternation seq ; INSTANCE: alternation node +TUPLE: kleene-star term ; INSTANCE: kleene-star node +TUPLE: question term ; INSTANCE: question node +TUPLE: negation term ; INSTANCE: negation node +TUPLE: constant char ; INSTANCE: constant node +TUPLE: range from to ; INSTANCE: range node +TUPLE: lookahead term ; INSTANCE: lookahead node +TUPLE: lookbehind term ; INSTANCE: lookbehind node +TUPLE: capture-group term ; INSTANCE: capture-group node +TUPLE: non-capture-group term ; INSTANCE: non-capture-group node +TUPLE: independent-group term ; INSTANCE: independent-group node +TUPLE: character-class-range from to ; INSTANCE: character-class-range node +SINGLETON: epsilon INSTANCE: epsilon node +SINGLETON: any-char INSTANCE: any-char node +SINGLETON: front-anchor INSTANCE: front-anchor node +SINGLETON: back-anchor INSTANCE: back-anchor node + +TUPLE: option-on option ; INSTANCE: option-on node +TUPLE: option-off option ; INSTANCE: option-off node +SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ; + +SINGLETONS: letter-class LETTER-class Letter-class digit-class +alpha-class non-newline-blank-class +ascii-class punctuation-class java-printable-class blank-class +control-character-class hex-digit-class java-blank-class c-identifier-class +unmatchable-class ; + +SINGLETONS: beginning-of-group end-of-group +beginning-of-character-class end-of-character-class +left-parenthesis pipe caret dash ; + +: get-option ( option -- ? ) current-regexp get options>> at ; +: get-unix-lines ( -- ? ) unix-lines get-option ; +: get-dotall ( -- ? ) dotall get-option ; +: get-multiline ( -- ? ) multiline get-option ; +: get-comments ( -- ? ) comments get-option ; +: get-case-insensitive ( -- ? ) case-insensitive get-option ; +: get-unicode-case ( -- ? ) unicode-case get-option ; +: get-reversed-regexp ( -- ? ) reversed-regexp get-option ; + +: ( obj -- negation ) negation boa ; +: ( seq -- concatenation ) + >vector get-reversed-regexp [ reverse ] when + concatenation boa ; +: ( seq -- alternation ) >vector alternation boa ; +: ( obj -- capture-group ) capture-group boa ; +: ( obj -- kleene-star ) kleene-star boa ; +: ( obj -- constant ) + dup Letter? get-case-insensitive and [ + [ ch>lower constant boa ] + [ ch>upper constant boa ] bi 2array + ] [ + constant boa + ] if ; + +: first|concatenation ( seq -- first/concatenation ) + dup length 1 = [ first ] [ ] if ; + +: first|alternation ( seq -- first/alternation ) + dup length 1 = [ first ] [ ] if ; + +: ( from to -- obj ) + 2dup [ Letter? ] bi@ or get-case-insensitive and [ + [ [ ch>lower ] bi@ character-class-range boa ] + [ [ ch>upper ] bi@ character-class-range boa ] 2bi + 2array [ [ from>> ] [ to>> ] bi < ] filter + [ unmatchable-class ] [ first|alternation ] if-empty + ] [ + 2dup < + [ character-class-range boa ] [ 2drop unmatchable-class ] if + ] if ; + +ERROR: unmatched-parentheses ; + +: make-positive-lookahead ( string -- ) + lookahead boa push-stack ; + +: make-negative-lookahead ( string -- ) + lookahead boa push-stack ; + +: make-independent-group ( string -- ) + #! no backtracking + independent-group boa push-stack ; + +: make-positive-lookbehind ( string -- ) + lookbehind boa push-stack ; + +: make-negative-lookbehind ( string -- ) + lookbehind boa push-stack ; + +DEFER: nested-parse-regexp +: make-non-capturing-group ( string -- ) + non-capture-group boa push-stack ; + +ERROR: bad-option ch ; + +: option ( ch -- singleton ) + { + { CHAR: i [ case-insensitive ] } + { CHAR: d [ unix-lines ] } + { CHAR: m [ multiline ] } + { CHAR: r [ reversed-regexp ] } + { CHAR: s [ dotall ] } + { CHAR: u [ unicode-case ] } + { CHAR: x [ comments ] } + [ bad-option ] + } case ; + +: option-on ( option -- ) current-regexp get options>> conjoin ; +: option-off ( option -- ) current-regexp get options>> delete-at ; + +: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ; +: (parse-options) ( string ? -- ) [ toggle-option ] curry each ; + +: parse-options ( string -- ) + "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; + +DEFER: (parse-regexp) +: parse-special-group ( -- ) + beginning-of-group push-stack + (parse-regexp) pop-stack make-non-capturing-group ; + +ERROR: bad-special-group string ; + +: (parse-special-group) ( -- ) + read1 { + { [ dup CHAR: : = ] + [ drop nested-parse-regexp pop-stack make-non-capturing-group ] } + { [ dup CHAR: = = ] + [ drop nested-parse-regexp pop-stack make-positive-lookahead ] } + { [ dup CHAR: = = ] + [ drop nested-parse-regexp pop-stack make-negative-lookahead ] } + { [ dup CHAR: > = ] + [ drop nested-parse-regexp pop-stack make-independent-group ] } + { [ dup CHAR: < = peek1 CHAR: = = and ] + [ drop read1 drop nested-parse-regexp pop-stack make-positive-lookbehind ] } + { [ dup CHAR: < = peek1 CHAR: ! = and ] + [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] } + [ + ":)" read-until + [ swap prefix ] dip + { + { CHAR: : [ parse-options parse-special-group ] } + { CHAR: ) [ parse-options ] } + [ drop bad-special-group ] + } case + ] + } cond ; + +: handle-left-parenthesis ( -- ) + peek1 CHAR: ? = + [ read1 drop (parse-special-group) ] + [ nested-parse-regexp ] if ; + +: handle-dot ( -- ) any-char push-stack ; +: handle-pipe ( -- ) pipe push-stack ; +: handle-star ( -- ) stack pop push-stack ; +: handle-question ( -- ) + stack pop epsilon 2array push-stack ; +: handle-plus ( -- ) + stack pop dup 2array push-stack ; + +ERROR: unmatched-brace ; +: parse-repetition ( -- start finish ? ) + "}" read-until [ unmatched-brace ] unless + [ "," split1 [ string>number ] bi@ ] + [ CHAR: , swap index >boolean ] bi ; + +: replicate/concatenate ( n obj -- obj' ) + over zero? [ 2drop epsilon ] + [ first|concatenation ] if ; + +: exactly-n ( n -- ) + stack pop replicate/concatenate push-stack ; + +: at-least-n ( n -- ) + stack pop + [ replicate/concatenate ] keep + 2array push-stack ; + +: at-most-n ( n -- ) + 1+ + stack pop + [ replicate/concatenate ] curry map push-stack ; + +: from-m-to-n ( m n -- ) + [a,b] + stack pop + [ replicate/concatenate ] curry map + push-stack ; + +ERROR: invalid-range a b ; + +: handle-left-brace ( -- ) + parse-repetition + >r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r> + [ + 2dup and [ from-m-to-n ] + [ [ nip at-most-n ] [ at-least-n ] if* ] if + ] [ drop 0 max exactly-n ] if ; + +: handle-front-anchor ( -- ) front-anchor push-stack ; +: handle-back-anchor ( -- ) back-anchor push-stack ; + +ERROR: bad-character-class obj ; +ERROR: expected-posix-class ; + +: parse-posix-class ( -- obj ) + read1 CHAR: { = [ expected-posix-class ] unless + "}" read-until [ bad-character-class ] unless + { + { "Lower" [ get-case-insensitive Letter-class letter-class ? ] } + { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] } + { "Alpha" [ Letter-class ] } + { "ASCII" [ ascii-class ] } + { "Digit" [ digit-class ] } + { "Alnum" [ alpha-class ] } + { "Punct" [ punctuation-class ] } + { "Graph" [ java-printable-class ] } + { "Print" [ java-printable-class ] } + { "Blank" [ non-newline-blank-class ] } + { "Cntrl" [ control-character-class ] } + { "XDigit" [ hex-digit-class ] } + { "Space" [ java-blank-class ] } + ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss + [ bad-character-class ] + } case ; + +: parse-octal ( -- n ) 3 read oct> check-octal ; +: parse-short-hex ( -- n ) 2 read hex> check-hex ; +: parse-long-hex ( -- n ) 6 read hex> check-hex ; +: parse-control-character ( -- n ) read1 ; + +ERROR: bad-escaped-literals seq ; +: parse-escaped-literals ( -- obj ) + "\\E" read-until [ bad-escaped-literals ] unless + read1 drop + [ epsilon ] [ + [ ] V{ } map-as + first|concatenation + ] if-empty ; + +: parse-escaped ( -- obj ) + read1 + { + { CHAR: \ [ CHAR: \ ] } + { CHAR: . [ CHAR: . ] } + { CHAR: t [ CHAR: \t ] } + { CHAR: n [ CHAR: \n ] } + { CHAR: r [ CHAR: \r ] } + { CHAR: f [ HEX: c ] } + { CHAR: a [ HEX: 7 ] } + { CHAR: e [ HEX: 1b ] } + + { CHAR: d [ digit-class ] } + { CHAR: D [ digit-class ] } + { CHAR: s [ java-blank-class ] } + { CHAR: S [ java-blank-class ] } + { CHAR: w [ c-identifier-class ] } + { CHAR: W [ c-identifier-class ] } + + { CHAR: p [ parse-posix-class ] } + { CHAR: P [ parse-posix-class ] } + { CHAR: x [ parse-short-hex ] } + { CHAR: u [ parse-long-hex ] } + { CHAR: 0 [ parse-octal ] } + { CHAR: c [ parse-control-character ] } + + ! { CHAR: b [ handle-word-boundary ] } + ! { CHAR: B [ handle-word-boundary ] } + ! { CHAR: A [ handle-beginning-of-input ] } + ! { CHAR: G [ end of previous match ] } + ! { CHAR: Z [ handle-end-of-input ] } + ! { CHAR: z [ handle-end-of-input ] } ! except for terminator + + { CHAR: Q [ parse-escaped-literals ] } + } case ; + +: handle-escape ( -- ) parse-escaped push-stack ; + +: handle-dash ( vector -- vector' ) + H{ { dash CHAR: - } } substitute ; + +: character-class>alternation ( seq -- alternation ) + [ dup number? [ ] when ] map first|alternation ; + +: handle-caret ( vector -- vector' ) + dup [ length 2 >= ] [ first caret eq? ] bi and [ + rest-slice character-class>alternation + ] [ + character-class>alternation + ] if ; + +: make-character-class ( -- character-class ) + [ beginning-of-character-class swap cut-stack ] change-whole-stack + handle-dash handle-caret ; + +: apply-dash ( -- ) + stack [ pop3 nip ] keep push ; + +: apply-dash? ( -- ? ) + stack dup length 3 >= + [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ; + +ERROR: empty-negated-character-class ; +DEFER: handle-left-bracket +: (parse-character-class) ( -- ) + read1 [ empty-negated-character-class ] unless* { + { CHAR: [ [ handle-left-bracket t ] } + { CHAR: ] [ make-character-class push-stack f ] } + { CHAR: - [ dash push-stack t ] } + { CHAR: \ [ parse-escaped push-stack t ] } + [ push-stack apply-dash? [ apply-dash ] when t ] + } case + [ (parse-character-class) ] when ; + +: parse-character-class-second ( -- ) + read1 { + { CHAR: [ [ CHAR: [ push-stack ] } + { CHAR: ] [ CHAR: ] push-stack ] } + { CHAR: - [ CHAR: - push-stack ] } + [ push1 ] + } case ; + +: parse-character-class-first ( -- ) + read1 { + { CHAR: ^ [ caret push-stack parse-character-class-second ] } + { CHAR: [ [ CHAR: [ push-stack ] } + { CHAR: ] [ CHAR: ] push-stack ] } + { CHAR: - [ CHAR: - push-stack ] } + [ push1 ] + } case ; + +: handle-left-bracket ( -- ) + beginning-of-character-class push-stack + parse-character-class-first (parse-character-class) ; + +: finish-regexp-parse ( stack -- obj ) + { pipe } split + [ first|concatenation ] map first|alternation ; + +: handle-right-parenthesis ( -- ) + stack beginning-of-group over last-index cut rest + [ current-regexp get swap >>stack drop ] + [ finish-regexp-parse push-stack ] bi* ; + +: nested-parse-regexp ( -- ) + beginning-of-group push-stack (parse-regexp) ; + +: ((parse-regexp)) ( token -- ) + { + { CHAR: . [ handle-dot ] } + { CHAR: ( [ handle-left-parenthesis ] } + { CHAR: ) [ handle-right-parenthesis ] } + { CHAR: | [ handle-pipe ] } + { CHAR: ? [ handle-question ] } + { CHAR: * [ handle-star ] } + { CHAR: + [ handle-plus ] } + { CHAR: { [ handle-left-brace ] } + { CHAR: [ [ handle-left-bracket ] } + { CHAR: ^ [ handle-front-anchor ] } + { CHAR: $ [ handle-back-anchor ] } + { CHAR: \ [ handle-escape ] } + [ push-stack ] + } case ; + +: (parse-regexp) ( -- ) + read1 [ ((parse-regexp)) (parse-regexp) ] when* ; + +: parse-regexp ( regexp -- ) + dup current-regexp [ + raw>> [ + [ (parse-regexp) ] with-input-stream + ] unless-empty + current-regexp get + stack finish-regexp-parse + >>parse-tree drop + ] with-variable ; diff --git a/unfinished/regexp2/regexp2-docs.factor b/unfinished/regexp2/regexp2-docs.factor new file mode 100644 index 0000000000..f903c14bc4 --- /dev/null +++ b/unfinished/regexp2/regexp2-docs.factor @@ -0,0 +1,14 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: kernel strings help.markup help.syntax regexp2.backend ; +IN: regexp2 + +HELP: +{ $values { "string" string } { "regexp" regexp } } +{ $description "Compiles a regular expression into a DFA and returns this object. Regular expressions only have to be compiled once and can then be used multiple times to match input strings." } ; + +HELP: +{ $values { "string" string } { "regexp" regexp } } +{ $description "Compiles a case-insensitive regular expression into a DFA and returns this object. Otherwise, exactly the same as " { $link } } ; + +{ } related-words diff --git a/unfinished/regexp2/regexp2-tests.factor b/unfinished/regexp2/regexp2-tests.factor new file mode 100644 index 0000000000..54626ea165 --- /dev/null +++ b/unfinished/regexp2/regexp2-tests.factor @@ -0,0 +1,263 @@ +USING: regexp2 tools.test kernel sequences regexp2.parser +regexp2.traversal ; +IN: regexp2-tests + +[ f ] [ "b" "a*" matches? ] unit-test +[ t ] [ "" "a*" matches? ] unit-test +[ t ] [ "a" "a*" matches? ] unit-test +[ t ] [ "aaaaaaa" "a*" matches? ] unit-test +[ f ] [ "ab" "a*" matches? ] unit-test + +[ t ] [ "abc" "abc" matches? ] unit-test +[ t ] [ "a" "a|b|c" matches? ] unit-test +[ t ] [ "b" "a|b|c" matches? ] unit-test +[ t ] [ "c" "a|b|c" matches? ] unit-test +[ f ] [ "c" "d|e|f" matches? ] unit-test + +[ f ] [ "aa" "a|b|c" matches? ] unit-test +[ f ] [ "bb" "a|b|c" matches? ] unit-test +[ f ] [ "cc" "a|b|c" matches? ] unit-test +[ f ] [ "cc" "d|e|f" matches? ] unit-test + +[ f ] [ "" "a+" matches? ] unit-test +[ t ] [ "a" "a+" matches? ] unit-test +[ t ] [ "aa" "a+" matches? ] unit-test + +[ t ] [ "" "a?" matches? ] unit-test +[ t ] [ "a" "a?" matches? ] unit-test +[ f ] [ "aa" "a?" matches? ] unit-test + +[ f ] [ "" "." matches? ] unit-test +[ t ] [ "a" "." matches? ] unit-test +[ t ] [ "." "." matches? ] unit-test +! [ f ] [ "\n" "." matches? ] unit-test + +[ f ] [ "" ".+" matches? ] unit-test +[ t ] [ "a" ".+" matches? ] unit-test +[ t ] [ "ab" ".+" matches? ] unit-test + + +[ t ] [ "" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "a" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "c" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "cc" "a|b*|c+|d?" matches? ] unit-test +[ f ] [ "ccd" "a|b*|c+|d?" matches? ] unit-test +[ t ] [ "d" "a|b*|c+|d?" matches? ] unit-test + +[ t ] [ "foo" "foo|bar" matches? ] unit-test +[ t ] [ "bar" "foo|bar" matches? ] unit-test +[ f ] [ "foobar" "foo|bar" matches? ] unit-test + +[ f ] [ "" "(a)" matches? ] unit-test +[ t ] [ "a" "(a)" matches? ] unit-test +[ f ] [ "aa" "(a)" matches? ] unit-test +[ t ] [ "aa" "(a*)" matches? ] unit-test + +[ f ] [ "aababaaabbac" "(a|b)+" matches? ] unit-test +[ t ] [ "ababaaabba" "(a|b)+" matches? ] unit-test + +[ f ] [ "" "a{1}" matches? ] unit-test +[ t ] [ "a" "a{1}" matches? ] unit-test +[ f ] [ "aa" "a{1}" matches? ] unit-test + +[ f ] [ "a" "a{2,}" matches? ] unit-test +[ t ] [ "aaa" "a{2,}" matches? ] unit-test +[ t ] [ "aaaa" "a{2,}" matches? ] unit-test +[ t ] [ "aaaaa" "a{2,}" matches? ] unit-test + +[ t ] [ "" "a{,2}" matches? ] unit-test +[ t ] [ "a" "a{,2}" matches? ] unit-test +[ t ] [ "aa" "a{,2}" matches? ] unit-test +[ f ] [ "aaa" "a{,2}" matches? ] unit-test +[ f ] [ "aaaa" "a{,2}" matches? ] unit-test +[ f ] [ "aaaaa" "a{,2}" matches? ] unit-test + +[ f ] [ "" "a{1,3}" matches? ] unit-test +[ t ] [ "a" "a{1,3}" matches? ] unit-test +[ t ] [ "aa" "a{1,3}" matches? ] unit-test +[ t ] [ "aaa" "a{1,3}" matches? ] unit-test +[ f ] [ "aaaa" "a{1,3}" matches? ] unit-test + +[ f ] [ "" "[a]" matches? ] unit-test +[ t ] [ "a" "[a]" matches? ] unit-test +[ t ] [ "a" "[abc]" matches? ] unit-test +[ f ] [ "b" "[a]" matches? ] unit-test +[ f ] [ "d" "[abc]" matches? ] unit-test +[ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test + +[ f ] [ "" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^abc]" matches? ] unit-test +[ t ] [ "b" "[^a]" matches? ] unit-test +[ t ] [ "d" "[^abc]" matches? ] unit-test +[ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test + +[ t ] [ "]" "[]]" matches? ] unit-test +[ f ] [ "]" "[^]]" matches? ] unit-test +[ t ] [ "a" "[^]]" matches? ] unit-test + +[ "^" "[^]" matches? ] must-fail +[ t ] [ "^" "[]^]" matches? ] unit-test +[ t ] [ "]" "[]^]" matches? ] unit-test + +[ t ] [ "[" "[[]" matches? ] unit-test +[ f ] [ "^" "[^^]" matches? ] unit-test +[ t ] [ "a" "[^^]" matches? ] unit-test + +[ t ] [ "-" "[-]" matches? ] unit-test +[ f ] [ "a" "[-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test + +[ t ] [ "-" "[-a]" matches? ] unit-test +[ t ] [ "a" "[-a]" matches? ] unit-test +[ t ] [ "-" "[a-]" matches? ] unit-test +[ t ] [ "a" "[a-]" matches? ] unit-test +[ f ] [ "b" "[a-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test + +[ f ] [ "-" "[a-c]" matches? ] unit-test +[ t ] [ "-" "[^a-c]" matches? ] unit-test +[ t ] [ "b" "[a-c]" matches? ] unit-test +[ f ] [ "b" "[^a-c]" matches? ] unit-test + +[ t ] [ "-" "[a-c-]" matches? ] unit-test +[ f ] [ "-" "[^a-c-]" matches? ] unit-test + +[ t ] [ "\\" "[\\\\]" matches? ] unit-test +[ f ] [ "a" "[\\\\]" matches? ] unit-test +[ f ] [ "\\" "[^\\\\]" matches? ] unit-test +[ t ] [ "a" "[^\\\\]" matches? ] unit-test + +[ t ] [ "0" "[\\d]" matches? ] unit-test +[ f ] [ "a" "[\\d]" matches? ] unit-test +[ f ] [ "0" "[^\\d]" matches? ] unit-test +[ t ] [ "a" "[^\\d]" matches? ] unit-test + +[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" matches? ] unit-test +[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" matches? ] unit-test +[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" matches? ] unit-test + +[ t ] [ "1000" "\\d{4,6}" matches? ] unit-test +[ t ] [ "1000" "[0-9]{4,6}" matches? ] unit-test + +[ t ] [ "abc" "\\p{Lower}{3}" matches? ] unit-test +[ f ] [ "ABC" "\\p{Lower}{3}" matches? ] unit-test +[ t ] [ "ABC" "\\p{Upper}{3}" matches? ] unit-test +[ f ] [ "abc" "\\p{Upper}{3}" matches? ] unit-test +! +[ f ] [ "abc" "[\\p{Upper}]{3}" matches? ] unit-test +[ t ] [ "ABC" "[\\p{Upper}]{3}" matches? ] unit-test + +[ f ] [ "" "\\Q\\E" matches? ] unit-test +[ f ] [ "a" "\\Q\\E" matches? ] unit-test +[ t ] [ "|*+" "\\Q|*+\\E" matches? ] unit-test +[ f ] [ "abc" "\\Q|*+\\E" matches? ] unit-test +[ t ] [ "s" "\\Qs\\E" matches? ] unit-test + +[ t ] [ "S" "\\0123" matches? ] unit-test +[ t ] [ "SXY" "\\0123XY" matches? ] unit-test +[ t ] [ "x" "\\x78" matches? ] unit-test +[ f ] [ "y" "\\x78" matches? ] unit-test +[ t ] [ "x" "\\u000078" matches? ] unit-test +[ f ] [ "y" "\\u000078" matches? ] unit-test + +[ t ] [ "ab" "a+b" matches? ] unit-test +[ f ] [ "b" "a+b" matches? ] unit-test +[ t ] [ "aab" "a+b" matches? ] unit-test +[ f ] [ "abb" "a+b" matches? ] unit-test + +[ t ] [ "abbbb" "ab*" matches? ] unit-test +[ t ] [ "a" "ab*" matches? ] unit-test +[ f ] [ "abab" "ab*" matches? ] unit-test + +[ f ] [ "x" "\\." matches? ] unit-test +[ t ] [ "." "\\." matches? ] unit-test + +[ t ] [ "aaaab" "a+ab" matches? ] unit-test +[ f ] [ "aaaxb" "a+ab" matches? ] unit-test +[ t ] [ "aaacb" "a+cb" matches? ] unit-test +[ f ] [ "aaaab" "a++ab" matches? ] unit-test +[ t ] [ "aaacb" "a++cb" matches? ] unit-test + +[ 3 ] [ "aaacb" "a*" match-head ] unit-test +[ 1 ] [ "aaacb" "a+?" match-head ] unit-test +[ 2 ] [ "aaacb" "aa?" match-head ] unit-test +[ 1 ] [ "aaacb" "aa??" match-head ] unit-test +[ 3 ] [ "aacb" "aa?c" match-head ] unit-test +[ 3 ] [ "aacb" "aa??c" match-head ] unit-test + +! [ t ] [ "aaa" "AAA" t matches? ] unit-test +! [ f ] [ "aax" "AAA" t matches? ] unit-test +! [ t ] [ "aaa" "A*" t matches? ] unit-test +! [ f ] [ "aaba" "A*" t matches? ] unit-test +! [ t ] [ "b" "[AB]" t matches? ] unit-test +! [ f ] [ "c" "[AB]" t matches? ] unit-test +! [ t ] [ "c" "[A-Z]" t matches? ] unit-test +! [ f ] [ "3" "[A-Z]" t matches? ] unit-test + +[ ] [ + "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" + drop +] unit-test + +[ "{Lower}" ] [ invalid-range? ] must-fail-with + +[ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test +[ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test + +! [ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test +! [ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test + +! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test +! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test + +! [ 3 ] [ "foo bar" "foo\\b" match-head ] unit-test +! [ f ] [ "fooxbar" "foo\\b" matches? ] unit-test +! [ t ] [ "foo" "foo\\b" matches? ] unit-test +! [ t ] [ "foo bar" "foo\\b bar" matches? ] unit-test +! [ f ] [ "fooxbar" "foo\\bxbar" matches? ] unit-test +! [ f ] [ "foo" "foo\\bbar" matches? ] unit-test + +! [ f ] [ "foo bar" "foo\\B" matches? ] unit-test +! [ 3 ] [ "fooxbar" "foo\\B" match-head ] unit-test +! [ t ] [ "foo" "foo\\B" matches? ] unit-test +! [ f ] [ "foo bar" "foo\\B bar" matches? ] unit-test +! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test +! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test + +[ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test +[ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test +[ t ] [ ".o" "\\.[a-z]" matches? ] unit-test + +[ t ] [ "a" "(?i)a" matches? ] unit-test +[ t ] [ "a" "(?i)a" matches? ] unit-test +[ t ] [ "A" "(?i)a" matches? ] unit-test +[ t ] [ "A" "(?i)a" matches? ] unit-test + +[ t ] [ "a" "(?-i)a" matches? ] unit-test +[ t ] [ "a" "(?-i)a" matches? ] unit-test +[ f ] [ "A" "(?-i)a" matches? ] unit-test +[ f ] [ "A" "(?-i)a" matches? ] unit-test + +[ f ] [ "A" "[a-z]" matches? ] unit-test +[ t ] [ "A" "[a-z]" matches? ] unit-test + +[ f ] [ "A" "\\p{Lower}" matches? ] unit-test +[ t ] [ "A" "\\p{Lower}" matches? ] unit-test + +[ t ] [ "abc" "abc" matches? ] unit-test +[ t ] [ "abc" "a[bB][cC]" matches? ] unit-test +[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" matches? ] unit-test + +! Bug in parsing word +! [ t ] [ "a" R' a' matches? ] unit-test + +! ((A)(B(C))) +! 1. ((A)(B(C))) +! 2. (A) +! 3. (B(C)) +! 4. (C) diff --git a/unfinished/regexp2/regexp2.factor b/unfinished/regexp2/regexp2.factor new file mode 100644 index 0000000000..0b8994ca2b --- /dev/null +++ b/unfinished/regexp2/regexp2.factor @@ -0,0 +1,59 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors combinators kernel math math.ranges +sequences regexp2.backend regexp2.utils memoize sets +regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal +regexp2.transition-tables ; +IN: regexp2 + +: default-regexp ( string -- regexp ) + regexp new + swap >>raw + >>nfa-table + >>dfa-table + >>minimized-table + reset-regexp ; + +: construct-regexp ( regexp -- regexp' ) + { + [ parse-regexp ] + [ construct-nfa ] + [ construct-dfa ] + [ ] + } cleave ; + +: match ( string regexp -- pair ) + do-match return-match ; + +: matches? ( string regexp -- ? ) + dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ; + +: match-head ( string regexp -- end ) match length>> 1- ; + +: initial-option ( regexp option -- regexp' ) + over options>> conjoin ; + +: ( string -- regexp ) + default-regexp construct-regexp ; + +: ( string -- regexp ) + default-regexp + case-insensitive initial-option + construct-regexp ; + +: ( string -- regexp ) + default-regexp + reversed-regexp initial-option + construct-regexp ; + +: R! CHAR: ! ; parsing +: R" CHAR: " ; parsing +: R# CHAR: # ; parsing +: R' CHAR: ' ; parsing +: R( CHAR: ) ; parsing +: R/ CHAR: / ; parsing +: R@ CHAR: @ ; parsing +: R[ CHAR: ] ; parsing +: R` CHAR: ` ; parsing +: R{ CHAR: } ; parsing +: R| CHAR: | ; parsing diff --git a/unfinished/regexp2/summary.txt b/unfinished/regexp2/summary.txt new file mode 100644 index 0000000000..aa1e1c27a9 --- /dev/null +++ b/unfinished/regexp2/summary.txt @@ -0,0 +1 @@ +Regular expressions diff --git a/unfinished/regexp2/tags.txt b/unfinished/regexp2/tags.txt new file mode 100644 index 0000000000..65bc471f6b --- /dev/null +++ b/unfinished/regexp2/tags.txt @@ -0,0 +1,2 @@ +parsing +text diff --git a/unfinished/regexp2/transition-tables/transition-tables.factor b/unfinished/regexp2/transition-tables/transition-tables.factor new file mode 100644 index 0000000000..0547846655 --- /dev/null +++ b/unfinished/regexp2/transition-tables/transition-tables.factor @@ -0,0 +1,44 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs fry hashtables kernel sequences +vectors ; +IN: regexp2.transition-tables + +: insert-at ( value key hash -- ) + 2dup at* [ + 2nip push + ] [ + drop >r >r dup vector? [ 1vector ] unless r> r> set-at + ] if ; + +: ?insert-at ( value key hash/f -- hash ) + [ H{ } clone ] unless* [ insert-at ] keep ; + +TUPLE: transition from to obj ; +TUPLE: literal-transition < transition ; +TUPLE: class-transition < transition ; +TUPLE: default-transition < transition ; + +TUPLE: literal obj ; +TUPLE: class obj ; +TUPLE: default ; +: ( from to obj -- transition ) literal-transition boa ; +: ( from to obj -- transition ) class-transition boa ; +: ( from to -- transition ) t default-transition boa ; + +TUPLE: transition-table transitions + literals classes defaults + start-state final-states ; + +: ( -- transition-table ) + transition-table new + H{ } clone >>transitions + H{ } clone >>final-states ; + +: set-transition ( transition hash -- ) + >r [ to>> ] [ obj>> ] [ from>> ] tri r> + 2dup at* [ 2nip insert-at ] + [ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ; + +: add-transition ( transition transition-table -- ) + transitions>> set-transition ; diff --git a/unfinished/regexp2/traversal/traversal.factor b/unfinished/regexp2/traversal/traversal.factor new file mode 100644 index 0000000000..94e96bb935 --- /dev/null +++ b/unfinished/regexp2/traversal/traversal.factor @@ -0,0 +1,80 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors assocs combinators combinators.lib kernel +math math.ranges quotations sequences regexp2.parser +regexp2.classes combinators.short-circuit assocs.lib +sequences.lib ; +IN: regexp2.traversal + +TUPLE: dfa-traverser + dfa-table + last-state current-state + text + start-index current-index + matches ; + +: ( text regexp -- match ) + dfa-table>> + dfa-traverser new + swap [ start-state>> >>current-state ] keep + >>dfa-table + swap >>text + 0 >>start-index + 0 >>current-index + V{ } clone >>matches ; + +: final-state? ( dfa-traverser -- ? ) + [ current-state>> ] [ dfa-table>> final-states>> ] bi + key? ; + +: text-finished? ( dfa-traverser -- ? ) + [ current-index>> ] [ text>> length ] bi >= ; + +: save-final-state ( dfa-straverser -- ) + [ current-index>> ] [ matches>> ] bi push ; + +: match-done? ( dfa-traverser -- ? ) + dup final-state? [ + dup save-final-state + ] when text-finished? ; + +: increment-state ( dfa-traverser state -- dfa-traverser ) + >r [ 1+ ] change-current-index + dup current-state>> >>last-state r> + first >>current-state ; + +: match-failed ( dfa-traverser -- dfa-traverser ) + V{ } clone >>matches ; + +: match-literal ( transition from-state table -- to-state/f ) + transitions>> [ at ] [ 2drop f ] if-at ; + +: assoc-with ( param assoc quot -- assoc curry ) + swapd [ [ -rot ] dip call ] 2curry ; inline + +: match-class ( transition from-state table -- to-state/f ) + transitions>> at* [ + [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if + ] [ drop ] if ; + +: match-default ( transition from-state table -- to-state/f ) + [ nip ] dip transitions>> + [ t swap [ drop f ] unless-at ] [ drop f ] if-at ; + +: match-transition ( obj from-state dfa -- to-state/f ) + { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; + +: setup-match ( match -- obj state dfa-table ) + { current-index>> text>> current-state>> dfa-table>> } get-slots + [ nth ] 2dip ; + +: do-match ( dfa-traverser -- dfa-traverser ) + dup match-done? [ + dup setup-match match-transition + [ increment-state do-match ] when* + ] unless ; + +: return-match ( dfa-traverser -- interval/f ) + dup matches>> + [ drop f ] + [ [ start-index>> ] [ peek ] bi* 1 ] if-empty ; diff --git a/unfinished/regexp2/utils/utils.factor b/unfinished/regexp2/utils/utils.factor new file mode 100644 index 0000000000..0167e73005 --- /dev/null +++ b/unfinished/regexp2/utils/utils.factor @@ -0,0 +1,69 @@ +! Copyright (C) 2008 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs combinators.lib io kernel +math math.order namespaces regexp2.backend sequences +sequences.lib unicode.categories math.ranges fry +combinators.short-circuit ; +IN: regexp2.utils + +: (while-changes) ( obj quot pred pred-ret -- obj ) + ! quot: ( obj -- obj' ) + ! pred: ( obj -- <=> ) + >r >r dup slip r> pick over call r> dupd = + [ 3drop ] [ (while-changes) ] if ; inline + +: while-changes ( obj quot pred -- obj' ) + pick over call (while-changes) ; inline + +: last-state ( regexp -- range ) stack>> peek first2 [a,b] ; +: push1 ( obj -- ) input-stream get stream>> push ; +: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ; +: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ; + +: stack ( -- obj ) current-regexp get stack>> ; +: change-whole-stack ( quot -- ) + current-regexp get + [ stack>> swap call ] keep (>>stack) ; inline +: push-stack ( obj -- ) stack push ; +: pop-stack ( -- obj ) stack pop ; +: cut-out ( vector n -- vector' vector ) cut rest ; +ERROR: cut-stack-error ; +: cut-stack ( obj vector -- vector' vector ) + tuck last-index [ cut-stack-error ] unless* cut-out swap ; + +ERROR: bad-octal number ; +ERROR: bad-hex number ; +: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ; +: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ; + +: ascii? ( n -- ? ) 0 HEX: 7f between? ; +: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; +: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; + +: hex-digit? ( n -- ? ) + [ + [ decimal-digit? ] + [ CHAR: a CHAR: f between? ] + [ CHAR: A CHAR: F between? ] + ] 1|| ; + +: control-char? ( n -- ? ) + [ + [ 0 HEX: 1f between? ] + [ HEX: 7f = ] + ] 1|| ; + +: punct? ( n -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + +: c-identifier-char? ( ch -- ? ) + [ [ alpha? ] [ CHAR: _ = ] ] 1|| ; + +: java-blank? ( n -- ? ) + { + CHAR: \s CHAR: \t CHAR: \n + HEX: b HEX: 7 CHAR: \r + } member? ; + +: java-printable? ( n -- ? ) + [ [ alpha? ] [ punct? ] ] 1|| ; From c9ea133b16a8f33a28946b2212a0736487ed8a51 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Wed, 27 Aug 2008 00:02:53 -0500 Subject: [PATCH 2/2] Remove --- extra/regexp2/authors.txt | 1 - extra/regexp2/backend/backend.factor | 25 -- extra/regexp2/classes/classes.factor | 55 --- extra/regexp2/dfa/dfa.factor | 70 ---- extra/regexp2/nfa/nfa.factor | 126 ------ extra/regexp2/parser/parser-tests.factor | 33 -- extra/regexp2/parser/parser.factor | 391 ------------------ extra/regexp2/regexp2-docs.factor | 14 - extra/regexp2/regexp2-tests.factor | 263 ------------ extra/regexp2/regexp2.factor | 59 --- extra/regexp2/summary.txt | 1 - extra/regexp2/tags.txt | 2 - .../transition-tables.factor | 44 -- extra/regexp2/traversal/traversal.factor | 80 ---- extra/regexp2/utils/utils.factor | 69 ---- 15 files changed, 1233 deletions(-) delete mode 100644 extra/regexp2/authors.txt delete mode 100644 extra/regexp2/backend/backend.factor delete mode 100644 extra/regexp2/classes/classes.factor delete mode 100644 extra/regexp2/dfa/dfa.factor delete mode 100644 extra/regexp2/nfa/nfa.factor delete mode 100644 extra/regexp2/parser/parser-tests.factor delete mode 100644 extra/regexp2/parser/parser.factor delete mode 100644 extra/regexp2/regexp2-docs.factor delete mode 100644 extra/regexp2/regexp2-tests.factor delete mode 100644 extra/regexp2/regexp2.factor delete mode 100644 extra/regexp2/summary.txt delete mode 100644 extra/regexp2/tags.txt delete mode 100644 extra/regexp2/transition-tables/transition-tables.factor delete mode 100644 extra/regexp2/traversal/traversal.factor delete mode 100644 extra/regexp2/utils/utils.factor diff --git a/extra/regexp2/authors.txt b/extra/regexp2/authors.txt deleted file mode 100644 index 7c1b2f2279..0000000000 --- a/extra/regexp2/authors.txt +++ /dev/null @@ -1 +0,0 @@ -Doug Coleman diff --git a/extra/regexp2/backend/backend.factor b/extra/regexp2/backend/backend.factor deleted file mode 100644 index c39d67e7b8..0000000000 --- a/extra/regexp2/backend/backend.factor +++ /dev/null @@ -1,25 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors hashtables kernel math state-tables vars vectors ; -IN: regexp2.backend - -TUPLE: regexp - raw - { stack vector } - parse-tree - { options hashtable } - nfa-table - dfa-table - minimized-table - { state integer } - { new-states vector } - { visited-states hashtable } ; - -: reset-regexp ( regexp -- regexp ) - 0 >>state - V{ } clone >>stack - V{ } clone >>new-states - H{ } clone >>options - H{ } clone >>visited-states ; - -SYMBOL: current-regexp diff --git a/extra/regexp2/classes/classes.factor b/extra/regexp2/classes/classes.factor deleted file mode 100644 index 7737e02d40..0000000000 --- a/extra/regexp2/classes/classes.factor +++ /dev/null @@ -1,55 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors kernel math math.order symbols regexp2.parser -words regexp2.utils unicode.categories combinators.short-circuit ; -IN: regexp2.classes - -GENERIC: class-member? ( obj class -- ? ) - -M: word class-member? ( obj class -- ? ) 2drop f ; -M: integer class-member? ( obj class -- ? ) 2drop f ; - -M: character-class-range class-member? ( obj class -- ? ) - [ from>> ] [ to>> ] bi between? ; - -M: any-char class-member? ( obj class -- ? ) - 2drop t ; - -M: letter-class class-member? ( obj class -- ? ) - drop letter? ; - -M: LETTER-class class-member? ( obj class -- ? ) - drop LETTER? ; - -M: Letter-class class-member? ( obj class -- ? ) - drop Letter? ; - -M: ascii-class class-member? ( obj class -- ? ) - drop ascii? ; - -M: digit-class class-member? ( obj class -- ? ) - drop digit? ; - -M: alpha-class class-member? ( obj class -- ? ) - drop alpha? ; - -M: punctuation-class class-member? ( obj class -- ? ) - drop punct? ; - -M: java-printable-class class-member? ( obj class -- ? ) - drop java-printable? ; - -M: non-newline-blank-class class-member? ( obj class -- ? ) - drop { [ blank? ] [ CHAR: \n = not ] } 1&& ; - -M: control-character-class class-member? ( obj class -- ? ) - drop control-char? ; - -M: hex-digit-class class-member? ( obj class -- ? ) - drop hex-digit? ; - -M: java-blank-class class-member? ( obj class -- ? ) - drop java-blank? ; - -M: unmatchable-class class-member? ( obj class -- ? ) - 2drop f ; diff --git a/extra/regexp2/dfa/dfa.factor b/extra/regexp2/dfa/dfa.factor deleted file mode 100644 index 0dcf6c4ab5..0000000000 --- a/extra/regexp2/dfa/dfa.factor +++ /dev/null @@ -1,70 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs combinators fry kernel locals -math math.order regexp2.nfa regexp2.transition-tables sequences -sets sorting vectors regexp2.utils sequences.lib ; -USING: io prettyprint threads ; -IN: regexp2.dfa - -: find-delta ( states transition regexp -- new-states ) - nfa-table>> transitions>> - rot [ swap at at ] with with map sift concat prune ; - -: (find-epsilon-closure) ( states regexp -- new-states ) - eps swap find-delta ; - -: find-epsilon-closure ( states regexp -- new-states ) - '[ dup , (find-epsilon-closure) union ] [ length ] while-changes - natural-sort ; - -: find-closure ( states transition regexp -- new-states ) - [ find-delta ] 2keep nip find-epsilon-closure ; - -: find-start-state ( regexp -- state ) - [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ; - -: find-transitions ( seq1 regexp -- seq2 ) - nfa-table>> transitions>> - [ at keys ] curry map concat eps swap remove ; - -: add-todo-state ( state regexp -- ) - 2dup visited-states>> key? [ - 2drop - ] [ - [ visited-states>> conjoin ] - [ new-states>> push ] 2bi - ] if ; - -: new-transitions ( regexp -- ) - dup new-states>> [ - drop - ] [ - dupd pop dup pick find-transitions rot - [ - [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep - >r swapd transition boa r> dfa-table>> add-transition - ] curry with each - new-transitions - ] if-empty ; - -: states ( hashtable -- array ) - [ keys ] - [ values [ values concat ] map concat append ] bi ; - -: set-final-states ( regexp -- ) - dup - [ nfa-table>> final-states>> keys ] - [ dfa-table>> transitions>> states ] bi - [ intersect empty? not ] with filter - - swap dfa-table>> final-states>> - [ conjoin ] curry each ; - -: set-initial-state ( regexp -- ) - dup - [ dfa-table>> ] [ find-start-state ] bi - [ >>start-state drop ] keep - 1vector >>new-states drop ; - -: construct-dfa ( regexp -- ) - [ set-initial-state ] [ new-transitions ] [ set-final-states ] tri ; diff --git a/extra/regexp2/nfa/nfa.factor b/extra/regexp2/nfa/nfa.factor deleted file mode 100644 index f87a2a7b52..0000000000 --- a/extra/regexp2/nfa/nfa.factor +++ /dev/null @@ -1,126 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs grouping kernel regexp2.backend -locals math namespaces regexp2.parser sequences state-tables fry -quotations math.order math.ranges vectors unicode.categories -regexp2.utils regexp2.transition-tables words sequences.lib ; -IN: regexp2.nfa - -SYMBOL: negation-mode -: negated? ( -- ? ) negation-mode get 0 or odd? ; - -SINGLETON: eps - -: next-state ( regexp -- state ) - [ state>> ] [ [ 1+ ] change-state drop ] bi ; - -: set-start-state ( regexp -- ) - dup stack>> [ - drop - ] [ - [ nfa-table>> ] [ pop first ] bi* >>start-state drop - ] if-empty ; - -GENERIC: nfa-node ( node -- ) - -:: add-simple-entry ( obj class -- ) - [let* | regexp [ current-regexp get ] - s0 [ regexp next-state ] - s1 [ regexp next-state ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] | - negated? [ - s0 f obj class boa table add-transition - s0 s1 table add-transition - ] [ - s0 s1 obj class boa table add-transition - ] if - s0 s1 2array stack push - t s1 table final-states>> set-at ] ; - -:: concatenate-nodes ( -- ) - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] - s2 [ stack peek first ] - s3 [ stack pop second ] - s0 [ stack peek first ] - s1 [ stack pop second ] | - s1 s2 eps table add-transition - s1 table final-states>> delete-at - s0 s3 2array stack push ] ; - -:: alternate-nodes ( -- ) - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] - table [ regexp nfa-table>> ] - s2 [ stack peek first ] - s3 [ stack pop second ] - s0 [ stack peek first ] - s1 [ stack pop second ] - s4 [ regexp next-state ] - s5 [ regexp next-state ] | - s4 s0 eps table add-transition - s4 s2 eps table add-transition - s1 s5 eps table add-transition - s3 s5 eps table add-transition - s1 table final-states>> delete-at - s3 table final-states>> delete-at - t s5 table final-states>> set-at - s4 s5 2array stack push ] ; - -M: kleene-star nfa-node ( node -- ) - term>> nfa-node - [let* | regexp [ current-regexp get ] - stack [ regexp stack>> ] - s0 [ stack peek first ] - s1 [ stack pop second ] - s2 [ regexp next-state ] - s3 [ regexp next-state ] - table [ regexp nfa-table>> ] | - s1 table final-states>> delete-at - t s3 table final-states>> set-at - s1 s0 eps table add-transition - s2 s0 eps table add-transition - s2 s3 eps table add-transition - s1 s3 eps table add-transition - s2 s3 2array stack push ] ; - -M: concatenation nfa-node ( node -- ) - seq>> - [ [ nfa-node ] each ] - [ length 1- [ concatenate-nodes ] times ] bi ; - -M: alternation nfa-node ( node -- ) - seq>> - [ [ nfa-node ] each ] - [ length 1- [ alternate-nodes ] times ] bi ; - -M: constant nfa-node ( node -- ) - char>> literal-transition add-simple-entry ; - -M: epsilon nfa-node ( node -- ) - drop eps literal-transition add-simple-entry ; - -M: word nfa-node ( node -- ) - class-transition add-simple-entry ; - -M: character-class-range nfa-node ( node -- ) - class-transition add-simple-entry ; - -M: capture-group nfa-node ( node -- ) - term>> nfa-node ; - -M: negation nfa-node ( node -- ) - negation-mode inc - term>> nfa-node - negation-mode dec ; - -: construct-nfa ( regexp -- ) - [ - reset-regexp - negation-mode off - [ current-regexp set ] - [ parse-tree>> nfa-node ] - [ set-start-state ] tri - ] with-scope ; diff --git a/extra/regexp2/parser/parser-tests.factor b/extra/regexp2/parser/parser-tests.factor deleted file mode 100644 index 9dc7dc7909..0000000000 --- a/extra/regexp2/parser/parser-tests.factor +++ /dev/null @@ -1,33 +0,0 @@ -USING: kernel tools.test regexp2.backend regexp2 ; -IN: regexp2.parser - -: test-regexp ( string -- ) - default-regexp parse-regexp ; - -: test-regexp2 ( string -- regexp ) - default-regexp dup parse-regexp ; - -[ "(" ] [ unmatched-parentheses? ] must-fail-with - -[ ] [ "a|b" test-regexp ] unit-test -[ ] [ "a.b" test-regexp ] unit-test -[ ] [ "a|b|c" test-regexp ] unit-test -[ ] [ "abc|b" test-regexp ] unit-test -[ ] [ "a|bcd" test-regexp ] unit-test -[ ] [ "a|(b)" test-regexp ] unit-test -[ ] [ "(a)|b" test-regexp ] unit-test -[ ] [ "(a|b)" test-regexp ] unit-test -[ ] [ "((a)|(b))" test-regexp ] unit-test - -[ ] [ "(?:a)" test-regexp ] unit-test -[ ] [ "(?i:a)" test-regexp ] unit-test -[ ] [ "(?-i:a)" test-regexp ] unit-test -[ "(?z:a)" test-regexp ] [ bad-option? ] must-fail-with -[ "(?-z:a)" test-regexp ] [ bad-option? ] must-fail-with - -[ ] [ "(?=a)" test-regexp ] unit-test - -[ ] [ "[abc]" test-regexp ] unit-test -[ ] [ "[a-c]" test-regexp ] unit-test -[ ] [ "[^a-c]" test-regexp ] unit-test -[ "[^]" test-regexp ] must-fail diff --git a/extra/regexp2/parser/parser.factor b/extra/regexp2/parser/parser.factor deleted file mode 100644 index 6eda3310d0..0000000000 --- a/extra/regexp2/parser/parser.factor +++ /dev/null @@ -1,391 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs combinators io io.streams.string -kernel math math.parser multi-methods namespaces qualified sets -quotations sequences sequences.lib splitting symbols vectors -dlists math.order combinators.lib unicode.categories strings -sequences.lib regexp2.backend regexp2.utils unicode.case ; -IN: regexp2.parser - -FROM: math.ranges => [a,b] ; - -MIXIN: node -TUPLE: concatenation seq ; INSTANCE: concatenation node -TUPLE: alternation seq ; INSTANCE: alternation node -TUPLE: kleene-star term ; INSTANCE: kleene-star node -TUPLE: question term ; INSTANCE: question node -TUPLE: negation term ; INSTANCE: negation node -TUPLE: constant char ; INSTANCE: constant node -TUPLE: range from to ; INSTANCE: range node -TUPLE: lookahead term ; INSTANCE: lookahead node -TUPLE: lookbehind term ; INSTANCE: lookbehind node -TUPLE: capture-group term ; INSTANCE: capture-group node -TUPLE: non-capture-group term ; INSTANCE: non-capture-group node -TUPLE: independent-group term ; INSTANCE: independent-group node -TUPLE: character-class-range from to ; INSTANCE: character-class-range node -SINGLETON: epsilon INSTANCE: epsilon node -SINGLETON: any-char INSTANCE: any-char node -SINGLETON: front-anchor INSTANCE: front-anchor node -SINGLETON: back-anchor INSTANCE: back-anchor node - -TUPLE: option-on option ; INSTANCE: option-on node -TUPLE: option-off option ; INSTANCE: option-off node -SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ; - -SINGLETONS: letter-class LETTER-class Letter-class digit-class -alpha-class non-newline-blank-class -ascii-class punctuation-class java-printable-class blank-class -control-character-class hex-digit-class java-blank-class c-identifier-class -unmatchable-class ; - -SINGLETONS: beginning-of-group end-of-group -beginning-of-character-class end-of-character-class -left-parenthesis pipe caret dash ; - -: get-option ( option -- ? ) current-regexp get options>> at ; -: get-unix-lines ( -- ? ) unix-lines get-option ; -: get-dotall ( -- ? ) dotall get-option ; -: get-multiline ( -- ? ) multiline get-option ; -: get-comments ( -- ? ) comments get-option ; -: get-case-insensitive ( -- ? ) case-insensitive get-option ; -: get-unicode-case ( -- ? ) unicode-case get-option ; -: get-reversed-regexp ( -- ? ) reversed-regexp get-option ; - -: ( obj -- negation ) negation boa ; -: ( seq -- concatenation ) - >vector get-reversed-regexp [ reverse ] when - concatenation boa ; -: ( seq -- alternation ) >vector alternation boa ; -: ( obj -- capture-group ) capture-group boa ; -: ( obj -- kleene-star ) kleene-star boa ; -: ( obj -- constant ) - dup Letter? get-case-insensitive and [ - [ ch>lower constant boa ] - [ ch>upper constant boa ] bi 2array - ] [ - constant boa - ] if ; - -: first|concatenation ( seq -- first/concatenation ) - dup length 1 = [ first ] [ ] if ; - -: first|alternation ( seq -- first/alternation ) - dup length 1 = [ first ] [ ] if ; - -: ( from to -- obj ) - 2dup [ Letter? ] bi@ or get-case-insensitive and [ - [ [ ch>lower ] bi@ character-class-range boa ] - [ [ ch>upper ] bi@ character-class-range boa ] 2bi - 2array [ [ from>> ] [ to>> ] bi < ] filter - [ unmatchable-class ] [ first|alternation ] if-empty - ] [ - 2dup < - [ character-class-range boa ] [ 2drop unmatchable-class ] if - ] if ; - -ERROR: unmatched-parentheses ; - -: make-positive-lookahead ( string -- ) - lookahead boa push-stack ; - -: make-negative-lookahead ( string -- ) - lookahead boa push-stack ; - -: make-independent-group ( string -- ) - #! no backtracking - independent-group boa push-stack ; - -: make-positive-lookbehind ( string -- ) - lookbehind boa push-stack ; - -: make-negative-lookbehind ( string -- ) - lookbehind boa push-stack ; - -DEFER: nested-parse-regexp -: make-non-capturing-group ( string -- ) - non-capture-group boa push-stack ; - -ERROR: bad-option ch ; - -: option ( ch -- singleton ) - { - { CHAR: i [ case-insensitive ] } - { CHAR: d [ unix-lines ] } - { CHAR: m [ multiline ] } - { CHAR: r [ reversed-regexp ] } - { CHAR: s [ dotall ] } - { CHAR: u [ unicode-case ] } - { CHAR: x [ comments ] } - [ bad-option ] - } case ; - -: option-on ( option -- ) current-regexp get options>> conjoin ; -: option-off ( option -- ) current-regexp get options>> delete-at ; - -: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ; -: (parse-options) ( string ? -- ) [ toggle-option ] curry each ; - -: parse-options ( string -- ) - "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; - -DEFER: (parse-regexp) -: parse-special-group ( -- ) - beginning-of-group push-stack - (parse-regexp) pop-stack make-non-capturing-group ; - -ERROR: bad-special-group string ; - -: (parse-special-group) ( -- ) - read1 { - { [ dup CHAR: : = ] - [ drop nested-parse-regexp pop-stack make-non-capturing-group ] } - { [ dup CHAR: = = ] - [ drop nested-parse-regexp pop-stack make-positive-lookahead ] } - { [ dup CHAR: = = ] - [ drop nested-parse-regexp pop-stack make-negative-lookahead ] } - { [ dup CHAR: > = ] - [ drop nested-parse-regexp pop-stack make-independent-group ] } - { [ dup CHAR: < = peek1 CHAR: = = and ] - [ drop read1 drop nested-parse-regexp pop-stack make-positive-lookbehind ] } - { [ dup CHAR: < = peek1 CHAR: ! = and ] - [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] } - [ - ":)" read-until - [ swap prefix ] dip - { - { CHAR: : [ parse-options parse-special-group ] } - { CHAR: ) [ parse-options ] } - [ drop bad-special-group ] - } case - ] - } cond ; - -: handle-left-parenthesis ( -- ) - peek1 CHAR: ? = - [ read1 drop (parse-special-group) ] - [ nested-parse-regexp ] if ; - -: handle-dot ( -- ) any-char push-stack ; -: handle-pipe ( -- ) pipe push-stack ; -: handle-star ( -- ) stack pop push-stack ; -: handle-question ( -- ) - stack pop epsilon 2array push-stack ; -: handle-plus ( -- ) - stack pop dup 2array push-stack ; - -ERROR: unmatched-brace ; -: parse-repetition ( -- start finish ? ) - "}" read-until [ unmatched-brace ] unless - [ "," split1 [ string>number ] bi@ ] - [ CHAR: , swap index >boolean ] bi ; - -: replicate/concatenate ( n obj -- obj' ) - over zero? [ 2drop epsilon ] - [ first|concatenation ] if ; - -: exactly-n ( n -- ) - stack pop replicate/concatenate push-stack ; - -: at-least-n ( n -- ) - stack pop - [ replicate/concatenate ] keep - 2array push-stack ; - -: at-most-n ( n -- ) - 1+ - stack pop - [ replicate/concatenate ] curry map push-stack ; - -: from-m-to-n ( m n -- ) - [a,b] - stack pop - [ replicate/concatenate ] curry map - push-stack ; - -ERROR: invalid-range a b ; - -: handle-left-brace ( -- ) - parse-repetition - >r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r> - [ - 2dup and [ from-m-to-n ] - [ [ nip at-most-n ] [ at-least-n ] if* ] if - ] [ drop 0 max exactly-n ] if ; - -: handle-front-anchor ( -- ) front-anchor push-stack ; -: handle-back-anchor ( -- ) back-anchor push-stack ; - -ERROR: bad-character-class obj ; -ERROR: expected-posix-class ; - -: parse-posix-class ( -- obj ) - read1 CHAR: { = [ expected-posix-class ] unless - "}" read-until [ bad-character-class ] unless - { - { "Lower" [ get-case-insensitive Letter-class letter-class ? ] } - { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] } - { "Alpha" [ Letter-class ] } - { "ASCII" [ ascii-class ] } - { "Digit" [ digit-class ] } - { "Alnum" [ alpha-class ] } - { "Punct" [ punctuation-class ] } - { "Graph" [ java-printable-class ] } - { "Print" [ java-printable-class ] } - { "Blank" [ non-newline-blank-class ] } - { "Cntrl" [ control-character-class ] } - { "XDigit" [ hex-digit-class ] } - { "Space" [ java-blank-class ] } - ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss - [ bad-character-class ] - } case ; - -: parse-octal ( -- n ) 3 read oct> check-octal ; -: parse-short-hex ( -- n ) 2 read hex> check-hex ; -: parse-long-hex ( -- n ) 6 read hex> check-hex ; -: parse-control-character ( -- n ) read1 ; - -ERROR: bad-escaped-literals seq ; -: parse-escaped-literals ( -- obj ) - "\\E" read-until [ bad-escaped-literals ] unless - read1 drop - [ epsilon ] [ - [ ] V{ } map-as - first|concatenation - ] if-empty ; - -: parse-escaped ( -- obj ) - read1 - { - { CHAR: \ [ CHAR: \ ] } - { CHAR: . [ CHAR: . ] } - { CHAR: t [ CHAR: \t ] } - { CHAR: n [ CHAR: \n ] } - { CHAR: r [ CHAR: \r ] } - { CHAR: f [ HEX: c ] } - { CHAR: a [ HEX: 7 ] } - { CHAR: e [ HEX: 1b ] } - - { CHAR: d [ digit-class ] } - { CHAR: D [ digit-class ] } - { CHAR: s [ java-blank-class ] } - { CHAR: S [ java-blank-class ] } - { CHAR: w [ c-identifier-class ] } - { CHAR: W [ c-identifier-class ] } - - { CHAR: p [ parse-posix-class ] } - { CHAR: P [ parse-posix-class ] } - { CHAR: x [ parse-short-hex ] } - { CHAR: u [ parse-long-hex ] } - { CHAR: 0 [ parse-octal ] } - { CHAR: c [ parse-control-character ] } - - ! { CHAR: b [ handle-word-boundary ] } - ! { CHAR: B [ handle-word-boundary ] } - ! { CHAR: A [ handle-beginning-of-input ] } - ! { CHAR: G [ end of previous match ] } - ! { CHAR: Z [ handle-end-of-input ] } - ! { CHAR: z [ handle-end-of-input ] } ! except for terminator - - { CHAR: Q [ parse-escaped-literals ] } - } case ; - -: handle-escape ( -- ) parse-escaped push-stack ; - -: handle-dash ( vector -- vector' ) - H{ { dash CHAR: - } } substitute ; - -: character-class>alternation ( seq -- alternation ) - [ dup number? [ ] when ] map first|alternation ; - -: handle-caret ( vector -- vector' ) - dup [ length 2 >= ] [ first caret eq? ] bi and [ - rest-slice character-class>alternation - ] [ - character-class>alternation - ] if ; - -: make-character-class ( -- character-class ) - [ beginning-of-character-class swap cut-stack ] change-whole-stack - handle-dash handle-caret ; - -: apply-dash ( -- ) - stack [ pop3 nip ] keep push ; - -: apply-dash? ( -- ? ) - stack dup length 3 >= - [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ; - -ERROR: empty-negated-character-class ; -DEFER: handle-left-bracket -: (parse-character-class) ( -- ) - read1 [ empty-negated-character-class ] unless* { - { CHAR: [ [ handle-left-bracket t ] } - { CHAR: ] [ make-character-class push-stack f ] } - { CHAR: - [ dash push-stack t ] } - { CHAR: \ [ parse-escaped push-stack t ] } - [ push-stack apply-dash? [ apply-dash ] when t ] - } case - [ (parse-character-class) ] when ; - -: parse-character-class-second ( -- ) - read1 { - { CHAR: [ [ CHAR: [ push-stack ] } - { CHAR: ] [ CHAR: ] push-stack ] } - { CHAR: - [ CHAR: - push-stack ] } - [ push1 ] - } case ; - -: parse-character-class-first ( -- ) - read1 { - { CHAR: ^ [ caret push-stack parse-character-class-second ] } - { CHAR: [ [ CHAR: [ push-stack ] } - { CHAR: ] [ CHAR: ] push-stack ] } - { CHAR: - [ CHAR: - push-stack ] } - [ push1 ] - } case ; - -: handle-left-bracket ( -- ) - beginning-of-character-class push-stack - parse-character-class-first (parse-character-class) ; - -: finish-regexp-parse ( stack -- obj ) - { pipe } split - [ first|concatenation ] map first|alternation ; - -: handle-right-parenthesis ( -- ) - stack beginning-of-group over last-index cut rest - [ current-regexp get swap >>stack drop ] - [ finish-regexp-parse push-stack ] bi* ; - -: nested-parse-regexp ( -- ) - beginning-of-group push-stack (parse-regexp) ; - -: ((parse-regexp)) ( token -- ) - { - { CHAR: . [ handle-dot ] } - { CHAR: ( [ handle-left-parenthesis ] } - { CHAR: ) [ handle-right-parenthesis ] } - { CHAR: | [ handle-pipe ] } - { CHAR: ? [ handle-question ] } - { CHAR: * [ handle-star ] } - { CHAR: + [ handle-plus ] } - { CHAR: { [ handle-left-brace ] } - { CHAR: [ [ handle-left-bracket ] } - { CHAR: ^ [ handle-front-anchor ] } - { CHAR: $ [ handle-back-anchor ] } - { CHAR: \ [ handle-escape ] } - [ push-stack ] - } case ; - -: (parse-regexp) ( -- ) - read1 [ ((parse-regexp)) (parse-regexp) ] when* ; - -: parse-regexp ( regexp -- ) - dup current-regexp [ - raw>> [ - [ (parse-regexp) ] with-input-stream - ] unless-empty - current-regexp get - stack finish-regexp-parse - >>parse-tree drop - ] with-variable ; diff --git a/extra/regexp2/regexp2-docs.factor b/extra/regexp2/regexp2-docs.factor deleted file mode 100644 index f903c14bc4..0000000000 --- a/extra/regexp2/regexp2-docs.factor +++ /dev/null @@ -1,14 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: kernel strings help.markup help.syntax regexp2.backend ; -IN: regexp2 - -HELP: -{ $values { "string" string } { "regexp" regexp } } -{ $description "Compiles a regular expression into a DFA and returns this object. Regular expressions only have to be compiled once and can then be used multiple times to match input strings." } ; - -HELP: -{ $values { "string" string } { "regexp" regexp } } -{ $description "Compiles a case-insensitive regular expression into a DFA and returns this object. Otherwise, exactly the same as " { $link } } ; - -{ } related-words diff --git a/extra/regexp2/regexp2-tests.factor b/extra/regexp2/regexp2-tests.factor deleted file mode 100644 index 54626ea165..0000000000 --- a/extra/regexp2/regexp2-tests.factor +++ /dev/null @@ -1,263 +0,0 @@ -USING: regexp2 tools.test kernel sequences regexp2.parser -regexp2.traversal ; -IN: regexp2-tests - -[ f ] [ "b" "a*" matches? ] unit-test -[ t ] [ "" "a*" matches? ] unit-test -[ t ] [ "a" "a*" matches? ] unit-test -[ t ] [ "aaaaaaa" "a*" matches? ] unit-test -[ f ] [ "ab" "a*" matches? ] unit-test - -[ t ] [ "abc" "abc" matches? ] unit-test -[ t ] [ "a" "a|b|c" matches? ] unit-test -[ t ] [ "b" "a|b|c" matches? ] unit-test -[ t ] [ "c" "a|b|c" matches? ] unit-test -[ f ] [ "c" "d|e|f" matches? ] unit-test - -[ f ] [ "aa" "a|b|c" matches? ] unit-test -[ f ] [ "bb" "a|b|c" matches? ] unit-test -[ f ] [ "cc" "a|b|c" matches? ] unit-test -[ f ] [ "cc" "d|e|f" matches? ] unit-test - -[ f ] [ "" "a+" matches? ] unit-test -[ t ] [ "a" "a+" matches? ] unit-test -[ t ] [ "aa" "a+" matches? ] unit-test - -[ t ] [ "" "a?" matches? ] unit-test -[ t ] [ "a" "a?" matches? ] unit-test -[ f ] [ "aa" "a?" matches? ] unit-test - -[ f ] [ "" "." matches? ] unit-test -[ t ] [ "a" "." matches? ] unit-test -[ t ] [ "." "." matches? ] unit-test -! [ f ] [ "\n" "." matches? ] unit-test - -[ f ] [ "" ".+" matches? ] unit-test -[ t ] [ "a" ".+" matches? ] unit-test -[ t ] [ "ab" ".+" matches? ] unit-test - - -[ t ] [ "" "a|b*|c+|d?" matches? ] unit-test -[ t ] [ "a" "a|b*|c+|d?" matches? ] unit-test -[ t ] [ "c" "a|b*|c+|d?" matches? ] unit-test -[ t ] [ "cc" "a|b*|c+|d?" matches? ] unit-test -[ f ] [ "ccd" "a|b*|c+|d?" matches? ] unit-test -[ t ] [ "d" "a|b*|c+|d?" matches? ] unit-test - -[ t ] [ "foo" "foo|bar" matches? ] unit-test -[ t ] [ "bar" "foo|bar" matches? ] unit-test -[ f ] [ "foobar" "foo|bar" matches? ] unit-test - -[ f ] [ "" "(a)" matches? ] unit-test -[ t ] [ "a" "(a)" matches? ] unit-test -[ f ] [ "aa" "(a)" matches? ] unit-test -[ t ] [ "aa" "(a*)" matches? ] unit-test - -[ f ] [ "aababaaabbac" "(a|b)+" matches? ] unit-test -[ t ] [ "ababaaabba" "(a|b)+" matches? ] unit-test - -[ f ] [ "" "a{1}" matches? ] unit-test -[ t ] [ "a" "a{1}" matches? ] unit-test -[ f ] [ "aa" "a{1}" matches? ] unit-test - -[ f ] [ "a" "a{2,}" matches? ] unit-test -[ t ] [ "aaa" "a{2,}" matches? ] unit-test -[ t ] [ "aaaa" "a{2,}" matches? ] unit-test -[ t ] [ "aaaaa" "a{2,}" matches? ] unit-test - -[ t ] [ "" "a{,2}" matches? ] unit-test -[ t ] [ "a" "a{,2}" matches? ] unit-test -[ t ] [ "aa" "a{,2}" matches? ] unit-test -[ f ] [ "aaa" "a{,2}" matches? ] unit-test -[ f ] [ "aaaa" "a{,2}" matches? ] unit-test -[ f ] [ "aaaaa" "a{,2}" matches? ] unit-test - -[ f ] [ "" "a{1,3}" matches? ] unit-test -[ t ] [ "a" "a{1,3}" matches? ] unit-test -[ t ] [ "aa" "a{1,3}" matches? ] unit-test -[ t ] [ "aaa" "a{1,3}" matches? ] unit-test -[ f ] [ "aaaa" "a{1,3}" matches? ] unit-test - -[ f ] [ "" "[a]" matches? ] unit-test -[ t ] [ "a" "[a]" matches? ] unit-test -[ t ] [ "a" "[abc]" matches? ] unit-test -[ f ] [ "b" "[a]" matches? ] unit-test -[ f ] [ "d" "[abc]" matches? ] unit-test -[ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test -[ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test - -[ f ] [ "" "[^a]" matches? ] unit-test -[ f ] [ "a" "[^a]" matches? ] unit-test -[ f ] [ "a" "[^abc]" matches? ] unit-test -[ t ] [ "b" "[^a]" matches? ] unit-test -[ t ] [ "d" "[^abc]" matches? ] unit-test -[ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test -[ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test - -[ t ] [ "]" "[]]" matches? ] unit-test -[ f ] [ "]" "[^]]" matches? ] unit-test -[ t ] [ "a" "[^]]" matches? ] unit-test - -[ "^" "[^]" matches? ] must-fail -[ t ] [ "^" "[]^]" matches? ] unit-test -[ t ] [ "]" "[]^]" matches? ] unit-test - -[ t ] [ "[" "[[]" matches? ] unit-test -[ f ] [ "^" "[^^]" matches? ] unit-test -[ t ] [ "a" "[^^]" matches? ] unit-test - -[ t ] [ "-" "[-]" matches? ] unit-test -[ f ] [ "a" "[-]" matches? ] unit-test -[ f ] [ "-" "[^-]" matches? ] unit-test -[ t ] [ "a" "[^-]" matches? ] unit-test - -[ t ] [ "-" "[-a]" matches? ] unit-test -[ t ] [ "a" "[-a]" matches? ] unit-test -[ t ] [ "-" "[a-]" matches? ] unit-test -[ t ] [ "a" "[a-]" matches? ] unit-test -[ f ] [ "b" "[a-]" matches? ] unit-test -[ f ] [ "-" "[^-]" matches? ] unit-test -[ t ] [ "a" "[^-]" matches? ] unit-test - -[ f ] [ "-" "[a-c]" matches? ] unit-test -[ t ] [ "-" "[^a-c]" matches? ] unit-test -[ t ] [ "b" "[a-c]" matches? ] unit-test -[ f ] [ "b" "[^a-c]" matches? ] unit-test - -[ t ] [ "-" "[a-c-]" matches? ] unit-test -[ f ] [ "-" "[^a-c-]" matches? ] unit-test - -[ t ] [ "\\" "[\\\\]" matches? ] unit-test -[ f ] [ "a" "[\\\\]" matches? ] unit-test -[ f ] [ "\\" "[^\\\\]" matches? ] unit-test -[ t ] [ "a" "[^\\\\]" matches? ] unit-test - -[ t ] [ "0" "[\\d]" matches? ] unit-test -[ f ] [ "a" "[\\d]" matches? ] unit-test -[ f ] [ "0" "[^\\d]" matches? ] unit-test -[ t ] [ "a" "[^\\d]" matches? ] unit-test - -[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" matches? ] unit-test -[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" matches? ] unit-test -[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" matches? ] unit-test - -[ t ] [ "1000" "\\d{4,6}" matches? ] unit-test -[ t ] [ "1000" "[0-9]{4,6}" matches? ] unit-test - -[ t ] [ "abc" "\\p{Lower}{3}" matches? ] unit-test -[ f ] [ "ABC" "\\p{Lower}{3}" matches? ] unit-test -[ t ] [ "ABC" "\\p{Upper}{3}" matches? ] unit-test -[ f ] [ "abc" "\\p{Upper}{3}" matches? ] unit-test -! -[ f ] [ "abc" "[\\p{Upper}]{3}" matches? ] unit-test -[ t ] [ "ABC" "[\\p{Upper}]{3}" matches? ] unit-test - -[ f ] [ "" "\\Q\\E" matches? ] unit-test -[ f ] [ "a" "\\Q\\E" matches? ] unit-test -[ t ] [ "|*+" "\\Q|*+\\E" matches? ] unit-test -[ f ] [ "abc" "\\Q|*+\\E" matches? ] unit-test -[ t ] [ "s" "\\Qs\\E" matches? ] unit-test - -[ t ] [ "S" "\\0123" matches? ] unit-test -[ t ] [ "SXY" "\\0123XY" matches? ] unit-test -[ t ] [ "x" "\\x78" matches? ] unit-test -[ f ] [ "y" "\\x78" matches? ] unit-test -[ t ] [ "x" "\\u000078" matches? ] unit-test -[ f ] [ "y" "\\u000078" matches? ] unit-test - -[ t ] [ "ab" "a+b" matches? ] unit-test -[ f ] [ "b" "a+b" matches? ] unit-test -[ t ] [ "aab" "a+b" matches? ] unit-test -[ f ] [ "abb" "a+b" matches? ] unit-test - -[ t ] [ "abbbb" "ab*" matches? ] unit-test -[ t ] [ "a" "ab*" matches? ] unit-test -[ f ] [ "abab" "ab*" matches? ] unit-test - -[ f ] [ "x" "\\." matches? ] unit-test -[ t ] [ "." "\\." matches? ] unit-test - -[ t ] [ "aaaab" "a+ab" matches? ] unit-test -[ f ] [ "aaaxb" "a+ab" matches? ] unit-test -[ t ] [ "aaacb" "a+cb" matches? ] unit-test -[ f ] [ "aaaab" "a++ab" matches? ] unit-test -[ t ] [ "aaacb" "a++cb" matches? ] unit-test - -[ 3 ] [ "aaacb" "a*" match-head ] unit-test -[ 1 ] [ "aaacb" "a+?" match-head ] unit-test -[ 2 ] [ "aaacb" "aa?" match-head ] unit-test -[ 1 ] [ "aaacb" "aa??" match-head ] unit-test -[ 3 ] [ "aacb" "aa?c" match-head ] unit-test -[ 3 ] [ "aacb" "aa??c" match-head ] unit-test - -! [ t ] [ "aaa" "AAA" t matches? ] unit-test -! [ f ] [ "aax" "AAA" t matches? ] unit-test -! [ t ] [ "aaa" "A*" t matches? ] unit-test -! [ f ] [ "aaba" "A*" t matches? ] unit-test -! [ t ] [ "b" "[AB]" t matches? ] unit-test -! [ f ] [ "c" "[AB]" t matches? ] unit-test -! [ t ] [ "c" "[A-Z]" t matches? ] unit-test -! [ f ] [ "3" "[A-Z]" t matches? ] unit-test - -[ ] [ - "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))" - drop -] unit-test - -[ "{Lower}" ] [ invalid-range? ] must-fail-with - -[ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test -[ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test - -! [ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test -! [ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test - -! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test -! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test - -! [ 3 ] [ "foo bar" "foo\\b" match-head ] unit-test -! [ f ] [ "fooxbar" "foo\\b" matches? ] unit-test -! [ t ] [ "foo" "foo\\b" matches? ] unit-test -! [ t ] [ "foo bar" "foo\\b bar" matches? ] unit-test -! [ f ] [ "fooxbar" "foo\\bxbar" matches? ] unit-test -! [ f ] [ "foo" "foo\\bbar" matches? ] unit-test - -! [ f ] [ "foo bar" "foo\\B" matches? ] unit-test -! [ 3 ] [ "fooxbar" "foo\\B" match-head ] unit-test -! [ t ] [ "foo" "foo\\B" matches? ] unit-test -! [ f ] [ "foo bar" "foo\\B bar" matches? ] unit-test -! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test -! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test - -[ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test -[ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test -[ t ] [ ".o" "\\.[a-z]" matches? ] unit-test - -[ t ] [ "a" "(?i)a" matches? ] unit-test -[ t ] [ "a" "(?i)a" matches? ] unit-test -[ t ] [ "A" "(?i)a" matches? ] unit-test -[ t ] [ "A" "(?i)a" matches? ] unit-test - -[ t ] [ "a" "(?-i)a" matches? ] unit-test -[ t ] [ "a" "(?-i)a" matches? ] unit-test -[ f ] [ "A" "(?-i)a" matches? ] unit-test -[ f ] [ "A" "(?-i)a" matches? ] unit-test - -[ f ] [ "A" "[a-z]" matches? ] unit-test -[ t ] [ "A" "[a-z]" matches? ] unit-test - -[ f ] [ "A" "\\p{Lower}" matches? ] unit-test -[ t ] [ "A" "\\p{Lower}" matches? ] unit-test - -[ t ] [ "abc" "abc" matches? ] unit-test -[ t ] [ "abc" "a[bB][cC]" matches? ] unit-test -[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" matches? ] unit-test - -! Bug in parsing word -! [ t ] [ "a" R' a' matches? ] unit-test - -! ((A)(B(C))) -! 1. ((A)(B(C))) -! 2. (A) -! 3. (B(C)) -! 4. (C) diff --git a/extra/regexp2/regexp2.factor b/extra/regexp2/regexp2.factor deleted file mode 100644 index 0b8994ca2b..0000000000 --- a/extra/regexp2/regexp2.factor +++ /dev/null @@ -1,59 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors combinators kernel math math.ranges -sequences regexp2.backend regexp2.utils memoize sets -regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal -regexp2.transition-tables ; -IN: regexp2 - -: default-regexp ( string -- regexp ) - regexp new - swap >>raw - >>nfa-table - >>dfa-table - >>minimized-table - reset-regexp ; - -: construct-regexp ( regexp -- regexp' ) - { - [ parse-regexp ] - [ construct-nfa ] - [ construct-dfa ] - [ ] - } cleave ; - -: match ( string regexp -- pair ) - do-match return-match ; - -: matches? ( string regexp -- ? ) - dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ; - -: match-head ( string regexp -- end ) match length>> 1- ; - -: initial-option ( regexp option -- regexp' ) - over options>> conjoin ; - -: ( string -- regexp ) - default-regexp construct-regexp ; - -: ( string -- regexp ) - default-regexp - case-insensitive initial-option - construct-regexp ; - -: ( string -- regexp ) - default-regexp - reversed-regexp initial-option - construct-regexp ; - -: R! CHAR: ! ; parsing -: R" CHAR: " ; parsing -: R# CHAR: # ; parsing -: R' CHAR: ' ; parsing -: R( CHAR: ) ; parsing -: R/ CHAR: / ; parsing -: R@ CHAR: @ ; parsing -: R[ CHAR: ] ; parsing -: R` CHAR: ` ; parsing -: R{ CHAR: } ; parsing -: R| CHAR: | ; parsing diff --git a/extra/regexp2/summary.txt b/extra/regexp2/summary.txt deleted file mode 100644 index aa1e1c27a9..0000000000 --- a/extra/regexp2/summary.txt +++ /dev/null @@ -1 +0,0 @@ -Regular expressions diff --git a/extra/regexp2/tags.txt b/extra/regexp2/tags.txt deleted file mode 100644 index 65bc471f6b..0000000000 --- a/extra/regexp2/tags.txt +++ /dev/null @@ -1,2 +0,0 @@ -parsing -text diff --git a/extra/regexp2/transition-tables/transition-tables.factor b/extra/regexp2/transition-tables/transition-tables.factor deleted file mode 100644 index 0547846655..0000000000 --- a/extra/regexp2/transition-tables/transition-tables.factor +++ /dev/null @@ -1,44 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs fry hashtables kernel sequences -vectors ; -IN: regexp2.transition-tables - -: insert-at ( value key hash -- ) - 2dup at* [ - 2nip push - ] [ - drop >r >r dup vector? [ 1vector ] unless r> r> set-at - ] if ; - -: ?insert-at ( value key hash/f -- hash ) - [ H{ } clone ] unless* [ insert-at ] keep ; - -TUPLE: transition from to obj ; -TUPLE: literal-transition < transition ; -TUPLE: class-transition < transition ; -TUPLE: default-transition < transition ; - -TUPLE: literal obj ; -TUPLE: class obj ; -TUPLE: default ; -: ( from to obj -- transition ) literal-transition boa ; -: ( from to obj -- transition ) class-transition boa ; -: ( from to -- transition ) t default-transition boa ; - -TUPLE: transition-table transitions - literals classes defaults - start-state final-states ; - -: ( -- transition-table ) - transition-table new - H{ } clone >>transitions - H{ } clone >>final-states ; - -: set-transition ( transition hash -- ) - >r [ to>> ] [ obj>> ] [ from>> ] tri r> - 2dup at* [ 2nip insert-at ] - [ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ; - -: add-transition ( transition transition-table -- ) - transitions>> set-transition ; diff --git a/extra/regexp2/traversal/traversal.factor b/extra/regexp2/traversal/traversal.factor deleted file mode 100644 index 94e96bb935..0000000000 --- a/extra/regexp2/traversal/traversal.factor +++ /dev/null @@ -1,80 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors assocs combinators combinators.lib kernel -math math.ranges quotations sequences regexp2.parser -regexp2.classes combinators.short-circuit assocs.lib -sequences.lib ; -IN: regexp2.traversal - -TUPLE: dfa-traverser - dfa-table - last-state current-state - text - start-index current-index - matches ; - -: ( text regexp -- match ) - dfa-table>> - dfa-traverser new - swap [ start-state>> >>current-state ] keep - >>dfa-table - swap >>text - 0 >>start-index - 0 >>current-index - V{ } clone >>matches ; - -: final-state? ( dfa-traverser -- ? ) - [ current-state>> ] [ dfa-table>> final-states>> ] bi - key? ; - -: text-finished? ( dfa-traverser -- ? ) - [ current-index>> ] [ text>> length ] bi >= ; - -: save-final-state ( dfa-straverser -- ) - [ current-index>> ] [ matches>> ] bi push ; - -: match-done? ( dfa-traverser -- ? ) - dup final-state? [ - dup save-final-state - ] when text-finished? ; - -: increment-state ( dfa-traverser state -- dfa-traverser ) - >r [ 1+ ] change-current-index - dup current-state>> >>last-state r> - first >>current-state ; - -: match-failed ( dfa-traverser -- dfa-traverser ) - V{ } clone >>matches ; - -: match-literal ( transition from-state table -- to-state/f ) - transitions>> [ at ] [ 2drop f ] if-at ; - -: assoc-with ( param assoc quot -- assoc curry ) - swapd [ [ -rot ] dip call ] 2curry ; inline - -: match-class ( transition from-state table -- to-state/f ) - transitions>> at* [ - [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if - ] [ drop ] if ; - -: match-default ( transition from-state table -- to-state/f ) - [ nip ] dip transitions>> - [ t swap [ drop f ] unless-at ] [ drop f ] if-at ; - -: match-transition ( obj from-state dfa -- to-state/f ) - { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; - -: setup-match ( match -- obj state dfa-table ) - { current-index>> text>> current-state>> dfa-table>> } get-slots - [ nth ] 2dip ; - -: do-match ( dfa-traverser -- dfa-traverser ) - dup match-done? [ - dup setup-match match-transition - [ increment-state do-match ] when* - ] unless ; - -: return-match ( dfa-traverser -- interval/f ) - dup matches>> - [ drop f ] - [ [ start-index>> ] [ peek ] bi* 1 ] if-empty ; diff --git a/extra/regexp2/utils/utils.factor b/extra/regexp2/utils/utils.factor deleted file mode 100644 index 0167e73005..0000000000 --- a/extra/regexp2/utils/utils.factor +++ /dev/null @@ -1,69 +0,0 @@ -! Copyright (C) 2008 Doug Coleman. -! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs combinators.lib io kernel -math math.order namespaces regexp2.backend sequences -sequences.lib unicode.categories math.ranges fry -combinators.short-circuit ; -IN: regexp2.utils - -: (while-changes) ( obj quot pred pred-ret -- obj ) - ! quot: ( obj -- obj' ) - ! pred: ( obj -- <=> ) - >r >r dup slip r> pick over call r> dupd = - [ 3drop ] [ (while-changes) ] if ; inline - -: while-changes ( obj quot pred -- obj' ) - pick over call (while-changes) ; inline - -: last-state ( regexp -- range ) stack>> peek first2 [a,b] ; -: push1 ( obj -- ) input-stream get stream>> push ; -: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ; -: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ; - -: stack ( -- obj ) current-regexp get stack>> ; -: change-whole-stack ( quot -- ) - current-regexp get - [ stack>> swap call ] keep (>>stack) ; inline -: push-stack ( obj -- ) stack push ; -: pop-stack ( -- obj ) stack pop ; -: cut-out ( vector n -- vector' vector ) cut rest ; -ERROR: cut-stack-error ; -: cut-stack ( obj vector -- vector' vector ) - tuck last-index [ cut-stack-error ] unless* cut-out swap ; - -ERROR: bad-octal number ; -ERROR: bad-hex number ; -: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ; -: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ; - -: ascii? ( n -- ? ) 0 HEX: 7f between? ; -: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; -: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; - -: hex-digit? ( n -- ? ) - [ - [ decimal-digit? ] - [ CHAR: a CHAR: f between? ] - [ CHAR: A CHAR: F between? ] - ] 1|| ; - -: control-char? ( n -- ? ) - [ - [ 0 HEX: 1f between? ] - [ HEX: 7f = ] - ] 1|| ; - -: punct? ( n -- ? ) - "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; - -: c-identifier-char? ( ch -- ? ) - [ [ alpha? ] [ CHAR: _ = ] ] 1|| ; - -: java-blank? ( n -- ? ) - { - CHAR: \s CHAR: \t CHAR: \n - HEX: b HEX: 7 CHAR: \r - } member? ; - -: java-printable? ( n -- ? ) - [ [ alpha? ] [ punct? ] ] 1|| ;