From e41cdf5e8f6a848df14a015b70ca18612b630c35 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 20 Feb 2009 17:54:48 -0600 Subject: [PATCH] Various unfinshed regexp changes --- basis/regexp/ast/ast.factor | 8 +- basis/regexp/classes/classes.factor | 60 +++++++------ basis/regexp/dfa/dfa.factor | 31 ++++++- basis/regexp/nfa/nfa.factor | 126 +++++++++++++++------------- basis/regexp/parser/parser.factor | 6 +- basis/regexp/regexp-tests.factor | 16 ++++ 6 files changed, 153 insertions(+), 94 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index e1308f0855..65748005f4 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -18,7 +18,7 @@ SINGLETON: epsilon TUPLE: concatenation first second ; : ( seq -- concatenation ) - epsilon [ concatenation boa ] reduce ; + [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ; TUPLE: alternation first second ; @@ -54,3 +54,9 @@ M: from-to : char-class ( ranges ? -- term ) [ ] dip [ ] when ; + +TUPLE: lookahead term ; +C: lookahead + +TUPLE: lookbehind term ; +C: lookbehind diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index aaa650726c..516b6b4a1d 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -4,28 +4,6 @@ USING: accessors kernel math math.order words ascii unicode.categories combinators.short-circuit sequences ; IN: regexp.classes -: punct? ( ch -- ? ) - "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; - -: c-identifier-char? ( ch -- ? ) - { [ alpha? ] [ CHAR: _ = ] } 1|| ; - -: java-blank? ( ch -- ? ) - { - CHAR: \s CHAR: \t CHAR: \n - HEX: b HEX: 7 CHAR: \r - } member? ; - -: java-printable? ( ch -- ? ) - [ [ alpha? ] [ punct? ] ] 1|| ; - -: hex-digit? ( ch -- ? ) - { - [ CHAR: A CHAR: F between? ] - [ CHAR: a CHAR: f between? ] - [ CHAR: 0 CHAR: 9 between? ] - } 1|| ; - SINGLETONS: any-char any-char-no-nl letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class @@ -70,16 +48,24 @@ M: ascii-class class-member? ( obj class -- ? ) M: digit-class class-member? ( obj class -- ? ) drop digit? ; +: c-identifier-char? ( ch -- ? ) + { [ alpha? ] [ CHAR: _ = ] } 1|| ; + M: c-identifier-class class-member? ( obj class -- ? ) - drop - { [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ; + drop c-identifier-char? ; M: alpha-class class-member? ( obj class -- ? ) drop alpha? ; +: punct? ( ch -- ? ) + "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; + M: punctuation-class class-member? ( obj class -- ? ) drop punct? ; +: java-printable? ( ch -- ? ) + { [ alpha? ] [ punct? ] } 1|| ; + M: java-printable-class class-member? ( obj class -- ? ) drop java-printable? ; @@ -89,9 +75,22 @@ M: non-newline-blank-class class-member? ( obj class -- ? ) M: control-character-class class-member? ( obj class -- ? ) drop control? ; +: hex-digit? ( ch -- ? ) + { + [ CHAR: A CHAR: F between? ] + [ CHAR: a CHAR: f between? ] + [ CHAR: 0 CHAR: 9 between? ] + } 1|| ; + M: hex-digit-class class-member? ( obj class -- ? ) drop hex-digit? ; +: java-blank? ( ch -- ? ) + { + CHAR: \s CHAR: \t CHAR: \n + HEX: b HEX: 7 CHAR: \r + } member? ; + M: java-blank-class class-member? ( obj class -- ? ) drop java-blank? ; @@ -99,13 +98,7 @@ M: unmatchable-class class-member? ( obj class -- ? ) 2drop f ; M: terminator-class class-member? ( obj class -- ? ) - drop { - [ CHAR: \r = ] - [ CHAR: \n = ] - [ CHAR: \u000085 = ] - [ CHAR: \u002028 = ] - [ CHAR: \u002029 = ] - } 1|| ; + drop "\r\n\u000085\u002029\u002028" member? ; M: beginning-of-line class-member? ( obj class -- ? ) 2drop f ; @@ -119,6 +112,9 @@ C: or-class TUPLE: not-class class ; C: not-class +: ( classes -- class ) + [ ] map ; + TUPLE: primitive-class class ; C: primitive-class @@ -130,3 +126,5 @@ M: not-class class-member? M: primitive-class class-member? class>> class-member? ; + +UNION: class primitive-class not-class or-class range ; diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 88e4e8f9ff..9834ca4ca0 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals math math.order regexp.nfa regexp.transition-tables sequences -sets sorting vectors sequences.deep ; +sets sorting vectors sequences.deep math.functions regexp.classes ; USING: io prettyprint threads ; IN: regexp.dfa @@ -17,6 +17,34 @@ IN: regexp.dfa : while-changes ( obj quot pred -- obj' ) 3dup nip call (while-changes) ; inline +TUPLE: parts in out ; + +: make-partition ( choices classes -- partition ) + zip [ first ] partition parts boa ; + +: powerset-partition ( classes -- partitions ) + ! Here is where class algebra will happen, when I implement it + [ length [ 2^ ] keep ] keep '[ + _ [ ] map-bits _ make-partition + ] map ; + +: partition>class ( parts -- class ) + [ in>> ] [ out>> ] bi + [ ] bi@ 2array ; + +: get-transitions ( partition state-transitions -- next-states ) + [ in>> ] dip '[ at ] gather ; + +: disambiguate-overlap ( nfa -- nfa' ) + [ + [ + [ keys powerset-partition ] keep '[ + [ partition>class ] + [ _ get-transitions ] bi + ] H{ } map>assoc + ] assoc-map + ] change-transitions ; + : find-delta ( states transition nfa -- new-states ) transitions>> '[ _ swap _ at at ] gather sift ; @@ -72,6 +100,7 @@ IN: regexp.dfa swap find-start-state >>start-state ; : construct-dfa ( nfa -- dfa ) + disambiguate-overlap dup initialize-dfa dup start-state>> 1vector H{ } clone diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 6775124e60..370b354276 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -3,17 +3,26 @@ USING: accessors arrays assocs grouping kernel locals math namespaces sequences fry quotations math.order math.ranges vectors unicode.categories -regexp.transition-tables words sets hashtables +regexp.transition-tables words sets hashtables combinators.short-circuit unicode.case.private regexp.ast regexp.classes ; +IN: regexp.nfa + ! This uses unicode.case.private for ch>upper and ch>lower ! but case-insensitive matching should be done by case-folding everything ! before processing starts -IN: regexp.nfa -SYMBOL: negated? +GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' ) +! This is unfinished and does nothing right now! -: negate ( -- ) - negated? [ not ] change ; +M: object remove-lookahead ; + +M: with-options remove-lookahead + [ tree>> remove-lookahead ] [ options>> ] bi ; + +M: alternation remove-lookahead + [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ; + +M: concatenation remove-lookahead ; SINGLETON: eps @@ -45,16 +54,9 @@ SYMBOL: nfa-table GENERIC: nfa-node ( node -- start-state end-state ) -:: add-simple-entry ( obj class -- start-state end-state ) - next-state :> s0 - next-state :> s1 - negated? get [ - s0 f obj class make-transition table add-transition - s0 s1 table add-transition - ] [ - s0 s1 obj class make-transition table add-transition - ] if - s0 s1 ; +: add-simple-entry ( obj class -- start-state end-state ) + [ next-state next-state 2dup ] 2dip + make-transition table add-transition ; : epsilon-transition ( source target -- ) eps table add-transition ; @@ -92,62 +94,66 @@ M: alternation nfa-node ( node -- start end ) [ nfa-node ] bi@ alternate-nodes ; +GENERIC: modify-class ( char-class -- char-class' ) + +M: object modify-class ; + +M: integer modify-class + case-insensitive option? [ + dup Letter? [ + [ ch>lower ] [ ch>upper ] bi 2array + ] when + ] when ; + M: integer nfa-node ( node -- start end ) + modify-class dup class? + class-transition literal-transition ? + add-simple-entry ; + +M: primitive-class modify-class + class>> modify-class ; + +M: or-class modify-class + seq>> [ modify-class ] map ; + +M: not-class modify-class + class>> modify-class ; + +M: any-char modify-class + [ dotall option? ] dip any-char-no-nl ? ; + +: modify-letter-class ( class -- newclass ) + case-insensitive option? [ drop Letter-class ] when ; +M: letter-class modify-class modify-letter-class ; +M: LETTER-class modify-class modify-letter-class ; + +: cased-range? ( range -- ? ) + [ from>> ] [ to>> ] bi { + [ [ letter? ] bi@ and ] + [ [ LETTER? ] bi@ and ] + } 2|| ; + +M: range modify-class case-insensitive option? [ - dup [ ch>lower ] [ ch>upper ] bi - 2dup = [ - 2drop - literal-transition add-simple-entry - ] [ - [ literal-transition add-simple-entry ] bi@ - alternate-nodes [ nip ] dip - ] if - ] [ literal-transition add-simple-entry ] if ; - -M: primitive-class nfa-node ( node -- start end ) - class>> dup - { letter-class LETTER-class } member? case-insensitive option? and - [ drop Letter-class ] when - class-transition add-simple-entry ; - -M: or-class nfa-node class-transition add-simple-entry ; -M: not-class nfa-node class-transition add-simple-entry ; - -M: any-char nfa-node ( node -- start end ) - [ dotall option? ] dip any-char-no-nl ? - class-transition add-simple-entry ; - -! M: negation nfa-node ( node -- start end ) -! negate term>> nfa-node negate ; - -M: range nfa-node ( node -- start end ) - case-insensitive option? [ - ! This should be implemented for Unicode by case-folding - ! the input and all strings in the regexp. - dup [ from>> ] [ to>> ] bi - 2dup [ Letter? ] bi@ and [ - rot drop + dup cased-range? [ + [ from>> ] [ to>> ] bi [ [ ch>lower ] bi@ ] [ [ ch>upper ] bi@ ] 2bi - [ class-transition add-simple-entry ] bi@ - alternate-nodes - ] [ - 2drop - class-transition add-simple-entry - ] if - ] [ - class-transition add-simple-entry - ] if ; + 2array + ] when + ] when ; + +M: class nfa-node + modify-class class-transition add-simple-entry ; M: with-options nfa-node ( node -- start end ) dup options>> [ tree>> nfa-node ] using-options ; : construct-nfa ( ast -- nfa-table ) [ - negated? off 0 state set - clone nfa-table set - nfa-node + nfa-table set + remove-lookahead nfa-node table swap dup associate >>final-states swap >>start-state diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 3a7ba12552..18b43674c4 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -132,11 +132,15 @@ Parenthized = "?:" Alternation:a => [[ a ]] => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] | "?~" Alternation:a => [[ a ]] + | "?=" Alternation:a => [[ a ]] + | "?!" Alternation:a => [[ a ]] + | "?<=" Alternation:a => [[ a ]] + | "? [[ a ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] | "[" CharClass:r "]" => [[ r ]] - | ".":d => [[ any-char ]] + | ".":d => [[ any-char ]] | Character Number = (!(","|"}").)* => [[ string>number ensure-number ]] diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 4331eaa250..0d9ed129c8 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -317,6 +317,22 @@ IN: regexp-tests ! Bug in parsing word [ t ] [ "a" R' a' matches? ] unit-test +! Testing negation +[ f ] [ "a" R/ (?~a)/ matches? ] unit-test +[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test +[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test +[ t ] [ "" R/ (?~a)/ matches? ] unit-test + +[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test +[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test +[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test +[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test +[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test + +! Intersecting classes +[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test +[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test + ! [ t ] [ "a" R/ ^a/ matches? ] unit-test ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test