Various unfinshed regexp changes

db4
Daniel Ehrenberg 2009-02-20 17:54:48 -06:00
parent f535b66aed
commit e41cdf5e8f
6 changed files with 153 additions and 94 deletions

View File

@ -18,7 +18,7 @@ SINGLETON: epsilon
TUPLE: concatenation first second ; TUPLE: concatenation first second ;
: <concatenation> ( seq -- concatenation ) : <concatenation> ( seq -- concatenation )
epsilon [ concatenation boa ] reduce ; [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
TUPLE: alternation first second ; TUPLE: alternation first second ;
@ -54,3 +54,9 @@ M: from-to <times>
: char-class ( ranges ? -- term ) : char-class ( ranges ? -- term )
[ <or-class> ] dip [ <not-class> ] when ; [ <or-class> ] dip [ <not-class> ] when ;
TUPLE: lookahead term ;
C: <lookahead> lookahead
TUPLE: lookbehind term ;
C: <lookbehind> lookbehind

View File

@ -4,28 +4,6 @@ USING: accessors kernel math math.order words
ascii unicode.categories combinators.short-circuit sequences ; ascii unicode.categories combinators.short-circuit sequences ;
IN: regexp.classes IN: regexp.classes
: punct? ( ch -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
: java-blank? ( ch -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
: java-printable? ( ch -- ? )
[ [ alpha? ] [ punct? ] ] 1|| ;
: hex-digit? ( ch -- ? )
{
[ CHAR: A CHAR: F between? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: 0 CHAR: 9 between? ]
} 1|| ;
SINGLETONS: any-char any-char-no-nl SINGLETONS: any-char any-char-no-nl
letter-class LETTER-class Letter-class digit-class letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class alpha-class non-newline-blank-class
@ -70,16 +48,24 @@ M: ascii-class class-member? ( obj class -- ? )
M: digit-class class-member? ( obj class -- ? ) M: digit-class class-member? ( obj class -- ? )
drop digit? ; drop digit? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
M: c-identifier-class class-member? ( obj class -- ? ) M: c-identifier-class class-member? ( obj class -- ? )
drop drop c-identifier-char? ;
{ [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
M: alpha-class class-member? ( obj class -- ? ) M: alpha-class class-member? ( obj class -- ? )
drop alpha? ; drop alpha? ;
: punct? ( ch -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
M: punctuation-class class-member? ( obj class -- ? ) M: punctuation-class class-member? ( obj class -- ? )
drop punct? ; drop punct? ;
: java-printable? ( ch -- ? )
{ [ alpha? ] [ punct? ] } 1|| ;
M: java-printable-class class-member? ( obj class -- ? ) M: java-printable-class class-member? ( obj class -- ? )
drop java-printable? ; drop java-printable? ;
@ -89,9 +75,22 @@ M: non-newline-blank-class class-member? ( obj class -- ? )
M: control-character-class class-member? ( obj class -- ? ) M: control-character-class class-member? ( obj class -- ? )
drop control? ; drop control? ;
: hex-digit? ( ch -- ? )
{
[ CHAR: A CHAR: F between? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: 0 CHAR: 9 between? ]
} 1|| ;
M: hex-digit-class class-member? ( obj class -- ? ) M: hex-digit-class class-member? ( obj class -- ? )
drop hex-digit? ; drop hex-digit? ;
: java-blank? ( ch -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
M: java-blank-class class-member? ( obj class -- ? ) M: java-blank-class class-member? ( obj class -- ? )
drop java-blank? ; drop java-blank? ;
@ -99,13 +98,7 @@ M: unmatchable-class class-member? ( obj class -- ? )
2drop f ; 2drop f ;
M: terminator-class class-member? ( obj class -- ? ) M: terminator-class class-member? ( obj class -- ? )
drop { drop "\r\n\u000085\u002029\u002028" member? ;
[ CHAR: \r = ]
[ CHAR: \n = ]
[ CHAR: \u000085 = ]
[ CHAR: \u002028 = ]
[ CHAR: \u002029 = ]
} 1|| ;
M: beginning-of-line class-member? ( obj class -- ? ) M: beginning-of-line class-member? ( obj class -- ? )
2drop f ; 2drop f ;
@ -119,6 +112,9 @@ C: <or-class> or-class
TUPLE: not-class class ; TUPLE: not-class class ;
C: <not-class> not-class C: <not-class> not-class
: <and-class> ( classes -- class )
[ <not-class> ] map <or-class> <not-class> ;
TUPLE: primitive-class class ; TUPLE: primitive-class class ;
C: <primitive-class> primitive-class C: <primitive-class> primitive-class
@ -130,3 +126,5 @@ M: not-class class-member?
M: primitive-class class-member? M: primitive-class class-member?
class>> class-member? ; class>> class-member? ;
UNION: class primitive-class not-class or-class range ;

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp.nfa regexp.transition-tables sequences math math.order regexp.nfa regexp.transition-tables sequences
sets sorting vectors sequences.deep ; sets sorting vectors sequences.deep math.functions regexp.classes ;
USING: io prettyprint threads ; USING: io prettyprint threads ;
IN: regexp.dfa IN: regexp.dfa
@ -17,6 +17,34 @@ IN: regexp.dfa
: while-changes ( obj quot pred -- obj' ) : while-changes ( obj quot pred -- obj' )
3dup nip call (while-changes) ; inline 3dup nip call (while-changes) ; inline
TUPLE: parts in out ;
: make-partition ( choices classes -- partition )
zip [ first ] partition parts boa ;
: powerset-partition ( classes -- partitions )
! Here is where class algebra will happen, when I implement it
[ length [ 2^ ] keep ] keep '[
_ [ ] map-bits _ make-partition
] map ;
: partition>class ( parts -- class )
[ in>> ] [ out>> ] bi
[ <or-class> ] bi@ <not-class> 2array <and-class> ;
: get-transitions ( partition state-transitions -- next-states )
[ in>> ] dip '[ at ] gather ;
: disambiguate-overlap ( nfa -- nfa' )
[
[
[ keys powerset-partition ] keep '[
[ partition>class ]
[ _ get-transitions ] bi
] H{ } map>assoc
] assoc-map
] change-transitions ;
: find-delta ( states transition nfa -- new-states ) : find-delta ( states transition nfa -- new-states )
transitions>> '[ _ swap _ at at ] gather sift ; transitions>> '[ _ swap _ at at ] gather sift ;
@ -72,6 +100,7 @@ IN: regexp.dfa
swap find-start-state >>start-state ; swap find-start-state >>start-state ;
: construct-dfa ( nfa -- dfa ) : construct-dfa ( nfa -- dfa )
disambiguate-overlap
dup initialize-dfa dup initialize-dfa
dup start-state>> 1vector dup start-state>> 1vector
H{ } clone H{ } clone

View File

@ -3,17 +3,26 @@
USING: accessors arrays assocs grouping kernel USING: accessors arrays assocs grouping kernel
locals math namespaces sequences fry quotations locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories math.order math.ranges vectors unicode.categories
regexp.transition-tables words sets hashtables regexp.transition-tables words sets hashtables combinators.short-circuit
unicode.case.private regexp.ast regexp.classes ; unicode.case.private regexp.ast regexp.classes ;
IN: regexp.nfa
! This uses unicode.case.private for ch>upper and ch>lower ! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything ! but case-insensitive matching should be done by case-folding everything
! before processing starts ! before processing starts
IN: regexp.nfa
SYMBOL: negated? GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
! This is unfinished and does nothing right now!
: negate ( -- ) M: object remove-lookahead ;
negated? [ not ] change ;
M: with-options remove-lookahead
[ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
M: alternation remove-lookahead
[ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ;
M: concatenation remove-lookahead ;
SINGLETON: eps SINGLETON: eps
@ -45,16 +54,9 @@ SYMBOL: nfa-table
GENERIC: nfa-node ( node -- start-state end-state ) GENERIC: nfa-node ( node -- start-state end-state )
:: add-simple-entry ( obj class -- start-state end-state ) : add-simple-entry ( obj class -- start-state end-state )
next-state :> s0 [ next-state next-state 2dup ] 2dip
next-state :> s1 make-transition table add-transition ;
negated? get [
s0 f obj class make-transition table add-transition
s0 s1 <default-transition> table add-transition
] [
s0 s1 obj class make-transition table add-transition
] if
s0 s1 ;
: epsilon-transition ( source target -- ) : epsilon-transition ( source target -- )
eps <literal-transition> table add-transition ; eps <literal-transition> table add-transition ;
@ -92,62 +94,66 @@ M: alternation nfa-node ( node -- start end )
[ nfa-node ] bi@ [ nfa-node ] bi@
alternate-nodes ; alternate-nodes ;
GENERIC: modify-class ( char-class -- char-class' )
M: object modify-class ;
M: integer modify-class
case-insensitive option? [
dup Letter? [
[ ch>lower ] [ ch>upper ] bi 2array <or-class>
] when
] when ;
M: integer nfa-node ( node -- start end ) M: integer nfa-node ( node -- start end )
modify-class dup class?
class-transition literal-transition ?
add-simple-entry ;
M: primitive-class modify-class
class>> modify-class <primitive-class> ;
M: or-class modify-class
seq>> [ modify-class ] map <or-class> ;
M: not-class modify-class
class>> modify-class <not-class> ;
M: any-char modify-class
[ dotall option? ] dip any-char-no-nl ? ;
: modify-letter-class ( class -- newclass )
case-insensitive option? [ drop Letter-class ] when ;
M: letter-class modify-class modify-letter-class ;
M: LETTER-class modify-class modify-letter-class ;
: cased-range? ( range -- ? )
[ from>> ] [ to>> ] bi {
[ [ letter? ] bi@ and ]
[ [ LETTER? ] bi@ and ]
} 2|| ;
M: range modify-class
case-insensitive option? [ case-insensitive option? [
dup [ ch>lower ] [ ch>upper ] bi dup cased-range? [
2dup = [ [ from>> ] [ to>> ] bi
2drop
literal-transition add-simple-entry
] [
[ literal-transition add-simple-entry ] bi@
alternate-nodes [ nip ] dip
] if
] [ literal-transition add-simple-entry ] if ;
M: primitive-class nfa-node ( node -- start end )
class>> dup
{ letter-class LETTER-class } member? case-insensitive option? and
[ drop Letter-class ] when
class-transition add-simple-entry ;
M: or-class nfa-node class-transition add-simple-entry ;
M: not-class nfa-node class-transition add-simple-entry ;
M: any-char nfa-node ( node -- start end )
[ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ;
! M: negation nfa-node ( node -- start end )
! negate term>> nfa-node negate ;
M: range nfa-node ( node -- start end )
case-insensitive option? [
! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp.
dup [ from>> ] [ to>> ] bi
2dup [ Letter? ] bi@ and [
rot drop
[ [ ch>lower ] bi@ <range> ] [ [ ch>lower ] bi@ <range> ]
[ [ ch>upper ] bi@ <range> ] 2bi [ [ ch>upper ] bi@ <range> ] 2bi
[ class-transition add-simple-entry ] bi@ 2array <or-class>
alternate-nodes ] when
] [ ] when ;
2drop
class-transition add-simple-entry M: class nfa-node
] if modify-class class-transition add-simple-entry ;
] [
class-transition add-simple-entry
] if ;
M: with-options nfa-node ( node -- start end ) M: with-options nfa-node ( node -- start end )
dup options>> [ tree>> nfa-node ] using-options ; dup options>> [ tree>> nfa-node ] using-options ;
: construct-nfa ( ast -- nfa-table ) : construct-nfa ( ast -- nfa-table )
[ [
negated? off
0 state set 0 state set
<transition-table> clone nfa-table set <transition-table> nfa-table set
nfa-node remove-lookahead nfa-node
table table
swap dup associate >>final-states swap dup associate >>final-states
swap >>start-state swap >>start-state

View File

@ -132,11 +132,15 @@ Parenthized = "?:" Alternation:a => [[ a ]]
=> [[ a on off parse-options <with-options> ]] => [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]] | "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]] | "?~" Alternation:a => [[ a <negation> ]]
| "?=" Alternation:a => [[ a <lookahead> ]]
| "?!" Alternation:a => [[ a <negation> <lookahead> ]]
| "?<=" Alternation:a => [[ a <lookbehind> ]]
| "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
| Alternation | Alternation
Element = "(" Parenthized:p ")" => [[ p ]] Element = "(" Parenthized:p ")" => [[ p ]]
| "[" CharClass:r "]" => [[ r ]] | "[" CharClass:r "]" => [[ r ]]
| ".":d => [[ any-char ]] | ".":d => [[ any-char <primitive-class> ]]
| Character | Character
Number = (!(","|"}").)* => [[ string>number ensure-number ]] Number = (!(","|"}").)* => [[ string>number ensure-number ]]

View File

@ -317,6 +317,22 @@ IN: regexp-tests
! Bug in parsing word ! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test [ t ] [ "a" R' a' matches? ] unit-test
! Testing negation
[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
[ t ] [ "" R/ (?~a)/ matches? ] unit-test
[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
! Intersecting classes
[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
! [ t ] [ "a" R/ ^a/ matches? ] unit-test ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test