Various unfinshed regexp changes

db4
Daniel Ehrenberg 2009-02-20 17:54:48 -06:00
parent f535b66aed
commit e41cdf5e8f
6 changed files with 153 additions and 94 deletions

View File

@ -18,7 +18,7 @@ SINGLETON: epsilon
TUPLE: concatenation first second ;
: <concatenation> ( seq -- concatenation )
epsilon [ concatenation boa ] reduce ;
[ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
TUPLE: alternation first second ;
@ -54,3 +54,9 @@ M: from-to <times>
: char-class ( ranges ? -- term )
[ <or-class> ] dip [ <not-class> ] when ;
TUPLE: lookahead term ;
C: <lookahead> lookahead
TUPLE: lookbehind term ;
C: <lookbehind> lookbehind

View File

@ -4,28 +4,6 @@ USING: accessors kernel math math.order words
ascii unicode.categories combinators.short-circuit sequences ;
IN: regexp.classes
: punct? ( ch -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
: java-blank? ( ch -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
: java-printable? ( ch -- ? )
[ [ alpha? ] [ punct? ] ] 1|| ;
: hex-digit? ( ch -- ? )
{
[ CHAR: A CHAR: F between? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: 0 CHAR: 9 between? ]
} 1|| ;
SINGLETONS: any-char any-char-no-nl
letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class
@ -70,16 +48,24 @@ M: ascii-class class-member? ( obj class -- ? )
M: digit-class class-member? ( obj class -- ? )
drop digit? ;
: c-identifier-char? ( ch -- ? )
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
M: c-identifier-class class-member? ( obj class -- ? )
drop
{ [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
drop c-identifier-char? ;
M: alpha-class class-member? ( obj class -- ? )
drop alpha? ;
: punct? ( ch -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
M: punctuation-class class-member? ( obj class -- ? )
drop punct? ;
: java-printable? ( ch -- ? )
{ [ alpha? ] [ punct? ] } 1|| ;
M: java-printable-class class-member? ( obj class -- ? )
drop java-printable? ;
@ -89,9 +75,22 @@ M: non-newline-blank-class class-member? ( obj class -- ? )
M: control-character-class class-member? ( obj class -- ? )
drop control? ;
: hex-digit? ( ch -- ? )
{
[ CHAR: A CHAR: F between? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: 0 CHAR: 9 between? ]
} 1|| ;
M: hex-digit-class class-member? ( obj class -- ? )
drop hex-digit? ;
: java-blank? ( ch -- ? )
{
CHAR: \s CHAR: \t CHAR: \n
HEX: b HEX: 7 CHAR: \r
} member? ;
M: java-blank-class class-member? ( obj class -- ? )
drop java-blank? ;
@ -99,13 +98,7 @@ M: unmatchable-class class-member? ( obj class -- ? )
2drop f ;
M: terminator-class class-member? ( obj class -- ? )
drop {
[ CHAR: \r = ]
[ CHAR: \n = ]
[ CHAR: \u000085 = ]
[ CHAR: \u002028 = ]
[ CHAR: \u002029 = ]
} 1|| ;
drop "\r\n\u000085\u002029\u002028" member? ;
M: beginning-of-line class-member? ( obj class -- ? )
2drop f ;
@ -119,6 +112,9 @@ C: <or-class> or-class
TUPLE: not-class class ;
C: <not-class> not-class
: <and-class> ( classes -- class )
[ <not-class> ] map <or-class> <not-class> ;
TUPLE: primitive-class class ;
C: <primitive-class> primitive-class
@ -130,3 +126,5 @@ M: not-class class-member?
M: primitive-class class-member?
class>> class-member? ;
UNION: class primitive-class not-class or-class range ;

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp.nfa regexp.transition-tables sequences
sets sorting vectors sequences.deep ;
sets sorting vectors sequences.deep math.functions regexp.classes ;
USING: io prettyprint threads ;
IN: regexp.dfa
@ -17,6 +17,34 @@ IN: regexp.dfa
: while-changes ( obj quot pred -- obj' )
3dup nip call (while-changes) ; inline
TUPLE: parts in out ;
: make-partition ( choices classes -- partition )
zip [ first ] partition parts boa ;
: powerset-partition ( classes -- partitions )
! Here is where class algebra will happen, when I implement it
[ length [ 2^ ] keep ] keep '[
_ [ ] map-bits _ make-partition
] map ;
: partition>class ( parts -- class )
[ in>> ] [ out>> ] bi
[ <or-class> ] bi@ <not-class> 2array <and-class> ;
: get-transitions ( partition state-transitions -- next-states )
[ in>> ] dip '[ at ] gather ;
: disambiguate-overlap ( nfa -- nfa' )
[
[
[ keys powerset-partition ] keep '[
[ partition>class ]
[ _ get-transitions ] bi
] H{ } map>assoc
] assoc-map
] change-transitions ;
: find-delta ( states transition nfa -- new-states )
transitions>> '[ _ swap _ at at ] gather sift ;
@ -72,6 +100,7 @@ IN: regexp.dfa
swap find-start-state >>start-state ;
: construct-dfa ( nfa -- dfa )
disambiguate-overlap
dup initialize-dfa
dup start-state>> 1vector
H{ } clone

View File

@ -3,17 +3,26 @@
USING: accessors arrays assocs grouping kernel
locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories
regexp.transition-tables words sets hashtables
regexp.transition-tables words sets hashtables combinators.short-circuit
unicode.case.private regexp.ast regexp.classes ;
IN: regexp.nfa
! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
IN: regexp.nfa
SYMBOL: negated?
GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
! This is unfinished and does nothing right now!
: negate ( -- )
negated? [ not ] change ;
M: object remove-lookahead ;
M: with-options remove-lookahead
[ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
M: alternation remove-lookahead
[ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ;
M: concatenation remove-lookahead ;
SINGLETON: eps
@ -45,16 +54,9 @@ SYMBOL: nfa-table
GENERIC: nfa-node ( node -- start-state end-state )
:: add-simple-entry ( obj class -- start-state end-state )
next-state :> s0
next-state :> s1
negated? get [
s0 f obj class make-transition table add-transition
s0 s1 <default-transition> table add-transition
] [
s0 s1 obj class make-transition table add-transition
] if
s0 s1 ;
: add-simple-entry ( obj class -- start-state end-state )
[ next-state next-state 2dup ] 2dip
make-transition table add-transition ;
: epsilon-transition ( source target -- )
eps <literal-transition> table add-transition ;
@ -92,62 +94,66 @@ M: alternation nfa-node ( node -- start end )
[ nfa-node ] bi@
alternate-nodes ;
GENERIC: modify-class ( char-class -- char-class' )
M: object modify-class ;
M: integer modify-class
case-insensitive option? [
dup Letter? [
[ ch>lower ] [ ch>upper ] bi 2array <or-class>
] when
] when ;
M: integer nfa-node ( node -- start end )
modify-class dup class?
class-transition literal-transition ?
add-simple-entry ;
M: primitive-class modify-class
class>> modify-class <primitive-class> ;
M: or-class modify-class
seq>> [ modify-class ] map <or-class> ;
M: not-class modify-class
class>> modify-class <not-class> ;
M: any-char modify-class
[ dotall option? ] dip any-char-no-nl ? ;
: modify-letter-class ( class -- newclass )
case-insensitive option? [ drop Letter-class ] when ;
M: letter-class modify-class modify-letter-class ;
M: LETTER-class modify-class modify-letter-class ;
: cased-range? ( range -- ? )
[ from>> ] [ to>> ] bi {
[ [ letter? ] bi@ and ]
[ [ LETTER? ] bi@ and ]
} 2|| ;
M: range modify-class
case-insensitive option? [
dup [ ch>lower ] [ ch>upper ] bi
2dup = [
2drop
literal-transition add-simple-entry
] [
[ literal-transition add-simple-entry ] bi@
alternate-nodes [ nip ] dip
] if
] [ literal-transition add-simple-entry ] if ;
M: primitive-class nfa-node ( node -- start end )
class>> dup
{ letter-class LETTER-class } member? case-insensitive option? and
[ drop Letter-class ] when
class-transition add-simple-entry ;
M: or-class nfa-node class-transition add-simple-entry ;
M: not-class nfa-node class-transition add-simple-entry ;
M: any-char nfa-node ( node -- start end )
[ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ;
! M: negation nfa-node ( node -- start end )
! negate term>> nfa-node negate ;
M: range nfa-node ( node -- start end )
case-insensitive option? [
! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp.
dup [ from>> ] [ to>> ] bi
2dup [ Letter? ] bi@ and [
rot drop
dup cased-range? [
[ from>> ] [ to>> ] bi
[ [ ch>lower ] bi@ <range> ]
[ [ ch>upper ] bi@ <range> ] 2bi
[ class-transition add-simple-entry ] bi@
alternate-nodes
] [
2drop
class-transition add-simple-entry
] if
] [
class-transition add-simple-entry
] if ;
2array <or-class>
] when
] when ;
M: class nfa-node
modify-class class-transition add-simple-entry ;
M: with-options nfa-node ( node -- start end )
dup options>> [ tree>> nfa-node ] using-options ;
: construct-nfa ( ast -- nfa-table )
[
negated? off
0 state set
<transition-table> clone nfa-table set
nfa-node
<transition-table> nfa-table set
remove-lookahead nfa-node
table
swap dup associate >>final-states
swap >>start-state

View File

@ -132,11 +132,15 @@ Parenthized = "?:" Alternation:a => [[ a ]]
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
| "?=" Alternation:a => [[ a <lookahead> ]]
| "?!" Alternation:a => [[ a <negation> <lookahead> ]]
| "?<=" Alternation:a => [[ a <lookbehind> ]]
| "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
| Alternation
Element = "(" Parenthized:p ")" => [[ p ]]
| "[" CharClass:r "]" => [[ r ]]
| ".":d => [[ any-char ]]
| ".":d => [[ any-char <primitive-class> ]]
| Character
Number = (!(","|"}").)* => [[ string>number ensure-number ]]

View File

@ -317,6 +317,22 @@ IN: regexp-tests
! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test
! Testing negation
[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
[ t ] [ "" R/ (?~a)/ matches? ] unit-test
[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
! Intersecting classes
[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test