Various unfinshed regexp changes
parent
f535b66aed
commit
e41cdf5e8f
|
@ -18,7 +18,7 @@ SINGLETON: epsilon
|
|||
TUPLE: concatenation first second ;
|
||||
|
||||
: <concatenation> ( seq -- concatenation )
|
||||
epsilon [ concatenation boa ] reduce ;
|
||||
[ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
|
||||
|
||||
TUPLE: alternation first second ;
|
||||
|
||||
|
@ -54,3 +54,9 @@ M: from-to <times>
|
|||
|
||||
: char-class ( ranges ? -- term )
|
||||
[ <or-class> ] dip [ <not-class> ] when ;
|
||||
|
||||
TUPLE: lookahead term ;
|
||||
C: <lookahead> lookahead
|
||||
|
||||
TUPLE: lookbehind term ;
|
||||
C: <lookbehind> lookbehind
|
||||
|
|
|
@ -4,28 +4,6 @@ USING: accessors kernel math math.order words
|
|||
ascii unicode.categories combinators.short-circuit sequences ;
|
||||
IN: regexp.classes
|
||||
|
||||
: punct? ( ch -- ? )
|
||||
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
|
||||
|
||||
: c-identifier-char? ( ch -- ? )
|
||||
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
|
||||
|
||||
: java-blank? ( ch -- ? )
|
||||
{
|
||||
CHAR: \s CHAR: \t CHAR: \n
|
||||
HEX: b HEX: 7 CHAR: \r
|
||||
} member? ;
|
||||
|
||||
: java-printable? ( ch -- ? )
|
||||
[ [ alpha? ] [ punct? ] ] 1|| ;
|
||||
|
||||
: hex-digit? ( ch -- ? )
|
||||
{
|
||||
[ CHAR: A CHAR: F between? ]
|
||||
[ CHAR: a CHAR: f between? ]
|
||||
[ CHAR: 0 CHAR: 9 between? ]
|
||||
} 1|| ;
|
||||
|
||||
SINGLETONS: any-char any-char-no-nl
|
||||
letter-class LETTER-class Letter-class digit-class
|
||||
alpha-class non-newline-blank-class
|
||||
|
@ -70,16 +48,24 @@ M: ascii-class class-member? ( obj class -- ? )
|
|||
M: digit-class class-member? ( obj class -- ? )
|
||||
drop digit? ;
|
||||
|
||||
: c-identifier-char? ( ch -- ? )
|
||||
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
|
||||
|
||||
M: c-identifier-class class-member? ( obj class -- ? )
|
||||
drop
|
||||
{ [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
|
||||
drop c-identifier-char? ;
|
||||
|
||||
M: alpha-class class-member? ( obj class -- ? )
|
||||
drop alpha? ;
|
||||
|
||||
: punct? ( ch -- ? )
|
||||
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
|
||||
|
||||
M: punctuation-class class-member? ( obj class -- ? )
|
||||
drop punct? ;
|
||||
|
||||
: java-printable? ( ch -- ? )
|
||||
{ [ alpha? ] [ punct? ] } 1|| ;
|
||||
|
||||
M: java-printable-class class-member? ( obj class -- ? )
|
||||
drop java-printable? ;
|
||||
|
||||
|
@ -89,9 +75,22 @@ M: non-newline-blank-class class-member? ( obj class -- ? )
|
|||
M: control-character-class class-member? ( obj class -- ? )
|
||||
drop control? ;
|
||||
|
||||
: hex-digit? ( ch -- ? )
|
||||
{
|
||||
[ CHAR: A CHAR: F between? ]
|
||||
[ CHAR: a CHAR: f between? ]
|
||||
[ CHAR: 0 CHAR: 9 between? ]
|
||||
} 1|| ;
|
||||
|
||||
M: hex-digit-class class-member? ( obj class -- ? )
|
||||
drop hex-digit? ;
|
||||
|
||||
: java-blank? ( ch -- ? )
|
||||
{
|
||||
CHAR: \s CHAR: \t CHAR: \n
|
||||
HEX: b HEX: 7 CHAR: \r
|
||||
} member? ;
|
||||
|
||||
M: java-blank-class class-member? ( obj class -- ? )
|
||||
drop java-blank? ;
|
||||
|
||||
|
@ -99,13 +98,7 @@ M: unmatchable-class class-member? ( obj class -- ? )
|
|||
2drop f ;
|
||||
|
||||
M: terminator-class class-member? ( obj class -- ? )
|
||||
drop {
|
||||
[ CHAR: \r = ]
|
||||
[ CHAR: \n = ]
|
||||
[ CHAR: \u000085 = ]
|
||||
[ CHAR: \u002028 = ]
|
||||
[ CHAR: \u002029 = ]
|
||||
} 1|| ;
|
||||
drop "\r\n\u000085\u002029\u002028" member? ;
|
||||
|
||||
M: beginning-of-line class-member? ( obj class -- ? )
|
||||
2drop f ;
|
||||
|
@ -119,6 +112,9 @@ C: <or-class> or-class
|
|||
TUPLE: not-class class ;
|
||||
C: <not-class> not-class
|
||||
|
||||
: <and-class> ( classes -- class )
|
||||
[ <not-class> ] map <or-class> <not-class> ;
|
||||
|
||||
TUPLE: primitive-class class ;
|
||||
C: <primitive-class> primitive-class
|
||||
|
||||
|
@ -130,3 +126,5 @@ M: not-class class-member?
|
|||
|
||||
M: primitive-class class-member?
|
||||
class>> class-member? ;
|
||||
|
||||
UNION: class primitive-class not-class or-class range ;
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors arrays assocs combinators fry kernel locals
|
||||
math math.order regexp.nfa regexp.transition-tables sequences
|
||||
sets sorting vectors sequences.deep ;
|
||||
sets sorting vectors sequences.deep math.functions regexp.classes ;
|
||||
USING: io prettyprint threads ;
|
||||
IN: regexp.dfa
|
||||
|
||||
|
@ -17,6 +17,34 @@ IN: regexp.dfa
|
|||
: while-changes ( obj quot pred -- obj' )
|
||||
3dup nip call (while-changes) ; inline
|
||||
|
||||
TUPLE: parts in out ;
|
||||
|
||||
: make-partition ( choices classes -- partition )
|
||||
zip [ first ] partition parts boa ;
|
||||
|
||||
: powerset-partition ( classes -- partitions )
|
||||
! Here is where class algebra will happen, when I implement it
|
||||
[ length [ 2^ ] keep ] keep '[
|
||||
_ [ ] map-bits _ make-partition
|
||||
] map ;
|
||||
|
||||
: partition>class ( parts -- class )
|
||||
[ in>> ] [ out>> ] bi
|
||||
[ <or-class> ] bi@ <not-class> 2array <and-class> ;
|
||||
|
||||
: get-transitions ( partition state-transitions -- next-states )
|
||||
[ in>> ] dip '[ at ] gather ;
|
||||
|
||||
: disambiguate-overlap ( nfa -- nfa' )
|
||||
[
|
||||
[
|
||||
[ keys powerset-partition ] keep '[
|
||||
[ partition>class ]
|
||||
[ _ get-transitions ] bi
|
||||
] H{ } map>assoc
|
||||
] assoc-map
|
||||
] change-transitions ;
|
||||
|
||||
: find-delta ( states transition nfa -- new-states )
|
||||
transitions>> '[ _ swap _ at at ] gather sift ;
|
||||
|
||||
|
@ -72,6 +100,7 @@ IN: regexp.dfa
|
|||
swap find-start-state >>start-state ;
|
||||
|
||||
: construct-dfa ( nfa -- dfa )
|
||||
disambiguate-overlap
|
||||
dup initialize-dfa
|
||||
dup start-state>> 1vector
|
||||
H{ } clone
|
||||
|
|
|
@ -3,17 +3,26 @@
|
|||
USING: accessors arrays assocs grouping kernel
|
||||
locals math namespaces sequences fry quotations
|
||||
math.order math.ranges vectors unicode.categories
|
||||
regexp.transition-tables words sets hashtables
|
||||
regexp.transition-tables words sets hashtables combinators.short-circuit
|
||||
unicode.case.private regexp.ast regexp.classes ;
|
||||
IN: regexp.nfa
|
||||
|
||||
! This uses unicode.case.private for ch>upper and ch>lower
|
||||
! but case-insensitive matching should be done by case-folding everything
|
||||
! before processing starts
|
||||
IN: regexp.nfa
|
||||
|
||||
SYMBOL: negated?
|
||||
GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
|
||||
! This is unfinished and does nothing right now!
|
||||
|
||||
: negate ( -- )
|
||||
negated? [ not ] change ;
|
||||
M: object remove-lookahead ;
|
||||
|
||||
M: with-options remove-lookahead
|
||||
[ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
|
||||
|
||||
M: alternation remove-lookahead
|
||||
[ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ;
|
||||
|
||||
M: concatenation remove-lookahead ;
|
||||
|
||||
SINGLETON: eps
|
||||
|
||||
|
@ -45,16 +54,9 @@ SYMBOL: nfa-table
|
|||
|
||||
GENERIC: nfa-node ( node -- start-state end-state )
|
||||
|
||||
:: add-simple-entry ( obj class -- start-state end-state )
|
||||
next-state :> s0
|
||||
next-state :> s1
|
||||
negated? get [
|
||||
s0 f obj class make-transition table add-transition
|
||||
s0 s1 <default-transition> table add-transition
|
||||
] [
|
||||
s0 s1 obj class make-transition table add-transition
|
||||
] if
|
||||
s0 s1 ;
|
||||
: add-simple-entry ( obj class -- start-state end-state )
|
||||
[ next-state next-state 2dup ] 2dip
|
||||
make-transition table add-transition ;
|
||||
|
||||
: epsilon-transition ( source target -- )
|
||||
eps <literal-transition> table add-transition ;
|
||||
|
@ -92,62 +94,66 @@ M: alternation nfa-node ( node -- start end )
|
|||
[ nfa-node ] bi@
|
||||
alternate-nodes ;
|
||||
|
||||
GENERIC: modify-class ( char-class -- char-class' )
|
||||
|
||||
M: object modify-class ;
|
||||
|
||||
M: integer modify-class
|
||||
case-insensitive option? [
|
||||
dup Letter? [
|
||||
[ ch>lower ] [ ch>upper ] bi 2array <or-class>
|
||||
] when
|
||||
] when ;
|
||||
|
||||
M: integer nfa-node ( node -- start end )
|
||||
modify-class dup class?
|
||||
class-transition literal-transition ?
|
||||
add-simple-entry ;
|
||||
|
||||
M: primitive-class modify-class
|
||||
class>> modify-class <primitive-class> ;
|
||||
|
||||
M: or-class modify-class
|
||||
seq>> [ modify-class ] map <or-class> ;
|
||||
|
||||
M: not-class modify-class
|
||||
class>> modify-class <not-class> ;
|
||||
|
||||
M: any-char modify-class
|
||||
[ dotall option? ] dip any-char-no-nl ? ;
|
||||
|
||||
: modify-letter-class ( class -- newclass )
|
||||
case-insensitive option? [ drop Letter-class ] when ;
|
||||
M: letter-class modify-class modify-letter-class ;
|
||||
M: LETTER-class modify-class modify-letter-class ;
|
||||
|
||||
: cased-range? ( range -- ? )
|
||||
[ from>> ] [ to>> ] bi {
|
||||
[ [ letter? ] bi@ and ]
|
||||
[ [ LETTER? ] bi@ and ]
|
||||
} 2|| ;
|
||||
|
||||
M: range modify-class
|
||||
case-insensitive option? [
|
||||
dup [ ch>lower ] [ ch>upper ] bi
|
||||
2dup = [
|
||||
2drop
|
||||
literal-transition add-simple-entry
|
||||
] [
|
||||
[ literal-transition add-simple-entry ] bi@
|
||||
alternate-nodes [ nip ] dip
|
||||
] if
|
||||
] [ literal-transition add-simple-entry ] if ;
|
||||
|
||||
M: primitive-class nfa-node ( node -- start end )
|
||||
class>> dup
|
||||
{ letter-class LETTER-class } member? case-insensitive option? and
|
||||
[ drop Letter-class ] when
|
||||
class-transition add-simple-entry ;
|
||||
|
||||
M: or-class nfa-node class-transition add-simple-entry ;
|
||||
M: not-class nfa-node class-transition add-simple-entry ;
|
||||
|
||||
M: any-char nfa-node ( node -- start end )
|
||||
[ dotall option? ] dip any-char-no-nl ?
|
||||
class-transition add-simple-entry ;
|
||||
|
||||
! M: negation nfa-node ( node -- start end )
|
||||
! negate term>> nfa-node negate ;
|
||||
|
||||
M: range nfa-node ( node -- start end )
|
||||
case-insensitive option? [
|
||||
! This should be implemented for Unicode by case-folding
|
||||
! the input and all strings in the regexp.
|
||||
dup [ from>> ] [ to>> ] bi
|
||||
2dup [ Letter? ] bi@ and [
|
||||
rot drop
|
||||
dup cased-range? [
|
||||
[ from>> ] [ to>> ] bi
|
||||
[ [ ch>lower ] bi@ <range> ]
|
||||
[ [ ch>upper ] bi@ <range> ] 2bi
|
||||
[ class-transition add-simple-entry ] bi@
|
||||
alternate-nodes
|
||||
] [
|
||||
2drop
|
||||
class-transition add-simple-entry
|
||||
] if
|
||||
] [
|
||||
class-transition add-simple-entry
|
||||
] if ;
|
||||
2array <or-class>
|
||||
] when
|
||||
] when ;
|
||||
|
||||
M: class nfa-node
|
||||
modify-class class-transition add-simple-entry ;
|
||||
|
||||
M: with-options nfa-node ( node -- start end )
|
||||
dup options>> [ tree>> nfa-node ] using-options ;
|
||||
|
||||
: construct-nfa ( ast -- nfa-table )
|
||||
[
|
||||
negated? off
|
||||
0 state set
|
||||
<transition-table> clone nfa-table set
|
||||
nfa-node
|
||||
<transition-table> nfa-table set
|
||||
remove-lookahead nfa-node
|
||||
table
|
||||
swap dup associate >>final-states
|
||||
swap >>start-state
|
||||
|
|
|
@ -132,11 +132,15 @@ Parenthized = "?:" Alternation:a => [[ a ]]
|
|||
=> [[ a on off parse-options <with-options> ]]
|
||||
| "?#" [^)]* => [[ f ]]
|
||||
| "?~" Alternation:a => [[ a <negation> ]]
|
||||
| "?=" Alternation:a => [[ a <lookahead> ]]
|
||||
| "?!" Alternation:a => [[ a <negation> <lookahead> ]]
|
||||
| "?<=" Alternation:a => [[ a <lookbehind> ]]
|
||||
| "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
|
||||
| Alternation
|
||||
|
||||
Element = "(" Parenthized:p ")" => [[ p ]]
|
||||
| "[" CharClass:r "]" => [[ r ]]
|
||||
| ".":d => [[ any-char ]]
|
||||
| ".":d => [[ any-char <primitive-class> ]]
|
||||
| Character
|
||||
|
||||
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
||||
|
|
|
@ -317,6 +317,22 @@ IN: regexp-tests
|
|||
! Bug in parsing word
|
||||
[ t ] [ "a" R' a' matches? ] unit-test
|
||||
|
||||
! Testing negation
|
||||
[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
|
||||
[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
|
||||
[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
|
||||
[ t ] [ "" R/ (?~a)/ matches? ] unit-test
|
||||
|
||||
[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
|
||||
[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
|
||||
[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
|
||||
[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
|
||||
[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
|
||||
|
||||
! Intersecting classes
|
||||
[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
|
||||
[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
|
||||
|
||||
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
|
||||
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
||||
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
||||
|
|
Loading…
Reference in New Issue