Various unfinshed regexp changes
parent
f535b66aed
commit
e41cdf5e8f
|
@ -18,7 +18,7 @@ SINGLETON: epsilon
|
||||||
TUPLE: concatenation first second ;
|
TUPLE: concatenation first second ;
|
||||||
|
|
||||||
: <concatenation> ( seq -- concatenation )
|
: <concatenation> ( seq -- concatenation )
|
||||||
epsilon [ concatenation boa ] reduce ;
|
[ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
|
||||||
|
|
||||||
TUPLE: alternation first second ;
|
TUPLE: alternation first second ;
|
||||||
|
|
||||||
|
@ -54,3 +54,9 @@ M: from-to <times>
|
||||||
|
|
||||||
: char-class ( ranges ? -- term )
|
: char-class ( ranges ? -- term )
|
||||||
[ <or-class> ] dip [ <not-class> ] when ;
|
[ <or-class> ] dip [ <not-class> ] when ;
|
||||||
|
|
||||||
|
TUPLE: lookahead term ;
|
||||||
|
C: <lookahead> lookahead
|
||||||
|
|
||||||
|
TUPLE: lookbehind term ;
|
||||||
|
C: <lookbehind> lookbehind
|
||||||
|
|
|
@ -4,28 +4,6 @@ USING: accessors kernel math math.order words
|
||||||
ascii unicode.categories combinators.short-circuit sequences ;
|
ascii unicode.categories combinators.short-circuit sequences ;
|
||||||
IN: regexp.classes
|
IN: regexp.classes
|
||||||
|
|
||||||
: punct? ( ch -- ? )
|
|
||||||
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
|
|
||||||
|
|
||||||
: c-identifier-char? ( ch -- ? )
|
|
||||||
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
|
|
||||||
|
|
||||||
: java-blank? ( ch -- ? )
|
|
||||||
{
|
|
||||||
CHAR: \s CHAR: \t CHAR: \n
|
|
||||||
HEX: b HEX: 7 CHAR: \r
|
|
||||||
} member? ;
|
|
||||||
|
|
||||||
: java-printable? ( ch -- ? )
|
|
||||||
[ [ alpha? ] [ punct? ] ] 1|| ;
|
|
||||||
|
|
||||||
: hex-digit? ( ch -- ? )
|
|
||||||
{
|
|
||||||
[ CHAR: A CHAR: F between? ]
|
|
||||||
[ CHAR: a CHAR: f between? ]
|
|
||||||
[ CHAR: 0 CHAR: 9 between? ]
|
|
||||||
} 1|| ;
|
|
||||||
|
|
||||||
SINGLETONS: any-char any-char-no-nl
|
SINGLETONS: any-char any-char-no-nl
|
||||||
letter-class LETTER-class Letter-class digit-class
|
letter-class LETTER-class Letter-class digit-class
|
||||||
alpha-class non-newline-blank-class
|
alpha-class non-newline-blank-class
|
||||||
|
@ -70,16 +48,24 @@ M: ascii-class class-member? ( obj class -- ? )
|
||||||
M: digit-class class-member? ( obj class -- ? )
|
M: digit-class class-member? ( obj class -- ? )
|
||||||
drop digit? ;
|
drop digit? ;
|
||||||
|
|
||||||
|
: c-identifier-char? ( ch -- ? )
|
||||||
|
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
|
||||||
|
|
||||||
M: c-identifier-class class-member? ( obj class -- ? )
|
M: c-identifier-class class-member? ( obj class -- ? )
|
||||||
drop
|
drop c-identifier-char? ;
|
||||||
{ [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
|
|
||||||
|
|
||||||
M: alpha-class class-member? ( obj class -- ? )
|
M: alpha-class class-member? ( obj class -- ? )
|
||||||
drop alpha? ;
|
drop alpha? ;
|
||||||
|
|
||||||
|
: punct? ( ch -- ? )
|
||||||
|
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
|
||||||
|
|
||||||
M: punctuation-class class-member? ( obj class -- ? )
|
M: punctuation-class class-member? ( obj class -- ? )
|
||||||
drop punct? ;
|
drop punct? ;
|
||||||
|
|
||||||
|
: java-printable? ( ch -- ? )
|
||||||
|
{ [ alpha? ] [ punct? ] } 1|| ;
|
||||||
|
|
||||||
M: java-printable-class class-member? ( obj class -- ? )
|
M: java-printable-class class-member? ( obj class -- ? )
|
||||||
drop java-printable? ;
|
drop java-printable? ;
|
||||||
|
|
||||||
|
@ -89,9 +75,22 @@ M: non-newline-blank-class class-member? ( obj class -- ? )
|
||||||
M: control-character-class class-member? ( obj class -- ? )
|
M: control-character-class class-member? ( obj class -- ? )
|
||||||
drop control? ;
|
drop control? ;
|
||||||
|
|
||||||
|
: hex-digit? ( ch -- ? )
|
||||||
|
{
|
||||||
|
[ CHAR: A CHAR: F between? ]
|
||||||
|
[ CHAR: a CHAR: f between? ]
|
||||||
|
[ CHAR: 0 CHAR: 9 between? ]
|
||||||
|
} 1|| ;
|
||||||
|
|
||||||
M: hex-digit-class class-member? ( obj class -- ? )
|
M: hex-digit-class class-member? ( obj class -- ? )
|
||||||
drop hex-digit? ;
|
drop hex-digit? ;
|
||||||
|
|
||||||
|
: java-blank? ( ch -- ? )
|
||||||
|
{
|
||||||
|
CHAR: \s CHAR: \t CHAR: \n
|
||||||
|
HEX: b HEX: 7 CHAR: \r
|
||||||
|
} member? ;
|
||||||
|
|
||||||
M: java-blank-class class-member? ( obj class -- ? )
|
M: java-blank-class class-member? ( obj class -- ? )
|
||||||
drop java-blank? ;
|
drop java-blank? ;
|
||||||
|
|
||||||
|
@ -99,13 +98,7 @@ M: unmatchable-class class-member? ( obj class -- ? )
|
||||||
2drop f ;
|
2drop f ;
|
||||||
|
|
||||||
M: terminator-class class-member? ( obj class -- ? )
|
M: terminator-class class-member? ( obj class -- ? )
|
||||||
drop {
|
drop "\r\n\u000085\u002029\u002028" member? ;
|
||||||
[ CHAR: \r = ]
|
|
||||||
[ CHAR: \n = ]
|
|
||||||
[ CHAR: \u000085 = ]
|
|
||||||
[ CHAR: \u002028 = ]
|
|
||||||
[ CHAR: \u002029 = ]
|
|
||||||
} 1|| ;
|
|
||||||
|
|
||||||
M: beginning-of-line class-member? ( obj class -- ? )
|
M: beginning-of-line class-member? ( obj class -- ? )
|
||||||
2drop f ;
|
2drop f ;
|
||||||
|
@ -119,6 +112,9 @@ C: <or-class> or-class
|
||||||
TUPLE: not-class class ;
|
TUPLE: not-class class ;
|
||||||
C: <not-class> not-class
|
C: <not-class> not-class
|
||||||
|
|
||||||
|
: <and-class> ( classes -- class )
|
||||||
|
[ <not-class> ] map <or-class> <not-class> ;
|
||||||
|
|
||||||
TUPLE: primitive-class class ;
|
TUPLE: primitive-class class ;
|
||||||
C: <primitive-class> primitive-class
|
C: <primitive-class> primitive-class
|
||||||
|
|
||||||
|
@ -130,3 +126,5 @@ M: not-class class-member?
|
||||||
|
|
||||||
M: primitive-class class-member?
|
M: primitive-class class-member?
|
||||||
class>> class-member? ;
|
class>> class-member? ;
|
||||||
|
|
||||||
|
UNION: class primitive-class not-class or-class range ;
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays assocs combinators fry kernel locals
|
USING: accessors arrays assocs combinators fry kernel locals
|
||||||
math math.order regexp.nfa regexp.transition-tables sequences
|
math math.order regexp.nfa regexp.transition-tables sequences
|
||||||
sets sorting vectors sequences.deep ;
|
sets sorting vectors sequences.deep math.functions regexp.classes ;
|
||||||
USING: io prettyprint threads ;
|
USING: io prettyprint threads ;
|
||||||
IN: regexp.dfa
|
IN: regexp.dfa
|
||||||
|
|
||||||
|
@ -17,6 +17,34 @@ IN: regexp.dfa
|
||||||
: while-changes ( obj quot pred -- obj' )
|
: while-changes ( obj quot pred -- obj' )
|
||||||
3dup nip call (while-changes) ; inline
|
3dup nip call (while-changes) ; inline
|
||||||
|
|
||||||
|
TUPLE: parts in out ;
|
||||||
|
|
||||||
|
: make-partition ( choices classes -- partition )
|
||||||
|
zip [ first ] partition parts boa ;
|
||||||
|
|
||||||
|
: powerset-partition ( classes -- partitions )
|
||||||
|
! Here is where class algebra will happen, when I implement it
|
||||||
|
[ length [ 2^ ] keep ] keep '[
|
||||||
|
_ [ ] map-bits _ make-partition
|
||||||
|
] map ;
|
||||||
|
|
||||||
|
: partition>class ( parts -- class )
|
||||||
|
[ in>> ] [ out>> ] bi
|
||||||
|
[ <or-class> ] bi@ <not-class> 2array <and-class> ;
|
||||||
|
|
||||||
|
: get-transitions ( partition state-transitions -- next-states )
|
||||||
|
[ in>> ] dip '[ at ] gather ;
|
||||||
|
|
||||||
|
: disambiguate-overlap ( nfa -- nfa' )
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[ keys powerset-partition ] keep '[
|
||||||
|
[ partition>class ]
|
||||||
|
[ _ get-transitions ] bi
|
||||||
|
] H{ } map>assoc
|
||||||
|
] assoc-map
|
||||||
|
] change-transitions ;
|
||||||
|
|
||||||
: find-delta ( states transition nfa -- new-states )
|
: find-delta ( states transition nfa -- new-states )
|
||||||
transitions>> '[ _ swap _ at at ] gather sift ;
|
transitions>> '[ _ swap _ at at ] gather sift ;
|
||||||
|
|
||||||
|
@ -72,6 +100,7 @@ IN: regexp.dfa
|
||||||
swap find-start-state >>start-state ;
|
swap find-start-state >>start-state ;
|
||||||
|
|
||||||
: construct-dfa ( nfa -- dfa )
|
: construct-dfa ( nfa -- dfa )
|
||||||
|
disambiguate-overlap
|
||||||
dup initialize-dfa
|
dup initialize-dfa
|
||||||
dup start-state>> 1vector
|
dup start-state>> 1vector
|
||||||
H{ } clone
|
H{ } clone
|
||||||
|
|
|
@ -3,17 +3,26 @@
|
||||||
USING: accessors arrays assocs grouping kernel
|
USING: accessors arrays assocs grouping kernel
|
||||||
locals math namespaces sequences fry quotations
|
locals math namespaces sequences fry quotations
|
||||||
math.order math.ranges vectors unicode.categories
|
math.order math.ranges vectors unicode.categories
|
||||||
regexp.transition-tables words sets hashtables
|
regexp.transition-tables words sets hashtables combinators.short-circuit
|
||||||
unicode.case.private regexp.ast regexp.classes ;
|
unicode.case.private regexp.ast regexp.classes ;
|
||||||
|
IN: regexp.nfa
|
||||||
|
|
||||||
! This uses unicode.case.private for ch>upper and ch>lower
|
! This uses unicode.case.private for ch>upper and ch>lower
|
||||||
! but case-insensitive matching should be done by case-folding everything
|
! but case-insensitive matching should be done by case-folding everything
|
||||||
! before processing starts
|
! before processing starts
|
||||||
IN: regexp.nfa
|
|
||||||
|
|
||||||
SYMBOL: negated?
|
GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
|
||||||
|
! This is unfinished and does nothing right now!
|
||||||
|
|
||||||
: negate ( -- )
|
M: object remove-lookahead ;
|
||||||
negated? [ not ] change ;
|
|
||||||
|
M: with-options remove-lookahead
|
||||||
|
[ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
|
||||||
|
|
||||||
|
M: alternation remove-lookahead
|
||||||
|
[ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ;
|
||||||
|
|
||||||
|
M: concatenation remove-lookahead ;
|
||||||
|
|
||||||
SINGLETON: eps
|
SINGLETON: eps
|
||||||
|
|
||||||
|
@ -45,16 +54,9 @@ SYMBOL: nfa-table
|
||||||
|
|
||||||
GENERIC: nfa-node ( node -- start-state end-state )
|
GENERIC: nfa-node ( node -- start-state end-state )
|
||||||
|
|
||||||
:: add-simple-entry ( obj class -- start-state end-state )
|
: add-simple-entry ( obj class -- start-state end-state )
|
||||||
next-state :> s0
|
[ next-state next-state 2dup ] 2dip
|
||||||
next-state :> s1
|
make-transition table add-transition ;
|
||||||
negated? get [
|
|
||||||
s0 f obj class make-transition table add-transition
|
|
||||||
s0 s1 <default-transition> table add-transition
|
|
||||||
] [
|
|
||||||
s0 s1 obj class make-transition table add-transition
|
|
||||||
] if
|
|
||||||
s0 s1 ;
|
|
||||||
|
|
||||||
: epsilon-transition ( source target -- )
|
: epsilon-transition ( source target -- )
|
||||||
eps <literal-transition> table add-transition ;
|
eps <literal-transition> table add-transition ;
|
||||||
|
@ -92,62 +94,66 @@ M: alternation nfa-node ( node -- start end )
|
||||||
[ nfa-node ] bi@
|
[ nfa-node ] bi@
|
||||||
alternate-nodes ;
|
alternate-nodes ;
|
||||||
|
|
||||||
|
GENERIC: modify-class ( char-class -- char-class' )
|
||||||
|
|
||||||
|
M: object modify-class ;
|
||||||
|
|
||||||
|
M: integer modify-class
|
||||||
|
case-insensitive option? [
|
||||||
|
dup Letter? [
|
||||||
|
[ ch>lower ] [ ch>upper ] bi 2array <or-class>
|
||||||
|
] when
|
||||||
|
] when ;
|
||||||
|
|
||||||
M: integer nfa-node ( node -- start end )
|
M: integer nfa-node ( node -- start end )
|
||||||
|
modify-class dup class?
|
||||||
|
class-transition literal-transition ?
|
||||||
|
add-simple-entry ;
|
||||||
|
|
||||||
|
M: primitive-class modify-class
|
||||||
|
class>> modify-class <primitive-class> ;
|
||||||
|
|
||||||
|
M: or-class modify-class
|
||||||
|
seq>> [ modify-class ] map <or-class> ;
|
||||||
|
|
||||||
|
M: not-class modify-class
|
||||||
|
class>> modify-class <not-class> ;
|
||||||
|
|
||||||
|
M: any-char modify-class
|
||||||
|
[ dotall option? ] dip any-char-no-nl ? ;
|
||||||
|
|
||||||
|
: modify-letter-class ( class -- newclass )
|
||||||
|
case-insensitive option? [ drop Letter-class ] when ;
|
||||||
|
M: letter-class modify-class modify-letter-class ;
|
||||||
|
M: LETTER-class modify-class modify-letter-class ;
|
||||||
|
|
||||||
|
: cased-range? ( range -- ? )
|
||||||
|
[ from>> ] [ to>> ] bi {
|
||||||
|
[ [ letter? ] bi@ and ]
|
||||||
|
[ [ LETTER? ] bi@ and ]
|
||||||
|
} 2|| ;
|
||||||
|
|
||||||
|
M: range modify-class
|
||||||
case-insensitive option? [
|
case-insensitive option? [
|
||||||
dup [ ch>lower ] [ ch>upper ] bi
|
dup cased-range? [
|
||||||
2dup = [
|
[ from>> ] [ to>> ] bi
|
||||||
2drop
|
|
||||||
literal-transition add-simple-entry
|
|
||||||
] [
|
|
||||||
[ literal-transition add-simple-entry ] bi@
|
|
||||||
alternate-nodes [ nip ] dip
|
|
||||||
] if
|
|
||||||
] [ literal-transition add-simple-entry ] if ;
|
|
||||||
|
|
||||||
M: primitive-class nfa-node ( node -- start end )
|
|
||||||
class>> dup
|
|
||||||
{ letter-class LETTER-class } member? case-insensitive option? and
|
|
||||||
[ drop Letter-class ] when
|
|
||||||
class-transition add-simple-entry ;
|
|
||||||
|
|
||||||
M: or-class nfa-node class-transition add-simple-entry ;
|
|
||||||
M: not-class nfa-node class-transition add-simple-entry ;
|
|
||||||
|
|
||||||
M: any-char nfa-node ( node -- start end )
|
|
||||||
[ dotall option? ] dip any-char-no-nl ?
|
|
||||||
class-transition add-simple-entry ;
|
|
||||||
|
|
||||||
! M: negation nfa-node ( node -- start end )
|
|
||||||
! negate term>> nfa-node negate ;
|
|
||||||
|
|
||||||
M: range nfa-node ( node -- start end )
|
|
||||||
case-insensitive option? [
|
|
||||||
! This should be implemented for Unicode by case-folding
|
|
||||||
! the input and all strings in the regexp.
|
|
||||||
dup [ from>> ] [ to>> ] bi
|
|
||||||
2dup [ Letter? ] bi@ and [
|
|
||||||
rot drop
|
|
||||||
[ [ ch>lower ] bi@ <range> ]
|
[ [ ch>lower ] bi@ <range> ]
|
||||||
[ [ ch>upper ] bi@ <range> ] 2bi
|
[ [ ch>upper ] bi@ <range> ] 2bi
|
||||||
[ class-transition add-simple-entry ] bi@
|
2array <or-class>
|
||||||
alternate-nodes
|
] when
|
||||||
] [
|
] when ;
|
||||||
2drop
|
|
||||||
class-transition add-simple-entry
|
M: class nfa-node
|
||||||
] if
|
modify-class class-transition add-simple-entry ;
|
||||||
] [
|
|
||||||
class-transition add-simple-entry
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
M: with-options nfa-node ( node -- start end )
|
M: with-options nfa-node ( node -- start end )
|
||||||
dup options>> [ tree>> nfa-node ] using-options ;
|
dup options>> [ tree>> nfa-node ] using-options ;
|
||||||
|
|
||||||
: construct-nfa ( ast -- nfa-table )
|
: construct-nfa ( ast -- nfa-table )
|
||||||
[
|
[
|
||||||
negated? off
|
|
||||||
0 state set
|
0 state set
|
||||||
<transition-table> clone nfa-table set
|
<transition-table> nfa-table set
|
||||||
nfa-node
|
remove-lookahead nfa-node
|
||||||
table
|
table
|
||||||
swap dup associate >>final-states
|
swap dup associate >>final-states
|
||||||
swap >>start-state
|
swap >>start-state
|
||||||
|
|
|
@ -132,11 +132,15 @@ Parenthized = "?:" Alternation:a => [[ a ]]
|
||||||
=> [[ a on off parse-options <with-options> ]]
|
=> [[ a on off parse-options <with-options> ]]
|
||||||
| "?#" [^)]* => [[ f ]]
|
| "?#" [^)]* => [[ f ]]
|
||||||
| "?~" Alternation:a => [[ a <negation> ]]
|
| "?~" Alternation:a => [[ a <negation> ]]
|
||||||
|
| "?=" Alternation:a => [[ a <lookahead> ]]
|
||||||
|
| "?!" Alternation:a => [[ a <negation> <lookahead> ]]
|
||||||
|
| "?<=" Alternation:a => [[ a <lookbehind> ]]
|
||||||
|
| "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
|
||||||
| Alternation
|
| Alternation
|
||||||
|
|
||||||
Element = "(" Parenthized:p ")" => [[ p ]]
|
Element = "(" Parenthized:p ")" => [[ p ]]
|
||||||
| "[" CharClass:r "]" => [[ r ]]
|
| "[" CharClass:r "]" => [[ r ]]
|
||||||
| ".":d => [[ any-char ]]
|
| ".":d => [[ any-char <primitive-class> ]]
|
||||||
| Character
|
| Character
|
||||||
|
|
||||||
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
||||||
|
|
|
@ -317,6 +317,22 @@ IN: regexp-tests
|
||||||
! Bug in parsing word
|
! Bug in parsing word
|
||||||
[ t ] [ "a" R' a' matches? ] unit-test
|
[ t ] [ "a" R' a' matches? ] unit-test
|
||||||
|
|
||||||
|
! Testing negation
|
||||||
|
[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
|
||||||
|
[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
|
||||||
|
[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
|
||||||
|
[ t ] [ "" R/ (?~a)/ matches? ] unit-test
|
||||||
|
|
||||||
|
[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
|
||||||
|
[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
|
||||||
|
[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
|
||||||
|
[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
|
||||||
|
[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
|
||||||
|
|
||||||
|
! Intersecting classes
|
||||||
|
[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
|
||||||
|
[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
|
||||||
|
|
||||||
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
|
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
|
||||||
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
||||||
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
||||||
|
|
Loading…
Reference in New Issue