Almost done with regexp cleanup
parent
242cfb5c19
commit
b8845cb87e
|
@ -0,0 +1,53 @@
|
||||||
|
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: kernel arrays accessors fry sequences ;
|
||||||
|
FROM: math.ranges => [a,b] ;
|
||||||
|
IN: regexp.ast
|
||||||
|
|
||||||
|
TUPLE: primitive-class class ;
|
||||||
|
C: <primitive-class> primitive-class
|
||||||
|
|
||||||
|
TUPLE: negation term ;
|
||||||
|
C: <negation> negation
|
||||||
|
|
||||||
|
TUPLE: from-to n m ;
|
||||||
|
C: <from-to> from-to
|
||||||
|
|
||||||
|
TUPLE: at-least n ;
|
||||||
|
C: <at-least> at-least
|
||||||
|
|
||||||
|
TUPLE: concatenation seq ;
|
||||||
|
C: <concatenation> concatenation
|
||||||
|
|
||||||
|
TUPLE: alternation seq ;
|
||||||
|
C: <alternation> alternation
|
||||||
|
|
||||||
|
TUPLE: star term ;
|
||||||
|
C: <star> star
|
||||||
|
|
||||||
|
TUPLE: with-options tree options ;
|
||||||
|
C: <with-options> with-options
|
||||||
|
|
||||||
|
TUPLE: options on off ;
|
||||||
|
C: <options> options
|
||||||
|
|
||||||
|
SINGLETONS: unix-lines dotall multiline comments case-insensitive
|
||||||
|
unicode-case reversed-regexp ;
|
||||||
|
|
||||||
|
: <maybe> ( term -- term' )
|
||||||
|
f <concatenation> 2array <alternation> ;
|
||||||
|
|
||||||
|
: <plus> ( term -- term' )
|
||||||
|
dup <star> 2array <concatenation> ;
|
||||||
|
|
||||||
|
: repetition ( n term -- term' )
|
||||||
|
<array> <concatenation> ;
|
||||||
|
|
||||||
|
GENERIC: <times> ( term times -- term' )
|
||||||
|
M: at-least <times>
|
||||||
|
n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
|
||||||
|
M: from-to <times>
|
||||||
|
[ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
|
||||||
|
|
||||||
|
: char-class ( ranges ? -- term )
|
||||||
|
[ <alternation> ] dip [ <negation> ] when ;
|
|
@ -1,27 +0,0 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
|
||||||
USING: accessors hashtables kernel math vectors ;
|
|
||||||
IN: regexp.backend
|
|
||||||
|
|
||||||
TUPLE: regexp
|
|
||||||
raw
|
|
||||||
{ options hashtable }
|
|
||||||
stack
|
|
||||||
parse-tree
|
|
||||||
nfa-table
|
|
||||||
dfa-table
|
|
||||||
minimized-table
|
|
||||||
matchers
|
|
||||||
{ nfa-traversal-flags hashtable }
|
|
||||||
{ dfa-traversal-flags hashtable }
|
|
||||||
{ state integer }
|
|
||||||
{ new-states vector }
|
|
||||||
{ visited-states hashtable } ;
|
|
||||||
|
|
||||||
: reset-regexp ( regexp -- regexp )
|
|
||||||
0 >>state
|
|
||||||
V{ } clone >>stack
|
|
||||||
V{ } clone >>new-states
|
|
||||||
H{ } clone >>visited-states ;
|
|
||||||
|
|
||||||
SYMBOL: current-regexp
|
|
|
@ -1,9 +1,31 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors kernel math math.order words regexp.utils
|
USING: accessors kernel math math.order words
|
||||||
ascii unicode.categories combinators.short-circuit ;
|
ascii unicode.categories combinators.short-circuit sequences ;
|
||||||
IN: regexp.classes
|
IN: regexp.classes
|
||||||
|
|
||||||
|
: punct? ( ch -- ? )
|
||||||
|
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
|
||||||
|
|
||||||
|
: c-identifier-char? ( ch -- ? )
|
||||||
|
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
|
||||||
|
|
||||||
|
: java-blank? ( ch -- ? )
|
||||||
|
{
|
||||||
|
CHAR: \s CHAR: \t CHAR: \n
|
||||||
|
HEX: b HEX: 7 CHAR: \r
|
||||||
|
} member? ;
|
||||||
|
|
||||||
|
: java-printable? ( ch -- ? )
|
||||||
|
[ [ alpha? ] [ punct? ] ] 1|| ;
|
||||||
|
|
||||||
|
: hex-digit? ( ch -- ? )
|
||||||
|
{
|
||||||
|
[ CHAR: A CHAR: F between? ]
|
||||||
|
[ CHAR: a CHAR: f between? ]
|
||||||
|
[ CHAR: 0 CHAR: 9 between? ]
|
||||||
|
} 1|| ;
|
||||||
|
|
||||||
SINGLETONS: any-char any-char-no-nl
|
SINGLETONS: any-char any-char-no-nl
|
||||||
letter-class LETTER-class Letter-class digit-class
|
letter-class LETTER-class Letter-class digit-class
|
||||||
alpha-class non-newline-blank-class
|
alpha-class non-newline-blank-class
|
||||||
|
@ -14,8 +36,8 @@ unmatchable-class terminator-class word-boundary-class ;
|
||||||
SINGLETONS: beginning-of-input beginning-of-line
|
SINGLETONS: beginning-of-input beginning-of-line
|
||||||
end-of-input end-of-line ;
|
end-of-input end-of-line ;
|
||||||
|
|
||||||
MIXIN: node
|
TUPLE: range from to ;
|
||||||
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
|
C: <range> range
|
||||||
|
|
||||||
GENERIC: class-member? ( obj class -- ? )
|
GENERIC: class-member? ( obj class -- ? )
|
||||||
|
|
||||||
|
@ -23,7 +45,7 @@ M: t class-member? ( obj class -- ? ) 2drop f ;
|
||||||
|
|
||||||
M: integer class-member? ( obj class -- ? ) 2drop f ;
|
M: integer class-member? ( obj class -- ? ) 2drop f ;
|
||||||
|
|
||||||
M: character-class-range class-member? ( obj class -- ? )
|
M: range class-member? ( obj class -- ? )
|
||||||
[ from>> ] [ to>> ] bi between? ;
|
[ from>> ] [ to>> ] bi between? ;
|
||||||
|
|
||||||
M: any-char class-member? ( obj class -- ? )
|
M: any-char class-member? ( obj class -- ? )
|
||||||
|
|
|
@ -2,83 +2,74 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays assocs combinators fry kernel locals
|
USING: accessors arrays assocs combinators fry kernel locals
|
||||||
math math.order regexp.nfa regexp.transition-tables sequences
|
math math.order regexp.nfa regexp.transition-tables sequences
|
||||||
sets sorting vectors regexp.utils sequences.deep ;
|
sets sorting vectors sequences.deep ;
|
||||||
USING: io prettyprint threads ;
|
USING: io prettyprint threads ;
|
||||||
IN: regexp.dfa
|
IN: regexp.dfa
|
||||||
|
|
||||||
: find-delta ( states transition regexp -- new-states )
|
: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
|
||||||
nfa-table>> transitions>>
|
[ [ dup slip ] dip pick over call ] dip dupd =
|
||||||
rot [ swap at at ] with with gather sift ;
|
[ 3drop ] [ (while-changes) ] if ; inline recursive
|
||||||
|
|
||||||
: (find-epsilon-closure) ( states regexp -- new-states )
|
: while-changes ( obj quot pred -- obj' )
|
||||||
|
3dup nip call (while-changes) ; inline
|
||||||
|
|
||||||
|
: find-delta ( states transition nfa -- new-states )
|
||||||
|
transitions>> '[ _ swap _ at at ] gather sift ;
|
||||||
|
|
||||||
|
: (find-epsilon-closure) ( states nfa -- new-states )
|
||||||
eps swap find-delta ;
|
eps swap find-delta ;
|
||||||
|
|
||||||
: find-epsilon-closure ( states regexp -- new-states )
|
: find-epsilon-closure ( states nfa -- new-states )
|
||||||
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
|
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
|
||||||
natural-sort ;
|
natural-sort ;
|
||||||
|
|
||||||
: find-closure ( states transition regexp -- new-states )
|
: find-closure ( states transition nfa -- new-states )
|
||||||
[ find-delta ] 2keep nip find-epsilon-closure ;
|
[ find-delta ] keep find-epsilon-closure ;
|
||||||
|
|
||||||
: find-start-state ( regexp -- state )
|
: find-start-state ( nfa -- state )
|
||||||
[ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ;
|
[ start-state>> 1vector ] keep find-epsilon-closure ;
|
||||||
|
|
||||||
: find-transitions ( seq1 regexp -- seq2 )
|
: find-transitions ( dfa-state nfa -- next-dfa-state )
|
||||||
nfa-table>> transitions>>
|
transitions>>
|
||||||
[ at keys ] curry gather
|
'[ _ at keys ] gather
|
||||||
eps swap remove ;
|
eps swap remove ;
|
||||||
|
|
||||||
: add-todo-state ( state regexp -- )
|
: add-todo-state ( state visited-states new-states -- )
|
||||||
2dup visited-states>> key? [
|
3dup drop key? [ 3drop ] [
|
||||||
2drop
|
[ conjoin ] [ push ] bi-curry* bi
|
||||||
] [
|
|
||||||
[ visited-states>> conjoin ]
|
|
||||||
[ new-states>> push ] 2bi
|
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
: new-transitions ( regexp -- )
|
:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
|
||||||
dup new-states>> [
|
new-states [ nfa dfa ] [
|
||||||
drop
|
new-states pop :> state
|
||||||
] [
|
state nfa-table find-transitions
|
||||||
dupd pop dup pick find-transitions rot
|
[| trans |
|
||||||
[
|
state trans nfa find-closure :> new-state
|
||||||
[ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
|
state visited-states new-state add-todo-state
|
||||||
[ swapd transition make-transition ] dip
|
state new-state trans transition make-transition dfa add-transition
|
||||||
dfa-table>> add-transition
|
] each
|
||||||
] curry with each
|
nfa dfa new-states visited-states new-transitions
|
||||||
new-transitions
|
|
||||||
] if-empty ;
|
] if-empty ;
|
||||||
|
|
||||||
: states ( hashtable -- array )
|
: states ( hashtable -- array )
|
||||||
[ keys ]
|
[ keys ]
|
||||||
[ values [ values concat ] map concat append ] bi ;
|
[ values [ values concat ] map concat append ] bi ;
|
||||||
|
|
||||||
: set-final-states ( regexp -- )
|
: set-final-states ( nfa dfa -- )
|
||||||
dup
|
[
|
||||||
[ nfa-table>> final-states>> keys ]
|
[ final-states>> keys ]
|
||||||
[ dfa-table>> transitions>> states ] bi
|
[ transitions>> states ] bi*
|
||||||
[ intersects? ] with filter
|
[ intersects? ] with filter
|
||||||
|
] [ final-states>> ] bi
|
||||||
swap dfa-table>> final-states>>
|
|
||||||
[ conjoin ] curry each ;
|
[ conjoin ] curry each ;
|
||||||
|
|
||||||
: set-initial-state ( regexp -- )
|
: initialize-dfa ( nfa -- dfa )
|
||||||
dup
|
<transition-table>
|
||||||
[ dfa-table>> ] [ find-start-state ] bi
|
swap find-start-state >>start-state ;
|
||||||
[ >>start-state drop ] keep
|
|
||||||
1vector >>new-states drop ;
|
|
||||||
|
|
||||||
: set-traversal-flags ( regexp -- )
|
: construct-dfa ( nfa -- dfa )
|
||||||
dup
|
dup initialize-dfa
|
||||||
[ nfa-traversal-flags>> ]
|
dup start-state>> 1vector
|
||||||
[ dfa-table>> transitions>> keys ] bi
|
H{ } clone
|
||||||
[ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc
|
new-transitions
|
||||||
>>dfa-traversal-flags drop ;
|
[ set-final-states ] keep ;
|
||||||
|
|
||||||
: construct-dfa ( regexp -- )
|
|
||||||
{
|
|
||||||
[ set-initial-state ]
|
|
||||||
[ new-transitions ]
|
|
||||||
[ set-final-states ]
|
|
||||||
[ set-traversal-flags ]
|
|
||||||
} cleave ;
|
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays assocs grouping kernel regexp.backend
|
USING: accessors arrays assocs grouping kernel
|
||||||
locals math namespaces regexp.parser sequences fry quotations
|
locals math namespaces sequences fry quotations
|
||||||
math.order math.ranges vectors unicode.categories regexp.utils
|
math.order math.ranges vectors unicode.categories
|
||||||
regexp.transition-tables words sets regexp.classes unicode.case.private ;
|
regexp.transition-tables words sets
|
||||||
|
unicode.case.private regexp.ast regexp.classes ;
|
||||||
! This uses unicode.case.private for ch>upper and ch>lower
|
! This uses unicode.case.private for ch>upper and ch>lower
|
||||||
! but case-insensitive matching should be done by case-folding everything
|
! but case-insensitive matching should be done by case-folding everything
|
||||||
! before processing starts
|
! before processing starts
|
||||||
|
@ -13,34 +14,49 @@ ERROR: feature-is-broken feature ;
|
||||||
|
|
||||||
SYMBOL: negated?
|
SYMBOL: negated?
|
||||||
|
|
||||||
|
: negate ( -- )
|
||||||
|
negated? [ not ] change ;
|
||||||
|
|
||||||
SINGLETON: eps
|
SINGLETON: eps
|
||||||
|
|
||||||
: options ( -- obj ) current-regexp get options>> ;
|
SYMBOL: option-stack
|
||||||
|
|
||||||
: option? ( obj -- ? ) options key? ;
|
SYMBOL: combine-stack
|
||||||
|
|
||||||
: option-on ( obj -- ) options conjoin ;
|
SYMBOL: state
|
||||||
|
|
||||||
: option-off ( obj -- ) options delete-at ;
|
: next-state ( -- state )
|
||||||
|
state [ get ] [ inc ] bi ;
|
||||||
|
|
||||||
: next-state ( regexp -- state )
|
SYMBOL: nfa-table
|
||||||
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
|
|
||||||
|
|
||||||
: set-start-state ( regexp -- )
|
: set-each ( keys value hashtable -- )
|
||||||
dup stack>> [
|
'[ _ swap _ set-at ] each ;
|
||||||
drop
|
|
||||||
] [
|
: options>hash ( options -- hashtable )
|
||||||
[ nfa-table>> ] [ pop first ] bi* >>start-state drop
|
H{ } clone [
|
||||||
] if-empty ;
|
[ [ on>> t ] dip set-each ]
|
||||||
|
[ [ off>> f ] dip set-each ] 2bi
|
||||||
|
] keep ;
|
||||||
|
|
||||||
|
: using-options ( options quot -- )
|
||||||
|
[ options>hash option-stack [ ?push ] change ] dip
|
||||||
|
call option-stack get pop* ; inline
|
||||||
|
|
||||||
|
: option? ( obj -- ? )
|
||||||
|
option-stack get assoc-stack ;
|
||||||
|
|
||||||
|
: set-start-state ( -- nfa-table )
|
||||||
|
nfa-table get
|
||||||
|
combine-stack get pop first >>start-state ;
|
||||||
|
|
||||||
GENERIC: nfa-node ( node -- )
|
GENERIC: nfa-node ( node -- )
|
||||||
|
|
||||||
:: add-simple-entry ( obj class -- )
|
:: add-simple-entry ( obj class -- )
|
||||||
[let* | regexp [ current-regexp get ]
|
[let* | s0 [ next-state ]
|
||||||
s0 [ regexp next-state ]
|
s1 [ next-state ]
|
||||||
s1 [ regexp next-state ]
|
stack [ combine-stack get ]
|
||||||
stack [ regexp stack>> ]
|
table [ nfa-table get ] |
|
||||||
table [ regexp nfa-table>> ] |
|
|
||||||
negated? get [
|
negated? get [
|
||||||
s0 f obj class make-transition table add-transition
|
s0 f obj class make-transition table add-transition
|
||||||
s0 s1 <default-transition> table add-transition
|
s0 s1 <default-transition> table add-transition
|
||||||
|
@ -51,9 +67,8 @@ GENERIC: nfa-node ( node -- )
|
||||||
t s1 table final-states>> set-at ] ;
|
t s1 table final-states>> set-at ] ;
|
||||||
|
|
||||||
:: concatenate-nodes ( -- )
|
:: concatenate-nodes ( -- )
|
||||||
[let* | regexp [ current-regexp get ]
|
[let* | stack [ combine-stack get ]
|
||||||
stack [ regexp stack>> ]
|
table [ nfa-table get ]
|
||||||
table [ regexp nfa-table>> ]
|
|
||||||
s2 [ stack peek first ]
|
s2 [ stack peek first ]
|
||||||
s3 [ stack pop second ]
|
s3 [ stack pop second ]
|
||||||
s0 [ stack peek first ]
|
s0 [ stack peek first ]
|
||||||
|
@ -63,15 +78,14 @@ GENERIC: nfa-node ( node -- )
|
||||||
s0 s3 2array stack push ] ;
|
s0 s3 2array stack push ] ;
|
||||||
|
|
||||||
:: alternate-nodes ( -- )
|
:: alternate-nodes ( -- )
|
||||||
[let* | regexp [ current-regexp get ]
|
[let* | stack [ combine-stack get ]
|
||||||
stack [ regexp stack>> ]
|
table [ nfa-table get ]
|
||||||
table [ regexp nfa-table>> ]
|
|
||||||
s2 [ stack peek first ]
|
s2 [ stack peek first ]
|
||||||
s3 [ stack pop second ]
|
s3 [ stack pop second ]
|
||||||
s0 [ stack peek first ]
|
s0 [ stack peek first ]
|
||||||
s1 [ stack pop second ]
|
s1 [ stack pop second ]
|
||||||
s4 [ regexp next-state ]
|
s4 [ next-state ]
|
||||||
s5 [ regexp next-state ] |
|
s5 [ next-state ] |
|
||||||
s4 s0 eps <literal-transition> table add-transition
|
s4 s0 eps <literal-transition> table add-transition
|
||||||
s4 s2 eps <literal-transition> table add-transition
|
s4 s2 eps <literal-transition> table add-transition
|
||||||
s1 s5 eps <literal-transition> table add-transition
|
s1 s5 eps <literal-transition> table add-transition
|
||||||
|
@ -83,13 +97,12 @@ GENERIC: nfa-node ( node -- )
|
||||||
|
|
||||||
M: star nfa-node ( node -- )
|
M: star nfa-node ( node -- )
|
||||||
term>> nfa-node
|
term>> nfa-node
|
||||||
[let* | regexp [ current-regexp get ]
|
[let* | stack [ combine-stack get ]
|
||||||
stack [ regexp stack>> ]
|
|
||||||
s0 [ stack peek first ]
|
s0 [ stack peek first ]
|
||||||
s1 [ stack pop second ]
|
s1 [ stack pop second ]
|
||||||
s2 [ regexp next-state ]
|
s2 [ next-state ]
|
||||||
s3 [ regexp next-state ]
|
s3 [ next-state ]
|
||||||
table [ regexp nfa-table>> ] |
|
table [ nfa-table get ] |
|
||||||
s1 table final-states>> delete-at
|
s1 table final-states>> delete-at
|
||||||
t s3 table final-states>> set-at
|
t s3 table final-states>> set-at
|
||||||
s1 s0 eps <literal-transition> table add-transition
|
s1 s0 eps <literal-transition> table add-transition
|
||||||
|
@ -99,58 +112,53 @@ M: star nfa-node ( node -- )
|
||||||
s2 s3 2array stack push ] ;
|
s2 s3 2array stack push ] ;
|
||||||
|
|
||||||
M: concatenation nfa-node ( node -- )
|
M: concatenation nfa-node ( node -- )
|
||||||
seq>>
|
seq>> [ eps literal-transition add-simple-entry ] [
|
||||||
reversed-regexp option? [ <reversed> ] when
|
reversed-regexp option? [ <reversed> ] when
|
||||||
[ [ nfa-node ] each ]
|
[ [ nfa-node ] each ]
|
||||||
[ length 1- [ concatenate-nodes ] times ] bi ;
|
[ length 1- [ concatenate-nodes ] times ] bi
|
||||||
|
] if-empty ;
|
||||||
|
|
||||||
M: alternation nfa-node ( node -- )
|
M: alternation nfa-node ( node -- )
|
||||||
seq>>
|
seq>>
|
||||||
[ [ nfa-node ] each ]
|
[ [ nfa-node ] each ]
|
||||||
[ length 1- [ alternate-nodes ] times ] bi ;
|
[ length 1- [ alternate-nodes ] times ] bi ;
|
||||||
|
|
||||||
M: constant nfa-node ( node -- )
|
M: integer nfa-node ( node -- )
|
||||||
case-insensitive option? [
|
case-insensitive option? [
|
||||||
dup char>> [ ch>lower ] [ ch>upper ] bi
|
dup [ ch>lower ] [ ch>upper ] bi
|
||||||
2dup = [
|
2dup = [
|
||||||
2drop
|
2drop
|
||||||
char>> literal-transition add-simple-entry
|
literal-transition add-simple-entry
|
||||||
] [
|
] [
|
||||||
[ literal-transition add-simple-entry ] bi@
|
[ literal-transition add-simple-entry ] bi@
|
||||||
alternate-nodes drop
|
alternate-nodes drop
|
||||||
] if
|
] if
|
||||||
] [
|
] [
|
||||||
char>> literal-transition add-simple-entry
|
literal-transition add-simple-entry
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
M: word nfa-node ( node -- ) class-transition add-simple-entry ;
|
M: primitive-class nfa-node ( node -- )
|
||||||
|
class>> dup
|
||||||
|
{ letter-class LETTER-class } member? case-insensitive option? and
|
||||||
|
[ drop Letter-class ] when
|
||||||
|
class-transition add-simple-entry ;
|
||||||
|
|
||||||
M: any-char nfa-node ( node -- )
|
M: any-char nfa-node ( node -- )
|
||||||
[ dotall option? ] dip any-char-no-nl ?
|
[ dotall option? ] dip any-char-no-nl ?
|
||||||
class-transition add-simple-entry ;
|
class-transition add-simple-entry ;
|
||||||
|
|
||||||
M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
|
M: negation nfa-node ( node -- )
|
||||||
|
negate term>> nfa-node negate ;
|
||||||
|
|
||||||
M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
|
M: range nfa-node ( node -- )
|
||||||
|
|
||||||
: choose-letter-class ( node -- node' )
|
|
||||||
case-insensitive option? Letter-class rot ? ;
|
|
||||||
|
|
||||||
M: letter-class nfa-node ( node -- )
|
|
||||||
choose-letter-class class-transition add-simple-entry ;
|
|
||||||
|
|
||||||
M: LETTER-class nfa-node ( node -- )
|
|
||||||
choose-letter-class class-transition add-simple-entry ;
|
|
||||||
|
|
||||||
M: character-class-range nfa-node ( node -- )
|
|
||||||
case-insensitive option? [
|
case-insensitive option? [
|
||||||
! This should be implemented for Unicode by case-folding
|
! This should be implemented for Unicode by case-folding
|
||||||
! the input and all strings in the regexp.
|
! the input and all strings in the regexp.
|
||||||
dup [ from>> ] [ to>> ] bi
|
dup [ from>> ] [ to>> ] bi
|
||||||
2dup [ Letter? ] bi@ and [
|
2dup [ Letter? ] bi@ and [
|
||||||
rot drop
|
rot drop
|
||||||
[ [ ch>lower ] bi@ character-class-range boa ]
|
[ [ ch>lower ] bi@ <range> ]
|
||||||
[ [ ch>upper ] bi@ character-class-range boa ] 2bi
|
[ [ ch>upper ] bi@ <range> ] 2bi
|
||||||
[ class-transition add-simple-entry ] bi@
|
[ class-transition add-simple-entry ] bi@
|
||||||
alternate-nodes
|
alternate-nodes
|
||||||
] [
|
] [
|
||||||
|
@ -161,14 +169,15 @@ M: character-class-range nfa-node ( node -- )
|
||||||
class-transition add-simple-entry
|
class-transition add-simple-entry
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
M: option nfa-node ( node -- )
|
M: with-options nfa-node ( node -- )
|
||||||
[ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
|
dup options>> [ tree>> nfa-node ] using-options ;
|
||||||
eps literal-transition add-simple-entry ;
|
|
||||||
|
|
||||||
: construct-nfa ( regexp -- )
|
: construct-nfa ( ast -- nfa-table )
|
||||||
[
|
[
|
||||||
reset-regexp
|
negated? off
|
||||||
[ current-regexp set ]
|
V{ } clone combine-stack set
|
||||||
[ parse-tree>> nfa-node ]
|
0 state set
|
||||||
[ set-start-state ] tri
|
<transition-table> clone nfa-table set
|
||||||
|
nfa-node
|
||||||
|
set-start-state
|
||||||
] with-scope ;
|
] with-scope ;
|
||||||
|
|
|
@ -1,28 +1,9 @@
|
||||||
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
|
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: peg.ebnf kernel math.parser sequences assocs arrays
|
USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
|
||||||
combinators regexp.classes strings splitting peg locals ;
|
combinators regexp.classes strings splitting peg locals accessors
|
||||||
|
regexp.ast ;
|
||||||
IN: regexp.parser
|
IN: regexp.parser
|
||||||
|
|
||||||
TUPLE: range from to ;
|
|
||||||
TUPLE: char-class ranges ;
|
|
||||||
TUPLE: primitive-class class ;
|
|
||||||
TUPLE: not-char-class ranges ;
|
|
||||||
TUPLE: not-primitive-class class ;
|
|
||||||
TUPLE: from-to n m ;
|
|
||||||
TUPLE: at-least n ;
|
|
||||||
TUPLE: up-to n ;
|
|
||||||
TUPLE: exactly n ;
|
|
||||||
TUPLE: times expression times ;
|
|
||||||
TUPLE: concatenation seq ;
|
|
||||||
TUPLE: alternation seq ;
|
|
||||||
TUPLE: maybe term ;
|
|
||||||
TUPLE: star term ;
|
|
||||||
TUPLE: plus term ;
|
|
||||||
TUPLE: with-options tree options ;
|
|
||||||
TUPLE: ast ^? $? tree ;
|
|
||||||
SINGLETON: any-char
|
|
||||||
|
|
||||||
: allowed-char? ( ch -- ? )
|
: allowed-char? ( ch -- ? )
|
||||||
".()|[*+?" member? not ;
|
".()|[*+?" member? not ;
|
||||||
|
|
||||||
|
@ -64,21 +45,16 @@ ERROR: bad-class name ;
|
||||||
{ CHAR: e [ HEX: 1b ] }
|
{ CHAR: e [ HEX: 1b ] }
|
||||||
{ CHAR: \\ [ CHAR: \\ ] }
|
{ CHAR: \\ [ CHAR: \\ ] }
|
||||||
|
|
||||||
{ CHAR: w [ c-identifier-class primitive-class boa ] }
|
{ CHAR: w [ c-identifier-class <primitive-class> ] }
|
||||||
{ CHAR: W [ c-identifier-class not-primitive-class boa ] }
|
{ CHAR: W [ c-identifier-class <primitive-class> <negation> ] }
|
||||||
{ CHAR: s [ java-blank-class primitive-class boa ] }
|
{ CHAR: s [ java-blank-class <primitive-class> ] }
|
||||||
{ CHAR: S [ java-blank-class not-primitive-class boa ] }
|
{ CHAR: S [ java-blank-class <primitive-class> <negation> ] }
|
||||||
{ CHAR: d [ digit-class primitive-class boa ] }
|
{ CHAR: d [ digit-class <primitive-class> ] }
|
||||||
{ CHAR: D [ digit-class not-primitive-class boa ] }
|
{ CHAR: D [ digit-class <primitive-class> <negation> ] }
|
||||||
|
|
||||||
[ ]
|
[ ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
TUPLE: options on off ;
|
|
||||||
|
|
||||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive
|
|
||||||
unicode-case reversed-regexp ;
|
|
||||||
|
|
||||||
: options-assoc ( -- assoc )
|
: options-assoc ( -- assoc )
|
||||||
H{
|
H{
|
||||||
{ CHAR: i case-insensitive }
|
{ CHAR: i case-insensitive }
|
||||||
|
@ -98,19 +74,30 @@ unicode-case reversed-regexp ;
|
||||||
options-assoc value-at ;
|
options-assoc value-at ;
|
||||||
|
|
||||||
: parse-options ( on off -- options )
|
: parse-options ( on off -- options )
|
||||||
[ [ ch>option ] map ] bi@ options boa ;
|
[ [ ch>option ] { } map-as ] bi@ <options> ;
|
||||||
|
|
||||||
! TODO: make range syntax better (negation, and, etc),
|
: string>options ( string -- options )
|
||||||
! add syntax for various parenthized things,
|
"-" split1 parse-options ;
|
||||||
|
|
||||||
|
: options>string ( options -- string )
|
||||||
|
[ on>> ] [ off>> ] bi
|
||||||
|
[ [ option>ch ] map ] bi@
|
||||||
|
[ "-" swap 3append ] unless-empty
|
||||||
|
"" like ;
|
||||||
|
|
||||||
|
! TODO: add syntax for various parenthized things,
|
||||||
! add greedy and nongreedy forms of matching
|
! add greedy and nongreedy forms of matching
|
||||||
! (once it's all implemented)
|
! (once it's all implemented)
|
||||||
|
|
||||||
EBNF: (parse-regexp)
|
EBNF: parse-regexp
|
||||||
|
|
||||||
CharacterInBracket = !("}") Character
|
CharacterInBracket = !("}") Character
|
||||||
|
|
||||||
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]]
|
QuotedCharacter = !("\\E") .
|
||||||
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]]
|
|
||||||
|
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
|
||||||
|
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
|
||||||
|
| "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
|
||||||
| "u" Character:a Character:b Character:c Character:d
|
| "u" Character:a Character:b Character:c Character:d
|
||||||
=> [[ { a b c d } hex> ensure-number ]]
|
=> [[ { a b c d } hex> ensure-number ]]
|
||||||
| "x" Character:a Character:b
|
| "x" Character:a Character:b
|
||||||
|
@ -119,30 +106,30 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-cla
|
||||||
=> [[ { a b c } oct> ensure-number ]]
|
=> [[ { a b c } oct> ensure-number ]]
|
||||||
| . => [[ lookup-escape ]]
|
| . => [[ lookup-escape ]]
|
||||||
|
|
||||||
Character = "\\" Escape:e => [[ e ]]
|
EscapeSequence = "\\" Escape:e => [[ e ]]
|
||||||
| . ?[ allowed-char? ]?
|
|
||||||
|
|
||||||
AnyRangeCharacter = Character | "["
|
Character = EscapeSequence | . ?[ allowed-char? ]?
|
||||||
|
|
||||||
|
AnyRangeCharacter = EscapeSequence | .
|
||||||
|
|
||||||
RangeCharacter = !("]") AnyRangeCharacter
|
RangeCharacter = !("]") AnyRangeCharacter
|
||||||
|
|
||||||
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
|
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
|
||||||
| RangeCharacter
|
| RangeCharacter
|
||||||
|
|
||||||
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
|
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
|
||||||
| AnyRangeCharacter
|
| AnyRangeCharacter
|
||||||
|
|
||||||
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
|
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
|
||||||
|
|
||||||
CharClass = "^" Ranges:e => [[ e not-char-class boa ]]
|
CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
|
||||||
| Ranges:e => [[ e char-class boa ]]
|
|
||||||
|
|
||||||
Options = [idmsux]*
|
Options = [idmsux]*
|
||||||
|
|
||||||
Parenthized = "?:" Alternation:a => [[ a ]]
|
Parenthized = "?:" Alternation:a => [[ a ]]
|
||||||
| "?" Options:on "-"? Options:off ":" Alternation:a
|
| "?" Options:on "-"? Options:off ":" Alternation:a
|
||||||
=> [[ a on off parse-options with-options boa ]]
|
=> [[ a on off parse-options <with-options> ]]
|
||||||
| "?#" [^)]* => [[ ignore ]]
|
| "?#" [^)]* => [[ f ]]
|
||||||
| Alternation
|
| Alternation
|
||||||
|
|
||||||
Element = "(" Parenthized:p ")" => [[ p ]]
|
Element = "(" Parenthized:p ")" => [[ p ]]
|
||||||
|
@ -152,32 +139,24 @@ Element = "(" Parenthized:p ")" => [[ p ]]
|
||||||
|
|
||||||
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
|
||||||
|
|
||||||
Times = "," Number:n "}" => [[ n up-to boa ]]
|
Times = "," Number:n "}" => [[ 0 n <from-to> ]]
|
||||||
| Number:n ",}" => [[ n at-least boa ]]
|
| Number:n ",}" => [[ n <at-least> ]]
|
||||||
| Number:n "}" => [[ n exactly boa ]]
|
| Number:n "}" => [[ n n <from-to> ]]
|
||||||
| "}" => [[ bad-number ]]
|
| "}" => [[ bad-number ]]
|
||||||
| Number:n "," Number:m "}" => [[ n m from-to boa ]]
|
| Number:n "," Number:m "}" => [[ n m <from-to> ]]
|
||||||
|
|
||||||
Repeated = Element:e "{" Times:t => [[ e t times boa ]]
|
Repeated = Element:e "{" Times:t => [[ e t <times> ]]
|
||||||
| Element:e "?" => [[ e maybe boa ]]
|
| Element:e "?" => [[ e <maybe> ]]
|
||||||
| Element:e "*" => [[ e star boa ]]
|
| Element:e "*" => [[ e <star> ]]
|
||||||
| Element:e "+" => [[ e plus boa ]]
|
| Element:e "+" => [[ e <plus> ]]
|
||||||
| Element
|
| Element
|
||||||
|
|
||||||
Concatenation = Repeated*:r => [[ r concatenation boa ]]
|
Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
|
||||||
|
|
||||||
Alternation = Concatenation:c ("|" Concatenation)*:a
|
Alternation = Concatenation:c ("|" Concatenation)*:a
|
||||||
=> [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]]
|
=> [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
|
||||||
|
|
||||||
End = !(.)
|
End = !(.)
|
||||||
|
|
||||||
Main = Alternation End
|
Main = Alternation End
|
||||||
;EBNF
|
;EBNF
|
||||||
|
|
||||||
: parse-regexp ( string -- regexp )
|
|
||||||
! Hack because I want $ allowable in regexps,
|
|
||||||
! but with special behavior at the end
|
|
||||||
! This fails if the regexp is stupid, though...
|
|
||||||
dup first CHAR: ^ = tuck [ rest ] when
|
|
||||||
dup peek CHAR: $ = tuck [ but-last ] when
|
|
||||||
(parse-regexp) ast boa ;
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: kernel strings help.markup help.syntax regexp.backend ;
|
USING: kernel strings help.markup help.syntax ;
|
||||||
IN: regexp
|
IN: regexp
|
||||||
|
|
||||||
HELP: <regexp>
|
HELP: <regexp>
|
||||||
|
|
|
@ -189,8 +189,8 @@ IN: regexp-tests
|
||||||
[ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
|
[ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
|
[ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
|
||||||
[ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
|
[ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
|
[ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
|
||||||
[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
|
[ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
|
[ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
|
||||||
[ f ] [ "b" "a+b" <regexp> matches? ] unit-test
|
[ f ] [ "b" "a+b" <regexp> matches? ] unit-test
|
||||||
|
@ -317,16 +317,6 @@ IN: regexp-tests
|
||||||
! Bug in parsing word
|
! Bug in parsing word
|
||||||
[ t ] [ "a" R' a' matches? ] unit-test
|
[ t ] [ "a" R' a' matches? ] unit-test
|
||||||
|
|
||||||
! Convert to lowercase until E
|
|
||||||
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
|
|
||||||
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
|
|
||||||
|
|
||||||
! Convert to uppercase until E
|
|
||||||
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
|
|
||||||
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
|
|
||||||
|
|
||||||
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
|
|
||||||
|
|
||||||
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
|
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
|
||||||
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
||||||
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
||||||
|
@ -370,10 +360,10 @@ IN: regexp-tests
|
||||||
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
|
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
|
||||||
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
|
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
|
||||||
|
|
||||||
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a" R/ a$/m matches? ] unit-test
|
||||||
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a\n" R/ a$/m matches? ] unit-test
|
||||||
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
|
||||||
! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
|
||||||
|
|
||||||
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
|
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
|
||||||
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
|
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
|
||||||
|
|
|
@ -2,33 +2,15 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors combinators kernel math sequences strings sets
|
USING: accessors combinators kernel math sequences strings sets
|
||||||
assocs prettyprint.backend prettyprint.custom make lexer
|
assocs prettyprint.backend prettyprint.custom make lexer
|
||||||
namespaces parser arrays fry regexp.backend regexp.utils
|
namespaces parser arrays fry locals
|
||||||
regexp.parser regexp.nfa regexp.dfa regexp.traversal
|
regexp.parser regexp.nfa regexp.dfa regexp.traversal
|
||||||
regexp.transition-tables splitting sorting ;
|
regexp.transition-tables splitting sorting regexp.ast ;
|
||||||
IN: regexp
|
IN: regexp
|
||||||
|
|
||||||
: default-regexp ( string -- regexp )
|
TUPLE: regexp raw options parse-tree dfa ;
|
||||||
regexp new
|
|
||||||
swap >>raw
|
|
||||||
<transition-table> >>nfa-table
|
|
||||||
<transition-table> >>dfa-table
|
|
||||||
<transition-table> >>minimized-table
|
|
||||||
H{ } clone >>nfa-traversal-flags
|
|
||||||
H{ } clone >>dfa-traversal-flags
|
|
||||||
H{ } clone >>options
|
|
||||||
H{ } clone >>matchers
|
|
||||||
reset-regexp ;
|
|
||||||
|
|
||||||
: construct-regexp ( regexp -- regexp' )
|
|
||||||
{
|
|
||||||
[ dup raw>> parse-regexp >>parse-tree drop ]
|
|
||||||
[ construct-nfa ]
|
|
||||||
[ construct-dfa ]
|
|
||||||
[ ]
|
|
||||||
} cleave ;
|
|
||||||
|
|
||||||
: (match) ( string regexp -- dfa-traverser )
|
: (match) ( string regexp -- dfa-traverser )
|
||||||
<dfa-traverser> do-match ; inline
|
dfa>> <dfa-traverser> do-match ; inline
|
||||||
|
|
||||||
: match ( string regexp -- slice/f )
|
: match ( string regexp -- slice/f )
|
||||||
(match) return-match ;
|
(match) return-match ;
|
||||||
|
@ -94,17 +76,17 @@ IN: regexp
|
||||||
{ "R| " "|" }
|
{ "R| " "|" }
|
||||||
} swap [ subseq? not nip ] curry assoc-find drop ;
|
} swap [ subseq? not nip ] curry assoc-find drop ;
|
||||||
|
|
||||||
: string>options ( string -- options )
|
|
||||||
[ ch>option dup ] H{ } map>assoc ;
|
|
||||||
|
|
||||||
: options>string ( options -- string )
|
|
||||||
keys [ option>ch ] map natural-sort >string ;
|
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: <optioned-regexp> ( string option-string -- regexp )
|
:: <optioned-regexp> ( string options -- regexp )
|
||||||
[ default-regexp ] [ string>options ] bi* >>options
|
string parse-regexp :> tree
|
||||||
construct-regexp ;
|
options parse-options :> opt
|
||||||
|
tree opt <with-options> :> ast
|
||||||
|
regexp new
|
||||||
|
string >>raw
|
||||||
|
opt >>options
|
||||||
|
tree >>parse-tree
|
||||||
|
tree opt <with-options> construct-nfa construct-dfa >>dfa ;
|
||||||
|
|
||||||
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays assocs fry hashtables kernel sequences
|
USING: accessors arrays assocs fry hashtables kernel sequences
|
||||||
vectors regexp.utils ;
|
vectors ;
|
||||||
IN: regexp.transition-tables
|
IN: regexp.transition-tables
|
||||||
|
|
||||||
TUPLE: transition from to obj ;
|
TUPLE: transition from to obj ;
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors assocs combinators kernel math
|
USING: accessors assocs combinators kernel math
|
||||||
quotations sequences regexp.parser regexp.classes fry arrays
|
quotations sequences regexp.classes fry arrays
|
||||||
combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
|
combinators.short-circuit prettyprint regexp.nfa ;
|
||||||
IN: regexp.traversal
|
IN: regexp.traversal
|
||||||
|
|
||||||
TUPLE: dfa-traverser
|
TUPLE: dfa-traverser
|
||||||
|
@ -13,8 +13,7 @@ TUPLE: dfa-traverser
|
||||||
start-index current-index
|
start-index current-index
|
||||||
matches ;
|
matches ;
|
||||||
|
|
||||||
: <dfa-traverser> ( text regexp -- match )
|
: <dfa-traverser> ( text dfa -- match )
|
||||||
dfa-table>>
|
|
||||||
dfa-traverser new
|
dfa-traverser new
|
||||||
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
|
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
|
||||||
swap >>text
|
swap >>text
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
USING: regexp.utils tools.test ;
|
|
||||||
IN: regexp.utils.tests
|
|
||||||
|
|
||||||
[ [ ] [ ] while-changes ] must-infer
|
|
|
@ -1,42 +0,0 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
|
||||||
USING: accessors arrays assocs io kernel math math.order
|
|
||||||
namespaces regexp.backend sequences unicode.categories
|
|
||||||
math.ranges fry combinators.short-circuit vectors ;
|
|
||||||
IN: regexp.utils
|
|
||||||
|
|
||||||
: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
|
|
||||||
[ [ dup slip ] dip pick over call ] dip dupd =
|
|
||||||
[ 3drop ] [ (while-changes) ] if ; inline recursive
|
|
||||||
|
|
||||||
: while-changes ( obj quot pred -- obj' )
|
|
||||||
pick over call (while-changes) ; inline
|
|
||||||
|
|
||||||
ERROR: bad-octal number ;
|
|
||||||
ERROR: bad-hex number ;
|
|
||||||
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
|
|
||||||
: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
|
|
||||||
|
|
||||||
: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
|
|
||||||
|
|
||||||
: hex-digit? ( n -- ? )
|
|
||||||
{
|
|
||||||
[ decimal-digit? ]
|
|
||||||
[ CHAR: a CHAR: f between? ]
|
|
||||||
[ CHAR: A CHAR: F between? ]
|
|
||||||
} 1|| ;
|
|
||||||
|
|
||||||
: punct? ( n -- ? )
|
|
||||||
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
|
|
||||||
|
|
||||||
: c-identifier-char? ( ch -- ? )
|
|
||||||
{ [ alpha? ] [ CHAR: _ = ] } 1|| ;
|
|
||||||
|
|
||||||
: java-blank? ( n -- ? )
|
|
||||||
{
|
|
||||||
CHAR: \s CHAR: \t CHAR: \n
|
|
||||||
HEX: b HEX: 7 CHAR: \r
|
|
||||||
} member? ;
|
|
||||||
|
|
||||||
: java-printable? ( n -- ? )
|
|
||||||
[ [ alpha? ] [ punct? ] ] 1|| ;
|
|
Loading…
Reference in New Issue