factor/unmaintained/regexp/regexp.factor

502 lines
14 KiB
Factor

USING: arrays errors generic assocs io kernel math
memoize namespaces kernel sequences strings tables
vectors ;
USE: interpreter
USE: prettyprint
USE: test
IN: regexp-internals
SYMBOL: trans-table
SYMBOL: eps
SYMBOL: start-state
SYMBOL: final-state
SYMBOL: paren-count
SYMBOL: currentstate
SYMBOL: stack
SYMBOL: bot
SYMBOL: eot
SYMBOL: alternation
SYMBOL: lparen
SYMBOL: rparen
: regexp-init ( -- )
0 paren-count set
-1 currentstate set
V{ } clone stack set
<vector-table> final-state over add-column trans-table set ;
: paren-underflow? ( -- )
paren-count get 0 < [ "too many rparen" throw ] when ;
: unbalanced-paren? ( -- )
paren-count get 0 > [ "neesds closing paren" throw ] when ;
: inc-paren-count ( -- )
paren-count [ 1+ ] change ;
: dec-paren-count ( -- )
paren-count [ 1- ] change paren-underflow? ;
: push-stack ( n -- ) stack get push ;
: next-state ( -- n )
currentstate [ 1+ ] change currentstate get ;
: current-state ( -- n ) currentstate get ;
: set-trans-table ( row col data -- )
<entry> trans-table get set-value ;
: add-trans-table ( row col data -- )
<entry> trans-table get add-value ;
: data-stack-slice ( token -- seq )
stack get reverse [ index ] keep cut reverse dup pop* stack set reverse ;
: find-start-state ( table -- n )
start-state t rot find-by-column first ;
: find-final-state ( table -- n )
final-state t rot find-by-column first ;
: final-state? ( row table -- ? )
get-row final-state swap key? ;
: switch-rows ( r1 r2 -- )
[ 2array [ trans-table get get-row ] each ] 2keep
2array [ trans-table get set-row ] each ;
: set-table-prop ( prop s table -- )
pick over add-column table-rows
[
pick rot member? [
pick t swap rot set-at
] [
drop
] if
] assoc-each 2drop ;
: add-numbers ( n obj -- obj )
dup sequence? [
[ + ] map-with
] [
dup number? [ + ] [ nip ] if
] if ;
: increment-cols ( n row -- )
! n row
dup [ >r pick r> add-numbers swap pick set-at ] assoc-each 2drop ;
: complex-count ( c -- ci-cr+1 )
>rect swap - 1+ ;
: copy-rows ( c1 -- )
#! copy rows to the bottom with a new row-name c1_range higher
[ complex-count ] keep trans-table get table-rows ! 2 C{ 0 1 } rows
[ drop [ over real >= ] keep pick imaginary <= and ] assoc-subset nip
[ clone [ >r over r> increment-cols ] keep swap pick + trans-table get set-row ] assoc-each ! 2
currentstate get 1+ dup pick + 1- rect> push-stack
currentstate [ + ] change ;
! s1 final f ! s1 eps s2 ! output s0,s3
: apply-concat ( seq -- )
! "Concat: " write dup .
dup pop over pop swap
over imaginary final-state f set-trans-table
2dup >r imaginary eps r> real add-trans-table
>r real r> imaginary rect> swap push ;
! swap 0, 4 so 0 is incoming
! ! s1 final f ! s3 final f ! s4 e s0 ! s4 e s2 ! s1 e s5 ! s3 e s5
! ! s5 final t ! s4,s5 push
SYMBOL: saved-state
: apply-alternation ( seq -- )
! "Alternation: " print
dup pop over pop* over pop swap
next-state trans-table get add-row
>r >rect >r saved-state set current-state r> rect> r>
! 4,1 2,3
over real saved-state get trans-table get swap-rows
saved-state get start-state t set-trans-table
over real start-state f set-trans-table
over imaginary final-state f set-trans-table
dup imaginary final-state f set-trans-table
over real saved-state get eps rot add-trans-table
dup real saved-state get eps rot add-trans-table
imaginary eps next-state add-trans-table
imaginary eps current-state add-trans-table
current-state final-state t set-trans-table
saved-state get current-state rect> swap push ;
! s1 final f ! s1 e s0 ! s2 e s0 ! s2 e s3 ! s1 e s3 ! s3 final t
: apply-kleene-closure ( -- )
! "Apply kleene closure" print
stack get pop
next-state trans-table get add-row
>rect >r [ saved-state set ] keep current-state
[ trans-table get swap-rows ] keep r> rect>
dup imaginary final-state f set-trans-table
dup imaginary eps pick real add-trans-table
saved-state get eps pick real add-trans-table
saved-state get eps next-state add-trans-table
imaginary eps current-state add-trans-table
current-state final-state t add-trans-table
saved-state get current-state rect> push-stack ;
: apply-plus-closure ( -- )
! "Apply plus closure" print
stack get peek copy-rows
apply-kleene-closure stack get apply-concat ;
: apply-alternation? ( seq -- ? )
dup length dup 3 < [
2drop f
] [
2 - swap nth alternation =
] if ;
: apply-concat? ( seq -- ? )
dup length dup 2 < [
2drop f
] [
2 - swap nth complex?
] if ;
: (apply) ( slice -- slice )
dup length 1 > [
{
{ [ dup apply-alternation? ]
[ [ apply-alternation ] keep (apply) ] }
{ [ dup apply-concat? ]
[ [ apply-concat ] keep (apply) ] }
} cond
] when ;
: apply-til-last ( tokens -- slice )
data-stack-slice (apply) ;
: maybe-concat ( -- )
stack get apply-concat? [ stack get apply-concat ] when ;
: maybe-concat-loop ( -- )
stack get length maybe-concat stack get length > [
maybe-concat-loop
] when ;
: create-nontoken-nfa ( tok -- )
next-state swap next-state <entry>
[ trans-table get set-value ] keep
entry-value final-state t set-trans-table
current-state [ 1- ] keep rect> push-stack ;
! stack gets: alternation C{ 0 1 }
: apply-question-closure ( -- )
alternation push-stack
eps create-nontoken-nfa stack get apply-alternation ;
! {2} exactly twice, {2,} 2 or more, {2,4} exactly 2,3,4 times
! : apply-bracket-closure ( c1 -- )
! ;
SYMBOL: character-class
SYMBOL: brace
SYMBOL: escaped-character
SYMBOL: octal
SYMBOL: hex
SYMBOL: control
SYMBOL: posix
: addto-character-class ( char -- )
;
: make-escaped ( char -- )
{
! TODO: POSIX character classes (US-ASCII only)
! TODO: Classes for Unicode blocks and categories
! { CHAR: { [ ] } ! left brace
{ CHAR: \\ [ ] } ! backaslash
{ CHAR: 0 [ ] } ! octal \0n \0nn \0mnn (0 <= m <= 3, 0 <= n <= 7)
{ CHAR: x [ ] } ! \xhh
{ CHAR: u [ ] } ! \uhhhh
{ CHAR: t [ ] } ! tab \u0009
{ CHAR: n [ ] } ! newline \u000a
{ CHAR: r [ ] } ! carriage-return \u000d
{ CHAR: f [ ] } ! form-feed \u000c
{ CHAR: a [ ] } ! alert (bell) \u0007
{ CHAR: e [ ] } ! escape \u001b
{ CHAR: c [ ] } ! control character corresoding to X in \cX
{ CHAR: d [ ] } ! [0-9]
{ CHAR: D [ ] } ! [^0-9]
{ CHAR: s [ ] } ! [ \t\n\x0B\f\r]
{ CHAR: S [ ] } ! [^\s]
{ CHAR: w [ ] } ! [a-zA-Z_0-9]
{ CHAR: W [ ] } ! [^\w]
{ CHAR: b [ ] } ! a word boundary
{ CHAR: B [ ] } ! a non-word boundary
{ CHAR: A [ ] } ! the beginning of input
{ CHAR: G [ ] } ! the end of the previous match
{ CHAR: Z [ ] } ! the end of the input but for the
! final terminator, if any
{ CHAR: z [ ] } ! the end of the input
} case ;
: handle-character-class ( char -- )
{
{ [ \ escaped-character get ] [ make-escaped \ escaped-character off ] }
{ [ dup CHAR: ] = ] [ \ character-class off ] }
{ [ t ] [ addto-character-class ] }
} cond ;
: parse-token ( char -- )
{
! { [ \ character-class get ] [ ] }
! { [ \ escaped-character get ] [ ] }
! { [ dup CHAR: [ = ] [ \ character-class on ] }
! { [ dup CHAR: \\ = ] [ drop \ escaped-character on ] }
! { [ dup CHAR: ^ = ] [ ] }
! { [ dup CHAR: $ = ] [ ] }
! { [ dup CHAR: { = ] [ ] }
! { [ dup CHAR: } = ] [ ] }
{ [ dup CHAR: | = ]
[ drop maybe-concat-loop alternation push-stack ] }
{ [ dup CHAR: * = ]
[ drop apply-kleene-closure ] }
{ [ dup CHAR: + = ]
[ drop apply-plus-closure ] }
{ [ dup CHAR: ? = ]
[ drop apply-question-closure ] }
{ [ dup CHAR: ( = ]
[ drop inc-paren-count lparen push-stack ] }
{ [ dup CHAR: ) = ]
[
drop dec-paren-count lparen apply-til-last
stack get push-all
] } ! apply
{ [ dup bot = ] [ push-stack ] }
{ [ dup eot = ]
[
drop unbalanced-paren? maybe-concat-loop bot apply-til-last
dup length 1 = [
pop real start-state t set-trans-table
] [
drop
] if
] }
{ [ t ] [ create-nontoken-nfa ] }
} cond ;
: cut-at-index ( i string ch -- i subseq )
-rot [ index* ] 2keep >r >r [ 1+ ] keep r> swap r> subseq ;
: parse-character-class ( index string -- new-index obj )
2dup >r 1+ r> nth CHAR: ] = [ >r 1+ r> ] when
cut-at-index ;
: (parse-regexp) ( str -- )
dup length [
2dup swap character-class get [
parse-character-class
"CHARACTER CLASS: " write .
character-class off
nip ! adjust index
] [
nth parse-token
] if
] repeat ;
: parse-regexp ( str -- )
bot parse-token
! [ "parsing: " write dup ch>string . parse-token ] each
[ parse-token ] each
! (parse-regexp)
eot parse-token ;
: push-all-diff ( seq seq -- diff )
[ swap seq-diff ] 2keep push-all ;
: prune-sort ( vec -- vec )
prune natural-sort >vector ;
SYMBOL: ttable
SYMBOL: transition
SYMBOL: check-list
SYMBOL: initial-check-list
SYMBOL: result
: init-find ( data state table -- )
ttable set
dup sequence? [ clone >vector ] [ V{ } clone [ push ] keep ] if
[ check-list set ] keep clone initial-check-list set
V{ } clone result set
transition set ;
: (find-next-state) ( -- )
check-list get [
[
ttable get get-row transition get swap at*
[ dup sequence? [ % ] [ , ] if ] [ drop ] if
] each
] { } make
result get push-all-diff
check-list set
result get prune-sort result set ;
: (find-next-state-recursive) ( -- )
check-list get empty? [ (find-next-state) (find-next-state-recursive) ] unless ;
: find-epsilon-closure ( state table -- vec )
eps -rot init-find
(find-next-state-recursive) result get initial-check-list get append natural-sort ;
: find-next-state ( data state table -- vec )
find-epsilon-closure check-list set
V{ } clone result set transition set
(find-next-state) result get ttable get find-epsilon-closure ;
: filter-cols ( vec -- vec )
#! remove info columns state-state, eps, final
clone start-state over delete-at eps over delete-at
final-state over delete-at ;
SYMBOL: old-table
SYMBOL: new-table
SYMBOL: todo-states
SYMBOL: transitions
: init-nfa>dfa ( table -- )
<vector-table> new-table set
[ table-columns clone filter-cols keys transitions set ] keep
dup [ find-start-state ] keep find-epsilon-closure
V{ } clone [ push ] keep todo-states set
old-table set ;
: create-row ( state table -- )
2dup row-exists?
[ 2drop ] [ [ add-row ] 2keep drop todo-states get push ] if ;
: (nfa>dfa) ( -- )
todo-states get dup empty? [
pop transitions get [
2dup swap old-table get find-next-state
dup empty? [
3drop
] [
dup new-table get create-row
<entry> new-table get set-value
] if
] each-with
] unless* todo-states get empty? [ (nfa>dfa) ] unless ;
: nfa>dfa ( table -- table )
init-nfa>dfa
(nfa>dfa)
start-state old-table get find-start-state
new-table get set-table-prop
final-state old-table get find-final-state
new-table get [ set-table-prop ] keep ;
SYMBOL: regexp
SYMBOL: text
SYMBOL: matches
SYMBOL: partial-matches
TUPLE: partial-match index row count ;
! a state is a vector
! state is a key in a hashtable. the value is a hashtable of transition states
: save-partial-match ( index row -- )
1 <partial-match> dup partial-match-index
\ partial-matches get set-at ;
: inc-partial-match ( partial-match -- )
[ partial-match-count 1+ ] keep set-partial-match-count ;
: check-final-state ( partial-match -- )
dup partial-match-row regexp get final-state? [
clone dup partial-match-index matches get set-at
] [
drop
] if ;
: check-trivial-match ( row regexp -- )
dupd final-state? [
>r 0 r> 0 <partial-match>
0 matches get set-at
] [
drop
] if ;
: update-partial-match ( char partial-match -- )
tuck partial-match-row regexp get get-row at* [
over set-partial-match-row
inc-partial-match
] [
drop
partial-match-index partial-matches get delete-at
] if ;
: regexp-step ( index char start-state -- )
! check partial-matches
over \ partial-matches get
[ nip update-partial-match ] assoc-each-with
! check new match
at* [
save-partial-match
] [
2drop
] if
partial-matches get values [ check-final-state ] each ;
: regexp-match ( text regexp -- seq )
#! text is the haystack
#! regexp is a table describing the needle
H{ } clone \ matches set
H{ } clone \ partial-matches set
dup regexp set
>r dup text set r>
[ find-start-state ] keep
2dup check-trivial-match
get-row
swap [ length ] keep
[ pick regexp-step ] 2each drop
matches get values [
[ partial-match-index ] keep
partial-match-count dupd + text get <slice>
] map ;
IN: regexp
MEMO: make-regexp ( str -- table )
[
regexp-init
parse-regexp
trans-table get nfa>dfa
] with-scope ;
! TODO: make compatible with
! http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
! Greedy
! Match the longest possible string, default
! a+
! Reluctant
! Match on shortest possible string
! / in vi does this (find next)
! a+?
! Possessive
! Match only when the entire text string matches
! a++