502 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Factor
		
	
	
			
		
		
	
	
			502 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Factor
		
	
	
| USING: arrays errors generic assocs io kernel math
 | |
| memoize namespaces kernel sequences strings tables
 | |
| vectors ;
 | |
| USE: interpreter
 | |
| USE: prettyprint
 | |
| USE: test
 | |
| 
 | |
| IN: regexp-internals
 | |
| 
 | |
| SYMBOL: trans-table
 | |
| SYMBOL: eps
 | |
| SYMBOL: start-state
 | |
| SYMBOL: final-state
 | |
| 
 | |
| SYMBOL: paren-count
 | |
| SYMBOL: currentstate
 | |
| SYMBOL: stack
 | |
| 
 | |
| SYMBOL: bot
 | |
| SYMBOL: eot
 | |
| SYMBOL: alternation
 | |
| SYMBOL: lparen
 | |
| SYMBOL: rparen
 | |
| 
 | |
| : regexp-init ( -- )
 | |
|     0 paren-count set
 | |
|     -1 currentstate set
 | |
|     V{ } clone stack set
 | |
|     <vector-table> final-state over add-column trans-table set ;
 | |
| 
 | |
| : paren-underflow? ( -- )
 | |
|     paren-count get 0 < [ "too many rparen" throw ] when ;
 | |
| 
 | |
| : unbalanced-paren? ( -- )
 | |
|     paren-count get 0 > [ "neesds closing paren" throw ] when ;
 | |
| 
 | |
| : inc-paren-count ( -- )
 | |
|     paren-count [ 1+ ] change ;
 | |
| 
 | |
| : dec-paren-count ( -- )
 | |
|     paren-count [ 1- ] change paren-underflow? ;
 | |
| 
 | |
| : push-stack ( n -- ) stack get push ;
 | |
| : next-state ( -- n )
 | |
|     currentstate [ 1+ ] change currentstate get ;
 | |
| : current-state ( -- n ) currentstate get ;
 | |
| 
 | |
| : set-trans-table ( row col data -- )
 | |
|     <entry> trans-table get set-value ;
 | |
| 
 | |
| : add-trans-table ( row col data -- )
 | |
|     <entry> trans-table get add-value ;
 | |
| 
 | |
| : data-stack-slice ( token -- seq )
 | |
|     stack get reverse [ index ] keep cut reverse dup pop* stack set reverse ;
 | |
| 
 | |
| : find-start-state ( table -- n )
 | |
|     start-state t rot find-by-column first ;
 | |
| 
 | |
| : find-final-state ( table -- n )
 | |
|     final-state t rot find-by-column first ;
 | |
| 
 | |
| : final-state? ( row table -- ? )
 | |
|     get-row final-state swap key? ;
 | |
| 
 | |
| : switch-rows ( r1 r2 -- )
 | |
|     [ 2array [ trans-table get get-row ] each ] 2keep
 | |
|     2array [ trans-table get set-row ] each ;
 | |
| 
 | |
| : set-table-prop ( prop s table -- )
 | |
|     pick over add-column table-rows
 | |
|     [
 | |
|         pick rot member? [
 | |
|             pick t swap rot set-at
 | |
|         ] [
 | |
|             drop
 | |
|         ] if
 | |
|     ] assoc-each 2drop ;
 | |
| 
 | |
| : add-numbers ( n obj -- obj )
 | |
|     dup sequence? [ 
 | |
|         [ + ] map-with
 | |
|     ] [
 | |
|         dup number? [ + ] [ nip ] if
 | |
|     ] if ;
 | |
| 
 | |
| : increment-cols ( n row -- )
 | |
|     ! n row
 | |
|     dup [ >r pick r> add-numbers swap pick set-at ] assoc-each 2drop ;
 | |
| 
 | |
| : complex-count ( c -- ci-cr+1 )
 | |
|     >rect swap - 1+ ;
 | |
| 
 | |
| : copy-rows ( c1 -- )
 | |
|     #! copy rows to the bottom with a new row-name c1_range higher
 | |
|     [ complex-count ] keep trans-table get table-rows ! 2 C{ 0 1 } rows
 | |
|     [ drop [ over real >= ] keep pick imaginary <= and ] assoc-subset nip
 | |
|     [ clone [ >r over r> increment-cols ] keep swap pick + trans-table get set-row ] assoc-each ! 2
 | |
|     currentstate get 1+ dup pick + 1- rect> push-stack
 | |
|     currentstate [ + ] change ;
 | |
| 
 | |
| 
 | |
| ! s1 final f ! s1 eps s2 ! output s0,s3
 | |
| : apply-concat ( seq -- )
 | |
|     ! "Concat: " write dup .
 | |
|     dup pop over pop swap
 | |
|     over imaginary final-state f set-trans-table
 | |
|     2dup >r imaginary eps r> real add-trans-table
 | |
|     >r real r> imaginary rect> swap push ; 
 | |
| 
 | |
| ! swap 0, 4 so 0 is incoming
 | |
| ! ! s1 final f ! s3 final f ! s4 e s0 ! s4 e s2 ! s1 e s5 ! s3 e s5
 | |
| ! ! s5 final t ! s4,s5 push
 | |
| 
 | |
| SYMBOL: saved-state
 | |
| : apply-alternation ( seq -- )
 | |
|     ! "Alternation: " print
 | |
|     dup pop over pop* over pop swap
 | |
|     next-state trans-table get add-row
 | |
|     >r >rect >r saved-state set current-state r> rect> r> 
 | |
|     ! 4,1 2,3
 | |
|     over real saved-state get trans-table get swap-rows
 | |
|     saved-state get start-state t set-trans-table
 | |
|     over real start-state f set-trans-table
 | |
|     over imaginary final-state f set-trans-table
 | |
|     dup imaginary final-state f set-trans-table
 | |
|     over real saved-state get eps rot add-trans-table
 | |
|     dup real saved-state get eps rot add-trans-table
 | |
|     imaginary eps next-state add-trans-table
 | |
|     imaginary eps current-state add-trans-table
 | |
|     current-state final-state t set-trans-table
 | |
|     saved-state get current-state rect> swap push ;
 | |
| 
 | |
| ! s1 final f ! s1 e s0 ! s2 e s0 ! s2 e s3 ! s1 e s3 ! s3 final t
 | |
| : apply-kleene-closure ( -- )
 | |
|     ! "Apply kleene closure" print
 | |
|     stack get pop
 | |
|     next-state trans-table get add-row
 | |
|     >rect >r [ saved-state set ] keep current-state 
 | |
|         [ trans-table get swap-rows ] keep r> rect>
 | |
| 
 | |
|     dup imaginary final-state f set-trans-table
 | |
|     dup imaginary eps pick real add-trans-table
 | |
|     saved-state get eps pick real add-trans-table
 | |
|     saved-state get eps next-state add-trans-table
 | |
|     imaginary eps current-state add-trans-table
 | |
|     current-state final-state t add-trans-table
 | |
|     saved-state get current-state rect> push-stack ;
 | |
| 
 | |
| : apply-plus-closure ( -- )
 | |
|     ! "Apply plus closure" print
 | |
|     stack get peek copy-rows
 | |
|     apply-kleene-closure stack get apply-concat ;
 | |
| 
 | |
| : apply-alternation? ( seq -- ? )
 | |
|     dup length dup 3 < [
 | |
|         2drop f
 | |
|     ] [
 | |
|         2 - swap nth alternation =
 | |
|     ] if ; 
 | |
| 
 | |
| : apply-concat? ( seq -- ? )
 | |
|     dup length dup 2 < [
 | |
|         2drop f
 | |
|     ] [
 | |
|         2 - swap nth complex?
 | |
|     ] if ;
 | |
| 
 | |
| : (apply) ( slice -- slice )
 | |
|     dup length 1 > [
 | |
|         {
 | |
|             { [ dup apply-alternation? ]
 | |
|                 [ [ apply-alternation ] keep (apply) ] }
 | |
|             { [ dup apply-concat? ]
 | |
|                 [ [ apply-concat ] keep (apply) ] }
 | |
|         } cond
 | |
|     ] when ;
 | |
| 
 | |
| : apply-til-last ( tokens -- slice )
 | |
|     data-stack-slice (apply) ;
 | |
| 
 | |
| : maybe-concat ( -- )
 | |
|     stack get apply-concat? [ stack get apply-concat ] when ;
 | |
| 
 | |
| : maybe-concat-loop ( -- )
 | |
|     stack get length maybe-concat stack get length > [
 | |
|         maybe-concat-loop
 | |
|     ] when ;
 | |
| 
 | |
| : create-nontoken-nfa ( tok -- )
 | |
|     next-state swap next-state <entry>
 | |
|     [ trans-table get set-value ] keep
 | |
|     entry-value final-state t set-trans-table
 | |
|     current-state [ 1- ] keep rect> push-stack ;
 | |
| 
 | |
| ! stack gets:  alternation C{ 0 1 }
 | |
| : apply-question-closure ( -- )
 | |
|     alternation push-stack
 | |
|     eps create-nontoken-nfa stack get apply-alternation ;
 | |
| 
 | |
| ! {2}  exactly twice,  {2,} 2 or more,  {2,4} exactly 2,3,4 times
 | |
| ! : apply-bracket-closure ( c1 -- )
 | |
|     ! ;
 | |
| SYMBOL: character-class
 | |
| SYMBOL: brace
 | |
| SYMBOL: escaped-character
 | |
| SYMBOL: octal
 | |
| SYMBOL: hex
 | |
| SYMBOL: control
 | |
| SYMBOL: posix
 | |
| 
 | |
| : addto-character-class ( char -- )
 | |
|     ;
 | |
| 
 | |
| : make-escaped ( char -- )
 | |
|     {
 | |
|         ! TODO: POSIX character classes (US-ASCII only)
 | |
|         ! TODO: Classes for Unicode blocks and categories
 | |
| 
 | |
|         ! { CHAR: { [ ] } ! left brace
 | |
|         { CHAR: \\ [ ] } ! backaslash
 | |
| 
 | |
|         { CHAR: 0 [ ] } ! octal \0n \0nn \0mnn (0 <= m <= 3, 0 <= n <= 7)
 | |
|         { CHAR: x [ ] } ! \xhh
 | |
|         { CHAR: u [ ] } ! \uhhhh
 | |
|         { CHAR: t [ ] } ! tab \u0009
 | |
|         { CHAR: n [ ] } ! newline \u000a
 | |
|         { CHAR: r [ ] } ! carriage-return \u000d
 | |
|         { CHAR: f [ ] } ! form-feed \u000c
 | |
|         { CHAR: a [ ] } ! alert (bell) \u0007
 | |
|         { CHAR: e [ ] } ! escape \u001b
 | |
|         { CHAR: c [ ] } ! control character corresoding to X in \cX
 | |
| 
 | |
|         { CHAR: d [ ] } ! [0-9]
 | |
|         { CHAR: D [ ] } ! [^0-9]
 | |
|         { CHAR: s [ ] } ! [ \t\n\x0B\f\r]
 | |
|         { CHAR: S [ ] } ! [^\s]
 | |
|         { CHAR: w [ ] } ! [a-zA-Z_0-9]
 | |
|         { CHAR: W [ ] } ! [^\w]
 | |
| 
 | |
|         { CHAR: b [ ] } ! a word boundary
 | |
|         { CHAR: B [ ] } ! a non-word boundary
 | |
|         { CHAR: A [ ] } ! the beginning of input
 | |
|         { CHAR: G [ ] } ! the end of the previous match
 | |
|         { CHAR: Z [ ] } ! the end of the input but for the
 | |
|                         ! final terminator, if any
 | |
|         { CHAR: z [ ] } ! the end of the input
 | |
|     } case ;
 | |
| 
 | |
| : handle-character-class ( char -- )
 | |
|     {
 | |
|         { [ \ escaped-character get ] [ make-escaped \ escaped-character off ] }
 | |
|         { [ dup CHAR: ] = ] [ \ character-class off ] }
 | |
|         { [ t ] [ addto-character-class ] }
 | |
|     } cond ;
 | |
| 
 | |
| : parse-token ( char -- )
 | |
|     {
 | |
|         ! { [ \ character-class get ] [ ] }
 | |
|         ! { [ \ escaped-character get ] [ ] }
 | |
|         ! { [ dup CHAR: [ = ] [ \ character-class on ] }
 | |
|         ! { [ dup CHAR: \\ = ] [ drop \ escaped-character on ] }
 | |
| 
 | |
|         ! { [ dup CHAR: ^ = ] [ ] }
 | |
|         ! { [ dup CHAR: $ = ] [ ] }
 | |
|         ! { [ dup CHAR: { = ] [ ] }
 | |
|         ! { [ dup CHAR: } = ] [ ] }
 | |
| 
 | |
|         { [ dup CHAR: | = ]
 | |
|             [ drop maybe-concat-loop alternation push-stack ] }
 | |
|         { [ dup CHAR: * = ]
 | |
|             [ drop apply-kleene-closure ] }
 | |
|         { [ dup CHAR: + = ]
 | |
|             [ drop apply-plus-closure ] }
 | |
|         { [ dup CHAR: ? = ]
 | |
|             [ drop apply-question-closure ] }
 | |
| 
 | |
|         { [ dup CHAR: ( = ]
 | |
|             [ drop inc-paren-count lparen push-stack ] }
 | |
|         { [ dup CHAR: ) = ]
 | |
|             [
 | |
|                 drop dec-paren-count lparen apply-til-last
 | |
|                 stack get push-all
 | |
|             ] } ! apply
 | |
| 
 | |
| 
 | |
|         { [ dup bot = ] [ push-stack ] }
 | |
|         { [ dup eot = ]
 | |
|             [
 | |
|                 drop unbalanced-paren? maybe-concat-loop bot apply-til-last
 | |
|                 dup length 1 = [
 | |
|                     pop real start-state t set-trans-table
 | |
|                 ] [
 | |
|                     drop
 | |
|                 ] if
 | |
|             ] }
 | |
|         { [ t ] [ create-nontoken-nfa ] }
 | |
|     } cond ;
 | |
| 
 | |
| : cut-at-index ( i string ch -- i subseq )
 | |
|     -rot [ index* ] 2keep >r >r [ 1+ ] keep r> swap r> subseq ;
 | |
| 
 | |
| : parse-character-class ( index string -- new-index obj )
 | |
|     2dup >r 1+ r> nth CHAR: ] = [ >r 1+ r> ] when
 | |
|     cut-at-index ;
 | |
| 
 | |
| : (parse-regexp) ( str -- )
 | |
|     dup length [
 | |
|         2dup swap character-class get [
 | |
|             parse-character-class
 | |
|             "CHARACTER CLASS: " write .
 | |
|             character-class off
 | |
|             nip ! adjust index
 | |
|         ] [
 | |
|             nth parse-token
 | |
|         ] if
 | |
|     ] repeat ;
 | |
| 
 | |
| : parse-regexp ( str -- )
 | |
|     bot parse-token
 | |
|     ! [ "parsing: " write dup ch>string . parse-token ] each
 | |
|     [ parse-token ] each
 | |
|     ! (parse-regexp)
 | |
|     eot parse-token ;
 | |
| 
 | |
| : push-all-diff ( seq seq -- diff )
 | |
|     [ swap seq-diff ] 2keep push-all ;
 | |
| 
 | |
| : prune-sort ( vec -- vec )
 | |
|     prune natural-sort >vector ;
 | |
| 
 | |
| SYMBOL: ttable
 | |
| SYMBOL: transition
 | |
| SYMBOL: check-list
 | |
| SYMBOL: initial-check-list
 | |
| SYMBOL: result
 | |
| 
 | |
| : init-find ( data state table -- )
 | |
|     ttable set
 | |
|     dup sequence? [ clone >vector ] [ V{ } clone [ push ] keep ] if
 | |
|     [ check-list set ] keep clone initial-check-list set
 | |
|     V{ } clone result set
 | |
|     transition set ;
 | |
| 
 | |
| : (find-next-state) ( -- )
 | |
|     check-list get [
 | |
|         [
 | |
|             ttable get get-row transition get swap at*
 | |
|                 [ dup sequence? [ % ] [ , ] if ] [ drop ] if
 | |
|         ] each
 | |
|     ] { } make
 | |
|     result get push-all-diff
 | |
|     check-list set
 | |
|     result get prune-sort result set ;
 | |
| 
 | |
| : (find-next-state-recursive) ( -- )
 | |
|     check-list get empty? [ (find-next-state) (find-next-state-recursive) ] unless ;
 | |
| 
 | |
| : find-epsilon-closure ( state table -- vec )
 | |
|     eps -rot init-find
 | |
|     (find-next-state-recursive) result get initial-check-list get append natural-sort ;
 | |
| 
 | |
| : find-next-state ( data state table -- vec )
 | |
|     find-epsilon-closure check-list set
 | |
|     V{ } clone result set transition set
 | |
|     (find-next-state) result get ttable get find-epsilon-closure ;
 | |
| 
 | |
| : filter-cols ( vec -- vec )
 | |
|     #! remove info columns state-state, eps, final
 | |
|     clone start-state over delete-at eps over delete-at
 | |
|     final-state over delete-at ;
 | |
| 
 | |
| SYMBOL: old-table
 | |
| SYMBOL: new-table
 | |
| SYMBOL: todo-states
 | |
| SYMBOL: transitions
 | |
| 
 | |
| : init-nfa>dfa ( table -- )
 | |
|     <vector-table> new-table set
 | |
|     [ table-columns clone filter-cols keys transitions set ] keep
 | |
|     dup [ find-start-state ] keep find-epsilon-closure
 | |
|     V{ } clone [ push ] keep todo-states set
 | |
|     old-table set ;
 | |
| 
 | |
| : create-row ( state table -- )
 | |
|     2dup row-exists?
 | |
|     [ 2drop ] [ [ add-row ] 2keep drop todo-states get push ] if ;
 | |
| 
 | |
| : (nfa>dfa) ( -- )
 | |
|     todo-states get dup empty? [
 | |
|         pop transitions get [
 | |
|             2dup swap old-table get find-next-state
 | |
|             dup empty? [
 | |
|                 3drop
 | |
|             ] [
 | |
|                 dup new-table get create-row
 | |
|                 <entry> new-table get set-value
 | |
|             ] if
 | |
|         ] each-with 
 | |
|     ] unless* todo-states get empty? [ (nfa>dfa) ] unless ;
 | |
| 
 | |
| : nfa>dfa ( table -- table )
 | |
|     init-nfa>dfa
 | |
|     (nfa>dfa)
 | |
|     start-state old-table get find-start-state
 | |
|     new-table get set-table-prop
 | |
|     final-state old-table get find-final-state
 | |
|     new-table get [ set-table-prop ] keep ;
 | |
| 
 | |
| SYMBOL: regexp
 | |
| SYMBOL: text
 | |
| SYMBOL: matches
 | |
| SYMBOL: partial-matches
 | |
| TUPLE: partial-match index row count ;
 | |
| ! a state is a vector
 | |
| ! state is a key in a hashtable. the value is a hashtable of transition states
 | |
| 
 | |
| : save-partial-match ( index row -- )
 | |
|     1 <partial-match> dup partial-match-index
 | |
|     \ partial-matches get set-at ;
 | |
| 
 | |
| : inc-partial-match ( partial-match -- )
 | |
|     [ partial-match-count 1+ ] keep set-partial-match-count ;
 | |
| 
 | |
| : check-final-state ( partial-match -- )
 | |
|     dup partial-match-row regexp get final-state? [
 | |
|         clone dup partial-match-index matches get set-at
 | |
|     ] [
 | |
|         drop
 | |
|     ] if ;
 | |
| 
 | |
| : check-trivial-match ( row regexp -- )
 | |
|     dupd final-state? [
 | |
|         >r 0 r> 0 <partial-match>
 | |
|         0 matches get set-at
 | |
|     ] [
 | |
|         drop
 | |
|     ] if ;
 | |
| 
 | |
| : update-partial-match ( char partial-match -- )
 | |
|     tuck partial-match-row regexp get get-row at* [
 | |
|         over set-partial-match-row
 | |
|         inc-partial-match
 | |
|     ] [
 | |
|         drop
 | |
|         partial-match-index partial-matches get delete-at
 | |
|     ] if ;
 | |
| 
 | |
| : regexp-step ( index char start-state -- )
 | |
|     ! check partial-matches
 | |
|     over \ partial-matches get
 | |
|     [ nip update-partial-match ] assoc-each-with
 | |
| 
 | |
|     ! check new match
 | |
|     at* [
 | |
|         save-partial-match
 | |
|     ] [
 | |
|         2drop
 | |
|     ] if
 | |
|     partial-matches get values [ check-final-state ] each ;
 | |
| 
 | |
| : regexp-match ( text regexp -- seq )
 | |
|     #! text is the haystack
 | |
|     #! regexp is a table describing the needle
 | |
|     H{ } clone \ matches set
 | |
|     H{ } clone \ partial-matches set
 | |
|     dup regexp set
 | |
|     >r dup text set r>
 | |
|     [ find-start-state ] keep
 | |
|     2dup check-trivial-match
 | |
|     get-row
 | |
|     swap [ length ] keep
 | |
|     [ pick regexp-step ] 2each drop
 | |
|     matches get values [
 | |
|         [ partial-match-index ] keep
 | |
|         partial-match-count dupd + text get <slice>
 | |
|     ] map ;
 | |
| 
 | |
| IN: regexp
 | |
| MEMO: make-regexp ( str -- table )
 | |
|     [
 | |
|         regexp-init
 | |
|         parse-regexp
 | |
|         trans-table get nfa>dfa
 | |
|     ] with-scope ;
 | |
| 
 | |
| ! TODO: make compatible with
 | |
| ! http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
 | |
| 
 | |
| ! Greedy
 | |
| ! Match the longest possible string, default
 | |
| ! a+
 | |
| 
 | |
| ! Reluctant
 | |
| ! Match on shortest possible string
 | |
| ! / in vi does this (find next)
 | |
| ! a+?
 | |
| 
 | |
| ! Possessive
 | |
| ! Match only when the entire text string matches
 | |
| ! a++
 |