| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | ! Copyright (C) 2008 Doug Coleman. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2008-09-18 15:42:16 -04:00
										 |  |  | USING: accessors arrays assocs grouping kernel regexp.backend | 
					
						
							| 
									
										
										
										
											2008-11-24 13:59:29 -05:00
										 |  |  | locals math namespaces regexp.parser sequences fry quotations | 
					
						
							|  |  |  | math.order math.ranges vectors unicode.categories regexp.utils | 
					
						
							| 
									
										
										
										
											2009-01-08 20:07:46 -05:00
										 |  |  | regexp.transition-tables words sets regexp.classes unicode.case.private ;
 | 
					
						
							|  |  |  | ! This uses unicode.case.private for ch>upper and ch>lower | 
					
						
							|  |  |  | ! but case-insensitive matching should be done by case-folding everything | 
					
						
							|  |  |  | ! before processing starts | 
					
						
							| 
									
										
										
										
											2008-09-18 15:42:16 -04:00
										 |  |  | IN: regexp.nfa | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-20 12:53:50 -05:00
										 |  |  | ERROR: feature-is-broken feature ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | SYMBOL: negation-mode | 
					
						
							|  |  |  | : negated? ( -- ? ) negation-mode get 0 or odd? ;  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | SINGLETON: eps | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-28 14:45:04 -04:00
										 |  |  | MIXIN: traversal-flag | 
					
						
							|  |  |  | SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag | 
					
						
							|  |  |  | SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag | 
					
						
							| 
									
										
										
										
											2008-09-22 14:37:27 -04:00
										 |  |  | SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag | 
					
						
							|  |  |  | SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag | 
					
						
							| 
									
										
										
										
											2008-08-28 14:45:04 -04:00
										 |  |  | SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag | 
					
						
							|  |  |  | SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  | SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag | 
					
						
							|  |  |  | SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag | 
					
						
							|  |  |  | SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag | 
					
						
							| 
									
										
										
										
											2008-08-28 14:45:04 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | : options ( -- obj ) current-regexp get options>> ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : option? ( obj -- ? ) options key? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : option-on ( obj -- ) options conjoin ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : option-off ( obj -- ) options delete-at ;
 | 
					
						
							| 
									
										
										
										
											2008-11-24 13:59:29 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | : next-state ( regexp -- state )
 | 
					
						
							|  |  |  |     [ state>> ] [ [ 1+ ] change-state drop ] bi ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : set-start-state ( regexp -- )
 | 
					
						
							|  |  |  |     dup stack>> [ | 
					
						
							|  |  |  |         drop
 | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         [ nfa-table>> ] [ pop first ] bi* >>start-state drop
 | 
					
						
							|  |  |  |     ] if-empty ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | GENERIC: nfa-node ( node -- )
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: add-simple-entry ( obj class -- )
 | 
					
						
							|  |  |  |     [let* | regexp [ current-regexp get ] | 
					
						
							|  |  |  |             s0 [ regexp next-state ] | 
					
						
							|  |  |  |             s1 [ regexp next-state ] | 
					
						
							|  |  |  |             stack [ regexp stack>> ] | 
					
						
							|  |  |  |             table [ regexp nfa-table>> ] | | 
					
						
							|  |  |  |         negated? [ | 
					
						
							| 
									
										
										
										
											2008-08-27 17:22:34 -04:00
										 |  |  |             s0 f obj class make-transition table add-transition | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  |             s0 s1 <default-transition> table add-transition | 
					
						
							|  |  |  |         ] [ | 
					
						
							| 
									
										
										
										
											2008-08-27 17:22:34 -04:00
										 |  |  |             s0 s1 obj class make-transition table add-transition | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  |         ] if
 | 
					
						
							|  |  |  |         s0 s1 2array stack push
 | 
					
						
							|  |  |  |         t s1 table final-states>> set-at ] ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-28 14:45:04 -04:00
										 |  |  | : add-traversal-flag ( flag -- )
 | 
					
						
							|  |  |  |     stack peek second
 | 
					
						
							| 
									
										
										
										
											2008-08-28 23:08:54 -04:00
										 |  |  |     current-regexp get nfa-traversal-flags>> push-at ;
 | 
					
						
							| 
									
										
										
										
											2008-08-28 14:45:04 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | :: concatenate-nodes ( -- )
 | 
					
						
							|  |  |  |     [let* | regexp [ current-regexp get ] | 
					
						
							|  |  |  |             stack [ regexp stack>> ] | 
					
						
							|  |  |  |             table [ regexp nfa-table>> ] | 
					
						
							|  |  |  |             s2 [ stack peek first ] | 
					
						
							|  |  |  |             s3 [ stack pop second ] | 
					
						
							|  |  |  |             s0 [ stack peek first ] | 
					
						
							|  |  |  |             s1 [ stack pop second ] | | 
					
						
							|  |  |  |         s1 s2 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s1 table final-states>> delete-at
 | 
					
						
							|  |  |  |         s0 s3 2array stack push ] ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: alternate-nodes ( -- )
 | 
					
						
							|  |  |  |     [let* | regexp [ current-regexp get ] | 
					
						
							|  |  |  |             stack [ regexp stack>> ] | 
					
						
							|  |  |  |             table [ regexp nfa-table>> ] | 
					
						
							|  |  |  |             s2 [ stack peek first ] | 
					
						
							|  |  |  |             s3 [ stack pop second ] | 
					
						
							|  |  |  |             s0 [ stack peek first ] | 
					
						
							|  |  |  |             s1 [ stack pop second ] | 
					
						
							|  |  |  |             s4 [ regexp next-state ] | 
					
						
							|  |  |  |             s5 [ regexp next-state ] | | 
					
						
							|  |  |  |         s4 s0 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s4 s2 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s1 s5 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s3 s5 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s1 table final-states>> delete-at
 | 
					
						
							|  |  |  |         s3 table final-states>> delete-at
 | 
					
						
							|  |  |  |         t s5 table final-states>> set-at
 | 
					
						
							|  |  |  |         s4 s5 2array stack push ] ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: kleene-star nfa-node ( node -- )
 | 
					
						
							|  |  |  |     term>> nfa-node | 
					
						
							|  |  |  |     [let* | regexp [ current-regexp get ] | 
					
						
							|  |  |  |             stack [ regexp stack>> ] | 
					
						
							|  |  |  |             s0 [ stack peek first ] | 
					
						
							|  |  |  |             s1 [ stack pop second ] | 
					
						
							|  |  |  |             s2 [ regexp next-state ] | 
					
						
							|  |  |  |             s3 [ regexp next-state ] | 
					
						
							|  |  |  |             table [ regexp nfa-table>> ] | | 
					
						
							|  |  |  |         s1 table final-states>> delete-at
 | 
					
						
							|  |  |  |         t s3 table final-states>> set-at
 | 
					
						
							|  |  |  |         s1 s0 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s2 s0 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s2 s3 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s1 s3 eps <literal-transition> table add-transition | 
					
						
							|  |  |  |         s2 s3 2array stack push ] ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: concatenation nfa-node ( node -- )
 | 
					
						
							|  |  |  |     seq>> | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |     reversed-regexp option? [ <reversed> ] when
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  |     [ [ nfa-node ] each ] | 
					
						
							|  |  |  |     [ length 1- [ concatenate-nodes ] times ] bi ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: alternation nfa-node ( node -- )
 | 
					
						
							|  |  |  |     seq>> | 
					
						
							|  |  |  |     [ [ nfa-node ] each ] | 
					
						
							|  |  |  |     [ length 1- [ alternate-nodes ] times ] bi ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: constant nfa-node ( node -- )
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |     case-insensitive option? [ | 
					
						
							|  |  |  |         dup char>> [ ch>lower ] [ ch>upper ] bi
 | 
					
						
							|  |  |  |         2dup = [ | 
					
						
							|  |  |  |             2drop
 | 
					
						
							|  |  |  |             char>> literal-transition add-simple-entry | 
					
						
							|  |  |  |         ] [ | 
					
						
							|  |  |  |             [ literal-transition add-simple-entry ] bi@
 | 
					
						
							|  |  |  |             alternate-nodes drop
 | 
					
						
							|  |  |  |         ] if
 | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         char>> literal-transition add-simple-entry | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | M: epsilon nfa-node ( node -- )
 | 
					
						
							|  |  |  |     drop eps literal-transition add-simple-entry ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | M: word nfa-node ( node -- ) class-transition add-simple-entry ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: any-char nfa-node ( node -- )
 | 
					
						
							|  |  |  |     [ dotall option? ] dip any-char-no-nl ?
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  |     class-transition add-simple-entry ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | ! M: beginning-of-text nfa-node ( node -- ) ; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : choose-letter-class ( node -- node' )
 | 
					
						
							|  |  |  |     case-insensitive option? Letter-class rot ? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: letter-class nfa-node ( node -- )
 | 
					
						
							|  |  |  |     choose-letter-class class-transition add-simple-entry ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: LETTER-class nfa-node ( node -- )
 | 
					
						
							|  |  |  |     choose-letter-class class-transition add-simple-entry ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | M: character-class-range nfa-node ( node -- )
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |     case-insensitive option? [ | 
					
						
							| 
									
										
										
										
											2009-01-08 20:07:46 -05:00
										 |  |  |         ! This should be implemented for Unicode by case-folding | 
					
						
							|  |  |  |         ! the input and all strings in the regexp. | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |         dup [ from>> ] [ to>> ] bi
 | 
					
						
							|  |  |  |         2dup [ Letter? ] bi@ and [ | 
					
						
							|  |  |  |             rot drop
 | 
					
						
							|  |  |  |             [ [ ch>lower ] bi@ character-class-range boa ] | 
					
						
							|  |  |  |             [ [ ch>upper ] bi@ character-class-range boa ] 2bi  | 
					
						
							|  |  |  |             [ class-transition add-simple-entry ] bi@
 | 
					
						
							|  |  |  |             alternate-nodes | 
					
						
							|  |  |  |         ] [ | 
					
						
							|  |  |  |             2drop
 | 
					
						
							|  |  |  |             class-transition add-simple-entry | 
					
						
							|  |  |  |         ] if
 | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         class-transition add-simple-entry | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | M: capture-group nfa-node ( node -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-20 12:53:50 -05:00
										 |  |  |     "capture-groups" feature-is-broken | 
					
						
							| 
									
										
										
										
											2008-09-22 21:09:42 -04:00
										 |  |  |     eps literal-transition add-simple-entry | 
					
						
							|  |  |  |     capture-group-on add-traversal-flag | 
					
						
							|  |  |  |     term>> nfa-node | 
					
						
							|  |  |  |     eps literal-transition add-simple-entry | 
					
						
							|  |  |  |     capture-group-off add-traversal-flag | 
					
						
							|  |  |  |     2 [ concatenate-nodes ] times ;
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-09-18 15:42:16 -04:00
										 |  |  | ! xyzzy | 
					
						
							|  |  |  | M: non-capture-group nfa-node ( node -- )
 | 
					
						
							|  |  |  |     term>> nfa-node ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | M: reluctant-kleene-star nfa-node ( node -- )
 | 
					
						
							|  |  |  |     term>> <kleene-star> nfa-node ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | M: negation nfa-node ( node -- )
 | 
					
						
							|  |  |  |     negation-mode inc
 | 
					
						
							|  |  |  |     term>> nfa-node  | 
					
						
							|  |  |  |     negation-mode dec ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-28 14:45:04 -04:00
										 |  |  | M: lookahead nfa-node ( node -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-20 12:53:50 -05:00
										 |  |  |     "lookahead" feature-is-broken | 
					
						
							| 
									
										
										
										
											2008-08-28 14:45:04 -04:00
										 |  |  |     eps literal-transition add-simple-entry | 
					
						
							|  |  |  |     lookahead-on add-traversal-flag | 
					
						
							|  |  |  |     term>> nfa-node | 
					
						
							|  |  |  |     eps literal-transition add-simple-entry | 
					
						
							|  |  |  |     lookahead-off add-traversal-flag | 
					
						
							|  |  |  |     2 [ concatenate-nodes ] times ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-09-22 14:37:27 -04:00
										 |  |  | M: lookbehind nfa-node ( node -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-20 12:53:50 -05:00
										 |  |  |     "lookbehind" feature-is-broken | 
					
						
							| 
									
										
										
										
											2008-09-22 14:37:27 -04:00
										 |  |  |     eps literal-transition add-simple-entry | 
					
						
							|  |  |  |     lookbehind-on add-traversal-flag | 
					
						
							|  |  |  |     term>> nfa-node | 
					
						
							|  |  |  |     eps literal-transition add-simple-entry | 
					
						
							|  |  |  |     lookbehind-off add-traversal-flag | 
					
						
							|  |  |  |     2 [ concatenate-nodes ] times ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | M: option nfa-node ( node -- )
 | 
					
						
							|  |  |  |     [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
 | 
					
						
							|  |  |  |     eps literal-transition add-simple-entry ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-26 21:24:14 -04:00
										 |  |  | : construct-nfa ( regexp -- )
 | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         reset-regexp | 
					
						
							|  |  |  |         negation-mode off
 | 
					
						
							|  |  |  |         [ current-regexp set ] | 
					
						
							|  |  |  |         [ parse-tree>> nfa-node ] | 
					
						
							|  |  |  |         [ set-start-state ] tri
 | 
					
						
							|  |  |  |     ] with-scope ;
 |