| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | ! Copyright (C) 2008 Doug Coleman. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							|  |  |  | USING: accessors arrays assocs combinators io io.streams.string | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | kernel math math.parser namespaces qualified sets | 
					
						
							| 
									
										
										
										
											2008-09-18 15:42:16 -04:00
										 |  |  | quotations sequences splitting symbols vectors math.order | 
					
						
							|  |  |  | unicode.categories strings regexp.backend regexp.utils | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | unicode.case words locals regexp.classes ;
 | 
					
						
							| 
									
										
										
										
											2008-09-18 15:42:16 -04:00
										 |  |  | IN: regexp.parser | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | FROM: math.ranges => [a,b] ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TUPLE: concatenation seq ; INSTANCE: concatenation node | 
					
						
							|  |  |  | TUPLE: alternation seq ; INSTANCE: alternation node | 
					
						
							| 
									
										
										
										
											2008-08-22 17:45:33 -04:00
										 |  |  | TUPLE: kleene-star term ; INSTANCE: kleene-star node | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! !!!!!!!! | 
					
						
							|  |  |  | TUPLE: possessive-question term ; INSTANCE: possessive-question node | 
					
						
							|  |  |  | TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! !!!!!!!! | 
					
						
							|  |  |  | TUPLE: reluctant-question term ; INSTANCE: reluctant-question node | 
					
						
							|  |  |  | TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | TUPLE: negation term ; INSTANCE: negation node | 
					
						
							|  |  |  | TUPLE: constant char ; INSTANCE: constant node | 
					
						
							|  |  |  | TUPLE: range from to ; INSTANCE: range node | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | MIXIN: parentheses-group | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | TUPLE: lookahead term ; INSTANCE: lookahead node | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | INSTANCE: lookahead parentheses-group | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | TUPLE: lookbehind term ; INSTANCE: lookbehind node | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | INSTANCE: lookbehind parentheses-group | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | TUPLE: capture-group term ; INSTANCE: capture-group node | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | INSTANCE: capture-group parentheses-group | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | TUPLE: non-capture-group term ; INSTANCE: non-capture-group node | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | INSTANCE: non-capture-group parentheses-group | 
					
						
							| 
									
										
										
										
											2008-08-22 17:49:48 -04:00
										 |  |  | TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | INSTANCE: independent-group parentheses-group | 
					
						
							|  |  |  | TUPLE: comment-group term ; INSTANCE: comment-group node | 
					
						
							|  |  |  | INSTANCE: comment-group parentheses-group | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | SINGLETON: epsilon INSTANCE: epsilon node | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | TUPLE: option option on? ; INSTANCE: option node | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  | SINGLETONS: unix-lines dotall multiline comments case-insensitive | 
					
						
							|  |  |  | unicode-case reversed-regexp ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | SINGLETONS: beginning-of-character-class end-of-character-class | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | left-parenthesis pipe caret dash ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | : push1 ( obj -- ) input-stream get stream>> push ;
 | 
					
						
							|  |  |  | : peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
 | 
					
						
							|  |  |  | : pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
 | 
					
						
							|  |  |  | : drop1 ( -- ) read1 drop ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : stack ( -- obj ) current-regexp get stack>> ;
 | 
					
						
							|  |  |  | : change-whole-stack ( quot -- )
 | 
					
						
							|  |  |  |     current-regexp get
 | 
					
						
							|  |  |  |     [ stack>> swap call ] keep (>>stack) ; inline
 | 
					
						
							|  |  |  | : push-stack ( obj -- ) stack push ;
 | 
					
						
							|  |  |  | : pop-stack ( -- obj ) stack pop ;
 | 
					
						
							|  |  |  | : cut-out ( vector n -- vector' vector ) cut rest ;
 | 
					
						
							|  |  |  | ERROR: cut-stack-error ;
 | 
					
						
							|  |  |  | : cut-stack ( obj vector -- vector' vector )
 | 
					
						
							|  |  |  |     tuck last-index [ cut-stack-error ] unless* cut-out swap ;
 | 
					
						
							| 
									
										
										
										
											2008-08-21 18:12:26 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-22 17:45:33 -04:00
										 |  |  | : <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
 | 
					
						
							|  |  |  | : <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
 | 
					
						
							|  |  |  | : <possessive-question> ( obj -- kleene ) possessive-question boa ;
 | 
					
						
							|  |  |  | : <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : <negation> ( obj -- negation ) negation boa ;
 | 
					
						
							| 
									
										
										
										
											2008-08-21 20:16:56 -04:00
										 |  |  | : <concatenation> ( seq -- concatenation )
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |     >vector [ epsilon ] [ concatenation boa ] if-empty ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : <alternation> ( seq -- alternation ) >vector alternation boa ;
 | 
					
						
							|  |  |  | : <capture-group> ( obj -- capture-group ) capture-group boa ;
 | 
					
						
							|  |  |  | : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | : <constant> ( obj -- constant ) constant boa ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : first|concatenation ( seq -- first/concatenation )
 | 
					
						
							|  |  |  |     dup length 1 = [ first ] [ <concatenation> ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : first|alternation ( seq -- first/alternation )
 | 
					
						
							|  |  |  |     dup length 1 = [ first ] [ <alternation> ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-21 18:55:25 -04:00
										 |  |  | : <character-class-range> ( from to -- obj )
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |     2dup <
 | 
					
						
							|  |  |  |     [ character-class-range boa ] [ 2drop unmatchable-class ] if ;
 | 
					
						
							| 
									
										
										
										
											2008-08-21 18:55:25 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | ERROR: unmatched-parentheses ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | ERROR: unknown-regexp-option option ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | : ch>option ( ch -- singleton )
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |     { | 
					
						
							|  |  |  |         { CHAR: i [ case-insensitive ] } | 
					
						
							|  |  |  |         { CHAR: d [ unix-lines ] } | 
					
						
							|  |  |  |         { CHAR: m [ multiline ] } | 
					
						
							| 
									
										
										
										
											2008-08-22 19:03:22 -04:00
										 |  |  |         { CHAR: n [ multiline ] } | 
					
						
							| 
									
										
										
										
											2008-08-21 20:16:56 -04:00
										 |  |  |         { CHAR: r [ reversed-regexp ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         { CHAR: s [ dotall ] } | 
					
						
							|  |  |  |         { CHAR: u [ unicode-case ] } | 
					
						
							|  |  |  |         { CHAR: x [ comments ] } | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |         [ unknown-regexp-option ] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |     } case ;
 | 
					
						
							| 
									
										
										
										
											2008-08-21 18:12:26 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | : option>ch ( option -- string )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { case-insensitive [ CHAR: i ] } | 
					
						
							|  |  |  |         { multiline [ CHAR: m ] } | 
					
						
							|  |  |  |         { reversed-regexp [ CHAR: r ] } | 
					
						
							|  |  |  |         { dotall [ CHAR: s ] } | 
					
						
							|  |  |  |         [ unknown-regexp-option ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : toggle-option ( ch ? -- )  | 
					
						
							|  |  |  |     [ ch>option ] dip option boa push-stack ;
 | 
					
						
							| 
									
										
										
										
											2008-08-21 18:12:26 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-options ( string -- )
 | 
					
						
							|  |  |  |     "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ERROR: bad-special-group string ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-09-22 12:45:36 -04:00
										 |  |  | DEFER: (parse-regexp) | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | : nested-parse-regexp ( token ? -- )
 | 
					
						
							|  |  |  |     [ push-stack (parse-regexp) pop-stack ] dip
 | 
					
						
							| 
									
										
										
										
											2008-11-17 22:42:59 -05:00
										 |  |  |     [ <negation> ] when pop-stack new swap >>term push-stack ;
 | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | ! non-capturing groups | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : (parse-special-group) ( -- )
 | 
					
						
							|  |  |  |     read1 { | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |         { [ dup CHAR: # = ] ! comment | 
					
						
							|  |  |  |             [ drop comment-group f nested-parse-regexp pop-stack drop ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         { [ dup CHAR: : = ] | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |             [ drop non-capture-group f nested-parse-regexp ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         { [ dup CHAR: = = ] | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |             [ drop lookahead f nested-parse-regexp ] } | 
					
						
							| 
									
										
										
										
											2008-08-27 16:27:54 -04:00
										 |  |  |         { [ dup CHAR: ! = ] | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |             [ drop lookahead t nested-parse-regexp ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         { [ dup CHAR: > = ] | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |             [ drop non-capture-group f nested-parse-regexp ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         { [ dup CHAR: < = peek1 CHAR: = = and ] | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |             [ drop drop1 lookbehind f nested-parse-regexp ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         { [ dup CHAR: < = peek1 CHAR: ! = and ] | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |             [ drop drop1 lookbehind t nested-parse-regexp ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         [ | 
					
						
							| 
									
										
										
										
											2008-08-21 18:12:26 -04:00
										 |  |  |             ":)" read-until
 | 
					
						
							|  |  |  |             [ swap prefix ] dip
 | 
					
						
							|  |  |  |             { | 
					
						
							| 
									
										
										
										
											2008-09-22 12:45:36 -04:00
										 |  |  |                 { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] } | 
					
						
							| 
									
										
										
										
											2008-08-21 18:12:26 -04:00
										 |  |  |                 { CHAR: ) [ parse-options ] } | 
					
						
							|  |  |  |                 [ drop bad-special-group ] | 
					
						
							|  |  |  |             } case
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         ] | 
					
						
							|  |  |  |     } cond ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : handle-left-parenthesis ( -- )
 | 
					
						
							|  |  |  |     peek1 CHAR: ? =
 | 
					
						
							| 
									
										
										
										
											2008-08-22 17:45:33 -04:00
										 |  |  |     [ drop1 (parse-special-group) ] | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |     [ capture-group f nested-parse-regexp ] if ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | : handle-dot ( -- ) any-char push-stack ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : handle-pipe ( -- ) pipe push-stack ;
 | 
					
						
							| 
									
										
										
										
											2008-08-22 17:45:33 -04:00
										 |  |  | : (handle-star) ( obj -- kleene-star )
 | 
					
						
							|  |  |  |     peek1 { | 
					
						
							|  |  |  |         { CHAR: + [ drop1 <possessive-kleene-star> ] } | 
					
						
							|  |  |  |         { CHAR: ? [ drop1 <reluctant-kleene-star> ] } | 
					
						
							|  |  |  |         [ drop <kleene-star> ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | : handle-star ( -- ) stack pop (handle-star) push-stack ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : handle-question ( -- )
 | 
					
						
							| 
									
										
										
										
											2008-08-22 17:45:33 -04:00
										 |  |  |     stack pop peek1 { | 
					
						
							|  |  |  |         { CHAR: + [ drop1 <possessive-question> ] } | 
					
						
							|  |  |  |         { CHAR: ? [ drop1 <reluctant-question> ] } | 
					
						
							|  |  |  |         [ drop epsilon 2array <alternation> ] | 
					
						
							|  |  |  |     } case push-stack ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : handle-plus ( -- )
 | 
					
						
							| 
									
										
										
										
											2008-08-22 17:45:33 -04:00
										 |  |  |     stack pop dup (handle-star) | 
					
						
							|  |  |  |     2array <concatenation> push-stack ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | ERROR: unmatched-brace ;
 | 
					
						
							|  |  |  | : parse-repetition ( -- start finish ? )
 | 
					
						
							|  |  |  |     "}" read-until [ unmatched-brace ] unless
 | 
					
						
							|  |  |  |     [ "," split1 [ string>number ] bi@ ] | 
					
						
							|  |  |  |     [ CHAR: , swap index >boolean ] bi ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : replicate/concatenate ( n obj -- obj' )
 | 
					
						
							|  |  |  |     over zero? [ 2drop epsilon ] | 
					
						
							|  |  |  |     [ <repetition> first|concatenation ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : exactly-n ( n -- )
 | 
					
						
							|  |  |  |     stack pop replicate/concatenate push-stack ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : at-least-n ( n -- )
 | 
					
						
							|  |  |  |     stack pop
 | 
					
						
							|  |  |  |     [ replicate/concatenate ] keep
 | 
					
						
							|  |  |  |     <kleene-star> 2array <concatenation> push-stack ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : at-most-n ( n -- )
 | 
					
						
							|  |  |  |     1+ | 
					
						
							|  |  |  |     stack pop
 | 
					
						
							|  |  |  |     [ replicate/concatenate ] curry map <alternation> push-stack ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : from-m-to-n ( m n -- )
 | 
					
						
							|  |  |  |     [a,b] | 
					
						
							|  |  |  |     stack pop
 | 
					
						
							|  |  |  |     [ replicate/concatenate ] curry map
 | 
					
						
							|  |  |  |     <alternation> push-stack ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ERROR: invalid-range a b ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : handle-left-brace ( -- )
 | 
					
						
							|  |  |  |     parse-repetition | 
					
						
							| 
									
										
										
										
											2008-11-23 00:01:24 -05:00
										 |  |  |     [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |     [ | 
					
						
							|  |  |  |         2dup and [ from-m-to-n ] | 
					
						
							|  |  |  |         [ [ nip at-most-n ] [ at-least-n ] if* ] if
 | 
					
						
							|  |  |  |     ] [ drop 0 max exactly-n ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | : handle-front-anchor ( -- ) beginning-of-line push-stack ;
 | 
					
						
							|  |  |  | : handle-back-anchor ( -- ) end-of-line push-stack ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | ERROR: bad-character-class obj ;
 | 
					
						
							|  |  |  | ERROR: expected-posix-class ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-posix-class ( -- obj )
 | 
					
						
							|  |  |  |     read1 CHAR: { = [ expected-posix-class ] unless
 | 
					
						
							|  |  |  |     "}" read-until [ bad-character-class ] unless
 | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |         { "Lower" [ letter-class ] } | 
					
						
							|  |  |  |         { "Upper" [ LETTER-class ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         { "Alpha" [ Letter-class ] } | 
					
						
							| 
									
										
										
										
											2008-08-21 18:55:25 -04:00
										 |  |  |         { "ASCII" [ ascii-class ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         { "Digit" [ digit-class ] } | 
					
						
							|  |  |  |         { "Alnum" [ alpha-class ] } | 
					
						
							|  |  |  |         { "Punct" [ punctuation-class ] } | 
					
						
							|  |  |  |         { "Graph" [ java-printable-class ] } | 
					
						
							|  |  |  |         { "Print" [ java-printable-class ] } | 
					
						
							|  |  |  |         { "Blank" [ non-newline-blank-class ] } | 
					
						
							|  |  |  |         { "Cntrl" [ control-character-class ] } | 
					
						
							|  |  |  |         { "XDigit" [ hex-digit-class ] } | 
					
						
							|  |  |  |         { "Space" [ java-blank-class ] } | 
					
						
							|  |  |  |         ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss | 
					
						
							|  |  |  |         [ bad-character-class ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-octal ( -- n ) 3 read oct> check-octal ;
 | 
					
						
							|  |  |  | : parse-short-hex ( -- n ) 2 read hex> check-hex ;
 | 
					
						
							|  |  |  | : parse-long-hex ( -- n ) 6 read hex> check-hex ;
 | 
					
						
							|  |  |  | : parse-control-character ( -- n ) read1 ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ERROR: bad-escaped-literals seq ;
 | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : parse-til-E ( -- obj )
 | 
					
						
							|  |  |  |     "\\E" read-until [ bad-escaped-literals ] unless ;
 | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  | :: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
 | 
					
						
							|  |  |  |     parse-til-E | 
					
						
							| 
									
										
										
										
											2008-08-22 17:45:33 -04:00
										 |  |  |     drop1 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |     [ epsilon ] [ | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  |         [ quot call <constant> ] V{ } map-as
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         first|concatenation | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  |     ] if-empty ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-escaped-literals ( -- obj )
 | 
					
						
							|  |  |  |     [ ] (parse-escaped-literals) ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : lower-case-literals ( -- obj )
 | 
					
						
							|  |  |  |     [ ch>lower ] (parse-escaped-literals) ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : upper-case-literals ( -- obj )
 | 
					
						
							|  |  |  |     [ ch>upper ] (parse-escaped-literals) ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : parse-escaped ( -- obj )
 | 
					
						
							|  |  |  |     read1
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { CHAR: t [ CHAR: \t <constant> ] } | 
					
						
							|  |  |  |         { CHAR: n [ CHAR: \n <constant> ] } | 
					
						
							|  |  |  |         { CHAR: r [ CHAR: \r <constant> ] } | 
					
						
							|  |  |  |         { CHAR: f [ HEX: c <constant> ] } | 
					
						
							|  |  |  |         { CHAR: a [ HEX: 7 <constant> ] } | 
					
						
							|  |  |  |         { CHAR: e [ HEX: 1b <constant> ] } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         { CHAR: w [ c-identifier-class ] } | 
					
						
							|  |  |  |         { CHAR: W [ c-identifier-class <negation> ] } | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  |         { CHAR: s [ java-blank-class ] } | 
					
						
							|  |  |  |         { CHAR: S [ java-blank-class <negation> ] } | 
					
						
							|  |  |  |         { CHAR: d [ digit-class ] } | 
					
						
							|  |  |  |         { CHAR: D [ digit-class <negation> ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |         { CHAR: p [ parse-posix-class ] } | 
					
						
							|  |  |  |         { CHAR: P [ parse-posix-class <negation> ] } | 
					
						
							|  |  |  |         { CHAR: x [ parse-short-hex <constant> ] } | 
					
						
							|  |  |  |         { CHAR: u [ parse-long-hex <constant> ] } | 
					
						
							|  |  |  |         { CHAR: 0 [ parse-octal <constant> ] } | 
					
						
							|  |  |  |         { CHAR: c [ parse-control-character ] } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  |         { CHAR: Q [ parse-escaped-literals ] } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         ! { CHAR: b [ word-boundary-class ] } | 
					
						
							|  |  |  |         ! { CHAR: B [ word-boundary-class <negation> ] } | 
					
						
							| 
									
										
										
										
											2008-08-21 18:55:25 -04:00
										 |  |  |         ! { CHAR: A [ handle-beginning-of-input ] } | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  |         ! { CHAR: z [ handle-end-of-input ] } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         ! m//g mode | 
					
						
							| 
									
										
										
										
											2008-08-21 18:55:25 -04:00
										 |  |  |         ! { CHAR: G [ end of previous match ] } | 
					
						
							| 
									
										
										
										
											2008-09-22 14:37:27 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  |         ! Group capture | 
					
						
							| 
									
										
										
										
											2008-09-22 14:37:27 -04:00
										 |  |  |         ! { CHAR: 1 [ CHAR: 1 <constant> ] } | 
					
						
							|  |  |  |         ! { CHAR: 2 [ CHAR: 2 <constant> ] } | 
					
						
							|  |  |  |         ! { CHAR: 3 [ CHAR: 3 <constant> ] } | 
					
						
							|  |  |  |         ! { CHAR: 4 [ CHAR: 4 <constant> ] } | 
					
						
							|  |  |  |         ! { CHAR: 5 [ CHAR: 5 <constant> ] } | 
					
						
							|  |  |  |         ! { CHAR: 6 [ CHAR: 6 <constant> ] } | 
					
						
							|  |  |  |         ! { CHAR: 7 [ CHAR: 7 <constant> ] } | 
					
						
							|  |  |  |         ! { CHAR: 8 [ CHAR: 8 <constant> ] } | 
					
						
							|  |  |  |         ! { CHAR: 9 [ CHAR: 9 <constant> ] } | 
					
						
							| 
									
										
										
										
											2008-08-21 18:55:25 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  |         ! Perl extensions | 
					
						
							|  |  |  |         ! can't do \l and \u because \u is already a 4-hex | 
					
						
							|  |  |  |         { CHAR: L [ lower-case-literals ] } | 
					
						
							|  |  |  |         { CHAR: U [ upper-case-literals ] } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-22 17:50:40 -05:00
										 |  |  |         [ <constant> ] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : handle-escape ( -- ) parse-escaped push-stack ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : handle-dash ( vector -- vector' )
 | 
					
						
							|  |  |  |     H{ { dash CHAR: - } } substitute ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : character-class>alternation ( seq -- alternation )
 | 
					
						
							|  |  |  |     [ dup number? [ <constant> ] when ] map first|alternation ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : handle-caret ( vector -- vector' )
 | 
					
						
							|  |  |  |     dup [ length 2 >= ] [ first caret eq? ] bi and [ | 
					
						
							|  |  |  |         rest-slice character-class>alternation <negation> | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         character-class>alternation | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : make-character-class ( -- character-class )
 | 
					
						
							|  |  |  |     [ beginning-of-character-class swap cut-stack ] change-whole-stack | 
					
						
							|  |  |  |     handle-dash handle-caret ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : apply-dash ( -- )
 | 
					
						
							| 
									
										
										
										
											2008-08-21 18:55:25 -04:00
										 |  |  |     stack [ pop3 nip <character-class-range> ] keep push ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : apply-dash? ( -- ? )
 | 
					
						
							|  |  |  |     stack dup length 3 >=
 | 
					
						
							|  |  |  |     [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ERROR: empty-negated-character-class ;
 | 
					
						
							|  |  |  | DEFER: handle-left-bracket | 
					
						
							|  |  |  | : (parse-character-class) ( -- )
 | 
					
						
							|  |  |  |     read1 [ empty-negated-character-class ] unless* { | 
					
						
							|  |  |  |         { CHAR: [ [ handle-left-bracket t ] } | 
					
						
							|  |  |  |         { CHAR: ] [ make-character-class push-stack f ] } | 
					
						
							|  |  |  |         { CHAR: - [ dash push-stack t ] } | 
					
						
							|  |  |  |         { CHAR: \ [ parse-escaped push-stack t ] } | 
					
						
							|  |  |  |         [ push-stack apply-dash? [ apply-dash ] when t ] | 
					
						
							|  |  |  |     } case
 | 
					
						
							|  |  |  |     [ (parse-character-class) ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-24 13:59:29 -05:00
										 |  |  | : push-constant ( ch -- ) <constant> push-stack ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : parse-character-class-second ( -- )
 | 
					
						
							|  |  |  |     read1 { | 
					
						
							| 
									
										
										
										
											2008-11-24 13:59:29 -05:00
										 |  |  |         { CHAR: [ [ CHAR: [ push-constant ] } | 
					
						
							|  |  |  |         { CHAR: ] [ CHAR: ] push-constant ] } | 
					
						
							|  |  |  |         { CHAR: - [ CHAR: - push-constant ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         [ push1 ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-character-class-first ( -- )
 | 
					
						
							|  |  |  |     read1 { | 
					
						
							|  |  |  |         { CHAR: ^ [ caret push-stack parse-character-class-second ] } | 
					
						
							| 
									
										
										
										
											2008-11-24 13:59:29 -05:00
										 |  |  |         { CHAR: [ [ CHAR: [ push-constant ] } | 
					
						
							|  |  |  |         { CHAR: ] [ CHAR: ] push-constant ] } | 
					
						
							|  |  |  |         { CHAR: - [ CHAR: - push-constant ] } | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         [ push1 ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : handle-left-bracket ( -- )
 | 
					
						
							|  |  |  |     beginning-of-character-class push-stack | 
					
						
							|  |  |  |     parse-character-class-first (parse-character-class) ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : finish-regexp-parse ( stack -- obj )
 | 
					
						
							| 
									
										
										
										
											2008-08-21 20:16:56 -04:00
										 |  |  |     { pipe } split | 
					
						
							|  |  |  |     [ first|concatenation ] map first|alternation ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : handle-right-parenthesis ( -- )
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |     stack dup [ parentheses-group "members" word-prop member? ] find-last
 | 
					
						
							|  |  |  |     -rot cut rest
 | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |     [ [ push ] keep current-regexp get (>>stack) ] | 
					
						
							|  |  |  |     [ finish-regexp-parse push-stack ] bi* ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  | : parse-regexp-token ( token -- ? )
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |     { | 
					
						
							| 
									
										
										
										
											2008-11-06 17:53:00 -05:00
										 |  |  |         { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning? | 
					
						
							| 
									
										
										
										
											2008-08-27 16:59:07 -04:00
										 |  |  |         { CHAR: ) [ handle-right-parenthesis f ] } | 
					
						
							| 
									
										
										
										
											2008-11-06 17:53:00 -05:00
										 |  |  |         { CHAR: . [ handle-dot t ] } | 
					
						
							| 
									
										
										
										
											2008-08-27 16:59:07 -04:00
										 |  |  |         { CHAR: | [ handle-pipe t ] } | 
					
						
							|  |  |  |         { CHAR: ? [ handle-question t ] } | 
					
						
							|  |  |  |         { CHAR: * [ handle-star t ] } | 
					
						
							|  |  |  |         { CHAR: + [ handle-plus t ] } | 
					
						
							|  |  |  |         { CHAR: { [ handle-left-brace t ] } | 
					
						
							|  |  |  |         { CHAR: [ [ handle-left-bracket t ] } | 
					
						
							|  |  |  |         { CHAR: \ [ handle-escape t ] } | 
					
						
							| 
									
										
										
										
											2008-11-06 17:53:00 -05:00
										 |  |  |         [ | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |             dup CHAR: $ = peek1 f = and
 | 
					
						
							|  |  |  |             [ drop handle-back-anchor f ] | 
					
						
							|  |  |  |             [ push-constant t ] if
 | 
					
						
							| 
									
										
										
										
											2008-11-06 17:53:00 -05:00
										 |  |  |         ] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : (parse-regexp) ( -- )
 | 
					
						
							| 
									
										
										
										
											2008-09-22 11:48:01 -04:00
										 |  |  |     read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-06 17:53:00 -05:00
										 |  |  | : parse-regexp-beginning ( -- )
 | 
					
						
							|  |  |  |     peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | : parse-regexp ( regexp -- )
 | 
					
						
							|  |  |  |     dup current-regexp [ | 
					
						
							|  |  |  |         raw>> [ | 
					
						
							| 
									
										
										
										
											2008-11-06 17:53:00 -05:00
										 |  |  |             <string-reader> [ | 
					
						
							|  |  |  |                 parse-regexp-beginning (parse-regexp) | 
					
						
							|  |  |  |             ] with-input-stream
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |         ] unless-empty
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  |         current-regexp get [ finish-regexp-parse ] change-stack | 
					
						
							|  |  |  |         dup stack>> >>parse-tree drop
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  |     ] with-variable ;
 |