| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | USING: peg.ebnf kernel math.parser sequences assocs arrays fry math | 
					
						
							|  |  |  | combinators regexp.classes strings splitting peg locals accessors | 
					
						
							|  |  |  | regexp.ast ;
 | 
					
						
							| 
									
										
										
										
											2008-09-18 15:42:16 -04:00
										 |  |  | IN: regexp.parser | 
					
						
							| 
									
										
										
										
											2009-02-19 17:48:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | : allowed-char? ( ch -- ? )
 | 
					
						
							| 
									
										
										
										
											2009-02-25 13:22:12 -05:00
										 |  |  |     ".()|[*+?$^" member? not ;
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | ERROR: bad-number ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : ensure-number ( n -- n )
 | 
					
						
							|  |  |  |     [ bad-number ] unless* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: at-error ( key assoc quot: ( key -- replacement ) -- value )
 | 
					
						
							|  |  |  |     key assoc at* [ drop key quot call ] unless ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ERROR: bad-class name ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : name>class ( name -- class )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { "Lower" letter-class } | 
					
						
							|  |  |  |         { "Upper" LETTER-class } | 
					
						
							|  |  |  |         { "Alpha" Letter-class } | 
					
						
							|  |  |  |         { "ASCII" ascii-class } | 
					
						
							|  |  |  |         { "Digit" digit-class } | 
					
						
							|  |  |  |         { "Alnum" alpha-class } | 
					
						
							|  |  |  |         { "Punct" punctuation-class } | 
					
						
							|  |  |  |         { "Graph" java-printable-class } | 
					
						
							|  |  |  |         { "Print" java-printable-class } | 
					
						
							|  |  |  |         { "Blank" non-newline-blank-class } | 
					
						
							|  |  |  |         { "Cntrl" control-character-class } | 
					
						
							|  |  |  |         { "XDigit" hex-digit-class } | 
					
						
							|  |  |  |         { "Space" java-blank-class } | 
					
						
							|  |  |  |         ! TODO: unicode-character-class | 
					
						
							|  |  |  |     } [ bad-class ] at-error ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : lookup-escape ( char -- ast )
 | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         { CHAR: t [ CHAR: \t ] } | 
					
						
							|  |  |  |         { CHAR: n [ CHAR: \n ] } | 
					
						
							|  |  |  |         { CHAR: r [ CHAR: \r ] } | 
					
						
							|  |  |  |         { CHAR: f [ HEX: c ] } | 
					
						
							|  |  |  |         { CHAR: a [ HEX: 7 ] } | 
					
						
							|  |  |  |         { CHAR: e [ HEX: 1b ] } | 
					
						
							|  |  |  |         { CHAR: \\ [ CHAR: \\ ] } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |         { CHAR: w [ c-identifier-class <primitive-class> ] } | 
					
						
							| 
									
										
										
										
											2009-02-19 19:28:54 -05:00
										 |  |  |         { CHAR: W [ c-identifier-class <primitive-class> <not-class> ] } | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |         { CHAR: s [ java-blank-class <primitive-class> ] } | 
					
						
							| 
									
										
										
										
											2009-02-19 19:28:54 -05:00
										 |  |  |         { CHAR: S [ java-blank-class <primitive-class> <not-class> ] } | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |         { CHAR: d [ digit-class <primitive-class> ] } | 
					
						
							| 
									
										
										
										
											2009-02-19 19:28:54 -05:00
										 |  |  |         { CHAR: D [ digit-class <primitive-class> <not-class> ] } | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-25 13:22:12 -05:00
										 |  |  |         { CHAR: z [ end-of-input <tagged-epsilon> ] } | 
					
						
							| 
									
										
										
										
											2009-03-05 17:34:04 -05:00
										 |  |  |         { CHAR: Z [ end-of-file <tagged-epsilon> ] } | 
					
						
							| 
									
										
										
										
											2009-02-25 13:22:12 -05:00
										 |  |  |         { CHAR: A [ beginning-of-input <tagged-epsilon> ] } | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  |         [ ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | : options-assoc ( -- assoc )
 | 
					
						
							|  |  |  |     H{ | 
					
						
							|  |  |  |         { CHAR: i case-insensitive } | 
					
						
							|  |  |  |         { CHAR: d unix-lines } | 
					
						
							|  |  |  |         { CHAR: m multiline } | 
					
						
							|  |  |  |         { CHAR: n multiline } | 
					
						
							|  |  |  |         { CHAR: r reversed-regexp } | 
					
						
							|  |  |  |         { CHAR: s dotall } | 
					
						
							|  |  |  |         { CHAR: u unicode-case } | 
					
						
							|  |  |  |         { CHAR: x comments } | 
					
						
							|  |  |  |     } ;
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | : ch>option ( ch -- singleton )
 | 
					
						
							|  |  |  |     options-assoc at ;
 | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | : option>ch ( option -- string )
 | 
					
						
							|  |  |  |     options-assoc value-at ;
 | 
					
						
							| 
									
										
										
										
											2008-08-21 18:12:26 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | : parse-options ( on off -- options )
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |     [ [ ch>option ] { } map-as ] bi@ <options> ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : string>options ( string -- options )
 | 
					
						
							|  |  |  |     "-" split1 parse-options ;
 | 
					
						
							|  |  |  |   | 
					
						
							|  |  |  | : options>string ( options -- string )
 | 
					
						
							|  |  |  |     [ on>> ] [ off>> ] bi
 | 
					
						
							|  |  |  |     [ [ option>ch ] map ] bi@
 | 
					
						
							| 
									
										
										
										
											2009-02-20 19:45:24 -05:00
										 |  |  |     [ "-" glue ] unless-empty
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |     "" like ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! TODO: add syntax for various parenthized things, | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | !       add greedy and nongreedy forms of matching | 
					
						
							|  |  |  | ! (once it's all implemented) | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | EBNF: parse-regexp | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | CharacterInBracket = !("}") Character | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | QuotedCharacter = !("\\E") .
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]] | 
					
						
							|  |  |  |        | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]] | 
					
						
							|  |  |  |        | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]] | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  |        | "u" Character:a Character:b Character:c Character:d | 
					
						
							|  |  |  |             => [[ { a b c d } hex> ensure-number ]] | 
					
						
							|  |  |  |        | "x" Character:a Character:b | 
					
						
							|  |  |  |             => [[ { a b } hex> ensure-number ]] | 
					
						
							|  |  |  |        | "0" Character:a Character:b Character:c | 
					
						
							|  |  |  |             => [[ { a b c } oct> ensure-number ]] | 
					
						
							|  |  |  |        | . => [[ lookup-escape ]] | 
					
						
							| 
									
										
										
										
											2008-08-21 18:55:25 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | EscapeSequence = "\\" Escape:e => [[ e ]] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-25 13:22:12 -05:00
										 |  |  | Character = EscapeSequence | 
					
						
							|  |  |  |           | "$" => [[ $ <tagged-epsilon> ]] | 
					
						
							|  |  |  |           | "^" => [[ ^ <tagged-epsilon> ]] | 
					
						
							|  |  |  |           | . ?[ allowed-char? ]? | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | AnyRangeCharacter = EscapeSequence | .
 | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | RangeCharacter = !("]") AnyRangeCharacter | 
					
						
							| 
									
										
										
										
											2008-08-21 18:12:26 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]] | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  |       | RangeCharacter | 
					
						
							| 
									
										
										
										
											2008-11-24 23:17:47 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]] | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  |            | AnyRangeCharacter | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | Ranges = StartRange:s Range*:r => [[ r s prefix ]] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | CharClass = "^"?:n Ranges:e => [[ e n char-class ]] | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | Options = [idmsux]* | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | Parenthized = "?:" Alternation:a => [[ a ]] | 
					
						
							|  |  |  |             | "?" Options:on "-"? Options:off ":" Alternation:a | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |                 => [[ a on off parse-options <with-options> ]] | 
					
						
							|  |  |  |             | "?#" [^)]* => [[ f ]] | 
					
						
							| 
									
										
										
										
											2009-02-19 17:48:46 -05:00
										 |  |  |             | "?~" Alternation:a => [[ a <negation> ]] | 
					
						
							| 
									
										
										
										
											2009-03-07 17:31:46 -05:00
										 |  |  |             | "?=" Alternation:a => [[ a t <lookahead> <tagged-epsilon> ]] | 
					
						
							|  |  |  |             | "?!" Alternation:a => [[ a f <lookahead> <tagged-epsilon> ]] | 
					
						
							|  |  |  |             | "?<=" Alternation:a => [[ a t <lookbehind> <tagged-epsilon> ]] | 
					
						
							|  |  |  |             | "?<!" Alternation:a => [[ a f <lookbehind> <tagged-epsilon> ]] | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  |             | Alternation | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | Element = "(" Parenthized:p ")" => [[ p ]] | 
					
						
							|  |  |  |         | "[" CharClass:r "]" => [[ r ]] | 
					
						
							| 
									
										
										
										
											2009-02-20 18:54:48 -05:00
										 |  |  |         | ".":d => [[ any-char <primitive-class> ]] | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  |         | Character | 
					
						
							| 
									
										
										
										
											2008-11-24 01:18:27 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | Number = (!(","|"}").)* => [[ string>number ensure-number ]] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | Times = "," Number:n "}" => [[ 0 n <from-to> ]] | 
					
						
							|  |  |  |       | Number:n ",}" => [[ n <at-least> ]] | 
					
						
							|  |  |  |       | Number:n "}" => [[ n n <from-to> ]] | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  |       | "}" => [[ bad-number ]] | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |       | Number:n "," Number:m "}" => [[ n m <from-to> ]] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | Repeated = Element:e "{" Times:t => [[ e t <times> ]] | 
					
						
							| 
									
										
										
										
											2009-03-08 19:50:41 -04:00
										 |  |  |          | Element:e "??" => [[ e <maybe> ]] | 
					
						
							|  |  |  |          | Element:e "*?" => [[ e <star> ]] | 
					
						
							|  |  |  |          | Element:e "+?" => [[ e <plus> ]] | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |          | Element:e "?" => [[ e <maybe> ]] | 
					
						
							|  |  |  |          | Element:e "*" => [[ e <star> ]] | 
					
						
							|  |  |  |          | Element:e "+" => [[ e <plus> ]] | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  |          | Element | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  | Concatenation = Repeated*:r => [[ r sift <concatenation> ]] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | Alternation = Concatenation:c ("|" Concatenation)*:a | 
					
						
							| 
									
										
										
										
											2009-02-18 13:27:07 -05:00
										 |  |  |                 => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]] | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | End = !(.) | 
					
						
							| 
									
										
										
										
											2008-08-18 12:24:18 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-16 21:23:00 -05:00
										 |  |  | Main = Alternation End | 
					
						
							|  |  |  | ;EBNF |