factor/basis/regexp/parser/parser.factor

177 lines
5.6 KiB
Factor
Raw Normal View History

2009-02-16 21:23:00 -05:00
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
2009-02-18 13:27:07 -05:00
USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
combinators regexp.classes strings splitting peg locals accessors
regexp.ast ;
IN: regexp.parser
2009-02-16 21:23:00 -05:00
: allowed-char? ( ch -- ? )
".()|[*+?$^" member? not ;
2009-02-16 21:23:00 -05:00
ERROR: bad-number ;
: ensure-number ( n -- n )
[ bad-number ] unless* ;
:: at-error ( key assoc quot: ( key -- replacement ) -- value )
key assoc at* [ drop key quot call ] unless ; inline
ERROR: bad-class name ;
: name>class ( name -- class )
{
{ "Lower" letter-class }
{ "Upper" LETTER-class }
{ "Alpha" Letter-class }
{ "ASCII" ascii-class }
{ "Digit" digit-class }
{ "Alnum" alpha-class }
{ "Punct" punctuation-class }
{ "Graph" java-printable-class }
{ "Print" java-printable-class }
{ "Blank" non-newline-blank-class }
{ "Cntrl" control-character-class }
{ "XDigit" hex-digit-class }
{ "Space" java-blank-class }
! TODO: unicode-character-class
} [ bad-class ] at-error ;
: lookup-escape ( char -- ast )
{
{ CHAR: t [ CHAR: \t ] }
{ CHAR: n [ CHAR: \n ] }
{ CHAR: r [ CHAR: \r ] }
{ CHAR: f [ HEX: c ] }
{ CHAR: a [ HEX: 7 ] }
{ CHAR: e [ HEX: 1b ] }
{ CHAR: \\ [ CHAR: \\ ] }
2009-02-18 13:27:07 -05:00
{ CHAR: w [ c-identifier-class <primitive-class> ] }
2009-02-19 19:28:54 -05:00
{ CHAR: W [ c-identifier-class <primitive-class> <not-class> ] }
2009-02-18 13:27:07 -05:00
{ CHAR: s [ java-blank-class <primitive-class> ] }
2009-02-19 19:28:54 -05:00
{ CHAR: S [ java-blank-class <primitive-class> <not-class> ] }
2009-02-18 13:27:07 -05:00
{ CHAR: d [ digit-class <primitive-class> ] }
2009-02-19 19:28:54 -05:00
{ CHAR: D [ digit-class <primitive-class> <not-class> ] }
2009-02-16 21:23:00 -05:00
{ CHAR: z [ end-of-input <tagged-epsilon> ] }
{ CHAR: Z [ end-of-file <tagged-epsilon> ] }
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] }
2009-02-16 21:23:00 -05:00
[ ]
} case ;
2009-02-16 21:23:00 -05:00
: options-assoc ( -- assoc )
H{
{ CHAR: i case-insensitive }
{ CHAR: d unix-lines }
{ CHAR: m multiline }
{ CHAR: n multiline }
{ CHAR: r reversed-regexp }
{ CHAR: s dotall }
{ CHAR: u unicode-case }
{ CHAR: x comments }
} ;
2009-02-16 21:23:00 -05:00
: ch>option ( ch -- singleton )
options-assoc at ;
2009-02-16 21:23:00 -05:00
: option>ch ( option -- string )
options-assoc value-at ;
2008-08-21 18:12:26 -04:00
2009-02-16 21:23:00 -05:00
: parse-options ( on off -- options )
2009-02-18 13:27:07 -05:00
[ [ ch>option ] { } map-as ] bi@ <options> ;
: string>options ( string -- options )
"-" split1 parse-options ;
: options>string ( options -- string )
[ on>> ] [ off>> ] bi
[ [ option>ch ] map ] bi@
2009-02-20 19:45:24 -05:00
[ "-" glue ] unless-empty
2009-02-18 13:27:07 -05:00
"" like ;
! TODO: add syntax for various parenthized things,
2009-02-16 21:23:00 -05:00
! add greedy and nongreedy forms of matching
! (once it's all implemented)
2009-02-18 13:27:07 -05:00
EBNF: parse-regexp
2009-02-16 21:23:00 -05:00
CharacterInBracket = !("}") Character
2009-02-18 13:27:07 -05:00
QuotedCharacter = !("\\E") .
Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
| "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
| "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
2009-02-16 21:23:00 -05:00
| "u" Character:a Character:b Character:c Character:d
=> [[ { a b c d } hex> ensure-number ]]
| "x" Character:a Character:b
=> [[ { a b } hex> ensure-number ]]
| "0" Character:a Character:b Character:c
=> [[ { a b c } oct> ensure-number ]]
| . => [[ lookup-escape ]]
2008-08-21 18:55:25 -04:00
2009-02-18 13:27:07 -05:00
EscapeSequence = "\\" Escape:e => [[ e ]]
Character = EscapeSequence
| "$" => [[ $ <tagged-epsilon> ]]
| "^" => [[ ^ <tagged-epsilon> ]]
| . ?[ allowed-char? ]?
2009-02-18 13:27:07 -05:00
AnyRangeCharacter = EscapeSequence | .
2009-02-16 21:23:00 -05:00
RangeCharacter = !("]") AnyRangeCharacter
2008-08-21 18:12:26 -04:00
2009-02-18 13:27:07 -05:00
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
2009-02-16 21:23:00 -05:00
| RangeCharacter
2009-02-18 13:27:07 -05:00
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
2009-02-16 21:23:00 -05:00
| AnyRangeCharacter
2009-02-16 21:23:00 -05:00
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
2009-02-18 13:27:07 -05:00
CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
2009-02-16 21:23:00 -05:00
Options = [idmsux]*
2009-02-16 21:23:00 -05:00
Parenthized = "?:" Alternation:a => [[ a ]]
| "?" Options:on "-"? Options:off ":" Alternation:a
2009-02-18 13:27:07 -05:00
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
2009-03-04 01:36:03 -05:00
| "?=" Alternation:a => [[ a <lookahead> <tagged-epsilon> ]]
| "?!" Alternation:a => [[ a <negation> <lookahead> <tagged-epsilon> ]]
| "?<=" Alternation:a => [[ a <lookbehind> <tagged-epsilon> ]]
| "?<!" Alternation:a => [[ a <negation> <lookbehind> <tagged-epsilon> ]]
2009-02-16 21:23:00 -05:00
| Alternation
2009-02-16 21:23:00 -05:00
Element = "(" Parenthized:p ")" => [[ p ]]
| "[" CharClass:r "]" => [[ r ]]
2009-02-20 18:54:48 -05:00
| ".":d => [[ any-char <primitive-class> ]]
2009-02-16 21:23:00 -05:00
| Character
2009-02-16 21:23:00 -05:00
Number = (!(","|"}").)* => [[ string>number ensure-number ]]
2009-02-18 13:27:07 -05:00
Times = "," Number:n "}" => [[ 0 n <from-to> ]]
| Number:n ",}" => [[ n <at-least> ]]
| Number:n "}" => [[ n n <from-to> ]]
2009-02-16 21:23:00 -05:00
| "}" => [[ bad-number ]]
2009-02-18 13:27:07 -05:00
| Number:n "," Number:m "}" => [[ n m <from-to> ]]
2009-02-18 13:27:07 -05:00
Repeated = Element:e "{" Times:t => [[ e t <times> ]]
| Element:e "*+" => [[ e <possessive-star> ]]
| Element:e "++" => [[ e <possessive-plus> ]]
2009-02-18 13:27:07 -05:00
| Element:e "?" => [[ e <maybe> ]]
| Element:e "*" => [[ e <star> ]]
| Element:e "+" => [[ e <plus> ]]
2009-02-16 21:23:00 -05:00
| Element
2009-02-18 13:27:07 -05:00
Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
2009-02-16 21:23:00 -05:00
Alternation = Concatenation:c ("|" Concatenation)*:a
2009-02-18 13:27:07 -05:00
=> [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
2009-02-16 21:23:00 -05:00
End = !(.)
2009-02-16 21:23:00 -05:00
Main = Alternation End
;EBNF