factor/basis/regexp/parser/parser.factor

! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: peg.ebnf kernel math.parser sequences assocs arrays
combinators regexp.classes strings splitting peg locals ;
IN: regexp.parser

TUPLE: range from to ;
TUPLE: char-class ranges ;
TUPLE: primitive-class class ;
TUPLE: not-char-class ranges ;
TUPLE: not-primitive-class class ;
TUPLE: from-to n m ;
TUPLE: at-least n ;
TUPLE: up-to n ;
TUPLE: exactly n ;
TUPLE: times expression times ;
TUPLE: concatenation seq ;
TUPLE: alternation seq ;
TUPLE: maybe term ;
TUPLE: star term ;
TUPLE: plus term ;
TUPLE: with-options tree options ;
TUPLE: ast ^? $? tree ;
SINGLETON: any-char

: allowed-char? ( ch -- ? )
    ".()|[*+?" member? not ;

ERROR: bad-number ;

: ensure-number ( n -- n )
    [ bad-number ] unless* ;

:: at-error ( key assoc quot: ( key -- replacement ) -- value )
    key assoc at* [ drop key quot call ] unless ; inline

ERROR: bad-class name ;

: name>class ( name -- class )
    {
        { "Lower" letter-class }
        { "Upper" LETTER-class }
        { "Alpha" Letter-class }
        { "ASCII" ascii-class }
        { "Digit" digit-class }
        { "Alnum" alpha-class }
        { "Punct" punctuation-class }
        { "Graph" java-printable-class }
        { "Print" java-printable-class }
        { "Blank" non-newline-blank-class }
        { "Cntrl" control-character-class }
        { "XDigit" hex-digit-class }
        { "Space" java-blank-class }
        ! TODO: unicode-character-class
    } [ bad-class ] at-error ;

: lookup-escape ( char -- ast )
    {
        { CHAR: t [ CHAR: \t ] }
        { CHAR: n [ CHAR: \n ] }
        { CHAR: r [ CHAR: \r ] }
        { CHAR: f [ HEX: c ] }
        { CHAR: a [ HEX: 7 ] }
        { CHAR: e [ HEX: 1b ] }
        { CHAR: \\ [ CHAR: \\ ] }

        { CHAR: w [ c-identifier-class primitive-class boa ] }
        { CHAR: W [ c-identifier-class not-primitive-class boa ] }
        { CHAR: s [ java-blank-class primitive-class boa ] }
        { CHAR: S [ java-blank-class not-primitive-class boa ] }
        { CHAR: d [ digit-class primitive-class boa ] }
        { CHAR: D [ digit-class not-primitive-class boa ] }

        [ ]
    } case ;

TUPLE: options on off ;

SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;

: options-assoc ( -- assoc )
    H{
        { CHAR: i case-insensitive }
        { CHAR: d unix-lines }
        { CHAR: m multiline }
        { CHAR: n multiline }
        { CHAR: r reversed-regexp }
        { CHAR: s dotall }
        { CHAR: u unicode-case }
        { CHAR: x comments }
    } ;

: ch>option ( ch -- singleton )
    options-assoc at ;

: option>ch ( option -- string )
    options-assoc value-at ;

: parse-options ( on off -- options )
    [ [ ch>option ] map ] bi@ options boa ;

! TODO: make range syntax better (negation, and, etc),
!       add syntax for various parenthized things,
!       add greedy and nongreedy forms of matching
! (once it's all implemented)

EBNF: (parse-regexp)

CharacterInBracket = !("}") Character

Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]]
       | "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]]
       | "u" Character:a Character:b Character:c Character:d
            => [[ { a b c d } hex> ensure-number ]]
       | "x" Character:a Character:b
            => [[ { a b } hex> ensure-number ]]
       | "0" Character:a Character:b Character:c
            => [[ { a b c } oct> ensure-number ]]
       | . => [[ lookup-escape ]]

Character = "\\" Escape:e => [[ e ]]
          | . ?[ allowed-char? ]?

AnyRangeCharacter = Character | "["

RangeCharacter = !("]") AnyRangeCharacter

Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
      | RangeCharacter

StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
           | AnyRangeCharacter

Ranges = StartRange:s Range*:r => [[ r s prefix ]]

CharClass = "^" Ranges:e => [[ e not-char-class boa ]]
          | Ranges:e => [[ e char-class boa ]]

Options = [idmsux]*

Parenthized = "?:" Alternation:a => [[ a ]]
            | "?" Options:on "-"? Options:off ":" Alternation:a
                => [[ a on off parse-options with-options boa ]]
            | "?#" [^)]* => [[ ignore ]]
            | Alternation

Element = "(" Parenthized:p ")" => [[ p ]]
        | "[" CharClass:r "]" => [[ r ]]
        | ".":d => [[ any-char ]]
        | Character

Number = (!(","|"}").)* => [[ string>number ensure-number ]]

Times = "," Number:n "}" => [[ n up-to boa ]]
      | Number:n ",}" => [[ n at-least boa ]]
      | Number:n "}" => [[ n exactly boa ]]
      | "}" => [[ bad-number ]]
      | Number:n "," Number:m "}" => [[ n m from-to boa ]]

Repeated = Element:e "{" Times:t => [[ e t times boa ]]
         | Element:e "?" => [[ e maybe boa ]]
         | Element:e "*" => [[ e star boa ]]
         | Element:e "+" => [[ e plus boa ]]
         | Element

Concatenation = Repeated*:r => [[ r concatenation boa ]]

Alternation = Concatenation:c ("|" Concatenation)*:a
                => [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]]

End = !(.)

Main = Alternation End
;EBNF

: parse-regexp ( string -- regexp )
    ! Hack because I want $ allowable in regexps,
    ! but with special behavior at the end
    ! This fails if the regexp is stupid, though...
    dup first CHAR: ^ = tuck [ rest ] when
    dup peek CHAR: $ = tuck [ but-last ] when
    (parse-regexp) ast boa ;
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00			`! See http://factorcode.org/license.txt for BSD license.`
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`USING: peg.ebnf kernel math.parser sequences assocs arrays`
			`combinators regexp.classes strings splitting peg locals ;`
move regexp2 to regexp, bug fixes, more tests 2008-09-18 15:42:16 -04:00			`IN: regexp.parser`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`TUPLE: range from to ;`
			`TUPLE: char-class ranges ;`
			`TUPLE: primitive-class class ;`
			`TUPLE: not-char-class ranges ;`
			`TUPLE: not-primitive-class class ;`
			`TUPLE: from-to n m ;`
			`TUPLE: at-least n ;`
			`TUPLE: up-to n ;`
			`TUPLE: exactly n ;`
			`TUPLE: times expression times ;`
			`TUPLE: concatenation seq ;`
			`TUPLE: alternation seq ;`
			`TUPLE: maybe term ;`
			`TUPLE: star term ;`
			`TUPLE: plus term ;`
			`TUPLE: with-options tree options ;`
			`TUPLE: ast ^? $? tree ;`
			`SINGLETON: any-char`

			`: allowed-char? ( ch -- ? )`
			`".()\|[*+?" member? not ;`

			`ERROR: bad-number ;`

			`: ensure-number ( n -- n )`
			`[ bad-number ] unless* ;`

			`:: at-error ( key assoc quot: ( key -- replacement ) -- value )`
			`key assoc at* [ drop key quot call ] unless ; inline`

			`ERROR: bad-class name ;`

			`: name>class ( name -- class )`
			`{`
			`{ "Lower" letter-class }`
			`{ "Upper" LETTER-class }`
			`{ "Alpha" Letter-class }`
			`{ "ASCII" ascii-class }`
			`{ "Digit" digit-class }`
			`{ "Alnum" alpha-class }`
			`{ "Punct" punctuation-class }`
			`{ "Graph" java-printable-class }`
			`{ "Print" java-printable-class }`
			`{ "Blank" non-newline-blank-class }`
			`{ "Cntrl" control-character-class }`
			`{ "XDigit" hex-digit-class }`
			`{ "Space" java-blank-class }`
			`! TODO: unicode-character-class`
			`} [ bad-class ] at-error ;`

			`: lookup-escape ( char -- ast )`
			`{`
			`{ CHAR: t [ CHAR: \t ] }`
			`{ CHAR: n [ CHAR: \n ] }`
			`{ CHAR: r [ CHAR: \r ] }`
			`{ CHAR: f [ HEX: c ] }`
			`{ CHAR: a [ HEX: 7 ] }`
			`{ CHAR: e [ HEX: 1b ] }`
			`{ CHAR: \\ [ CHAR: \\ ] }`

			`{ CHAR: w [ c-identifier-class primitive-class boa ] }`
			`{ CHAR: W [ c-identifier-class not-primitive-class boa ] }`
			`{ CHAR: s [ java-blank-class primitive-class boa ] }`
			`{ CHAR: S [ java-blank-class not-primitive-class boa ] }`
			`{ CHAR: d [ digit-class primitive-class boa ] }`
			`{ CHAR: D [ digit-class not-primitive-class boa ] }`

			`[ ]`
			`} case ;`
move things around. the parser no longer adjusts nodes based on options, instead opting for nfa to handle it (case-insensitive, multiline, dotall, reversed..) 2008-11-24 23:17:47 -05:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`TUPLE: options on off ;`
move things around. the parser no longer adjusts nodes based on options, instead opting for nfa to handle it (case-insensitive, multiline, dotall, reversed..) 2008-11-24 23:17:47 -05:00
better literal syntax for regexps. support ^ and $ (not in multiline mode yet) but not yet in multiline mode. support for \A and \Z \z is next. removed some crud from the parser, and added more commented out unit tests... 2008-11-24 01:18:27 -05:00			`SINGLETONS: unix-lines dotall multiline comments case-insensitive`
			`unicode-case reversed-regexp ;`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`: options-assoc ( -- assoc )`
			`H{`
			`{ CHAR: i case-insensitive }`
			`{ CHAR: d unix-lines }`
			`{ CHAR: m multiline }`
			`{ CHAR: n multiline }`
			`{ CHAR: r reversed-regexp }`
			`{ CHAR: s dotall }`
			`{ CHAR: u unicode-case }`
			`{ CHAR: x comments }`
			`} ;`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`: ch>option ( ch -- singleton )`
			`options-assoc at ;`
move things around. the parser no longer adjusts nodes based on options, instead opting for nfa to handle it (case-insensitive, multiline, dotall, reversed..) 2008-11-24 23:17:47 -05:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`: option>ch ( option -- string )`
			`options-assoc value-at ;`
case-insensitive for constants 2008-08-21 18:12:26 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`: parse-options ( on off -- options )`
			`[ [ ch>option ] map ] bi@ options boa ;`
read1 drop -> drop1 add possessive/reluctant parsing 2008-08-22 17:45:33 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`! TODO: make range syntax better (negation, and, etc),`
			`! add syntax for various parenthized things,`
			`! add greedy and nongreedy forms of matching`
			`! (once it's all implemented)`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`EBNF: (parse-regexp)`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`CharacterInBracket = !("}") Character`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]]`
			`\| "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]]`
			`\| "u" Character:a Character:b Character:c Character:d`
			`=> [[ { a b c d } hex> ensure-number ]]`
			`\| "x" Character:a Character:b`
			`=> [[ { a b } hex> ensure-number ]]`
			`\| "0" Character:a Character:b Character:c`
			`=> [[ { a b c } oct> ensure-number ]]`
			`\| . => [[ lookup-escape ]]`
case insensitive works 2008-08-21 18:55:25 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Character = "\\" Escape:e => [[ e ]]`
			`\| . ?[ allowed-char? ]?`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`AnyRangeCharacter = Character \| "["`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`RangeCharacter = !("]") AnyRangeCharacter`
case-insensitive for constants 2008-08-21 18:12:26 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]`
			`\| RangeCharacter`
move things around. the parser no longer adjusts nodes based on options, instead opting for nfa to handle it (case-insensitive, multiline, dotall, reversed..) 2008-11-24 23:17:47 -05:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]`
			`\| AnyRangeCharacter`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Ranges = StartRange:s Range*:r => [[ r s prefix ]]`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`CharClass = "^" Ranges:e => [[ e not-char-class boa ]]`
			`\| Ranges:e => [[ e char-class boa ]]`
better literal syntax for regexps. support ^ and $ (not in multiline mode yet) but not yet in multiline mode. support for \A and \Z \z is next. removed some crud from the parser, and added more commented out unit tests... 2008-11-24 01:18:27 -05:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Options = [idmsux]*`
better literal syntax for regexps. support ^ and $ (not in multiline mode yet) but not yet in multiline mode. support for \A and \Z \z is next. removed some crud from the parser, and added more commented out unit tests... 2008-11-24 01:18:27 -05:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Parenthized = "?:" Alternation:a => [[ a ]]`
			`\| "?" Options:on "-"? Options:off ":" Alternation:a`
			`=> [[ a on off parse-options with-options boa ]]`
			`\| "?#" [^)]* => [[ ignore ]]`
			`\| Alternation`
better literal syntax for regexps. support ^ and $ (not in multiline mode yet) but not yet in multiline mode. support for \A and \Z \z is next. removed some crud from the parser, and added more commented out unit tests... 2008-11-24 01:18:27 -05:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Element = "(" Parenthized:p ")" => [[ p ]]`
			`\| "[" CharClass:r "]" => [[ r ]]`
			`\| ".":d => [[ any-char ]]`
			`\| Character`
better literal syntax for regexps. support ^ and $ (not in multiline mode yet) but not yet in multiline mode. support for \A and \Z \z is next. removed some crud from the parser, and added more commented out unit tests... 2008-11-24 01:18:27 -05:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Number = (!(","\|"}").)* => [[ string>number ensure-number ]]`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Times = "," Number:n "}" => [[ n up-to boa ]]`
			`\| Number:n ",}" => [[ n at-least boa ]]`
			`\| Number:n "}" => [[ n exactly boa ]]`
			`\| "}" => [[ bad-number ]]`
			`\| Number:n "," Number:m "}" => [[ n m from-to boa ]]`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Repeated = Element:e "{" Times:t => [[ e t times boa ]]`
			`\| Element:e "?" => [[ e maybe boa ]]`
			`\| Element:e "*" => [[ e star boa ]]`
			`\| Element:e "+" => [[ e plus boa ]]`
			`\| Element`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Concatenation = Repeated*:r => [[ r concatenation boa ]]`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Alternation = Concatenation:c ("\|" Concatenation)*:a`
			`=> [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]]`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`End = !(.)`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`Main = Alternation End`
			`;EBNF`
put the regexp library into git it still needs these fixes: empty \Q\E, lookahead, group captures, unicode character classes, reluctant matching if possible 2008-08-18 12:24:18 -04:00
Rewriting regexp parser 2009-02-16 21:23:00 -05:00			`: parse-regexp ( string -- regexp )`
			`! Hack because I want $ allowable in regexps,`
			`! but with special behavior at the end`
			`! This fails if the regexp is stupid, though...`
			`dup first CHAR: ^ = tuck [ rest ] when`
			`dup peek CHAR: $ = tuck [ but-last ] when`
			`(parse-regexp) ast boa ;`