! Copyright (C) 2007 Chris Double. ! See http://factorcode.org/license.txt for BSD license. USING: kernel compiler.units words arrays strings math.parser sequences quotations vectors namespaces math assocs continuations peg peg.parsers unicode.categories multiline combinators combinators.lib splitting accessors effects sequences.deep peg.search inference io.streams.string io prettyprint parser ; IN: peg.ebnf : rule ( name word -- parser ) #! Given an EBNF word produced from EBNF: return the EBNF rule "ebnf-parser" word-prop at ; TUPLE: tokenizer any one many ; : default-tokenizer ( -- tokenizer ) T{ tokenizer f [ [ drop t ] satisfy ] [ token ] [ [ = ] curry satisfy ] } ; : parser-tokenizer ( parser -- tokenizer ) 1quotation [ [ = ] curry satisfy ] dup tokenizer boa ; : rule-tokenizer ( name word -- tokenizer ) rule parser-tokenizer ; : tokenizer ( -- word ) \ tokenizer get-global [ default-tokenizer ] unless* ; : reset-tokenizer ( -- ) default-tokenizer \ tokenizer set-global ; : TOKENIZER: scan search [ "Tokenizer not found" throw ] unless* execute \ tokenizer set-global ; parsing TUPLE: ebnf-non-terminal symbol ; TUPLE: ebnf-terminal symbol ; TUPLE: ebnf-foreign word rule ; TUPLE: ebnf-any-character ; TUPLE: ebnf-range pattern ; TUPLE: ebnf-ensure group ; TUPLE: ebnf-ensure-not group ; TUPLE: ebnf-choice options ; TUPLE: ebnf-sequence elements ; TUPLE: ebnf-repeat0 group ; TUPLE: ebnf-repeat1 group ; TUPLE: ebnf-optional group ; TUPLE: ebnf-whitespace group ; TUPLE: ebnf-tokenizer elements ; TUPLE: ebnf-rule symbol elements ; TUPLE: ebnf-action parser code ; TUPLE: ebnf-var parser name ; TUPLE: ebnf-semantic parser code ; TUPLE: ebnf rules ; C: ebnf-non-terminal C: ebnf-terminal C: ebnf-foreign C: ebnf-any-character C: ebnf-range C: ebnf-ensure C: ebnf-ensure-not C: ebnf-choice C: ebnf-sequence C: ebnf-repeat0 C: ebnf-repeat1 C: ebnf-optional C: ebnf-whitespace C: ebnf-tokenizer C: ebnf-rule C: ebnf-action C: ebnf-var C: ebnf-semantic C: ebnf : filter-hidden ( seq -- seq ) #! Remove elements that produce no AST from sequence [ ebnf-ensure-not? not ] filter [ ebnf-ensure? not ] filter ; : syntax ( string -- parser ) #! Parses the string, ignoring white space, and #! does not put the result in the AST. token sp hide ; : syntax-pack ( begin parser end -- parser ) #! Parse 'parser' surrounded by syntax elements #! begin and end. [ syntax ] 2dip syntax pack ; #! Don't want to use 'replace' in an action since replace doesn't infer. #! Do the compilation of the peg at parse time and call (replace). PEG: escaper ( string -- ast ) [ "\\t" token [ drop "\t" ] action , "\\n" token [ drop "\n" ] action , "\\r" token [ drop "\r" ] action , ] choice* any-char-parser 2array choice repeat0 ; : replace-escapes ( string -- string ) escaper sift [ [ tree-write ] each ] with-string-writer ; : insert-escapes ( string -- string ) [ "\t" token [ drop "\\t" ] action , "\n" token [ drop "\\n" ] action , "\r" token [ drop "\\r" ] action , ] choice* replace ; : 'identifier' ( -- parser ) #! Return a parser that parses an identifer delimited by #! a quotation character. The quotation can be single #! or double quotes. The AST produced is the identifier #! between the quotes. [ [ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by , [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by , ] choice* [ >string replace-escapes ] action ; : 'non-terminal' ( -- parser ) #! A non-terminal is the name of another rule. It can #! be any non-blank character except for characters used #! in the EBNF syntax itself. [ { [ dup blank? ] [ dup CHAR: " = ] [ dup CHAR: ' = ] [ dup CHAR: | = ] [ dup CHAR: { = ] [ dup CHAR: } = ] [ dup CHAR: = = ] [ dup CHAR: ) = ] [ dup CHAR: ( = ] [ dup CHAR: ] = ] [ dup CHAR: [ = ] [ dup CHAR: . = ] [ dup CHAR: ! = ] [ dup CHAR: & = ] [ dup CHAR: * = ] [ dup CHAR: + = ] [ dup CHAR: ? = ] [ dup CHAR: : = ] [ dup CHAR: ~ = ] [ dup CHAR: < = ] [ dup CHAR: > = ] } 0|| not nip ] satisfy repeat1 [ >string ] action ; : 'terminal' ( -- parser ) #! A terminal is an identifier enclosed in quotations #! and it represents the literal value of the identifier. 'identifier' [ ] action ; : 'foreign-name' ( -- parser ) #! Parse a valid foreign parser name [ { [ dup blank? ] [ dup CHAR: > = ] } 0|| not nip ] satisfy repeat1 [ >string ] action ; : 'foreign' ( -- parser ) #! A foreign call is a call to a rule in another ebnf grammar [ "" syntax , ] seq* [ first2 ] action ; : 'any-character' ( -- parser ) #! A parser to match the symbol for any character match. [ CHAR: . = ] satisfy [ drop ] action ; : 'range-parser' ( -- parser ) #! Match the syntax for declaring character ranges [ [ "[" syntax , "[" token ensure-not , ] seq* hide , [ CHAR: ] = not ] satisfy repeat1 , "]" syntax , ] seq* [ first >string ] action ; : ('element') ( -- parser ) #! An element of a rule. It can be a terminal or a #! non-terminal but must not be followed by a "=". #! The latter indicates that it is the beginning of a #! new rule. [ [ [ 'non-terminal' , 'terminal' , 'foreign' , 'range-parser' , 'any-character' , ] choice* [ dup , "*" token hide , ] seq* [ first ] action , [ dup , "+" token hide , ] seq* [ first ] action , [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first ] action , , ] choice* , [ "=" syntax ensure-not , "=>" syntax ensure , ] choice* , ] seq* [ first ] action ; DEFER: 'action' : 'element' ( -- parser ) [ [ ('element') , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 ] action , ('element') , ] choice* ; DEFER: 'choice' : grouped ( quot suffix -- parser ) #! Parse a group of choices, with a suffix indicating #! the type of group (repeat0, repeat1, etc) and #! an quot that is the action that produces the AST. 2dup [ "(" [ 'choice' sp ] delay ")" syntax-pack swap 2seq [ first ] rot compose action , "{" [ 'choice' sp ] delay "}" syntax-pack swap 2seq [ first ] rot compose action , ] choice* ; : 'group' ( -- parser ) #! A grouping with no suffix. Used for precedence. [ ] [ "*" token sp ensure-not , "+" token sp ensure-not , "?" token sp ensure-not , ] seq* hide grouped ; : 'repeat0' ( -- parser ) [ ] "*" syntax grouped ; : 'repeat1' ( -- parser ) [ ] "+" syntax grouped ; : 'optional' ( -- parser ) [ ] "?" syntax grouped ; : 'factor-code' ( -- parser ) [ "]]" token ensure-not , "]?" token ensure-not , [ drop t ] satisfy , ] seq* [ first ] action repeat0 [ >string ] action ; : 'ensure-not' ( -- parser ) #! Parses the '!' syntax to ensure that #! something that matches the following elements do #! not exist in the parse stream. [ "!" syntax , 'group' sp , ] seq* [ first ] action ; : 'ensure' ( -- parser ) #! Parses the '&' syntax to ensure that #! something that matches the following elements does #! exist in the parse stream. [ "&" syntax , 'group' sp , ] seq* [ first ] action ; : ('sequence') ( -- parser ) #! A sequence of terminals and non-terminals, including #! groupings of those. [ [ 'ensure-not' sp , 'ensure' sp , 'element' sp , 'group' sp , 'repeat0' sp , 'repeat1' sp , 'optional' sp , ] choice* [ dup , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 ] action , , ] choice* ; : 'action' ( -- parser ) "[[" 'factor-code' "]]" syntax-pack ; : 'semantic' ( -- parser ) "?[" 'factor-code' "]?" syntax-pack ; : 'sequence' ( -- parser ) #! A sequence of terminals and non-terminals, including #! groupings of those. [ [ ('sequence') , 'action' , ] seq* [ first2 ] action , [ ('sequence') , 'semantic' , ] seq* [ first2 ] action , ('sequence') , ] choice* repeat1 [ dup length 1 = [ first ] [ ] if ] action ; : 'actioned-sequence' ( -- parser ) [ [ 'sequence' , "=>" syntax , 'action' , ] seq* [ first2 ] action , 'sequence' , ] choice* ; : 'choice' ( -- parser ) 'actioned-sequence' sp repeat1 [ dup length 1 = [ first ] [ ] if ] action "|" token sp list-of [ dup length 1 = [ first ] [ ] if ] action ; : 'tokenizer' ( -- parser ) [ "tokenizer" syntax , "=" syntax , ">" token ensure-not , [ "default" token sp , 'choice' , ] choice* , ] seq* [ first ] action ; : 'rule' ( -- parser ) [ "tokenizer" token ensure-not , 'non-terminal' [ symbol>> ] action , "=" syntax , ">" token ensure-not , 'choice' , ] seq* [ first2 ] action ; : 'ebnf' ( -- parser ) [ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ ] action ; GENERIC: (transform) ( ast -- parser ) SYMBOL: parser SYMBOL: main SYMBOL: ignore-ws : transform ( ast -- object ) H{ } clone dup dup [ f ignore-ws set parser set swap (transform) main set ] bind ; M: ebnf (transform) ( ast -- parser ) rules>> [ (transform) ] map peek ; M: ebnf-tokenizer (transform) ( ast -- parser ) elements>> dup "default" = [ drop default-tokenizer \ tokenizer set-global any-char ] [ (transform) dup parser-tokenizer \ tokenizer set-global ] if ; M: ebnf-rule (transform) ( ast -- parser ) dup elements>> (transform) [ swap symbol>> dup get { [ tuple? ] [ delegate parser? ] } 1&& [ "Rule '" over append "' defined more than once" append throw ] [ set ] if ] keep ; M: ebnf-sequence (transform) ( ast -- parser ) #! If ignore-ws is set then each element of the sequence #! ignores leading whitespace. This is not inherited by #! subelements of the sequence. elements>> [ f ignore-ws [ (transform) ] with-variable ignore-ws get [ sp ] when ] map seq [ dup length 1 = [ first ] when ] action ; M: ebnf-choice (transform) ( ast -- parser ) options>> [ (transform) ] map choice ; M: ebnf-any-character (transform) ( ast -- parser ) drop tokenizer any>> call ; M: ebnf-range (transform) ( ast -- parser ) pattern>> range-pattern ; : transform-group ( ast -- parser ) #! convert a ast node with groups to a parser for that group group>> (transform) ; M: ebnf-ensure (transform) ( ast -- parser ) transform-group ensure ; M: ebnf-ensure-not (transform) ( ast -- parser ) transform-group ensure-not ; M: ebnf-repeat0 (transform) ( ast -- parser ) transform-group repeat0 ; M: ebnf-repeat1 (transform) ( ast -- parser ) transform-group repeat1 ; M: ebnf-optional (transform) ( ast -- parser ) transform-group optional ; M: ebnf-whitespace (transform) ( ast -- parser ) t ignore-ws [ transform-group ] with-variable ; GENERIC: build-locals ( code ast -- code ) M: ebnf-sequence build-locals ( code ast -- code ) #! Note the need to filter out this ebnf items that #! leave nothing in the AST elements>> filter-hidden dup length 1 = [ first build-locals ] [ dup [ ebnf-var? ] filter empty? [ drop ] [ [ "USING: locals sequences ; [let* | " % dup length swap [ dup ebnf-var? [ name>> % " [ " % # " over nth ] " % ] [ 2drop ] if ] 2each " | " % % " nip ]" % ] "" make ] if ] if ; M: ebnf-var build-locals ( code ast -- ) [ "USING: locals kernel ; [let* | " % name>> % " [ dup ] " % " | " % % " nip ]" % ] "" make ; M: object build-locals ( code ast -- ) drop ; : check-action-effect ( quot -- quot ) dup infer { { [ dup (( a -- b )) effect<= ] [ drop ] } { [ dup (( -- b )) effect<= ] [ drop [ drop ] prepose ] } [ [ "Bad effect: " write effect>string write " for quotation " write pprint ] with-string-writer throw ] } cond ; M: ebnf-action (transform) ( ast -- parser ) [ parser>> (transform) ] [ code>> insert-escapes ] [ parser>> ] tri build-locals string-lines parse-lines check-action-effect action ; M: ebnf-semantic (transform) ( ast -- parser ) [ parser>> (transform) ] [ code>> insert-escapes ] [ parser>> ] tri build-locals string-lines parse-lines semantic ; M: ebnf-var (transform) ( ast -- parser ) parser>> (transform) ; M: ebnf-terminal (transform) ( ast -- parser ) symbol>> tokenizer one>> call ; M: ebnf-foreign (transform) ( ast -- parser ) dup word>> search [ "Foreign word '" swap word>> append "' not found" append throw ] unless* swap rule>> [ main ] unless* dupd swap rule [ nip ] [ execute ] if* ; : parser-not-found ( name -- * ) [ "Parser '" % % "' not found." % ] "" make throw ; M: ebnf-non-terminal (transform) ( ast -- parser ) symbol>> [ , \ dup , parser get , \ at , [ parser-not-found ] , \ unless* , \ nip , ] [ ] make box ; : transform-ebnf ( string -- object ) 'ebnf' parse parse-result-ast transform ; : check-parse-result ( result -- result ) dup [ dup parse-result-remaining [ blank? ] trim empty? [ [ "Unable to fully parse EBNF. Left to parse was: " % parse-result-remaining % ] "" make throw ] unless ] [ "Could not parse EBNF" throw ] if ; : ebnf>quot ( string -- hashtable quot ) 'ebnf' parse check-parse-result parse-result-ast transform dup dup parser [ main swap at compile ] with-variable [ compiled-parse ] curry [ with-scope ] curry ; : [EBNF scan { { "+" [ scan-word execute "" swap ] } [ " " append default-tokenizer ] } case \ tokenizer set-global [ "EBNF]" parse-multiline-string ] [ drop "" ] recover append ebnf>quot nip parsed reset-tokenizer ; parsing : EBNF: CREATE-WORD scan { { "+" [ scan-word execute "" swap ] } [ " " append default-tokenizer ] } case \ tokenizer set-global dupd [ ";EBNF" parse-multiline-string ] [ drop "" ] recover append ebnf>quot swapd 1 1 define-declared "ebnf-parser" set-word-prop reset-tokenizer ; parsing