! Copyright (C) 2007 Chris Double. ! See http://factorcode.org/license.txt for BSD license. USING: kernel words arrays strings math.parser sequences quotations vectors namespaces make math assocs continuations peg peg.parsers unicode.categories multiline splitting accessors effects sequences.deep peg.search combinators.short-circuit lexer io.streams.string stack-checker io combinators parser summary ; FROM: compiler.units => with-compilation-unit ; FROM: vocabs.parser => search ; IN: peg.ebnf : rule ( name word -- parser ) #! Given an EBNF word produced from EBNF: return the EBNF rule "ebnf-parser" word-prop at ; ERROR: no-rule rule parser ; : lookup-rule ( rule parser -- rule' ) 2dup rule [ 2nip ] [ no-rule ] if* ; TUPLE: tokenizer any one many ; : default-tokenizer ( -- tokenizer ) T{ tokenizer f [ any-char ] [ token ] [ [ = ] curry any-char swap semantic ] } ; : parser-tokenizer ( parser -- tokenizer ) [ 1quotation ] keep [ swap [ = ] curry semantic ] curry dup \ tokenizer boa ; : rule-tokenizer ( name word -- tokenizer ) rule parser-tokenizer ; : tokenizer ( -- word ) \ tokenizer get-global [ default-tokenizer ] unless* ; : reset-tokenizer ( -- ) default-tokenizer \ tokenizer set-global ; ERROR: no-tokenizer name ; M: no-tokenizer summary drop "Tokenizer not found" ; SYNTAX: TOKENIZER: scan dup search [ nip ] [ no-tokenizer ] if* execute( -- tokenizer ) \ tokenizer set-global ; TUPLE: ebnf-non-terminal symbol ; TUPLE: ebnf-terminal symbol ; TUPLE: ebnf-foreign word rule ; TUPLE: ebnf-any-character ; TUPLE: ebnf-range pattern ; TUPLE: ebnf-ensure group ; TUPLE: ebnf-ensure-not group ; TUPLE: ebnf-choice options ; TUPLE: ebnf-sequence elements ; TUPLE: ebnf-repeat0 group ; TUPLE: ebnf-repeat1 group ; TUPLE: ebnf-optional group ; TUPLE: ebnf-whitespace group ; TUPLE: ebnf-tokenizer elements ; TUPLE: ebnf-rule symbol elements ; TUPLE: ebnf-action parser code ; TUPLE: ebnf-var parser name ; TUPLE: ebnf-semantic parser code ; TUPLE: ebnf rules ; C: ebnf-non-terminal C: ebnf-terminal C: ebnf-foreign C: ebnf-any-character C: ebnf-range C: ebnf-ensure C: ebnf-ensure-not C: ebnf-choice C: ebnf-sequence C: ebnf-repeat0 C: ebnf-repeat1 C: ebnf-optional C: ebnf-whitespace C: ebnf-tokenizer C: ebnf-rule C: ebnf-action C: ebnf-var C: ebnf-semantic C: ebnf : filter-hidden ( seq -- seq ) #! Remove elements that produce no AST from sequence [ ebnf-ensure-not? not ] filter [ ebnf-ensure? not ] filter ; : syntax ( string -- parser ) #! Parses the string, ignoring white space, and #! does not put the result in the AST. token sp hide ; : syntax-pack ( begin parser end -- parser ) #! Parse 'parser' surrounded by syntax elements #! begin and end. [ syntax ] 2dip syntax pack ; #! Don't want to use 'replace' in an action since replace doesn't infer. #! Do the compilation of the peg at parse time and call (replace). PEG: escaper ( string -- ast ) [ "\\t" token [ drop "\t" ] action , "\\n" token [ drop "\n" ] action , "\\r" token [ drop "\r" ] action , "\\\\" token [ drop "\\" ] action , ] choice* any-char-parser 2array choice repeat0 ; : replace-escapes ( string -- string ) escaper sift [ [ tree-write ] each ] with-string-writer ; : insert-escapes ( string -- string ) [ "\t" token [ drop "\\t" ] action , "\n" token [ drop "\\n" ] action , "\r" token [ drop "\\r" ] action , ] choice* replace ; : 'identifier' ( -- parser ) #! Return a parser that parses an identifer delimited by #! a quotation character. The quotation can be single #! or double quotes. The AST produced is the identifier #! between the quotes. [ [ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by , [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by , ] choice* [ >string replace-escapes ] action ; : 'non-terminal' ( -- parser ) #! A non-terminal is the name of another rule. It can #! be any non-blank character except for characters used #! in the EBNF syntax itself. [ { [ blank? ] [ CHAR: " = ] [ CHAR: ' = ] [ CHAR: | = ] [ CHAR: { = ] [ CHAR: } = ] [ CHAR: = = ] [ CHAR: ) = ] [ CHAR: ( = ] [ CHAR: ] = ] [ CHAR: [ = ] [ CHAR: . = ] [ CHAR: ! = ] [ CHAR: & = ] [ CHAR: * = ] [ CHAR: + = ] [ CHAR: ? = ] [ CHAR: : = ] [ CHAR: ~ = ] [ CHAR: < = ] [ CHAR: > = ] } 1|| not ] satisfy repeat1 [ >string ] action ; : 'terminal' ( -- parser ) #! A terminal is an identifier enclosed in quotations #! and it represents the literal value of the identifier. 'identifier' [ ] action ; : 'foreign-name' ( -- parser ) #! Parse a valid foreign parser name [ { [ blank? ] [ CHAR: > = ] } 1|| not ] satisfy repeat1 [ >string ] action ; : 'foreign' ( -- parser ) #! A foreign call is a call to a rule in another ebnf grammar [ "" syntax , ] seq* [ first2 ] action ; : 'any-character' ( -- parser ) #! A parser to match the symbol for any character match. [ CHAR: . = ] satisfy [ drop ] action ; : 'range-parser' ( -- parser ) #! Match the syntax for declaring character ranges [ [ "[" syntax , "[" token ensure-not , ] seq* hide , [ CHAR: ] = not ] satisfy repeat1 , "]" syntax , ] seq* [ first >string ] action ; : ('element') ( -- parser ) #! An element of a rule. It can be a terminal or a #! non-terminal but must not be followed by a "=". #! The latter indicates that it is the beginning of a #! new rule. [ [ [ 'non-terminal' , 'terminal' , 'foreign' , 'range-parser' , 'any-character' , ] choice* [ dup , "*" token hide , ] seq* [ first ] action , [ dup , "+" token hide , ] seq* [ first ] action , [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first ] action , , ] choice* , [ "=" syntax ensure-not , "=>" syntax ensure , ] choice* , ] seq* [ first ] action ; DEFER: 'action' : 'element' ( -- parser ) [ [ ('element') , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 ] action , ('element') , ] choice* ; DEFER: 'choice' : grouped ( quot suffix -- parser ) #! Parse a group of choices, with a suffix indicating #! the type of group (repeat0, repeat1, etc) and #! an quot that is the action that produces the AST. 2dup [ "(" [ 'choice' sp ] delay ")" syntax-pack swap 2seq [ first ] rot compose action , "{" [ 'choice' sp ] delay "}" syntax-pack swap 2seq [ first ] rot compose action , ] choice* ; : 'group' ( -- parser ) #! A grouping with no suffix. Used for precedence. [ ] [ "*" token sp ensure-not , "+" token sp ensure-not , "?" token sp ensure-not , ] seq* hide grouped ; : 'repeat0' ( -- parser ) [ ] "*" syntax grouped ; : 'repeat1' ( -- parser ) [ ] "+" syntax grouped ; : 'optional' ( -- parser ) [ ] "?" syntax grouped ; : 'factor-code' ( -- parser ) [ "]]" token ensure-not , "]?" token ensure-not , [ drop t ] satisfy , ] seq* repeat0 [ concat >string ] action ; : 'ensure-not' ( -- parser ) #! Parses the '!' syntax to ensure that #! something that matches the following elements do #! not exist in the parse stream. [ "!" syntax , 'group' sp , ] seq* [ first ] action ; : 'ensure' ( -- parser ) #! Parses the '&' syntax to ensure that #! something that matches the following elements does #! exist in the parse stream. [ "&" syntax , 'group' sp , ] seq* [ first ] action ; : ('sequence') ( -- parser ) #! A sequence of terminals and non-terminals, including #! groupings of those. [ [ 'ensure-not' sp , 'ensure' sp , 'element' sp , 'group' sp , 'repeat0' sp , 'repeat1' sp , 'optional' sp , ] choice* [ dup , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 ] action , , ] choice* ; : 'action' ( -- parser ) "[[" 'factor-code' "]]" syntax-pack ; : 'semantic' ( -- parser ) "?[" 'factor-code' "]?" syntax-pack ; : 'sequence' ( -- parser ) #! A sequence of terminals and non-terminals, including #! groupings of those. [ [ ('sequence') , 'action' , ] seq* [ first2 ] action , [ ('sequence') , 'semantic' , ] seq* [ first2 ] action , ('sequence') , ] choice* repeat1 [ dup length 1 = [ first ] [ ] if ] action ; : 'actioned-sequence' ( -- parser ) [ [ 'sequence' , "=>" syntax , 'action' , ] seq* [ first2 ] action , 'sequence' , ] choice* ; : 'choice' ( -- parser ) 'actioned-sequence' sp repeat1 [ dup length 1 = [ first ] [ ] if ] action "|" token sp list-of [ dup length 1 = [ first ] [ ] if ] action ; : 'tokenizer' ( -- parser ) [ "tokenizer" syntax , "=" syntax , ">" token ensure-not , [ "default" token sp , 'choice' , ] choice* , ] seq* [ first ] action ; : 'rule' ( -- parser ) [ "tokenizer" token ensure-not , 'non-terminal' [ symbol>> ] action , "=" syntax , ">" token ensure-not , 'choice' , ] seq* [ first2 ] action ; : 'ebnf' ( -- parser ) [ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ ] action ; GENERIC: (transform) ( ast -- parser ) SYMBOL: parser SYMBOL: main SYMBOL: ignore-ws : transform ( ast -- object ) H{ } clone dup dup [ f ignore-ws set parser set swap (transform) main set ] bind ; M: ebnf (transform) ( ast -- parser ) rules>> [ (transform) ] map peek ; M: ebnf-tokenizer (transform) ( ast -- parser ) elements>> dup "default" = [ drop default-tokenizer \ tokenizer set-global any-char ] [ (transform) dup parser-tokenizer \ tokenizer set-global ] if ; ERROR: redefined-rule name ; M: redefined-rule summary name>> "Rule '" "' defined more than once" surround ; M: ebnf-rule (transform) ( ast -- parser ) dup elements>> (transform) [ swap symbol>> dup get parser? [ redefined-rule ] [ set ] if ] keep ; M: ebnf-sequence (transform) ( ast -- parser ) #! If ignore-ws is set then each element of the sequence #! ignores leading whitespace. This is not inherited by #! subelements of the sequence. elements>> [ f ignore-ws [ (transform) ] with-variable ignore-ws get [ sp ] when ] map seq [ dup length 1 = [ first ] when ] action ; M: ebnf-choice (transform) ( ast -- parser ) options>> [ (transform) ] map choice ; M: ebnf-any-character (transform) ( ast -- parser ) drop tokenizer any>> call( -- parser ) ; M: ebnf-range (transform) ( ast -- parser ) pattern>> range-pattern ; : transform-group ( ast -- parser ) #! convert a ast node with groups to a parser for that group group>> (transform) ; M: ebnf-ensure (transform) ( ast -- parser ) transform-group ensure ; M: ebnf-ensure-not (transform) ( ast -- parser ) transform-group ensure-not ; M: ebnf-repeat0 (transform) ( ast -- parser ) transform-group repeat0 ; M: ebnf-repeat1 (transform) ( ast -- parser ) transform-group repeat1 ; M: ebnf-optional (transform) ( ast -- parser ) transform-group optional ; M: ebnf-whitespace (transform) ( ast -- parser ) t ignore-ws [ transform-group ] with-variable ; GENERIC: build-locals ( code ast -- code ) M: ebnf-sequence build-locals ( code ast -- code ) #! Note the need to filter out this ebnf items that #! leave nothing in the AST elements>> filter-hidden dup length 1 = [ first build-locals ] [ dup [ ebnf-var? ] filter empty? [ drop ] [ [ "FROM: locals => [let* ; FROM: sequences => nth ; [let* | " % dup length swap [ dup ebnf-var? [ name>> % " [ " % # " over nth ] " % ] [ 2drop ] if ] 2each " | " % % " nip ]" % ] "" make ] if ] if ; M: ebnf-var build-locals ( code ast -- ) [ "FROM: locals => [let* ; FROM: kernel => dup nip ; [let* | " % name>> % " [ dup ] " % " | " % % " nip ]" % ] "" make ; M: object build-locals ( code ast -- ) drop ; ERROR: bad-effect quot effect ; : check-action-effect ( quot -- quot ) dup infer { { [ dup (( a -- b )) effect<= ] [ drop ] } { [ dup (( -- b )) effect<= ] [ drop [ drop ] prepose ] } [ bad-effect ] } cond ; : ebnf-transform ( ast -- parser quot ) [ parser>> (transform) ] [ code>> insert-escapes ] [ parser>> ] tri build-locals [ string-lines parse-lines ] call( string -- quot ) ; M: ebnf-action (transform) ( ast -- parser ) ebnf-transform check-action-effect action ; M: ebnf-semantic (transform) ( ast -- parser ) ebnf-transform semantic ; M: ebnf-var (transform) ( ast -- parser ) parser>> (transform) ; M: ebnf-terminal (transform) ( ast -- parser ) symbol>> tokenizer one>> call( symbol -- parser ) ; ERROR: ebnf-foreign-not-found name ; M: ebnf-foreign-not-found summary name>> "Foreign word '" "' not found" surround ; M: ebnf-foreign (transform) ( ast -- parser ) dup word>> search [ word>> ebnf-foreign-not-found ] unless* swap rule>> [ main ] unless* over rule [ nip ] [ execute( -- parser ) ] if* ; ERROR: parser-not-found name ; M: ebnf-non-terminal (transform) ( ast -- parser ) symbol>> [ , \ dup , parser get , \ at , [ parser-not-found ] , \ unless* , \ nip , ] [ ] make box ; : transform-ebnf ( string -- object ) 'ebnf' parse transform ; : check-parse-result ( result -- result ) [ dup remaining>> [ blank? ] trim [ [ "Unable to fully parse EBNF. Left to parse was: " % remaining>> % ] "" make throw ] unless-empty ] [ "Could not parse EBNF" throw ] if* ; : parse-ebnf ( string -- hashtable ) 'ebnf' (parse) check-parse-result ast>> transform ; : ebnf>quot ( string -- hashtable quot ) parse-ebnf dup dup parser [ main swap at compile ] with-variable [ compiled-parse ] curry [ with-scope ast>> ] curry ; SYNTAX: " reset-tokenizer parse-multiline-string parse-ebnf main swap at parsed reset-tokenizer ; SYNTAX: [EBNF "EBNF]" reset-tokenizer parse-multiline-string ebnf>quot nip parsed \ call parsed reset-tokenizer ; SYNTAX: EBNF: reset-tokenizer CREATE-WORD dup ";EBNF" parse-multiline-string ebnf>quot swapd (( input -- ast )) define-declared "ebnf-parser" set-word-prop reset-tokenizer ;