542 lines
15 KiB
Factor
542 lines
15 KiB
Factor
! Copyright (C) 2007 Chris Double.
|
|
! See http://factorcode.org/license.txt for BSD license.
|
|
USING: kernel compiler.units words arrays strings math.parser sequences
|
|
quotations vectors namespaces math assocs continuations peg
|
|
peg.parsers unicode.categories multiline combinators combinators.lib
|
|
splitting accessors effects sequences.deep peg.search inference
|
|
io.streams.string io prettyprint parser ;
|
|
IN: peg.ebnf
|
|
|
|
: rule ( name word -- parser )
|
|
#! Given an EBNF word produced from EBNF: return the EBNF rule
|
|
"ebnf-parser" word-prop at ;
|
|
|
|
TUPLE: tokenizer any one many ;
|
|
|
|
: default-tokenizer ( -- tokenizer )
|
|
T{ tokenizer f
|
|
[ [ drop t ] satisfy ]
|
|
[ token ]
|
|
[ [ = ] curry satisfy ]
|
|
} ;
|
|
|
|
: parser-tokenizer ( parser -- tokenizer )
|
|
1quotation [ [ = ] curry satisfy ] dup tokenizer boa ;
|
|
|
|
: rule-tokenizer ( name word -- tokenizer )
|
|
rule parser-tokenizer ;
|
|
|
|
: tokenizer ( -- word )
|
|
\ tokenizer get-global [ default-tokenizer ] unless* ;
|
|
|
|
: reset-tokenizer ( -- )
|
|
default-tokenizer \ tokenizer set-global ;
|
|
|
|
: TOKENIZER:
|
|
scan search [ "Tokenizer not found" throw ] unless*
|
|
execute \ tokenizer set-global ; parsing
|
|
|
|
TUPLE: ebnf-non-terminal symbol ;
|
|
TUPLE: ebnf-terminal symbol ;
|
|
TUPLE: ebnf-foreign word rule ;
|
|
TUPLE: ebnf-any-character ;
|
|
TUPLE: ebnf-range pattern ;
|
|
TUPLE: ebnf-ensure group ;
|
|
TUPLE: ebnf-ensure-not group ;
|
|
TUPLE: ebnf-choice options ;
|
|
TUPLE: ebnf-sequence elements ;
|
|
TUPLE: ebnf-repeat0 group ;
|
|
TUPLE: ebnf-repeat1 group ;
|
|
TUPLE: ebnf-optional group ;
|
|
TUPLE: ebnf-whitespace group ;
|
|
TUPLE: ebnf-tokenizer elements ;
|
|
TUPLE: ebnf-rule symbol elements ;
|
|
TUPLE: ebnf-action parser code ;
|
|
TUPLE: ebnf-var parser name ;
|
|
TUPLE: ebnf-semantic parser code ;
|
|
TUPLE: ebnf rules ;
|
|
|
|
C: <ebnf-non-terminal> ebnf-non-terminal
|
|
C: <ebnf-terminal> ebnf-terminal
|
|
C: <ebnf-foreign> ebnf-foreign
|
|
C: <ebnf-any-character> ebnf-any-character
|
|
C: <ebnf-range> ebnf-range
|
|
C: <ebnf-ensure> ebnf-ensure
|
|
C: <ebnf-ensure-not> ebnf-ensure-not
|
|
C: <ebnf-choice> ebnf-choice
|
|
C: <ebnf-sequence> ebnf-sequence
|
|
C: <ebnf-repeat0> ebnf-repeat0
|
|
C: <ebnf-repeat1> ebnf-repeat1
|
|
C: <ebnf-optional> ebnf-optional
|
|
C: <ebnf-whitespace> ebnf-whitespace
|
|
C: <ebnf-tokenizer> ebnf-tokenizer
|
|
C: <ebnf-rule> ebnf-rule
|
|
C: <ebnf-action> ebnf-action
|
|
C: <ebnf-var> ebnf-var
|
|
C: <ebnf-semantic> ebnf-semantic
|
|
C: <ebnf> ebnf
|
|
|
|
: filter-hidden ( seq -- seq )
|
|
#! Remove elements that produce no AST from sequence
|
|
[ ebnf-ensure-not? not ] filter [ ebnf-ensure? not ] filter ;
|
|
|
|
: syntax ( string -- parser )
|
|
#! Parses the string, ignoring white space, and
|
|
#! does not put the result in the AST.
|
|
token sp hide ;
|
|
|
|
: syntax-pack ( begin parser end -- parser )
|
|
#! Parse 'parser' surrounded by syntax elements
|
|
#! begin and end.
|
|
[ syntax ] 2dip syntax pack ;
|
|
|
|
#! Don't want to use 'replace' in an action since replace doesn't infer.
|
|
#! Do the compilation of the peg at parse time and call (replace).
|
|
PEG: escaper ( string -- ast )
|
|
[
|
|
"\\t" token [ drop "\t" ] action ,
|
|
"\\n" token [ drop "\n" ] action ,
|
|
"\\r" token [ drop "\r" ] action ,
|
|
] choice* any-char-parser 2array choice repeat0 ;
|
|
|
|
: replace-escapes ( string -- string )
|
|
escaper sift [ [ tree-write ] each ] with-string-writer ;
|
|
|
|
: insert-escapes ( string -- string )
|
|
[
|
|
"\t" token [ drop "\\t" ] action ,
|
|
"\n" token [ drop "\\n" ] action ,
|
|
"\r" token [ drop "\\r" ] action ,
|
|
] choice* replace ;
|
|
|
|
: 'identifier' ( -- parser )
|
|
#! Return a parser that parses an identifer delimited by
|
|
#! a quotation character. The quotation can be single
|
|
#! or double quotes. The AST produced is the identifier
|
|
#! between the quotes.
|
|
[
|
|
[ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by ,
|
|
[ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
|
|
] choice* [ >string replace-escapes ] action ;
|
|
|
|
: 'non-terminal' ( -- parser )
|
|
#! A non-terminal is the name of another rule. It can
|
|
#! be any non-blank character except for characters used
|
|
#! in the EBNF syntax itself.
|
|
[
|
|
{
|
|
[ dup blank? ]
|
|
[ dup CHAR: " = ]
|
|
[ dup CHAR: ' = ]
|
|
[ dup CHAR: | = ]
|
|
[ dup CHAR: { = ]
|
|
[ dup CHAR: } = ]
|
|
[ dup CHAR: = = ]
|
|
[ dup CHAR: ) = ]
|
|
[ dup CHAR: ( = ]
|
|
[ dup CHAR: ] = ]
|
|
[ dup CHAR: [ = ]
|
|
[ dup CHAR: . = ]
|
|
[ dup CHAR: ! = ]
|
|
[ dup CHAR: & = ]
|
|
[ dup CHAR: * = ]
|
|
[ dup CHAR: + = ]
|
|
[ dup CHAR: ? = ]
|
|
[ dup CHAR: : = ]
|
|
[ dup CHAR: ~ = ]
|
|
[ dup CHAR: < = ]
|
|
[ dup CHAR: > = ]
|
|
} 0|| not nip
|
|
] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
|
|
|
|
: 'terminal' ( -- parser )
|
|
#! A terminal is an identifier enclosed in quotations
|
|
#! and it represents the literal value of the identifier.
|
|
'identifier' [ <ebnf-terminal> ] action ;
|
|
|
|
: 'foreign-name' ( -- parser )
|
|
#! Parse a valid foreign parser name
|
|
[
|
|
{
|
|
[ dup blank? ]
|
|
[ dup CHAR: > = ]
|
|
} 0|| not nip
|
|
] satisfy repeat1 [ >string ] action ;
|
|
|
|
: 'foreign' ( -- parser )
|
|
#! A foreign call is a call to a rule in another ebnf grammar
|
|
[
|
|
"<foreign" syntax ,
|
|
'foreign-name' sp ,
|
|
'foreign-name' sp optional ,
|
|
">" syntax ,
|
|
] seq* [ first2 <ebnf-foreign> ] action ;
|
|
|
|
: 'any-character' ( -- parser )
|
|
#! A parser to match the symbol for any character match.
|
|
[ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
|
|
|
|
: 'range-parser' ( -- parser )
|
|
#! Match the syntax for declaring character ranges
|
|
[
|
|
[ "[" syntax , "[" token ensure-not , ] seq* hide ,
|
|
[ CHAR: ] = not ] satisfy repeat1 ,
|
|
"]" syntax ,
|
|
] seq* [ first >string <ebnf-range> ] action ;
|
|
|
|
: ('element') ( -- parser )
|
|
#! An element of a rule. It can be a terminal or a
|
|
#! non-terminal but must not be followed by a "=".
|
|
#! The latter indicates that it is the beginning of a
|
|
#! new rule.
|
|
[
|
|
[
|
|
[
|
|
'non-terminal' ,
|
|
'terminal' ,
|
|
'foreign' ,
|
|
'range-parser' ,
|
|
'any-character' ,
|
|
] choice*
|
|
[ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
|
|
[ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
|
|
[ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
|
|
,
|
|
] choice* ,
|
|
[
|
|
"=" syntax ensure-not ,
|
|
"=>" syntax ensure ,
|
|
] choice* ,
|
|
] seq* [ first ] action ;
|
|
|
|
DEFER: 'action'
|
|
|
|
: 'element' ( -- parser )
|
|
[
|
|
[ ('element') , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
|
|
('element') ,
|
|
] choice* ;
|
|
|
|
DEFER: 'choice'
|
|
|
|
: grouped ( quot suffix -- parser )
|
|
#! Parse a group of choices, with a suffix indicating
|
|
#! the type of group (repeat0, repeat1, etc) and
|
|
#! an quot that is the action that produces the AST.
|
|
2dup
|
|
[
|
|
"(" [ 'choice' sp ] delay ")" syntax-pack
|
|
swap 2seq
|
|
[ first ] rot compose action ,
|
|
"{" [ 'choice' sp ] delay "}" syntax-pack
|
|
swap 2seq
|
|
[ first <ebnf-whitespace> ] rot compose action ,
|
|
] choice* ;
|
|
|
|
: 'group' ( -- parser )
|
|
#! A grouping with no suffix. Used for precedence.
|
|
[ ] [
|
|
"*" token sp ensure-not ,
|
|
"+" token sp ensure-not ,
|
|
"?" token sp ensure-not ,
|
|
] seq* hide grouped ;
|
|
|
|
: 'repeat0' ( -- parser )
|
|
[ <ebnf-repeat0> ] "*" syntax grouped ;
|
|
|
|
: 'repeat1' ( -- parser )
|
|
[ <ebnf-repeat1> ] "+" syntax grouped ;
|
|
|
|
: 'optional' ( -- parser )
|
|
[ <ebnf-optional> ] "?" syntax grouped ;
|
|
|
|
: 'factor-code' ( -- parser )
|
|
[
|
|
"]]" token ensure-not ,
|
|
"]?" token ensure-not ,
|
|
[ drop t ] satisfy ,
|
|
] seq* [ first ] action repeat0 [ >string ] action ;
|
|
|
|
: 'ensure-not' ( -- parser )
|
|
#! Parses the '!' syntax to ensure that
|
|
#! something that matches the following elements do
|
|
#! not exist in the parse stream.
|
|
[
|
|
"!" syntax ,
|
|
'group' sp ,
|
|
] seq* [ first <ebnf-ensure-not> ] action ;
|
|
|
|
: 'ensure' ( -- parser )
|
|
#! Parses the '&' syntax to ensure that
|
|
#! something that matches the following elements does
|
|
#! exist in the parse stream.
|
|
[
|
|
"&" syntax ,
|
|
'group' sp ,
|
|
] seq* [ first <ebnf-ensure> ] action ;
|
|
|
|
: ('sequence') ( -- parser )
|
|
#! A sequence of terminals and non-terminals, including
|
|
#! groupings of those.
|
|
[
|
|
[
|
|
'ensure-not' sp ,
|
|
'ensure' sp ,
|
|
'element' sp ,
|
|
'group' sp ,
|
|
'repeat0' sp ,
|
|
'repeat1' sp ,
|
|
'optional' sp ,
|
|
] choice*
|
|
[ dup , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
|
|
,
|
|
] choice* ;
|
|
|
|
: 'action' ( -- parser )
|
|
"[[" 'factor-code' "]]" syntax-pack ;
|
|
|
|
: 'semantic' ( -- parser )
|
|
"?[" 'factor-code' "]?" syntax-pack ;
|
|
|
|
: 'sequence' ( -- parser )
|
|
#! A sequence of terminals and non-terminals, including
|
|
#! groupings of those.
|
|
[
|
|
[ ('sequence') , 'action' , ] seq* [ first2 <ebnf-action> ] action ,
|
|
[ ('sequence') , 'semantic' , ] seq* [ first2 <ebnf-semantic> ] action ,
|
|
('sequence') ,
|
|
] choice* repeat1 [
|
|
dup length 1 = [ first ] [ <ebnf-sequence> ] if
|
|
] action ;
|
|
|
|
: 'actioned-sequence' ( -- parser )
|
|
[
|
|
[ 'sequence' , "=>" syntax , 'action' , ] seq* [ first2 <ebnf-action> ] action ,
|
|
'sequence' ,
|
|
] choice* ;
|
|
|
|
: 'choice' ( -- parser )
|
|
'actioned-sequence' sp repeat1 [ dup length 1 = [ first ] [ <ebnf-sequence> ] if ] action "|" token sp list-of [
|
|
dup length 1 = [ first ] [ <ebnf-choice> ] if
|
|
] action ;
|
|
|
|
: 'tokenizer' ( -- parser )
|
|
[
|
|
"tokenizer" syntax ,
|
|
"=" syntax ,
|
|
">" token ensure-not ,
|
|
[ "default" token sp , 'choice' , ] choice* ,
|
|
] seq* [ first <ebnf-tokenizer> ] action ;
|
|
|
|
: 'rule' ( -- parser )
|
|
[
|
|
"tokenizer" token ensure-not ,
|
|
'non-terminal' [ symbol>> ] action ,
|
|
"=" syntax ,
|
|
">" token ensure-not ,
|
|
'choice' ,
|
|
] seq* [ first2 <ebnf-rule> ] action ;
|
|
|
|
: 'ebnf' ( -- parser )
|
|
[ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ <ebnf> ] action ;
|
|
|
|
GENERIC: (transform) ( ast -- parser )
|
|
|
|
SYMBOL: parser
|
|
SYMBOL: main
|
|
SYMBOL: ignore-ws
|
|
|
|
: transform ( ast -- object )
|
|
H{ } clone dup dup [
|
|
f ignore-ws set
|
|
parser set
|
|
swap (transform)
|
|
main set
|
|
] bind ;
|
|
|
|
M: ebnf (transform) ( ast -- parser )
|
|
rules>> [ (transform) ] map peek ;
|
|
|
|
M: ebnf-tokenizer (transform) ( ast -- parser )
|
|
elements>> dup "default" = [
|
|
drop default-tokenizer \ tokenizer set-global any-char
|
|
] [
|
|
(transform)
|
|
dup parser-tokenizer \ tokenizer set-global
|
|
] if ;
|
|
|
|
M: ebnf-rule (transform) ( ast -- parser )
|
|
dup elements>>
|
|
(transform) [
|
|
swap symbol>> dup get { [ tuple? ] [ delegate parser? ] } 1&& [
|
|
"Rule '" over append "' defined more than once" append throw
|
|
] [
|
|
set
|
|
] if
|
|
] keep ;
|
|
|
|
M: ebnf-sequence (transform) ( ast -- parser )
|
|
#! If ignore-ws is set then each element of the sequence
|
|
#! ignores leading whitespace. This is not inherited by
|
|
#! subelements of the sequence.
|
|
elements>> [
|
|
f ignore-ws [ (transform) ] with-variable
|
|
ignore-ws get [ sp ] when
|
|
] map seq [ dup length 1 = [ first ] when ] action ;
|
|
|
|
M: ebnf-choice (transform) ( ast -- parser )
|
|
options>> [ (transform) ] map choice ;
|
|
|
|
M: ebnf-any-character (transform) ( ast -- parser )
|
|
drop tokenizer any>> call ;
|
|
|
|
M: ebnf-range (transform) ( ast -- parser )
|
|
pattern>> range-pattern ;
|
|
|
|
: transform-group ( ast -- parser )
|
|
#! convert a ast node with groups to a parser for that group
|
|
group>> (transform) ;
|
|
|
|
M: ebnf-ensure (transform) ( ast -- parser )
|
|
transform-group ensure ;
|
|
|
|
M: ebnf-ensure-not (transform) ( ast -- parser )
|
|
transform-group ensure-not ;
|
|
|
|
M: ebnf-repeat0 (transform) ( ast -- parser )
|
|
transform-group repeat0 ;
|
|
|
|
M: ebnf-repeat1 (transform) ( ast -- parser )
|
|
transform-group repeat1 ;
|
|
|
|
M: ebnf-optional (transform) ( ast -- parser )
|
|
transform-group optional ;
|
|
|
|
M: ebnf-whitespace (transform) ( ast -- parser )
|
|
t ignore-ws [ transform-group ] with-variable ;
|
|
|
|
GENERIC: build-locals ( code ast -- code )
|
|
|
|
M: ebnf-sequence build-locals ( code ast -- code )
|
|
#! Note the need to filter out this ebnf items that
|
|
#! leave nothing in the AST
|
|
elements>> filter-hidden dup length 1 = [
|
|
first build-locals
|
|
] [
|
|
dup [ ebnf-var? ] filter empty? [
|
|
drop
|
|
] [
|
|
[
|
|
"USING: locals sequences ; [let* | " %
|
|
dup length swap [
|
|
dup ebnf-var? [
|
|
name>> %
|
|
" [ " % # " over nth ] " %
|
|
] [
|
|
2drop
|
|
] if
|
|
] 2each
|
|
" | " %
|
|
%
|
|
" nip ]" %
|
|
] "" make
|
|
] if
|
|
] if ;
|
|
|
|
M: ebnf-var build-locals ( code ast -- )
|
|
[
|
|
"USING: locals kernel ; [let* | " %
|
|
name>> % " [ dup ] " %
|
|
" | " %
|
|
%
|
|
" nip ]" %
|
|
] "" make ;
|
|
|
|
M: object build-locals ( code ast -- )
|
|
drop ;
|
|
|
|
: check-action-effect ( quot -- quot )
|
|
dup infer {
|
|
{ [ dup (( a -- b )) effect<= ] [ drop ] }
|
|
{ [ dup (( -- b )) effect<= ] [ drop [ drop ] prepose ] }
|
|
[
|
|
[
|
|
"Bad effect: " write effect>string write
|
|
" for quotation " write pprint
|
|
] with-string-writer throw
|
|
]
|
|
} cond ;
|
|
|
|
M: ebnf-action (transform) ( ast -- parser )
|
|
[ parser>> (transform) ] [ code>> insert-escapes ] [ parser>> ] tri build-locals
|
|
string-lines parse-lines check-action-effect action ;
|
|
|
|
M: ebnf-semantic (transform) ( ast -- parser )
|
|
[ parser>> (transform) ] [ code>> insert-escapes ] [ parser>> ] tri build-locals
|
|
string-lines parse-lines semantic ;
|
|
|
|
M: ebnf-var (transform) ( ast -- parser )
|
|
parser>> (transform) ;
|
|
|
|
M: ebnf-terminal (transform) ( ast -- parser )
|
|
symbol>> tokenizer one>> call ;
|
|
|
|
M: ebnf-foreign (transform) ( ast -- parser )
|
|
dup word>> search
|
|
[ "Foreign word '" swap word>> append "' not found" append throw ] unless*
|
|
swap rule>> [ main ] unless* dupd swap rule [
|
|
nip
|
|
] [
|
|
execute
|
|
] if* ;
|
|
|
|
: parser-not-found ( name -- * )
|
|
[
|
|
"Parser '" % % "' not found." %
|
|
] "" make throw ;
|
|
|
|
M: ebnf-non-terminal (transform) ( ast -- parser )
|
|
symbol>> [
|
|
, \ dup , parser get , \ at , [ parser-not-found ] , \ unless* , \ nip ,
|
|
] [ ] make box ;
|
|
|
|
: transform-ebnf ( string -- object )
|
|
'ebnf' parse parse-result-ast transform ;
|
|
|
|
: check-parse-result ( result -- result )
|
|
dup [
|
|
dup parse-result-remaining [ blank? ] trim empty? [
|
|
[
|
|
"Unable to fully parse EBNF. Left to parse was: " %
|
|
parse-result-remaining %
|
|
] "" make throw
|
|
] unless
|
|
] [
|
|
"Could not parse EBNF" throw
|
|
] if ;
|
|
|
|
: ebnf>quot ( string -- hashtable quot )
|
|
'ebnf' parse check-parse-result
|
|
parse-result-ast transform dup dup parser [ main swap at compile ] with-variable
|
|
[ compiled-parse ] curry [ with-scope ] curry ;
|
|
|
|
: [EBNF
|
|
scan {
|
|
{ "+" [ scan-word execute "" swap ] }
|
|
[ " " append default-tokenizer ]
|
|
} case \ tokenizer set-global
|
|
[ "EBNF]" parse-multiline-string ] [ drop "" ] recover append ebnf>quot nip parsed
|
|
reset-tokenizer ; parsing
|
|
|
|
: EBNF:
|
|
CREATE-WORD scan {
|
|
{ "+" [ scan-word execute "" swap ] }
|
|
[ " " append default-tokenizer ]
|
|
} case \ tokenizer set-global
|
|
dupd [ ";EBNF" parse-multiline-string ] [ drop "" ] recover append
|
|
ebnf>quot swapd 1 1 <effect> define-declared "ebnf-parser" set-word-prop
|
|
reset-tokenizer ; parsing
|
|
|
|
|
|
|