factor/basis/peg/ebnf/ebnf.factor

! Copyright (C) 2007 Chris Double.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel words arrays strings math.parser
sequences quotations vectors namespaces make math assocs
continuations peg peg.parsers unicode.categories multiline
splitting accessors effects sequences.deep peg.search
combinators.short-circuit lexer io.streams.string stack-checker
io combinators parser summary ;
FROM: compiler.units => with-compilation-unit ;
FROM: vocabs.parser => search ;
IN: peg.ebnf

: rule ( name word -- parser )
  #! Given an EBNF word produced from EBNF: return the EBNF rule
  "ebnf-parser" word-prop at ;

ERROR: no-rule rule parser ;

: lookup-rule ( rule parser -- rule' )
    2dup rule [ 2nip ] [ no-rule ] if* ;

TUPLE: tokenizer any one many ;

: default-tokenizer ( -- tokenizer )
  T{ tokenizer f
    [ any-char ]
    [ token ]
    [ [ = ] curry any-char swap semantic ]
  } ;

: parser-tokenizer ( parser -- tokenizer )
  [ 1quotation ] keep
  [ swap [ = ] curry semantic ] curry dup \ tokenizer boa ;

: rule-tokenizer ( name word -- tokenizer )
  rule parser-tokenizer ;

: tokenizer ( -- word )
  \ tokenizer get-global [ default-tokenizer ] unless* ;

: reset-tokenizer ( -- )
  default-tokenizer \ tokenizer set-global ;

ERROR: no-tokenizer name ;

M: no-tokenizer summary
    drop "Tokenizer not found" ;

SYNTAX: TOKENIZER:
  scan dup search [ nip ] [ no-tokenizer ] if*
  execute( -- tokenizer ) \ tokenizer set-global ;

TUPLE: ebnf-non-terminal symbol ;
TUPLE: ebnf-terminal symbol ;
TUPLE: ebnf-foreign word rule ;
TUPLE: ebnf-any-character ;
TUPLE: ebnf-range pattern ;
TUPLE: ebnf-ensure group ;
TUPLE: ebnf-ensure-not group ;
TUPLE: ebnf-choice options ;
TUPLE: ebnf-sequence elements ;
TUPLE: ebnf-repeat0 group ;
TUPLE: ebnf-repeat1 group ;
TUPLE: ebnf-optional group ;
TUPLE: ebnf-whitespace group ;
TUPLE: ebnf-tokenizer elements ;
TUPLE: ebnf-rule symbol elements ;
TUPLE: ebnf-action parser code ;
TUPLE: ebnf-var parser name ;
TUPLE: ebnf-semantic parser code ;
TUPLE: ebnf rules ;

C: <ebnf-non-terminal> ebnf-non-terminal
C: <ebnf-terminal> ebnf-terminal
C: <ebnf-foreign> ebnf-foreign
C: <ebnf-any-character> ebnf-any-character
C: <ebnf-range> ebnf-range
C: <ebnf-ensure> ebnf-ensure
C: <ebnf-ensure-not> ebnf-ensure-not
C: <ebnf-choice> ebnf-choice
C: <ebnf-sequence> ebnf-sequence
C: <ebnf-repeat0> ebnf-repeat0
C: <ebnf-repeat1> ebnf-repeat1
C: <ebnf-optional> ebnf-optional
C: <ebnf-whitespace> ebnf-whitespace
C: <ebnf-tokenizer> ebnf-tokenizer
C: <ebnf-rule> ebnf-rule
C: <ebnf-action> ebnf-action
C: <ebnf-var> ebnf-var
C: <ebnf-semantic> ebnf-semantic
C: <ebnf> ebnf

: filter-hidden ( seq -- seq )
  #! Remove elements that produce no AST from sequence
  [ ebnf-ensure-not? not ] filter [ ebnf-ensure? not ] filter ;

: syntax ( string -- parser )
  #! Parses the string, ignoring white space, and
  #! does not put the result in the AST.
  token sp hide ;

: syntax-pack ( begin parser end -- parser )
  #! Parse 'parser' surrounded by syntax elements
  #! begin and end.
  [ syntax ] 2dip syntax pack ;

#! Don't want to use 'replace' in an action since replace doesn't infer.
#! Do the compilation of the peg at parse time and call (replace).
PEG: escaper ( string -- ast )
  [
    "\\t" token [ drop "\t" ] action ,
    "\\n" token [ drop "\n" ] action ,
    "\\r" token [ drop "\r" ] action ,
    "\\\\" token [ drop "\\" ] action ,
  ] choice* any-char-parser 2array choice repeat0 ;

: replace-escapes ( string -- string )
  escaper sift [ [ tree-write ] each ] with-string-writer ;

: insert-escapes ( string -- string )
  [
    "\t" token [ drop "\\t" ] action ,
    "\n" token [ drop "\\n" ] action ,
    "\r" token [ drop "\\r" ] action ,
  ] choice* replace ;

: 'identifier' ( -- parser )
  #! Return a parser that parses an identifer delimited by
  #! a quotation character. The quotation can be single
  #! or double quotes. The AST produced is the identifier
  #! between the quotes.
  [
    [ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by ,
    [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
  ] choice* [ >string replace-escapes ] action ;

: 'non-terminal' ( -- parser )
  #! A non-terminal is the name of another rule. It can
  #! be any non-blank character except for characters used
  #! in the EBNF syntax itself.
  [
    {
      [ blank?    ]
      [ CHAR: " = ]
      [ CHAR: ' = ]
      [ CHAR: | = ]
      [ CHAR: { = ]
      [ CHAR: } = ]
      [ CHAR: = = ]
      [ CHAR: ) = ]
      [ CHAR: ( = ]
      [ CHAR: ] = ]
      [ CHAR: [ = ]
      [ CHAR: . = ]
      [ CHAR: ! = ]
      [ CHAR: & = ]
      [ CHAR: * = ]
      [ CHAR: + = ]
      [ CHAR: ? = ]
      [ CHAR: : = ]
      [ CHAR: ~ = ]
      [ CHAR: < = ]
      [ CHAR: > = ]
    } 1|| not
  ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;

: 'terminal' ( -- parser )
  #! A terminal is an identifier enclosed in quotations
  #! and it represents the literal value of the identifier.
  'identifier' [ <ebnf-terminal> ] action ;

: 'foreign-name' ( -- parser )
  #! Parse a valid foreign parser name
  [
    {
      [ blank?    ]
      [ CHAR: > = ]
    } 1|| not
  ] satisfy repeat1 [ >string ] action ;

: 'foreign' ( -- parser )
  #! A foreign call is a call to a rule in another ebnf grammar
  [
    "<foreign" syntax ,
    'foreign-name' sp ,
    'foreign-name' sp optional ,
    ">" syntax ,
  ] seq* [ first2 <ebnf-foreign> ] action ;

: 'any-character' ( -- parser )
  #! A parser to match the symbol for any character match.
  [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;

: 'range-parser' ( -- parser )
  #! Match the syntax for declaring character ranges
  [
    [ "[" syntax , "[" token ensure-not , ] seq* hide ,
    [ CHAR: ] = not ] satisfy repeat1 ,
    "]" syntax ,
  ] seq* [ first >string <ebnf-range> ] action ;

: ('element') ( -- parser )
  #! An element of a rule. It can be a terminal or a
  #! non-terminal but must not be followed by a "=".
  #! The latter indicates that it is the beginning of a
  #! new rule.
  [
    [
      [
        'non-terminal' ,
        'terminal' ,
        'foreign' ,
        'range-parser' ,
        'any-character' ,
      ] choice*
      [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
      [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
      [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
      ,
    ] choice* ,
    [
      "=" syntax ensure-not ,
      "=>" syntax ensure ,
    ] choice* ,
  ] seq* [ first ] action ;

DEFER: 'action'

: 'element' ( -- parser )
  [
    [ ('element') , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
    ('element') ,
  ] choice* ;

DEFER: 'choice'

: grouped ( quot suffix  -- parser )
  #! Parse a group of choices, with a suffix indicating
  #! the type of group (repeat0, repeat1, etc) and
  #! an quot that is the action that produces the AST.
  2dup
  [
    "(" [ 'choice' sp ] delay ")" syntax-pack
    swap 2seq
    [ first ] rot compose action ,
    "{" [ 'choice' sp ] delay "}" syntax-pack
    swap 2seq
    [ first <ebnf-whitespace> ] rot compose action ,
  ] choice* ;

: 'group' ( -- parser )
  #! A grouping with no suffix. Used for precedence.
  [ ] [
    "*" token sp ensure-not ,
    "+" token sp ensure-not ,
    "?" token sp ensure-not ,
  ] seq* hide grouped ;

: 'repeat0' ( -- parser )
  [ <ebnf-repeat0> ] "*" syntax grouped ;

: 'repeat1' ( -- parser )
  [ <ebnf-repeat1> ] "+" syntax grouped ;

: 'optional' ( -- parser )
  [ <ebnf-optional> ] "?" syntax grouped ;

: 'factor-code' ( -- parser )
  [
    "]]" token ensure-not ,
    "]?" token ensure-not ,
    [ drop t ] satisfy ,
  ] seq* repeat0 [ concat >string ] action ;

: 'ensure-not' ( -- parser )
  #! Parses the '!' syntax to ensure that
  #! something that matches the following elements do
  #! not exist in the parse stream.
  [
    "!" syntax ,
    'group' sp ,
  ] seq* [ first <ebnf-ensure-not> ] action ;

: 'ensure' ( -- parser )
  #! Parses the '&' syntax to ensure that
  #! something that matches the following elements does
  #! exist in the parse stream.
  [
    "&" syntax ,
    'group' sp ,
  ] seq* [ first <ebnf-ensure> ] action ;

: ('sequence') ( -- parser )
  #! A sequence of terminals and non-terminals, including
  #! groupings of those.
  [
    [
      'ensure-not' sp ,
      'ensure' sp ,
      'element' sp ,
      'group' sp ,
      'repeat0' sp ,
      'repeat1' sp ,
      'optional' sp ,
    ] choice*
    [ dup  , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
    ,
  ] choice* ;

: 'action' ( -- parser )
   "[[" 'factor-code' "]]" syntax-pack ;

: 'semantic' ( -- parser )
   "?[" 'factor-code' "]?" syntax-pack ;

: 'sequence' ( -- parser )
  #! A sequence of terminals and non-terminals, including
  #! groupings of those.
  [
    [ ('sequence') , 'action' , ] seq* [ first2 <ebnf-action> ] action ,
    [ ('sequence') , 'semantic' , ] seq* [ first2 <ebnf-semantic> ] action ,
    ('sequence') ,
  ] choice* repeat1 [
     dup length 1 = [ first ] [ <ebnf-sequence> ] if
  ] action ;

: 'actioned-sequence' ( -- parser )
  [
    [ 'sequence' , "=>" syntax , 'action' , ] seq* [ first2 <ebnf-action> ] action ,
    'sequence' ,
  ] choice* ;

: 'choice' ( -- parser )
  'actioned-sequence' sp repeat1 [ dup length 1 = [ first ] [ <ebnf-sequence> ] if  ] action "|" token sp list-of [
    dup length 1 = [ first ] [ <ebnf-choice> ] if
  ] action ;

: 'tokenizer' ( -- parser )
  [
    "tokenizer" syntax ,
    "=" syntax ,
    ">" token ensure-not ,
    [ "default" token sp , 'choice' , ] choice* ,
  ] seq* [ first <ebnf-tokenizer> ] action ;

: 'rule' ( -- parser )
  [
    "tokenizer" token ensure-not ,
    'non-terminal' [ symbol>> ] action  ,
    "=" syntax  ,
    ">" token ensure-not ,
    'choice' ,
  ] seq* [ first2 <ebnf-rule> ] action ;

: 'ebnf' ( -- parser )
  [ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ <ebnf> ] action ;

GENERIC: (transform) ( ast -- parser )

SYMBOL: parser
SYMBOL: main
SYMBOL: ignore-ws

: transform ( ast -- object )
  H{ } clone dup dup [
    f ignore-ws set
    parser set
    swap (transform)
    main set
  ] bind ;

M: ebnf (transform) ( ast -- parser )
  rules>> [ (transform) ] map last ;

M: ebnf-tokenizer (transform) ( ast -- parser )
  elements>> dup "default" = [
    drop default-tokenizer \ tokenizer set-global any-char
  ] [
  (transform)
  dup parser-tokenizer \ tokenizer set-global
  ] if ;

ERROR: redefined-rule name ;

M: redefined-rule summary
  name>> "Rule '" "' defined more than once" surround ;

M: ebnf-rule (transform) ( ast -- parser )
  dup elements>>
  (transform) [
    swap symbol>> dup get parser? [ redefined-rule ] [ set ] if
  ] keep ;

M: ebnf-sequence (transform) ( ast -- parser )
  #! If ignore-ws is set then each element of the sequence
  #! ignores leading whitespace. This is not inherited by
  #! subelements of the sequence.
  elements>> [
    f ignore-ws [ (transform) ] with-variable
    ignore-ws get [ sp ] when
  ] map seq [ dup length 1 = [ first ] when ] action ;

M: ebnf-choice (transform) ( ast -- parser )
  options>> [ (transform) ] map choice ;

M: ebnf-any-character (transform) ( ast -- parser )
  drop tokenizer any>> call( -- parser ) ;

M: ebnf-range (transform) ( ast -- parser )
  pattern>> range-pattern ;

: transform-group ( ast -- parser )
  #! convert a ast node with groups to a parser for that group
  group>> (transform) ;

M: ebnf-ensure (transform) ( ast -- parser )
  transform-group ensure ;

M: ebnf-ensure-not (transform) ( ast -- parser )
  transform-group ensure-not ;

M: ebnf-repeat0 (transform) ( ast -- parser )
  transform-group repeat0 ;

M: ebnf-repeat1 (transform) ( ast -- parser )
  transform-group repeat1 ;

M: ebnf-optional (transform) ( ast -- parser )
  transform-group optional ;

M: ebnf-whitespace (transform) ( ast -- parser )
  t ignore-ws [ transform-group ] with-variable ;

GENERIC: build-locals ( code ast -- code )

M: ebnf-sequence build-locals ( code ast -- code )
  #! Note the need to filter out this ebnf items that
  #! leave nothing in the AST
  elements>> filter-hidden dup length 1 = [
    first build-locals
  ]  [
    dup [ ebnf-var? ] filter empty? [
      drop
    ] [
      [
        "FROM: locals => [let* ; FROM: sequences => nth ; [let* | " %
          dup length swap [
            dup ebnf-var? [
              name>> %
              " [ " % # " over nth ] " %
            ] [
              2drop
            ] if
          ] 2each
          " | " %
          %
          " nip ]" %
      ] "" make
    ] if
  ] if ;

M: ebnf-var build-locals ( code ast -- )
  [
    "FROM: locals => [let* ; FROM: kernel => dup nip ; [let* | " %
    name>> % " [ dup ] " %
    " | " %
    %
    " nip ]" %
  ] "" make ;

M: object build-locals ( code ast -- )
  drop ;

ERROR: bad-effect quot effect ;

: check-action-effect ( quot -- quot )
  dup infer {
    { [ dup (( a -- b )) effect<= ] [ drop ] }
    { [ dup (( -- b )) effect<= ] [ drop [ drop ] prepose ] }
    [ bad-effect ]
  } cond ;

: ebnf-transform ( ast -- parser quot )
  [ parser>> (transform) ]
  [ code>> insert-escapes ]
  [ parser>> ] tri build-locals
  [ string-lines parse-lines ] call( string -- quot ) ;

M: ebnf-action (transform) ( ast -- parser )
  ebnf-transform check-action-effect action ;

M: ebnf-semantic (transform) ( ast -- parser )
  ebnf-transform semantic ;

M: ebnf-var (transform) ( ast -- parser )
  parser>> (transform) ;

M: ebnf-terminal (transform) ( ast -- parser )
  symbol>> tokenizer one>> call( symbol -- parser ) ;

ERROR: ebnf-foreign-not-found name ;

M: ebnf-foreign-not-found summary
  name>> "Foreign word '" "' not found" surround ;

M: ebnf-foreign (transform) ( ast -- parser )
  dup word>> search [ word>> ebnf-foreign-not-found ] unless*
  swap rule>> [ main ] unless* over rule [
    nip
  ] [
    execute( -- parser )
  ] if* ;

ERROR: parser-not-found name ;

M: ebnf-non-terminal (transform) ( ast -- parser )
  symbol>>  [
    , \ dup , parser get , \ at , [ parser-not-found ] , \ unless* , \ nip ,
  ] [ ] make box ;

: transform-ebnf ( string -- object )
  'ebnf' parse transform ;

: check-parse-result ( result -- result )
  [
    dup remaining>> [ blank? ] trim [
      [
        "Unable to fully parse EBNF. Left to parse was: " %
        remaining>> %
      ] "" make throw
    ] unless-empty
  ] [
    "Could not parse EBNF" throw
  ] if* ;

: parse-ebnf ( string -- hashtable )
  'ebnf' (parse) check-parse-result ast>> transform ;

: ebnf>quot ( string -- hashtable quot )
  parse-ebnf dup dup parser [ main swap at compile ] with-variable
  [ compiled-parse ] curry [ with-scope ast>> ] curry ;

SYNTAX: <EBNF
  "EBNF>"
  reset-tokenizer parse-multiline-string parse-ebnf main swap at
  parsed reset-tokenizer ;

SYNTAX: [EBNF
  "EBNF]"
  reset-tokenizer parse-multiline-string ebnf>quot nip
  parsed \ call parsed reset-tokenizer ;

SYNTAX: EBNF:
  reset-tokenizer CREATE-WORD dup ";EBNF" parse-multiline-string
  ebnf>quot swapd
  (( input -- ast )) define-declared "ebnf-parser" set-word-prop
  reset-tokenizer ;