peg.ebnf syntax for tokenizers
parent
b6b5f12732
commit
eca8260799
|
@ -2,7 +2,8 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
!
|
!
|
||||||
USING: kernel tools.test peg peg.ebnf words math math.parser
|
USING: kernel tools.test peg peg.ebnf words math math.parser
|
||||||
sequences accessors peg.parsers parser namespaces ;
|
sequences accessors peg.parsers parser namespaces arrays
|
||||||
|
strings ;
|
||||||
IN: peg.ebnf.tests
|
IN: peg.ebnf.tests
|
||||||
|
|
||||||
{ T{ ebnf-non-terminal f "abc" } } [
|
{ T{ ebnf-non-terminal f "abc" } } [
|
||||||
|
@ -451,9 +452,63 @@ foo=<foreign any-char> 'd'
|
||||||
"USING: peg.ebnf ; [EBNF foo='a' foo='b' EBNF]" eval drop
|
"USING: peg.ebnf ; [EBNF foo='a' foo='b' EBNF]" eval drop
|
||||||
] must-fail
|
] must-fail
|
||||||
|
|
||||||
|
|
||||||
{ t } [
|
{ t } [
|
||||||
#! Rule lookup occurs in a namespace. This causes an incorrect duplicate rule
|
#! Rule lookup occurs in a namespace. This causes an incorrect duplicate rule
|
||||||
#! if a var in a namespace is set. This unit test is to remind me to fix this.
|
#! if a var in a namespace is set. This unit test is to remind me to fix this.
|
||||||
[ "fail" "foo" set "foo='a'" 'ebnf' parse ast>> transform drop t ] with-scope
|
[ "fail" "foo" set "foo='a'" 'ebnf' parse ast>> transform drop t ] with-scope
|
||||||
] unit-test
|
] unit-test
|
||||||
|
|
||||||
|
#! Tokenizer tests
|
||||||
|
{ V{ "a" CHAR: b } } [
|
||||||
|
"ab" [EBNF tokenizer=default foo="a" . EBNF] call ast>>
|
||||||
|
] unit-test
|
||||||
|
|
||||||
|
TUPLE: ast-number value ;
|
||||||
|
|
||||||
|
EBNF: a-tokenizer
|
||||||
|
Letter = [a-zA-Z]
|
||||||
|
Digit = [0-9]
|
||||||
|
Digits = Digit+
|
||||||
|
SingleLineComment = "//" (!("\n") .)* "\n" => [[ ignore ]]
|
||||||
|
MultiLineComment = "/*" (!("*/") .)* "*/" => [[ ignore ]]
|
||||||
|
Space = " " | "\t" | "\r" | "\n" | SingleLineComment | MultiLineComment
|
||||||
|
Spaces = Space* => [[ ignore ]]
|
||||||
|
Number = Digits:ws '.' Digits:fs => [[ ws "." fs 3array concat >string string>number ast-number boa ]]
|
||||||
|
| Digits => [[ >string string>number ast-number boa ]]
|
||||||
|
Special = "(" | ")" | "{" | "}" | "[" | "]" | "," | ";"
|
||||||
|
| "?" | ":" | "!==" | "~=" | "===" | "==" | "=" | ">="
|
||||||
|
| ">" | "<=" | "<" | "++" | "+=" | "+" | "--" | "-="
|
||||||
|
| "-" | "*=" | "*" | "/=" | "/" | "%=" | "%" | "&&="
|
||||||
|
| "&&" | "||=" | "||" | "." | "!"
|
||||||
|
Tok = Spaces (Number | Special )
|
||||||
|
;EBNF
|
||||||
|
|
||||||
|
{ V{ CHAR: 1 T{ ast-number f 23 } ";" CHAR: x } } [
|
||||||
|
"123;x" [EBNF bar = .
|
||||||
|
tokenizer = <foreign a-tokenizer Tok> foo=.
|
||||||
|
tokenizer=default baz=.
|
||||||
|
main = bar foo foo baz
|
||||||
|
EBNF] call ast>>
|
||||||
|
] unit-test
|
||||||
|
|
||||||
|
{ V{ CHAR: 5 "+" CHAR: 2 } } [
|
||||||
|
"5+2" [EBNF
|
||||||
|
space=(" " | "\n")
|
||||||
|
number=[0-9]
|
||||||
|
operator=("*" | "+")
|
||||||
|
spaces=space* => [[ ignore ]]
|
||||||
|
tokenizer=spaces (number | operator)
|
||||||
|
main= . . .
|
||||||
|
EBNF] call ast>>
|
||||||
|
] unit-test
|
||||||
|
|
||||||
|
{ V{ CHAR: 5 "+" CHAR: 2 } } [
|
||||||
|
"5 + 2" [EBNF
|
||||||
|
space=(" " | "\n")
|
||||||
|
number=[0-9]
|
||||||
|
operator=("*" | "+")
|
||||||
|
spaces=space* => [[ ignore ]]
|
||||||
|
tokenizer=spaces (number | operator)
|
||||||
|
main= . . .
|
||||||
|
EBNF] call ast>>
|
||||||
|
] unit-test
|
|
@ -27,7 +27,7 @@ TUPLE: tokenizer any one many ;
|
||||||
rule parser-tokenizer ;
|
rule parser-tokenizer ;
|
||||||
|
|
||||||
: tokenizer ( -- word )
|
: tokenizer ( -- word )
|
||||||
\ tokenizer get [ default-tokenizer ] unless* ;
|
\ tokenizer get-global [ default-tokenizer ] unless* ;
|
||||||
|
|
||||||
: reset-tokenizer ( -- )
|
: reset-tokenizer ( -- )
|
||||||
default-tokenizer \ tokenizer set-global ;
|
default-tokenizer \ tokenizer set-global ;
|
||||||
|
@ -49,6 +49,7 @@ TUPLE: ebnf-repeat0 group ;
|
||||||
TUPLE: ebnf-repeat1 group ;
|
TUPLE: ebnf-repeat1 group ;
|
||||||
TUPLE: ebnf-optional group ;
|
TUPLE: ebnf-optional group ;
|
||||||
TUPLE: ebnf-whitespace group ;
|
TUPLE: ebnf-whitespace group ;
|
||||||
|
TUPLE: ebnf-tokenizer elements ;
|
||||||
TUPLE: ebnf-rule symbol elements ;
|
TUPLE: ebnf-rule symbol elements ;
|
||||||
TUPLE: ebnf-action parser code ;
|
TUPLE: ebnf-action parser code ;
|
||||||
TUPLE: ebnf-var parser name ;
|
TUPLE: ebnf-var parser name ;
|
||||||
|
@ -68,6 +69,7 @@ C: <ebnf-repeat0> ebnf-repeat0
|
||||||
C: <ebnf-repeat1> ebnf-repeat1
|
C: <ebnf-repeat1> ebnf-repeat1
|
||||||
C: <ebnf-optional> ebnf-optional
|
C: <ebnf-optional> ebnf-optional
|
||||||
C: <ebnf-whitespace> ebnf-whitespace
|
C: <ebnf-whitespace> ebnf-whitespace
|
||||||
|
C: <ebnf-tokenizer> ebnf-tokenizer
|
||||||
C: <ebnf-rule> ebnf-rule
|
C: <ebnf-rule> ebnf-rule
|
||||||
C: <ebnf-action> ebnf-action
|
C: <ebnf-action> ebnf-action
|
||||||
C: <ebnf-var> ebnf-var
|
C: <ebnf-var> ebnf-var
|
||||||
|
@ -318,8 +320,17 @@ DEFER: 'choice'
|
||||||
dup length 1 = [ first ] [ <ebnf-choice> ] if
|
dup length 1 = [ first ] [ <ebnf-choice> ] if
|
||||||
] action ;
|
] action ;
|
||||||
|
|
||||||
|
: 'tokenizer' ( -- parser )
|
||||||
|
[
|
||||||
|
"tokenizer" syntax ,
|
||||||
|
"=" syntax ,
|
||||||
|
">" token ensure-not ,
|
||||||
|
[ "default" token sp , 'choice' , ] choice* ,
|
||||||
|
] seq* [ first <ebnf-tokenizer> ] action ;
|
||||||
|
|
||||||
: 'rule' ( -- parser )
|
: 'rule' ( -- parser )
|
||||||
[
|
[
|
||||||
|
"tokenizer" token ensure-not ,
|
||||||
'non-terminal' [ symbol>> ] action ,
|
'non-terminal' [ symbol>> ] action ,
|
||||||
"=" syntax ,
|
"=" syntax ,
|
||||||
">" token ensure-not ,
|
">" token ensure-not ,
|
||||||
|
@ -327,7 +338,7 @@ DEFER: 'choice'
|
||||||
] seq* [ first2 <ebnf-rule> ] action ;
|
] seq* [ first2 <ebnf-rule> ] action ;
|
||||||
|
|
||||||
: 'ebnf' ( -- parser )
|
: 'ebnf' ( -- parser )
|
||||||
'rule' sp repeat1 [ <ebnf> ] action ;
|
[ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ <ebnf> ] action ;
|
||||||
|
|
||||||
GENERIC: (transform) ( ast -- parser )
|
GENERIC: (transform) ( ast -- parser )
|
||||||
|
|
||||||
|
@ -346,6 +357,14 @@ SYMBOL: ignore-ws
|
||||||
M: ebnf (transform) ( ast -- parser )
|
M: ebnf (transform) ( ast -- parser )
|
||||||
rules>> [ (transform) ] map peek ;
|
rules>> [ (transform) ] map peek ;
|
||||||
|
|
||||||
|
M: ebnf-tokenizer (transform) ( ast -- parser )
|
||||||
|
elements>> dup "default" = [
|
||||||
|
drop default-tokenizer \ tokenizer set-global any-char
|
||||||
|
] [
|
||||||
|
(transform)
|
||||||
|
dup parser-tokenizer \ tokenizer set-global
|
||||||
|
] if ;
|
||||||
|
|
||||||
M: ebnf-rule (transform) ( ast -- parser )
|
M: ebnf-rule (transform) ( ast -- parser )
|
||||||
dup elements>>
|
dup elements>>
|
||||||
(transform) [
|
(transform) [
|
||||||
|
@ -369,7 +388,7 @@ M: ebnf-choice (transform) ( ast -- parser )
|
||||||
options>> [ (transform) ] map choice ;
|
options>> [ (transform) ] map choice ;
|
||||||
|
|
||||||
M: ebnf-any-character (transform) ( ast -- parser )
|
M: ebnf-any-character (transform) ( ast -- parser )
|
||||||
drop [ tokenizer any>> call ] box ;
|
drop tokenizer any>> call ;
|
||||||
|
|
||||||
M: ebnf-range (transform) ( ast -- parser )
|
M: ebnf-range (transform) ( ast -- parser )
|
||||||
pattern>> range-pattern ;
|
pattern>> range-pattern ;
|
||||||
|
@ -460,7 +479,7 @@ M: ebnf-var (transform) ( ast -- parser )
|
||||||
parser>> (transform) ;
|
parser>> (transform) ;
|
||||||
|
|
||||||
M: ebnf-terminal (transform) ( ast -- parser )
|
M: ebnf-terminal (transform) ( ast -- parser )
|
||||||
symbol>> [ tokenizer one>> call ] curry box ;
|
symbol>> tokenizer one>> call ;
|
||||||
|
|
||||||
M: ebnf-foreign (transform) ( ast -- parser )
|
M: ebnf-foreign (transform) ( ast -- parser )
|
||||||
dup word>> search
|
dup word>> search
|
||||||
|
@ -505,18 +524,18 @@ M: ebnf-non-terminal (transform) ( ast -- parser )
|
||||||
scan {
|
scan {
|
||||||
{ "+" [ scan-word execute "" swap ] }
|
{ "+" [ scan-word execute "" swap ] }
|
||||||
[ " " append default-tokenizer ]
|
[ " " append default-tokenizer ]
|
||||||
} case \ tokenizer [
|
} case \ tokenizer set-global
|
||||||
[ "EBNF]" parse-multiline-string ] [ drop "" ] recover append ebnf>quot nip parsed
|
[ "EBNF]" parse-multiline-string ] [ drop "" ] recover append ebnf>quot nip parsed
|
||||||
] with-variable ; parsing
|
reset-tokenizer ; parsing
|
||||||
|
|
||||||
: EBNF:
|
: EBNF:
|
||||||
CREATE-WORD scan {
|
CREATE-WORD scan {
|
||||||
{ "+" [ scan-word execute "" swap ] }
|
{ "+" [ scan-word execute "" swap ] }
|
||||||
[ " " append default-tokenizer ]
|
[ " " append default-tokenizer ]
|
||||||
} case \ tokenizer [
|
} case \ tokenizer set-global
|
||||||
dupd [ ";EBNF" parse-multiline-string ] [ drop "" ] recover append
|
dupd [ ";EBNF" parse-multiline-string ] [ drop "" ] recover append
|
||||||
ebnf>quot swapd 1 1 <effect> define-declared "ebnf-parser" set-word-prop
|
ebnf>quot swapd 1 1 <effect> define-declared "ebnf-parser" set-word-prop
|
||||||
] with-variable ; parsing
|
reset-tokenizer ; parsing
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue