105 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Lua
		
	
	
		
		
			
		
	
	
			105 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Lua
		
	
	
|  | --[[ | ||
|  | Lexer for the Factor programming language (http://factorcode.org) | ||
|  | Copyright 2013 Michael T. Richter <ttmrichter@gmail.com> | ||
|  | 
 | ||
|  | This program is free software and comes without any warranty, express nor | ||
|  | implied.  It is, in short, warranted to do absolutely nothing but (possibly) | ||
|  | occupy storage space.  You can redistribute it and/or modify it under the terms | ||
|  | of the Do What The Fuck You Want To Public License, Version 2, as published by | ||
|  | Sam Hocevar.  Consult http://www.wtfpl.net/txt/copying for full legal details. | ||
|  | 
 | ||
|  | BUGS | ||
|  | ==== | ||
|  | At this time the lexer is usable, but not perfect.  Problems include: | ||
|  |  * identifiers like (foo) get treated and coloured like stack declarations | ||
|  |  * other as-yet unknown display bugs  :-) | ||
|  | 
 | ||
|  | These make a few source files less than lovely and will be fixed as possible. | ||
|  | (Making syntax highlighting for a language as syntactically flexible as Factor | ||
|  | turns out to be a non-trivial task!) | ||
|  | ]] | ||
|  | 
 | ||
|  | local l = lexer | ||
|  | local token, style, color, word_match = l.token, l.style, l.color, l.word_match | ||
|  | local P, R, S = lpeg.P, lpeg.R, lpeg.S | ||
|  | 
 | ||
|  | local M = {_NAME = 'factor'} | ||
|  | 
 | ||
|  | -- General buliding blocks. | ||
|  | local pre = R'AZ'^1 | ||
|  | local post = pre | ||
|  | local opt_pre = pre^-1 | ||
|  | local opt_post = opt_pre | ||
|  | 
 | ||
|  | -- Whitespace. | ||
|  | local ws = token(l.WHITESPACE, l.space^1) | ||
|  | 
 | ||
|  | -- Comments. | ||
|  | local comment = token(l.COMMENT, P'#'^-1 * P'!' * l.nonnewline^0) | ||
|  | 
 | ||
|  | -- Strings. | ||
|  | local dq1_str = opt_pre * l.delimited_range('"', '\\') | ||
|  | local dq3_str = l.delimited_range('"""', '\\') | ||
|  | local string = token(l.STRING, dq1_str + dq3_str) | ||
|  | 
 | ||
|  | -- Numbers. | ||
|  | -- Note that complex literals like C{ 1/3 27.3 } are not covered by this lexer. | ||
|  | -- The C{ ... } notation is treated as an operator--to be specific a | ||
|  | -- "constructor" (for want of a better term). | ||
|  | local hex_digits       = R('09', 'af', 'AF')^1 | ||
|  | local binary           = P'-'^-1 * P'0b' * S'01'^1 | ||
|  | local octal            = P'-'^-1 * P'0o' * R'07'^1 | ||
|  | local decimal          = P'-'^-1 * R'09'^1 | ||
|  | local hexadecimal      = P'-'^-1 * P'0x' * hex_digits^1 | ||
|  | local integer          = binary + octal + hexadecimal + decimal | ||
|  | local ratio            = decimal * P'/' * decimal | ||
|  | local dfloat_component = decimal * P'.' * decimal^-1 | ||
|  | local hfloat_component = hexadecimal * (P'.' * hex_digits^-1)^-1 | ||
|  | local float            = (dfloat_component * (S'eE' * decimal)^-1) + | ||
|  |                          (hfloat_component * S'pP' * decimal)      + | ||
|  |                          (ratio * P'.')                            + | ||
|  |                          (P'-'^-1 * P'1/0.')                       + | ||
|  |                          (P'0/0') | ||
|  | 
 | ||
|  | local number = token(l.NUMBER, (float + ratio + integer) * #ws) | ||
|  | 
 | ||
|  | -- Keywords. | ||
|  | -- Things like NAN:, USE:, USING:, POSTPONE:, etc. are considered keywords, | ||
|  | -- as are similar words that end in #.  Patterns like <<WORD ... WORD>> are | ||
|  | -- similarly considered to be "keywords" (for want of a better term). | ||
|  | local colon_words = pre * S':#' + S':;'^1 | ||
|  | local angle_words = (P'<'^1 * post) + | ||
|  |                     (pre * P'>'^1) | ||
|  | local keyword = token(l.KEYWORD, (colon_words + angle_words) * #ws) | ||
|  | 
 | ||
|  | -- Operators. | ||
|  | -- The usual suspects like braces, brackets, angle brackets, parens, etc. are | ||
|  | -- considered to be operators.  They may, however, have prefixes like C{ ... }. | ||
|  | local constructor_words = opt_pre * P'{' + P'}' + | ||
|  |                           opt_pre * P'[' + P']' + | ||
|  |                           opt_pre * P'<' + P'>' + | ||
|  |                           pre     * P'(' + P')' | ||
|  | local stack_declaration = l.delimited_range('()') | ||
|  | local other_operators = S'+-*/<>' | ||
|  | local operator = token(l.OPERATOR, (stack_declaration + | ||
|  |                                    constructor_words + | ||
|  |                                    other_operators) * #ws) | ||
|  | 
 | ||
|  | -- Identifiers. | ||
|  | -- Identifiers can be practically anything but whitespace. | ||
|  | local symbols = S'`~!@#$%^&*()_-+={[<>]}:;X,?/' | ||
|  | local identifier = token(l.IDENTIFIER, (l.alnum + symbols)^1 * #ws) | ||
|  | 
 | ||
|  | M._rules = { | ||
|  |   {'keyword', keyword}, | ||
|  |   {'whitespace', ws}, | ||
|  |   {'string', string}, | ||
|  |   {'comment', comment}, | ||
|  |   {'number', number}, | ||
|  |   {'operator', operator}, | ||
|  |   {'identifier', identifier}, | ||
|  |   {'any_char', l.any_char}, | ||
|  | } | ||
|  | 
 | ||
|  | return M |