105 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Lua
		
	
	
			
		
		
	
	
			105 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Lua
		
	
	
--[[
 | 
						|
Lexer for the Factor programming language (http://factorcode.org)
 | 
						|
Copyright 2013 Michael T. Richter <ttmrichter@gmail.com>
 | 
						|
 | 
						|
This program is free software and comes without any warranty, express nor
 | 
						|
implied.  It is, in short, warranted to do absolutely nothing but (possibly)
 | 
						|
occupy storage space.  You can redistribute it and/or modify it under the terms
 | 
						|
of the Do What The Fuck You Want To Public License, Version 2, as published by
 | 
						|
Sam Hocevar.  Consult http://www.wtfpl.net/txt/copying for full legal details.
 | 
						|
 | 
						|
BUGS
 | 
						|
====
 | 
						|
At this time the lexer is usable, but not perfect.  Problems include:
 | 
						|
 * identifiers like (foo) get treated and coloured like stack declarations
 | 
						|
 * other as-yet unknown display bugs  :-)
 | 
						|
 | 
						|
These make a few source files less than lovely and will be fixed as possible.
 | 
						|
(Making syntax highlighting for a language as syntactically flexible as Factor
 | 
						|
turns out to be a non-trivial task!)
 | 
						|
]]
 | 
						|
 | 
						|
local l = lexer
 | 
						|
local token, style, color, word_match = l.token, l.style, l.color, l.word_match
 | 
						|
local P, R, S = lpeg.P, lpeg.R, lpeg.S
 | 
						|
 | 
						|
local M = {_NAME = 'factor'}
 | 
						|
 | 
						|
-- General buliding blocks.
 | 
						|
local pre = R'AZ'^1
 | 
						|
local post = pre
 | 
						|
local opt_pre = pre^-1
 | 
						|
local opt_post = opt_pre
 | 
						|
 | 
						|
-- Whitespace.
 | 
						|
local ws = token(l.WHITESPACE, l.space^1)
 | 
						|
 | 
						|
-- Comments.
 | 
						|
local comment = token(l.COMMENT, P'#'^-1 * P'!' * l.nonnewline^0)
 | 
						|
 | 
						|
-- Strings.
 | 
						|
local dq1_str = opt_pre * l.delimited_range('"', '\\')
 | 
						|
local dq3_str = l.delimited_range('"""', '\\')
 | 
						|
local string = token(l.STRING, dq1_str + dq3_str)
 | 
						|
 | 
						|
-- Numbers.
 | 
						|
-- Note that complex literals like C{ 1/3 27.3 } are not covered by this lexer.
 | 
						|
-- The C{ ... } notation is treated as an operator--to be specific a
 | 
						|
-- "constructor" (for want of a better term).
 | 
						|
local hex_digits       = R('09', 'af', 'AF')^1
 | 
						|
local binary           = P'-'^-1 * P'0b' * S'01'^1
 | 
						|
local octal            = P'-'^-1 * P'0o' * R'07'^1
 | 
						|
local decimal          = P'-'^-1 * R'09'^1
 | 
						|
local hexadecimal      = P'-'^-1 * P'0x' * hex_digits^1
 | 
						|
local integer          = binary + octal + hexadecimal + decimal
 | 
						|
local ratio            = decimal * P'/' * decimal
 | 
						|
local dfloat_component = decimal * P'.' * decimal^-1
 | 
						|
local hfloat_component = hexadecimal * (P'.' * hex_digits^-1)^-1
 | 
						|
local float            = (dfloat_component * (S'eE' * decimal)^-1) +
 | 
						|
                         (hfloat_component * S'pP' * decimal)      +
 | 
						|
                         (ratio * P'.')                            +
 | 
						|
                         (P'-'^-1 * P'1/0.')                       +
 | 
						|
                         (P'0/0')
 | 
						|
 | 
						|
local number = token(l.NUMBER, (float + ratio + integer) * #ws)
 | 
						|
 | 
						|
-- Keywords.
 | 
						|
-- Things like NAN:, USE:, USING:, POSTPONE:, etc. are considered keywords,
 | 
						|
-- as are similar words that end in #.  Patterns like <<WORD ... WORD>> are
 | 
						|
-- similarly considered to be "keywords" (for want of a better term).
 | 
						|
local colon_words = pre * S':#' + S':;'^1
 | 
						|
local angle_words = (P'<'^1 * post) +
 | 
						|
                    (pre * P'>'^1)
 | 
						|
local keyword = token(l.KEYWORD, (colon_words + angle_words) * #ws)
 | 
						|
 | 
						|
-- Operators.
 | 
						|
-- The usual suspects like braces, brackets, angle brackets, parens, etc. are
 | 
						|
-- considered to be operators.  They may, however, have prefixes like C{ ... }.
 | 
						|
local constructor_words = opt_pre * P'{' + P'}' +
 | 
						|
                          opt_pre * P'[' + P']' +
 | 
						|
                          opt_pre * P'<' + P'>' +
 | 
						|
                          pre     * P'(' + P')'
 | 
						|
local stack_declaration = l.delimited_range('()')
 | 
						|
local other_operators = S'+-*/<>'
 | 
						|
local operator = token(l.OPERATOR, (stack_declaration +
 | 
						|
                                   constructor_words +
 | 
						|
                                   other_operators) * #ws)
 | 
						|
 | 
						|
-- Identifiers.
 | 
						|
-- Identifiers can be practically anything but whitespace.
 | 
						|
local symbols = S'`~!@#$%^&*()_-+={[<>]}:;X,?/'
 | 
						|
local identifier = token(l.IDENTIFIER, (l.alnum + symbols)^1 * #ws)
 | 
						|
 | 
						|
M._rules = {
 | 
						|
  {'keyword', keyword},
 | 
						|
  {'whitespace', ws},
 | 
						|
  {'string', string},
 | 
						|
  {'comment', comment},
 | 
						|
  {'number', number},
 | 
						|
  {'operator', operator},
 | 
						|
  {'identifier', identifier},
 | 
						|
  {'any_char', l.any_char},
 | 
						|
}
 | 
						|
 | 
						|
return M
 |