105 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Lua
		
	
	
		
		
			
		
	
	
			105 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Lua
		
	
	
| 
								 | 
							
								--[[
							 | 
						||
| 
								 | 
							
								Lexer for the Factor programming language (http://factorcode.org)
							 | 
						||
| 
								 | 
							
								Copyright 2013 Michael T. Richter <ttmrichter@gmail.com>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								This program is free software and comes without any warranty, express nor
							 | 
						||
| 
								 | 
							
								implied.  It is, in short, warranted to do absolutely nothing but (possibly)
							 | 
						||
| 
								 | 
							
								occupy storage space.  You can redistribute it and/or modify it under the terms
							 | 
						||
| 
								 | 
							
								of the Do What The Fuck You Want To Public License, Version 2, as published by
							 | 
						||
| 
								 | 
							
								Sam Hocevar.  Consult http://www.wtfpl.net/txt/copying for full legal details.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								BUGS
							 | 
						||
| 
								 | 
							
								====
							 | 
						||
| 
								 | 
							
								At this time the lexer is usable, but not perfect.  Problems include:
							 | 
						||
| 
								 | 
							
								 * identifiers like (foo) get treated and coloured like stack declarations
							 | 
						||
| 
								 | 
							
								 * other as-yet unknown display bugs  :-)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								These make a few source files less than lovely and will be fixed as possible.
							 | 
						||
| 
								 | 
							
								(Making syntax highlighting for a language as syntactically flexible as Factor
							 | 
						||
| 
								 | 
							
								turns out to be a non-trivial task!)
							 | 
						||
| 
								 | 
							
								]]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								local l = lexer
							 | 
						||
| 
								 | 
							
								local token, style, color, word_match = l.token, l.style, l.color, l.word_match
							 | 
						||
| 
								 | 
							
								local P, R, S = lpeg.P, lpeg.R, lpeg.S
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								local M = {_NAME = 'factor'}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								-- General buliding blocks.
							 | 
						||
| 
								 | 
							
								local pre = R'AZ'^1
							 | 
						||
| 
								 | 
							
								local post = pre
							 | 
						||
| 
								 | 
							
								local opt_pre = pre^-1
							 | 
						||
| 
								 | 
							
								local opt_post = opt_pre
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								-- Whitespace.
							 | 
						||
| 
								 | 
							
								local ws = token(l.WHITESPACE, l.space^1)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								-- Comments.
							 | 
						||
| 
								 | 
							
								local comment = token(l.COMMENT, P'#'^-1 * P'!' * l.nonnewline^0)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								-- Strings.
							 | 
						||
| 
								 | 
							
								local dq1_str = opt_pre * l.delimited_range('"', '\\')
							 | 
						||
| 
								 | 
							
								local dq3_str = l.delimited_range('"""', '\\')
							 | 
						||
| 
								 | 
							
								local string = token(l.STRING, dq1_str + dq3_str)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								-- Numbers.
							 | 
						||
| 
								 | 
							
								-- Note that complex literals like C{ 1/3 27.3 } are not covered by this lexer.
							 | 
						||
| 
								 | 
							
								-- The C{ ... } notation is treated as an operator--to be specific a
							 | 
						||
| 
								 | 
							
								-- "constructor" (for want of a better term).
							 | 
						||
| 
								 | 
							
								local hex_digits       = R('09', 'af', 'AF')^1
							 | 
						||
| 
								 | 
							
								local binary           = P'-'^-1 * P'0b' * S'01'^1
							 | 
						||
| 
								 | 
							
								local octal            = P'-'^-1 * P'0o' * R'07'^1
							 | 
						||
| 
								 | 
							
								local decimal          = P'-'^-1 * R'09'^1
							 | 
						||
| 
								 | 
							
								local hexadecimal      = P'-'^-1 * P'0x' * hex_digits^1
							 | 
						||
| 
								 | 
							
								local integer          = binary + octal + hexadecimal + decimal
							 | 
						||
| 
								 | 
							
								local ratio            = decimal * P'/' * decimal
							 | 
						||
| 
								 | 
							
								local dfloat_component = decimal * P'.' * decimal^-1
							 | 
						||
| 
								 | 
							
								local hfloat_component = hexadecimal * (P'.' * hex_digits^-1)^-1
							 | 
						||
| 
								 | 
							
								local float            = (dfloat_component * (S'eE' * decimal)^-1) +
							 | 
						||
| 
								 | 
							
								                         (hfloat_component * S'pP' * decimal)      +
							 | 
						||
| 
								 | 
							
								                         (ratio * P'.')                            +
							 | 
						||
| 
								 | 
							
								                         (P'-'^-1 * P'1/0.')                       +
							 | 
						||
| 
								 | 
							
								                         (P'0/0')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								local number = token(l.NUMBER, (float + ratio + integer) * #ws)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								-- Keywords.
							 | 
						||
| 
								 | 
							
								-- Things like NAN:, USE:, USING:, POSTPONE:, etc. are considered keywords,
							 | 
						||
| 
								 | 
							
								-- as are similar words that end in #.  Patterns like <<WORD ... WORD>> are
							 | 
						||
| 
								 | 
							
								-- similarly considered to be "keywords" (for want of a better term).
							 | 
						||
| 
								 | 
							
								local colon_words = pre * S':#' + S':;'^1
							 | 
						||
| 
								 | 
							
								local angle_words = (P'<'^1 * post) +
							 | 
						||
| 
								 | 
							
								                    (pre * P'>'^1)
							 | 
						||
| 
								 | 
							
								local keyword = token(l.KEYWORD, (colon_words + angle_words) * #ws)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								-- Operators.
							 | 
						||
| 
								 | 
							
								-- The usual suspects like braces, brackets, angle brackets, parens, etc. are
							 | 
						||
| 
								 | 
							
								-- considered to be operators.  They may, however, have prefixes like C{ ... }.
							 | 
						||
| 
								 | 
							
								local constructor_words = opt_pre * P'{' + P'}' +
							 | 
						||
| 
								 | 
							
								                          opt_pre * P'[' + P']' +
							 | 
						||
| 
								 | 
							
								                          opt_pre * P'<' + P'>' +
							 | 
						||
| 
								 | 
							
								                          pre     * P'(' + P')'
							 | 
						||
| 
								 | 
							
								local stack_declaration = l.delimited_range('()')
							 | 
						||
| 
								 | 
							
								local other_operators = S'+-*/<>'
							 | 
						||
| 
								 | 
							
								local operator = token(l.OPERATOR, (stack_declaration +
							 | 
						||
| 
								 | 
							
								                                   constructor_words +
							 | 
						||
| 
								 | 
							
								                                   other_operators) * #ws)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								-- Identifiers.
							 | 
						||
| 
								 | 
							
								-- Identifiers can be practically anything but whitespace.
							 | 
						||
| 
								 | 
							
								local symbols = S'`~!@#$%^&*()_-+={[<>]}:;X,?/'
							 | 
						||
| 
								 | 
							
								local identifier = token(l.IDENTIFIER, (l.alnum + symbols)^1 * #ws)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								M._rules = {
							 | 
						||
| 
								 | 
							
								  {'keyword', keyword},
							 | 
						||
| 
								 | 
							
								  {'whitespace', ws},
							 | 
						||
| 
								 | 
							
								  {'string', string},
							 | 
						||
| 
								 | 
							
								  {'comment', comment},
							 | 
						||
| 
								 | 
							
								  {'number', number},
							 | 
						||
| 
								 | 
							
								  {'operator', operator},
							 | 
						||
| 
								 | 
							
								  {'identifier', identifier},
							 | 
						||
| 
								 | 
							
								  {'any_char', l.any_char},
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								return M
							 |