lua-lexer.lua 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. -- MIT License
  2. --
  3. -- Copyright (c) 2018 LoganDark
  4. --
  5. -- Permission is hereby granted, free of charge, to any person obtaining a copy
  6. -- of this software and associated documentation files (the "Software"), to deal
  7. -- in the Software without restriction, including without limitation the rights
  8. -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. -- copies of the Software, and to permit persons to whom the Software is
  10. -- furnished to do so, subject to the following conditions:
  11. --
  12. -- The above copyright notice and this permission notice shall be included in all
  13. -- copies or substantial portions of the Software.
  14. --
  15. -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. -- SOFTWARE.
  22. function lookupify(src, list)
  23. list = list or {}
  24. if type(src) == 'string' then
  25. for i = 1, src:len() do
  26. list[src:sub(i, i)] = true
  27. end
  28. elseif type(src) == 'table' then
  29. for i = 1, #src do
  30. list[src[i]] = true
  31. end
  32. end
  33. return list
  34. end
  35. local base_ident = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
  36. local base_digits = '0123456789'
  37. local base_operators = '+-*/^%#'
  38. local chars = {
  39. whitespace = lookupify(' \n\t\r'),
  40. validEscapes = lookupify('abfnrtv"\'\\'),
  41. ident = lookupify(
  42. base_ident .. base_digits,
  43. {
  44. start = lookupify(base_ident),
  45. }
  46. ),
  47. digits = lookupify(
  48. base_digits,
  49. {
  50. hex = lookupify(base_digits .. 'abcdefABCDEF')
  51. }
  52. ),
  53. symbols = lookupify(
  54. base_operators .. ',{}[]();.:', {
  55. equality = lookupify('~=><'),
  56. operators = lookupify(base_operators)
  57. }
  58. )
  59. }
  60. local keywords = {
  61. structure = lookupify({
  62. 'and', 'break', 'do', 'else', 'elseif', 'end', 'for', 'function',
  63. 'goto', 'if', 'in', 'local', 'not', 'or', 'repeat', 'return', 'then',
  64. 'until', 'while'
  65. }),
  66. values = lookupify({
  67. 'true', 'false', 'nil'
  68. })
  69. }
  70. return function(text)
  71. local pos = 1
  72. local start = 1
  73. local buffer = {}
  74. local lines = {}
  75. local function look(delta)
  76. delta = pos + (delta or 0)
  77. return text:sub(delta, delta)
  78. end
  79. local function get()
  80. pos = pos + 1
  81. return look(-1)
  82. end
  83. local function getDataLevel()
  84. local num = 0
  85. while look(num) == '=' do
  86. num = num + 1
  87. end
  88. if look(num) == '[' then
  89. pos = pos + num + 1
  90. return num
  91. end
  92. end
  93. local function getCurrentTokenText()
  94. return text:sub(start, pos - 1)
  95. end
  96. local currentLineLength = 0
  97. local lineoffset = 0
  98. local function pushToken(type, text)
  99. text = text or getCurrentTokenText()
  100. local tk = buffer[#buffer]
  101. if not tk or tk.type ~= type then
  102. tk = {
  103. type = type,
  104. data = text,
  105. posFirst = start - lineoffset,
  106. posLast = pos - 1 - lineoffset
  107. }
  108. if tk.data ~= '' then
  109. buffer[#buffer + 1] = tk
  110. end
  111. else
  112. tk.data = tk.data .. text
  113. tk.posLast = tk.posLast + text:len()
  114. end
  115. currentLineLength = currentLineLength + text:len()
  116. start = pos
  117. return tk
  118. end
  119. local function newline()
  120. lines[#lines + 1] = buffer
  121. buffer = {}
  122. get()
  123. pushToken('newline')
  124. buffer[1] = nil
  125. lineoffset = lineoffset + currentLineLength
  126. currentLineLength = 0
  127. end
  128. local function getData(level, type)
  129. while true do
  130. local char = get()
  131. if char == '' then
  132. return
  133. elseif char == '\n' then
  134. pos = pos - 1
  135. pushToken(type)
  136. newline()
  137. elseif char == ']' then
  138. local valid = true
  139. for i = 1, level do
  140. if look() == '=' then
  141. pos = pos + 1
  142. else
  143. valid = false
  144. break
  145. end
  146. end
  147. if valid and look() == ']' then
  148. pos = pos - level - 1
  149. return
  150. end
  151. end
  152. end
  153. end
  154. local function chompWhitespace()
  155. while true do
  156. local char = look()
  157. if char == '\n' then
  158. pushToken('whitespace')
  159. newline()
  160. elseif chars.whitespace[char] then
  161. pos = pos + 1
  162. else
  163. break
  164. end
  165. end
  166. pushToken('whitespace')
  167. end
  168. while true do
  169. chompWhitespace()
  170. local char = get()
  171. if char == '' then
  172. break
  173. elseif char == '-' and look() == '-' then
  174. pos = pos + 1
  175. if look() == '[' then
  176. pos = pos + 1
  177. local level = getDataLevel()
  178. if level then
  179. getData(level, 'comment')
  180. pos = pos + level + 2
  181. pushToken('comment')
  182. else
  183. while true do
  184. local char2 = get()
  185. if char2 == '' or char2 == '\n' then
  186. pos = pos - 1
  187. pushToken('comment')
  188. if char2 == '\n' then
  189. newline()
  190. end
  191. break
  192. end
  193. end
  194. end
  195. else
  196. while true do
  197. local char2 = get()
  198. if char2 == '' or char2 == '\n' then
  199. pos = pos - 1
  200. pushToken('comment')
  201. if char2 == '\n' then
  202. newline()
  203. end
  204. break
  205. end
  206. end
  207. end
  208. pushToken('comment')
  209. elseif char == '\'' or char == '"' then
  210. pushToken('string_start')
  211. while true do
  212. local char2 = get()
  213. if char2 == '\\' then
  214. pos = pos - 1
  215. pushToken('string')
  216. get()
  217. local char3 = get()
  218. if chars.digits[char3] then
  219. for i = 1, 2 do
  220. if chars.digits[look()] then
  221. pos = pos + 1
  222. end
  223. end
  224. elseif char3 == 'x' then
  225. if chars.digits.hex[look()] and chars.digits.hex[look(1)] then
  226. pos = pos + 2
  227. else
  228. pushToken('unidentified')
  229. end
  230. elseif char3 == '\n' then
  231. pos = pos - 1
  232. pushToken('escape')
  233. newline()
  234. elseif not chars.validEscapes[char3] then
  235. pushToken('unidentified')
  236. end
  237. pushToken('escape')
  238. elseif char2 == '\n' then
  239. pos = pos - 1
  240. pushToken('string')
  241. newline()
  242. break
  243. elseif char2 == char or char2 == '' then
  244. pos = pos - 1
  245. pushToken('string')
  246. get()
  247. break
  248. end
  249. end
  250. pushToken('string_end')
  251. elseif chars.ident.start[char] then
  252. while chars.ident[look()] do
  253. pos = pos + 1
  254. end
  255. local word = getCurrentTokenText()
  256. if keywords.structure[word] then
  257. pushToken('keyword')
  258. elseif keywords.values[word] then
  259. pushToken('value')
  260. else
  261. pushToken('ident')
  262. end
  263. elseif chars.digits[char] or (char == '.' and chars.digits[look()]) then
  264. if char == '0' and look() == 'x' then
  265. pos = pos + 1
  266. while chars.digits.hex[look()] do
  267. pos = pos + 1
  268. end
  269. else
  270. while chars.digits[look()] do
  271. pos = pos + 1
  272. end
  273. if look() == '.' then
  274. pos = pos + 1
  275. while chars.digits[look()] do
  276. pos = pos + 1
  277. end
  278. end
  279. if look():lower() == 'e' then
  280. pos = pos + 1
  281. if look() == '-' then
  282. pos = pos + 1
  283. end
  284. while chars.digits[look()] do
  285. pos = pos + 1
  286. end
  287. end
  288. end
  289. pushToken('number')
  290. elseif char == '[' then
  291. local level = getDataLevel()
  292. if level then
  293. pushToken('string_start')
  294. getData(level, 'string')
  295. pushToken('string')
  296. pos = pos + level + 2
  297. pushToken('string_end')
  298. else
  299. pushToken('symbol')
  300. end
  301. elseif char == '.' then
  302. if look() == '.' then
  303. pos = pos + 1
  304. if look() == '.' then
  305. pos = pos + 1
  306. end
  307. end
  308. if getCurrentTokenText():len() == 3 then
  309. pushToken('vararg')
  310. else
  311. pushToken('symbol')
  312. end
  313. elseif char == ':' and look() == ':' then
  314. get()
  315. pushToken('label_start')
  316. chompWhitespace()
  317. if chars.ident.start[look()] then
  318. get()
  319. while chars.ident[look()] do
  320. get()
  321. end
  322. pushToken('label')
  323. chompWhitespace()
  324. if look() == ':' and look(1) == ':' then
  325. get()
  326. get()
  327. pushToken('label_end')
  328. end
  329. end
  330. elseif chars.symbols.equality[char] then
  331. if look() == '=' then
  332. pos = pos + 1
  333. end
  334. pushToken('operator')
  335. elseif chars.symbols[char] then
  336. if chars.symbols.operators[char] then
  337. pushToken('operator')
  338. else
  339. pushToken('symbol')
  340. end
  341. else
  342. pushToken('unidentified')
  343. end
  344. end
  345. lines[#lines + 1] = buffer
  346. return lines
  347. end