1 // Written in the D programming language 2 3 /** 4 * $(H2 Summary) 5 * This module contains a range-based compile-time _lexer generator. 6 * 7 * $(H2 Overview) 8 * The _lexer generator consists of a template mixin, $(LREF Lexer), along with 9 * several helper templates for generating such things as token identifiers. 10 * 11 * To write a _lexer using this API: 12 * $(OL 13 * $(LI Create the string array constants for your language. 14 * $(UL 15 * $(LI $(LINK2 #.staticTokens, staticTokens)) 16 * $(LI $(LINK2 #.dynamicTokens, dynamicTokens)) 17 * $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)) 18 * $(LI $(LINK2 #.tokenHandlers, tokenHandlers)) 19 * )) 20 * $(LI Create aliases for the various token and token identifier types 21 * specific to your language. 22 * $(UL 23 * $(LI $(LREF TokenIdType)) 24 * $(LI $(LREF tokenStringRepresentation)) 25 * $(LI $(LREF TokenStructure)) 26 * $(LI $(LREF TokenId)) 27 * )) 28 * $(LI Create a struct that mixes in the Lexer template mixin and 29 * implements the necessary functions. 30 * $(UL 31 * $(LI $(LREF Lexer)) 32 * )) 33 * ) 34 * Examples: 35 * $(UL 36 * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).) 37 * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).) 38 * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).) 39 * ) 40 * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions) 41 * $(DL 42 * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction) 43 * $(DD A function that serves as the default token lexing function. For most 44 * languages this will be the identifier lexing function.)) 45 * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction)) 46 * $(DD A function that is able to determine if an identifier/keyword has come 47 * to an end. This function must return bool and take a single size_t 48 * argument representing the number of bytes to skip over before looking for 49 * a separating character.) 50 * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens)) 51 * $(DD A listing of the tokens whose exact value never changes and which cannot 52 * possibly be a token handled by the default token lexing function. The 53 * most common example of this kind of token is an operator such as 54 * $(D_STRING "*"), or $(D_STRING "-") in a programming language.) 55 * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens)) 56 * $(DD A listing of tokens whose value is variable, such as whitespace, 57 * identifiers, number literals, and string literals.) 58 * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens)) 59 * $(DD A listing of tokens that could posibly be one of the tokens handled by 60 * the default token handling function. An common example of this is 61 * a keyword such as $(D_STRING "for"), which looks like the beginning of 62 * the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is 63 * called to determine if the character after the $(D_STRING 'r') separates 64 * the identifier, indicating that the token is $(D_STRING "for"), or if 65 * lexing should be turned over to the $(B defaultTokenFunction).) 66 * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers)) 67 * $(DD A mapping of prefixes to custom token handling function names. The 68 * generated _lexer will search for the even-index elements of this array, 69 * and then call the function whose name is the element immedately after the 70 * even-indexed element. This is used for lexing complex tokens whose prefix 71 * is fixed.) 72 * ) 73 * 74 * Here are some example constants for a simple calculator _lexer: 75 * --- 76 * // There are a near infinite number of valid number literals, so numbers are 77 * // dynamic tokens. 78 * enum string[] dynamicTokens = ["numberLiteral", "whitespace"]; 79 * 80 * // The operators are always the same, and cannot start a numberLiteral, so 81 * // they are staticTokens 82 * enum string[] staticTokens = ["-", "+", "*", "/"]; 83 * 84 * // In this simple example there are no keywords or other tokens that could 85 * // look like dynamic tokens, so this is blank. 86 * enum string[] possibleDefaultTokens = []; 87 * 88 * // If any whitespace character or digit is encountered, pass lexing over to 89 * // our custom handler functions. These will be demonstrated in an example 90 * // later on. 91 * enum string[] tokenHandlers = [ 92 * "0", "lexNumber", 93 * "1", "lexNumber", 94 * "2", "lexNumber", 95 * "3", "lexNumber", 96 * "4", "lexNumber", 97 * "5", "lexNumber", 98 * "6", "lexNumber", 99 * "7", "lexNumber", 100 * "8", "lexNumber", 101 * "9", "lexNumber", 102 * " ", "lexWhitespace", 103 * "\n", "lexWhitespace", 104 * "\t", "lexWhitespace", 105 * "\r", "lexWhitespace" 106 * ]; 107 * --- 108 * 109 * Copyright: Brian Schott 2013 110 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) 111 * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu 112 * Source: $(PHOBOSSRC std/experimental/_lexer.d) 113 */ 114 115 module std.experimental.lexer; 116 117 /** 118 * Template for determining the type used for a token type. 119 * 120 * Selects the smallest unsigned integral type that is able to hold the value 121 * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length. 122 * For example if there are 20 static tokens, 30 dynamic tokens, 123 * and 10 possible default tokens, this template will alias itself to ubyte, 124 * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max. 125 * Examples: 126 * --- 127 * // In our calculator example this means that IdType is an alias for ubyte. 128 * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens); 129 * --- 130 */ 131 template TokenIdType(alias staticTokens, alias dynamicTokens, 132 alias possibleDefaultTokens) 133 { 134 immutable tokenCount = staticTokens.length + dynamicTokens.length 135 + possibleDefaultTokens.length + 1; 136 static if (tokenCount <= ubyte.max) 137 alias TokenIdType = ubyte; 138 else static if (tokenCount <= ushort.max) 139 alias TokenIdType = ushort; 140 else static if (tokenCount <= uint.max) 141 alias TokenIdType = uint; 142 else 143 static assert (false, "The number of tokens must be less than uint.max"); 144 } 145 146 /** 147 * Looks up the string representation of the given token type. 148 * 149 * This is the opposite of the function of the TokenId template. 150 * Params: type = the token type identifier 151 * Examples: 152 * --- 153 * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens); 154 * assert (str(tok!"*") == "*"); 155 * --- 156 * See_also: $(LREF TokenId) 157 */ 158 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, 159 alias possibleDefaultTokens)(IdType type) pure nothrow @property @nogc @safe 160 { 161 // hax 162 static auto f() pure nothrow @trusted 163 { 164 return cast(immutable) staticTokens ~ dynamicTokens ~ possibleDefaultTokens; 165 } 166 167 static immutable tokens = f(); 168 169 if (type == 0) 170 return "!ERROR!"; 171 else if (type < tokens.length + 1) 172 return tokens[type - 1]; 173 else 174 return null; 175 } 176 177 unittest 178 { 179 alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]); 180 enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token); 181 alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]); 182 183 static assert (str(tok!"foo") == "foo"); 184 static assert (str(tok!"bar") == "bar"); 185 static assert (str(tok!"doo") == "doo"); 186 } 187 188 /** 189 * Generates the token type identifier for the given symbol. 190 * 191 * There are two special cases: 192 * $(UL 193 * $(LI If symbol is $(D_STRING ""), then the token identifier will be 0) 194 * $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum 195 * valid token type identifier) 196 * ) 197 * In all cases this template will alias itself to a constant of type IdType. 198 * This template will fail at compile time if $(D_PARAM symbol) is not one of 199 * the staticTokens, dynamicTokens, or possibleDefaultTokens. 200 * Examples: 201 * --- 202 * template tok(string symbol) 203 * { 204 * alias tok = TokenId!(IdType, staticTokens, dynamicTokens, 205 * possibleDefaultTokens, symbol); 206 * } 207 * // num and plus are of type ubyte. 208 * IdType plus = tok!"+"; 209 * IdType num = tok!"numberLiteral"; 210 * --- 211 */ 212 template TokenId(IdType, alias staticTokens, alias dynamicTokens, 213 alias possibleDefaultTokens, string symbol) 214 { 215 enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens; 216 217 import std.algorithm; 218 static if (symbol == "") 219 { 220 enum id = 0; 221 alias TokenId = id; 222 } 223 else static if (symbol == "\0") 224 { 225 enum id = 1 + tokens.length; 226 alias TokenId = id; 227 } 228 else 229 { 230 enum i = tokens.countUntil(symbol); 231 static if (i != -1) 232 { 233 enum id = i + 1; 234 static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol); 235 alias TokenId = id; 236 } 237 else 238 static assert (0, "Invalid token: " ~ symbol); 239 } 240 } 241 242 /** 243 * The token that is returned by the lexer. 244 * Params: 245 * IdType = The D type of the "type" token type field. 246 * extraFields = A string containing D code for any extra fields that should 247 * be included in the token structure body. This string is passed 248 * directly to a mixin statement. 249 * Examples: 250 * --- 251 * // No extra struct fields are desired in this example, so leave it blank. 252 * alias Token = TokenStructure!(IdType, ""); 253 * Token minusToken = Token(tok!"-"); 254 * --- 255 */ 256 struct TokenStructure(IdType, string extraFields = "") 257 { 258 public pure nothrow @safe @nogc 259 { 260 261 bool opEquals(ref const typeof(this) other) const 262 { 263 return this.type == other.type && this.text == other.text; 264 } 265 266 /** 267 * Returns: true if the token has the given type, false otherwise. 268 */ 269 bool opEquals(IdType type) const 270 { 271 return this.type == type; 272 } 273 274 /** 275 * Constructs a token from a token type. 276 * Params: type = the token type 277 */ 278 this(IdType type) 279 { 280 this.type = type; 281 } 282 283 /** 284 * Constructs a token. 285 * Params: 286 * type = the token type 287 * text = the text of the token, which may be null 288 * line = the line number at which this token occurs 289 * column = the column number at which this token occurs 290 * index = the byte offset from the beginning of the input at which this 291 * token occurs 292 */ 293 this(IdType type, string text, size_t line, size_t column, size_t index) 294 { 295 this.text = text; 296 this.line = line; 297 this.column = column; 298 this.type = type; 299 this.index = index; 300 } 301 302 /** 303 * The _text of the token. 304 */ 305 string text; 306 307 /** 308 * The _line number at which this token occurs. 309 */ 310 size_t line; 311 312 /** 313 * The _column number at which this token occurs. This is measured in bytes 314 * and may not be correct when tab characters are involved. 315 */ 316 size_t column; 317 318 /** 319 * The byte offset from the beginning of the input at which this token 320 * occurs. 321 */ 322 size_t index; 323 324 /** 325 * The token type. 326 */ 327 IdType type; 328 329 } 330 331 mixin (extraFields); 332 } 333 334 /** 335 * The implementation of the _lexer is contained within this mixin template. 336 * 337 * To use it, this template should be mixed in to a struct that represents the 338 * _lexer for your language. This struct should implement the following methods: 339 * $(UL 340 * $(LI popFront, which should call this mixin's _popFront() and 341 * additionally perform any token filtering or shuffling you deem 342 * necessary. For example, you can implement popFront to skip comment or 343 * tokens.) 344 * $(LI A function that serves as the default token lexing function. For 345 * most languages this will be the identifier lexing function. This 346 * should then be passed to the $(LREF Lexer) template mixin as the 347 * $(LINK2 #.defaultTokenFunction defaultTokenFunction) template 348 * parameter.) 349 * $(LI A function that is able to determine if an identifier/keyword has 350 * come to an end. This function must return $(D_KEYWORD bool) and take 351 * a single $(D_KEYWORD size_t) argument representing the number of 352 * bytes to skip over before looking for a separating character.) 353 * $(LI Any functions referred to in the tokenHandlers template paramater. 354 * These functions must be marked $(D_KEYWORD pure nothrow), take no 355 * arguments, and return a token) 356 * $(LI A constructor that initializes the range field as well as calls 357 * popFront() exactly once (to initialize the _front field).) 358 * ) 359 * Params: 360 * Token = $(LREF TokenStructure) 361 * defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction) 362 * tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction) 363 * staticTokens = $(LINK2 #.staticTokens, staticTokens) 364 * dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens) 365 * possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens) 366 * tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers) 367 * Examples: 368 * --- 369 * struct CalculatorLexer 370 * { 371 * mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating, 372 * staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers); 373 * 374 * this (ubyte[] bytes) 375 * { 376 * this.range = LexerRange(bytes); 377 * popFront(); 378 * } 379 * 380 * void popFront() pure 381 * { 382 * _popFront(); 383 * } 384 * 385 * Token lexNumber() pure nothrow @safe 386 * { 387 * // implementation goes here 388 * } 389 * 390 * Token lexWhitespace() pure nothrow @safe 391 * { 392 * // implementation goes here 393 * } 394 * 395 * Token defaultTokenFunction() pure nothrow @safe 396 * { 397 * // There is no default token in the example calculator language, so 398 * // this is always an error. 399 * range.popFront(); 400 * return Token(tok!""); 401 * } 402 * 403 * bool isSeparating(size_t offset) pure nothrow @safe 404 * { 405 * // For this example language, always return true. 406 * return true; 407 * } 408 * } 409 * --- 410 */ 411 mixin template Lexer(Token, alias defaultTokenFunction, 412 alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens, 413 alias possibleDefaultTokens, alias tokenHandlers) 414 { 415 private alias _IDType = typeof(Token.type); 416 private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol); 417 418 static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must" 419 ~ " have a corresponding handler function name."); 420 421 static string generateMask(const ubyte[] arr) 422 { 423 import std.string : format; 424 ulong u; 425 for (size_t i = 0; i < arr.length && i < 8; i++) 426 { 427 u |= (cast(ulong) arr[i]) << (i * 8); 428 } 429 return format("0x%016x", u); 430 } 431 432 private static string generateByteMask(size_t l) 433 { 434 import std.string : format; 435 return format("0x%016x", ulong.max >> ((8 - l) * 8)); 436 } 437 438 private static size_t calcSplitCount(size_t a, size_t b) pure nothrow 439 { 440 int i; 441 while (true) 442 { 443 i++; 444 a /= 2; 445 if (a < b) 446 break; 447 } 448 return i; 449 } 450 451 private static char[] getBeginningChars(string[] allTokens) 452 { 453 char[] beginningChars; 454 for (size_t i = 0; i < allTokens.length; i++) 455 { 456 if (allTokens[i].length == 0) 457 continue; 458 beginningChars ~= allTokens[i][0]; 459 size_t j = i + 1; 460 while (j < allTokens.length && allTokens[i][0] == allTokens[j][0]) 461 j++; 462 i = j - 1; 463 } 464 return beginningChars; 465 } 466 467 private static string generateStatements() 468 { 469 import std.algorithm : sort; 470 import std.range : stride; 471 472 string[] pseudoTokens = array(tokenHandlers.stride(2)); 473 string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq()); 474 // Array consisting of a sorted list of the first characters of the 475 // tokens. 476 char[] beginningChars = getBeginningChars(allTokens); 477 size_t i = calcSplitCount(beginningChars.length, 8); 478 return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i); 479 } 480 481 private static string generateStatementsStep(string[] allTokens, 482 string[] pseudoTokens, char[] chars, size_t i, string indent = "") 483 { 484 import std.string : format; 485 string code; 486 if (i > 0) 487 { 488 size_t p = chars.length / 2; 489 code ~= indent ~ format("if (f < 0x%02x) // %s \n%s{\n", chars[p], chars[p], indent); 490 code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ " "); 491 code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n"; 492 code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ " "); 493 code ~= indent ~ "}\n"; 494 } 495 else 496 { 497 code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n"; 498 foreach (char c; chars) 499 { 500 size_t begin; 501 size_t end; 502 for (size_t j = 0; j < allTokens.length; j++) 503 { 504 if (allTokens[j].length == 0 || allTokens[j][0] != c) 505 continue; 506 begin = j; 507 end = j + 1; 508 while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0]) 509 end++; 510 break; 511 } 512 code ~= format("%scase 0x%02x:\n", indent, c); 513 code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ " "); 514 } 515 code ~= indent ~ "default: goto _defaultTokenFunction;\n"; 516 code ~= indent ~ "}\n"; 517 } 518 519 return code; 520 } 521 522 private static string printCase(string[] tokens, string[] pseudoTokens, string indent) 523 { 524 import std.array : array; 525 import std.algorithm : countUntil; 526 import std.conv : text; 527 string[] sortedTokens = array(sort!"a.length > b.length"(tokens)); 528 529 if (tokens.length == 1 && tokens[0].length == 1) 530 { 531 if (pseudoTokens.countUntil(tokens[0]) >= 0) 532 { 533 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] 534 ~ "(token);\n" ~ indent ~ "return;\n"; 535 } 536 else if (staticTokens.countUntil(tokens[0]) >= 0) 537 { 538 return indent ~ "range.index++; range.column++;\n" 539 ~ indent ~ "token = Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n" 540 ~ indent ~ "return;\n"; 541 } 542 else if (pseudoTokens.countUntil(tokens[0]) >= 0) 543 { 544 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] 545 ~ "(token);\n" ~ indent ~ "return;\n"; 546 547 } 548 } 549 550 string code; 551 552 bool insertTrailingGoto = true; 553 foreach (i, token; sortedTokens) 554 { 555 immutable mask = generateMask(cast (const ubyte[]) token); 556 if (token.length >= 8) 557 code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n"; 558 else if (token.length != 1) 559 code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n"; 560 if (token.length != 1) 561 code ~= indent ~ "{\n"; 562 if (pseudoTokens.countUntil(token) >= 0) 563 { 564 if (token.length <= 8) 565 { 566 if (token.length == 1) 567 insertTrailingGoto = false; 568 code ~= (token.length == 1 ? indent : indent ~ " ") 569 ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] 570 ~ "(token);\n"; 571 code ~= (token.length == 1 ? indent : indent ~ " ") ~ "return;\n"; 572 } 573 else 574 { 575 code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n"; 576 code ~= indent ~ " " 577 ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] 578 ~ "();\n"; 579 code ~= indent ~ " return;\n"; 580 } 581 } 582 else if (staticTokens.countUntil(token) >= 0) 583 { 584 if (token.length <= 8) 585 { 586 if (token.length == 1) 587 insertTrailingGoto = false; 588 code ~= indent ~ (token.length != 1 ? " " : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 589 code ~= indent ~ (token.length != 1 ? " " : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 590 code ~= indent ~ (token.length != 1 ? " " : "") ~ "return;\n"; 591 } 592 else 593 { 594 code ~= indent ~ " pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n"; 595 } 596 } 597 else 598 { 599 // possible default 600 if (token.length <= 8) 601 { 602 code ~= indent ~ " if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n"; 603 code ~= indent ~ " {\n"; 604 code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 605 code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 606 code ~= indent ~ " return;\n"; 607 code ~= indent ~ " }\n"; 608 code ~= indent ~ " else\n"; 609 code ~= indent ~ " goto _defaultTokenFunction;\n"; 610 } 611 else 612 { 613 code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n"; 614 code ~= indent ~ " {\n"; 615 code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; 616 code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; 617 code ~= indent ~ " return;\n"; 618 code ~= indent ~ " }\n"; 619 code ~= indent ~ " else\n"; 620 code ~= indent ~ " goto _defaultTokenFunction;\n"; 621 } 622 } 623 if (token.length != 1) 624 { 625 code ~= indent ~ "}\n"; 626 } 627 } 628 if (insertTrailingGoto) 629 code ~= indent ~ "goto _defaultTokenFunction;\n"; 630 return code; 631 } 632 633 /** 634 * Implements the range primitive _front. 635 */ 636 ref const(Token) front()() pure nothrow const @property @safe 637 { 638 return _front; 639 } 640 641 /** 642 * Advances the lexer to the next token and stores the new current token in 643 * the _front variable. 644 */ 645 void _popFront()() pure nothrow @safe 646 { 647 advance(_front); 648 } 649 650 /** 651 * Implements the range primitive _empty. 652 */ 653 bool empty()() pure const nothrow @property @safe @nogc 654 { 655 return _front.type == _tok!"\0"; 656 } 657 658 static string escape(string input) pure @trusted 659 { 660 string retVal; 661 foreach (ubyte c; cast(ubyte[]) input) 662 { 663 switch (c) 664 { 665 case '\\': retVal ~= `\\`; break; 666 case '"': retVal ~= `\"`; break; 667 case '\'': retVal ~= `\'`; break; 668 case '\t': retVal ~= `\t`; break; 669 case '\n': retVal ~= `\n`; break; 670 case '\r': retVal ~= `\r`; break; 671 default: retVal ~= c; break; 672 } 673 } 674 return retVal; 675 } 676 677 enum tokenSearch = generateStatements(); 678 679 static ulong getFront(const ubyte[] arr) pure nothrow @trusted 680 { 681 static union ByteArr { ulong l; ubyte[8] arr; } 682 static assert(ByteArr.sizeof == ulong.sizeof); 683 ByteArr b; 684 b.l = ulong.max; 685 b.arr[0 .. arr.length] = arr[]; 686 return b.l; 687 } 688 689 void advance(ref Token token) pure nothrow @trusted 690 { 691 if (range.index >= range.bytes.length) 692 { 693 token.type = _tok!"\0"; 694 return; 695 } 696 immutable size_t index = range.index; 697 immutable size_t column = range.column; 698 immutable size_t line = range.line; 699 immutable ulong frontBytes = range.index + 8 <= range.bytes.length 700 ? getFront(range.bytes[range.index .. range.index + 8]) 701 : getFront(range.bytes[range.index .. $]); 702 ubyte f = cast(ubyte) frontBytes; 703 // pragma(msg, tokenSearch); 704 mixin(tokenSearch); 705 _defaultTokenFunction: 706 defaultTokenFunction(token); 707 } 708 709 /** 710 * The lexer input. 711 */ 712 LexerRange range; 713 714 /** 715 * The token that is currently at the front of the range. 716 */ 717 Token _front; 718 } 719 720 /** 721 * Range structure that wraps the _lexer's input. 722 */ 723 struct LexerRange 724 { 725 // TODO: When D gets @forceinline the template inline hack (i.e 726 // `void front()() { ... }` )should be removed. 727 728 public nothrow pure @safe @nogc: 729 /** 730 * Params: 731 * bytes = the _lexer input 732 * index = the initial offset from the beginning of $(D_PARAM bytes) 733 * column = the initial _column number 734 * line = the initial _line number 735 */ 736 this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) 737 { 738 this.bytes = bytes; 739 this.index = index; 740 this.column = column; 741 this.line = line; 742 } 743 744 /** 745 * Returns: a mark at the current position that can then be used with slice. 746 */ 747 size_t mark()() const 748 { 749 return index; 750 } 751 752 /** 753 * Sets the range to the given position. 754 * Params: m = the position to seek to 755 */ 756 void seek()(size_t m) 757 { 758 index = m; 759 } 760 761 /** 762 * Returns a slice of the input byte array between the given mark and the 763 * current position. 764 * Params m = the beginning index of the slice to return 765 */ 766 const(ubyte)[] slice()(size_t m) const 767 { 768 return bytes[m .. index]; 769 } 770 771 /** 772 * Implements the range primitive _empty. 773 */ 774 bool empty()() const 775 { 776 return index >= bytes.length; 777 } 778 779 /** 780 * Implements the range primitive _front. 781 */ 782 ubyte front()() const 783 { 784 return bytes[index]; 785 } 786 787 /** 788 * Returns: the current item as well as the items $(D_PARAM p) items ahead. 789 */ 790 const(ubyte)[] peek(size_t p) const 791 { 792 return index + p + 1 > bytes.length 793 ? bytes[index .. $] 794 : bytes[index .. index + p + 1]; 795 } 796 797 /** 798 * Returns: true if the range starts with the given byte sequence 799 */ 800 bool startsWith(const(ubyte[]) needle) const 801 { 802 if (needle.length + index > bytes.length) 803 return false; 804 foreach (i; 0 .. needle.length) 805 if (needle[i] != bytes[index + i]) 806 return false; 807 return true; 808 } 809 810 /** 811 * 812 */ 813 ubyte peekAt()(size_t offset) const 814 { 815 return bytes[index + offset]; 816 } 817 818 /** 819 * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead. 820 */ 821 bool canPeek()(size_t p) const 822 { 823 return index + p < bytes.length; 824 } 825 826 /** 827 * Implements the range primitive _popFront. 828 */ 829 void popFront()() 830 { 831 index++; 832 column++; 833 } 834 835 /** 836 * Implements the algorithm _popFrontN more efficiently. This function does 837 * not detect or handle newlines. 838 */ 839 void popFrontN()(size_t n) 840 { 841 index += n; 842 column += n; 843 } 844 845 /** 846 * Increments the range's line number and resets the column counter. 847 */ 848 void incrementLine()(size_t i = 1) 849 { 850 column = 1; 851 line += i; 852 } 853 854 /** 855 * The input _bytes. 856 */ 857 const(ubyte)[] bytes; 858 859 /** 860 * The range's current position. 861 */ 862 size_t index; 863 864 /** 865 * The current _column number. 866 */ 867 size_t column; 868 869 /** 870 * The current _line number. 871 */ 872 size_t line; 873 }