1 module dparse.lexer; 2 3 import std.typecons; 4 import std.typetuple; 5 import std.array; 6 import std.algorithm; 7 import std.range; 8 import std.experimental.lexer; 9 import core.cpuid : sse42; 10 version (D_InlineAsm_X86_64) 11 { 12 version (Windows) {} 13 else version = iasm64NotWindows; 14 } 15 16 /// Operators 17 private enum operators = [ 18 ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", 19 "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", 20 "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", 21 "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", 22 "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" 23 ]; 24 25 /// Kewords 26 private enum keywords = [ 27 "abstract", "alias", "align", "asm", "assert", "auto", "body", "bool", 28 "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", 29 "char", "class", "const", "continue", "creal", "dchar", "debug", "default", 30 "delegate", "delete", "deprecated", "do", "double", "else", "enum", 31 "export", "extern", "false", "final", "finally", "float", "for", "foreach", 32 "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", 33 "immutable", "import", "in", "inout", "int", "interface", "invariant", 34 "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", 35 "null", "out", "override", "package", "pragma", "private", "protected", 36 "public", "pure", "real", "ref", "return", "scope", "shared", "short", 37 "static", "struct", "super", "switch", "synchronized", "template", "this", 38 "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", 39 "uint", "ulong", "union", "unittest", "ushort", "version", "void", 40 "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", 41 "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters", 42 "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector", 43 "__VENDOR__", "__VERSION__" 44 ]; 45 46 /// Other tokens 47 private enum dynamicTokens = [ 48 "specialTokenSequence", "comment", "identifier", "scriptLine", 49 "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", 50 "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", 51 "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", 52 "dstringLiteral", "stringLiteral", "wstringLiteral" 53 ]; 54 55 private enum pseudoTokenHandlers = [ 56 "\"", "lexStringLiteral", 57 "`", "lexWysiwygString", 58 "//", "lexSlashSlashComment", 59 "/*", "lexSlashStarComment", 60 "/+", "lexSlashPlusComment", 61 ".", "lexDot", 62 "'", "lexCharacterLiteral", 63 "0", "lexNumber", 64 "1", "lexDecimal", 65 "2", "lexDecimal", 66 "3", "lexDecimal", 67 "4", "lexDecimal", 68 "5", "lexDecimal", 69 "6", "lexDecimal", 70 "7", "lexDecimal", 71 "8", "lexDecimal", 72 "9", "lexDecimal", 73 "q\"", "lexDelimitedString", 74 "q{", "lexTokenString", 75 "r\"", "lexWysiwygString", 76 "x\"", "lexHexString", 77 " ", "lexWhitespace", 78 "\t", "lexWhitespace", 79 "\r", "lexWhitespace", 80 "\n", "lexWhitespace", 81 "\v", "lexWhitespace", 82 "\f", "lexWhitespace", 83 "\u2028", "lexLongNewline", 84 "\u2029", "lexLongNewline", 85 "#!", "lexScriptLine", 86 "#line", "lexSpecialTokenSequence" 87 ]; 88 89 /// Token ID type for the D lexer. 90 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 91 92 /** 93 * Function used for converting an IdType to a string. 94 * 95 * Examples: 96 * --- 97 * IdType c = tok!"case"; 98 * assert (str(c) == "case"); 99 * --- 100 */ 101 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 102 103 /** 104 * Template used to refer to D token types. 105 * 106 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 107 * values that can be passed to this template. 108 * Example: 109 * --- 110 * import dparse.lexer; 111 * IdType t = tok!"floatLiteral"; 112 * --- 113 */ 114 public template tok(string token) 115 { 116 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 117 } 118 119 private enum extraFields = q{ 120 string comment; 121 string trailingComment; 122 123 int opCmp(size_t i) const pure nothrow @safe { 124 if (index < i) return -1; 125 if (index > i) return 1; 126 return 0; 127 } 128 129 int opCmp(ref const typeof(this) other) const pure nothrow @safe { 130 return opCmp(other.index); 131 } 132 }; 133 134 /// The token type in the D lexer 135 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); 136 137 /** 138 * Configure whitespace handling 139 */ 140 public enum WhitespaceBehavior : ubyte 141 { 142 include = 0b0000_0000, 143 skip = 0b0000_0001, 144 } 145 146 /** 147 * Configure string lexing behavior 148 */ 149 public enum StringBehavior : ubyte 150 { 151 /// Do not include quote characters, process escape sequences 152 compiler = 0b0000_0000, 153 /// Opening quotes, closing quotes, and string suffixes are included in the 154 /// string token 155 includeQuoteChars = 0b0000_0001, 156 /// String escape sequences are not replaced 157 notEscaped = 0b0000_0010, 158 /// Not modified at all. Useful for formatters or highlighters 159 source = includeQuoteChars | notEscaped 160 } 161 162 public enum CommentBehavior : bool 163 { 164 intern = true, 165 noIntern = false 166 } 167 /** 168 * Lexer configuration struct 169 */ 170 public struct LexerConfig 171 { 172 string fileName; 173 StringBehavior stringBehavior; 174 WhitespaceBehavior whitespaceBehavior; 175 CommentBehavior commentBehavior = CommentBehavior.intern; 176 } 177 178 /** 179 * Basic type token types. 180 */ 181 public alias BasicTypes = AliasSeq!(tok!"int", tok!"bool", tok!"byte", 182 tok!"cdouble", tok!"cent", tok!"cfloat", tok!"char", tok!"creal", 183 tok!"dchar", tok!"double", tok!"float", tok!"idouble", 184 tok!"ifloat", tok!"ireal", tok!"long", tok!"real", tok!"short", 185 tok!"ubyte", tok!"ucent", tok!"uint", tok!"ulong", tok!"ushort", 186 tok!"void", tok!"wchar"); 187 188 /** 189 * Returns: true if the given ID is for a basic type. 190 */ 191 public bool isBasicType(IdType type) nothrow pure @safe @nogc 192 { 193 switch (type) 194 { 195 foreach (T; BasicTypes) 196 { 197 case T: 198 return true; 199 } 200 default: 201 return false; 202 } 203 } 204 205 /** 206 * Number literal token types. 207 */ 208 public alias NumberLiterals = AliasSeq!(tok!"doubleLiteral", 209 tok!"floatLiteral", tok!"idoubleLiteral", tok!"ifloatLiteral", 210 tok!"intLiteral", tok!"longLiteral", tok!"realLiteral", 211 tok!"irealLiteral", tok!"uintLiteral", tok!"ulongLiteral"); 212 213 /** 214 * Returns: true if the given ID type is for a number literal. 215 */ 216 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc 217 { 218 switch (type) 219 { 220 foreach (T; NumberLiterals) 221 { 222 case T: 223 return true; 224 } 225 default: 226 return false; 227 } 228 } 229 230 /** 231 * Operator token types. 232 */ 233 public alias Operators = AliasSeq!(tok!",", tok!".", tok!"..", tok!"...", 234 tok!"/", tok!"/=", tok!"!", tok!"!<", tok!"!<=", tok!"!<>", 235 tok!"!<>=", tok!"!=", tok!"!>", tok!"!>=", tok!"$", tok!"%", 236 tok!"%=", tok!"&", tok!"&&", tok!"&=", tok!"(", tok!")", 237 tok!"*", tok!"*=", tok!"+", tok!"++", tok!"+=", tok!"-", 238 tok!"--", tok!"-=", tok!":", tok!";", tok!"<", tok!"<<", 239 tok!"<<=", tok!"<=", tok!"<>", tok!"<>=", tok!"=", tok!"==", 240 tok!"=>", tok!">", tok!">=", tok!">>", tok!">>=", tok!">>>", 241 tok!">>>=", tok!"?", tok!"@", tok!"[", tok!"]", tok!"^", 242 tok!"^=", tok!"^^", tok!"^^=", tok!"{", tok!"|", tok!"|=", 243 tok!"||", tok!"}", tok!"~", tok!"~="); 244 245 /** 246 * Returns: true if the given ID type is for an operator. 247 */ 248 public bool isOperator(IdType type) nothrow pure @safe @nogc 249 { 250 switch (type) 251 { 252 foreach (T; Operators) 253 { 254 case T: 255 return true; 256 } 257 default: 258 return false; 259 } 260 } 261 262 /** 263 * Keyword token types. 264 */ 265 public alias Keywords = AliasSeq!(tok!"abstract", tok!"alias", tok!"align", 266 tok!"asm", tok!"assert", tok!"auto", tok!"body", tok!"break", 267 tok!"case", tok!"cast", tok!"catch", tok!"class", tok!"const", 268 tok!"continue", tok!"debug", tok!"default", tok!"delegate", 269 tok!"delete", tok!"deprecated", tok!"do", tok!"else", tok!"enum", 270 tok!"export", tok!"extern", tok!"false", tok!"final", tok!"finally", 271 tok!"for", tok!"foreach", tok!"foreach_reverse", tok!"function", 272 tok!"goto", tok!"if", tok!"immutable", tok!"import", tok!"in", 273 tok!"inout", tok!"interface", tok!"invariant", tok!"is", 274 tok!"lazy", tok!"macro", tok!"mixin", tok!"module", tok!"new", 275 tok!"nothrow", tok!"null", tok!"out", tok!"override", tok!"package", 276 tok!"pragma", tok!"private", tok!"protected", tok!"public", 277 tok!"pure", tok!"ref", tok!"return", tok!"scope", tok!"shared", 278 tok!"static", tok!"struct", tok!"super", tok!"switch", tok!"synchronized", 279 tok!"template", tok!"this", tok!"throw", tok!"true", tok!"try", 280 tok!"typedef", tok!"typeid", tok!"typeof", tok!"union", tok!"unittest", 281 tok!"version", tok!"volatile", tok!"while", tok!"with", tok!"__DATE__", 282 tok!"__EOF__", tok!"__FILE__", tok!"__FUNCTION__", tok!"__gshared", 283 tok!"__LINE__", tok!"__MODULE__", tok!"__parameters", tok!"__PRETTY_FUNCTION__", 284 tok!"__TIME__", tok!"__TIMESTAMP__", tok!"__traits", tok!"__vector", 285 tok!"__VENDOR__", tok!"__VERSION__"); 286 287 /** 288 * Returns: true if the given ID type is for a keyword. 289 */ 290 public bool isKeyword(IdType type) pure nothrow @safe @nogc 291 { 292 switch (type) 293 { 294 foreach (T; Keywords) 295 { 296 case T: 297 return true; 298 } 299 default: 300 return false; 301 } 302 } 303 304 /** 305 * String literal token types 306 */ 307 public alias StringLiterals = AliasSeq!(tok!"dstringLiteral", 308 tok!"stringLiteral", tok!"wstringLiteral"); 309 310 /** 311 * Returns: true if the given ID type is for a string literal. 312 */ 313 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc 314 { 315 switch (type) 316 { 317 foreach (T; StringLiterals) 318 { 319 case T: 320 return true; 321 } 322 default: 323 return false; 324 } 325 } 326 327 /** 328 * Protection token types. 329 */ 330 public alias Protections = AliasSeq!(tok!"export", tok!"package", 331 tok!"private", tok!"public", tok!"protected"); 332 333 /** 334 * Returns: true if the given ID type is for a protection attribute. 335 */ 336 public bool isProtection(IdType type) pure nothrow @safe @nogc 337 { 338 switch (type) 339 { 340 foreach (T; Protections) 341 { 342 case T: 343 return true; 344 } 345 default: 346 return false; 347 } 348 } 349 350 public alias SpecialTokens = AliasSeq!(tok!"__DATE__", tok!"__TIME__", tok!"__TIMESTAMP__", 351 tok!"__VENDOR__", tok!"__VERSION__", tok!"__FILE__", tok!"__LINE__", 352 tok!"__MODULE__", tok!"__FUNCTION__", tok!"__PRETTY_FUNCTION__"); 353 354 public bool isSpecialToken(IdType type) pure nothrow @safe @nogc 355 { 356 switch (type) 357 { 358 foreach (T; SpecialTokens) 359 { 360 case T: 361 return true; 362 } 363 default: 364 return false; 365 } 366 } 367 368 public alias Literals = AliasSeq!(StringLiterals, NumberLiterals, tok!"characterLiteral", 369 SpecialTokens, tok!"true", tok!"false", tok!"null", tok!"$"); 370 371 public bool isLiteral(IdType type) pure nothrow @safe @nogc 372 { 373 switch (type) 374 { 375 foreach (T; Literals) 376 { 377 case T: 378 return true; 379 } 380 default: 381 return false; 382 } 383 } 384 385 /** 386 * Returns: an array of tokens lexed from the given source code to the output range. All 387 * whitespace tokens are skipped and comments are attached to the token nearest 388 * to them. 389 */ 390 const(Token)[] getTokensForParser(ubyte[] sourceCode, LexerConfig config, 391 StringCache* cache) 392 { 393 enum CommentType : ubyte 394 { 395 notDoc, 396 line, 397 block 398 } 399 400 static CommentType commentType(string comment) pure nothrow @safe 401 { 402 if (comment.length < 3) 403 return CommentType.notDoc; 404 if (comment[0 ..3] == "///") 405 return CommentType.line; 406 if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**") 407 return CommentType.block; 408 return CommentType.notDoc; 409 } 410 411 config.whitespaceBehavior = WhitespaceBehavior.skip; 412 config.commentBehavior = CommentBehavior.noIntern; 413 414 auto leadingCommentAppender = appender!(char[])(); 415 leadingCommentAppender.reserve(1024); 416 auto trailingCommentAppender = appender!(char[])(); 417 trailingCommentAppender.reserve(1024); 418 bool hadDdoc; 419 string empty = cache.intern(""); 420 auto output = appender!(typeof(return))(); 421 auto lexer = DLexer(sourceCode, config, cache); 422 size_t tokenCount; 423 loop: while (!lexer.empty) switch (lexer.front.type) 424 { 425 case tok!"specialTokenSequence": 426 case tok!"whitespace": 427 lexer.popFront(); 428 break; 429 case tok!"comment": 430 final switch (commentType(lexer.front.text)) 431 { 432 case CommentType.block: 433 case CommentType.line: 434 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 435 { 436 if (!trailingCommentAppender.data.empty) 437 trailingCommentAppender.put('\n'); 438 unDecorateComment(lexer.front.text, trailingCommentAppender); 439 hadDdoc = true; 440 } 441 else 442 { 443 if (!leadingCommentAppender.data.empty) 444 leadingCommentAppender.put('\n'); 445 unDecorateComment(lexer.front.text, leadingCommentAppender); 446 hadDdoc = true; 447 } 448 lexer.popFront(); 449 break; 450 case CommentType.notDoc: 451 lexer.popFront(); 452 break; 453 } 454 break; 455 case tok!"__EOF__": 456 if (!trailingCommentAppender.data.empty) 457 (cast() output.data[$ - 1].trailingComment) = cache.intern(cast(string) trailingCommentAppender.data); 458 break loop; 459 default: 460 Token t = lexer.front; 461 lexer.popFront(); 462 tokenCount++; 463 if (!trailingCommentAppender.data.empty) 464 (cast() output.data[$ - 1].trailingComment) = cache.intern(cast(string) trailingCommentAppender.data); 465 t.comment = leadingCommentAppender.data.length > 0 466 ? cache.intern(cast(string) leadingCommentAppender.data) : (hadDdoc ? empty : null); 467 468 leadingCommentAppender.clear(); 469 trailingCommentAppender.clear(); 470 hadDdoc = false; 471 output.put(t); 472 break; 473 } 474 return output.data; 475 } 476 477 /** 478 * The D lexer struct. 479 */ 480 public struct DLexer 481 { 482 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 483 keywords, pseudoTokenHandlers); 484 485 /// 486 @disable this(); 487 488 /** 489 * Params: 490 * range = the bytes that compose the source code that will be lexed. 491 * config = the lexer configuration to use. 492 * cache = the string interning cache for de-duplicating identifiers and 493 * other token text. 494 */ 495 this(ubyte[] range, const LexerConfig config, StringCache* cache, 496 bool haveSSE42 = sse42()) pure nothrow @safe 497 { 498 this.haveSSE42 = haveSSE42; 499 auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf) 500 ? range[3 .. $] : range; 501 this.range = LexerRange(r); 502 this.config = config; 503 this.cache = cache; 504 popFront(); 505 } 506 507 /// 508 public void popFront()() pure nothrow @safe 509 { 510 do 511 _popFront(); 512 while (config.whitespaceBehavior == WhitespaceBehavior.skip 513 && _front.type == tok!"whitespace"); 514 } 515 516 private pure nothrow @safe: 517 518 bool isWhitespace() 519 { 520 switch (range.bytes[range.index]) 521 { 522 case ' ': 523 case '\r': 524 case '\n': 525 case '\t': 526 case '\v': 527 case '\f': 528 return true; 529 case 0xe2: 530 auto peek = range.peek(2); 531 return peek.length == 2 532 && peek[0] == 0x80 533 && (peek[1] == 0xa8 || peek[1] == 0xa9); 534 default: 535 return false; 536 } 537 } 538 539 void popFrontWhitespaceAware() 540 { 541 switch (range.bytes[range.index]) 542 { 543 case '\r': 544 range.popFront(); 545 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 546 { 547 range.popFront(); 548 range.incrementLine(); 549 } 550 else 551 range.incrementLine(); 552 return; 553 case '\n': 554 range.popFront(); 555 range.incrementLine(); 556 return; 557 case 0xe2: 558 auto lookahead = range.peek(3); 559 if (lookahead.length == 3 && lookahead[1] == 0x80 560 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 561 { 562 range.index+=3; 563 range.column+=3; 564 range.incrementLine(); 565 return; 566 } 567 else 568 { 569 range.popFront(); 570 return; 571 } 572 default: 573 range.popFront(); 574 return; 575 } 576 } 577 578 void lexWhitespace(ref Token token) @trusted 579 { 580 mixin (tokenStart); 581 loop: do 582 { 583 version (iasm64NotWindows) 584 { 585 if (haveSSE42 && range.index + 16 < range.bytes.length) 586 { 587 skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index, 588 &range.index, &range.column); 589 } 590 } 591 switch (range.bytes[range.index]) 592 { 593 case '\r': 594 range.popFront(); 595 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 596 { 597 range.popFront(); 598 } 599 range.column = 1; 600 range.line += 1; 601 break; 602 case '\n': 603 range.popFront(); 604 range.column = 1; 605 range.line += 1; 606 break; 607 case ' ': 608 case '\t': 609 case '\v': 610 case '\f': 611 range.popFront(); 612 break; 613 case 0xe2: 614 if (range.index + 2 >= range.bytes.length) 615 break loop; 616 if (range.bytes[range.index + 1] != 0x80) 617 break loop; 618 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9) 619 { 620 range.index += 3; 621 range.column += 3; 622 range.column = 1; 623 range.line += 1; 624 break; 625 } 626 break loop; 627 default: 628 break loop; 629 } 630 } while (!(range.index >= range.bytes.length)); 631 string text = config.whitespaceBehavior == WhitespaceBehavior.include 632 ? cache.intern(range.slice(mark)) : ""; 633 token = Token(tok!"whitespace", text, line, column, index); 634 } 635 636 void lexNumber(ref Token token) 637 { 638 mixin (tokenStart); 639 if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length) 640 { 641 immutable ahead = range.bytes[range.index + 1]; 642 switch (ahead) 643 { 644 case 'x': 645 case 'X': 646 range.index += 2; 647 range.column += 2; 648 lexHex(token, mark, line, column, index); 649 return; 650 case 'b': 651 case 'B': 652 range.index += 2; 653 range.column += 2; 654 lexBinary(token, mark, line, column, index); 655 return; 656 default: 657 lexDecimal(token, mark, line, column, index); 658 return; 659 } 660 } 661 else 662 lexDecimal(token, mark, line, column, index); 663 } 664 665 void lexHex(ref Token token) 666 { 667 mixin (tokenStart); 668 lexHex(token, mark, line, column, index); 669 } 670 671 void lexHex(ref Token token, size_t mark, size_t line, size_t column, 672 size_t index) @trusted 673 { 674 IdType type = tok!"intLiteral"; 675 bool foundDot; 676 hexLoop: while (!(range.index >= range.bytes.length)) 677 { 678 switch (range.bytes[range.index]) 679 { 680 case 'a': .. case 'f': 681 case 'A': .. case 'F': 682 case '0': .. case '9': 683 case '_': 684 version (iasm64NotWindows) 685 { 686 if (haveSSE42 && range.index + 16 < range.bytes.length) 687 { 688 immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_') 689 (range.bytes.ptr + range.index); 690 range.column += i; 691 range.index += i; 692 } 693 else 694 range.popFront(); 695 } 696 else 697 range.popFront(); 698 break; 699 case 'u': 700 case 'U': 701 lexIntSuffix(type); 702 break hexLoop; 703 case 'i': 704 if (foundDot) 705 lexFloatSuffix(type); 706 break hexLoop; 707 case 'L': 708 if (foundDot) 709 lexFloatSuffix(type); 710 else 711 lexIntSuffix(type); 712 break hexLoop; 713 case 'p': 714 case 'P': 715 lexExponent(type); 716 break hexLoop; 717 case '.': 718 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 719 break hexLoop; 720 else 721 { 722 // The following bit of silliness tries to tell the 723 // difference between "int dot identifier" and 724 // "double identifier". 725 if (range.index + 1 < range.bytes.length) 726 { 727 switch (range.peekAt(1)) 728 { 729 case '0': .. case '9': 730 case 'A': .. case 'F': 731 case 'a': .. case 'f': 732 goto doubleLiteral; 733 default: 734 break hexLoop; 735 } 736 } 737 else 738 { 739 doubleLiteral: 740 range.popFront(); 741 foundDot = true; 742 type = tok!"doubleLiteral"; 743 } 744 } 745 break; 746 default: 747 break hexLoop; 748 } 749 } 750 token = Token(type, cache.intern(range.slice(mark)), line, column, 751 index); 752 } 753 754 void lexBinary(ref Token token) 755 { 756 mixin (tokenStart); 757 return lexBinary(token, mark, line, column, index); 758 } 759 760 void lexBinary(ref Token token, size_t mark, size_t line, size_t column, 761 size_t index) @trusted 762 { 763 IdType type = tok!"intLiteral"; 764 binaryLoop: while (!(range.index >= range.bytes.length)) 765 { 766 switch (range.bytes[range.index]) 767 { 768 case '0': 769 case '1': 770 case '_': 771 version (iasm64NotWindows) 772 { 773 if (haveSSE42 && range.index + 16 < range.bytes.length) 774 { 775 immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')( 776 range.bytes.ptr + range.index); 777 range.column += i; 778 range.index += i; 779 } 780 else 781 range.popFront(); 782 } 783 else 784 range.popFront(); 785 break; 786 case 'u': 787 case 'U': 788 case 'L': 789 lexIntSuffix(type); 790 break binaryLoop; 791 default: 792 break binaryLoop; 793 } 794 } 795 token = Token(type, cache.intern(range.slice(mark)), line, column, 796 index); 797 } 798 799 void lexDecimal(ref Token token) 800 { 801 mixin (tokenStart); 802 lexDecimal(token, mark, line, column, index); 803 } 804 805 void lexDecimal(ref Token token, size_t mark, size_t line, size_t column, 806 size_t index) @trusted 807 { 808 bool foundDot = range.bytes[range.index] == '.'; 809 IdType type = tok!"intLiteral"; 810 if (foundDot) 811 { 812 range.popFront(); 813 type = tok!"doubleLiteral"; 814 } 815 816 decimalLoop: while (!(range.index >= range.bytes.length)) 817 { 818 switch (range.bytes[range.index]) 819 { 820 case '0': .. case '9': 821 case '_': 822 version (iasm64NotWindows) 823 { 824 if (haveSSE42 && range.index + 16 < range.bytes.length) 825 { 826 immutable ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index); 827 range.column += i; 828 range.index += i; 829 } 830 else 831 range.popFront(); 832 } 833 else 834 range.popFront(); 835 break; 836 case 'u': 837 case 'U': 838 if (!foundDot) 839 lexIntSuffix(type); 840 break decimalLoop; 841 case 'i': 842 lexFloatSuffix(type); 843 break decimalLoop; 844 case 'L': 845 if (foundDot) 846 lexFloatSuffix(type); 847 else 848 lexIntSuffix(type); 849 break decimalLoop; 850 case 'f': 851 case 'F': 852 lexFloatSuffix(type); 853 break decimalLoop; 854 case 'e': 855 case 'E': 856 lexExponent(type); 857 break decimalLoop; 858 case '.': 859 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 860 break decimalLoop; 861 else 862 { 863 // The following bit of silliness tries to tell the 864 // difference between "int dot identifier" and 865 // "double identifier". 866 if (range.index + 1 < range.bytes.length) 867 { 868 immutable ch = range.peekAt(1); 869 if (ch <= 0x2f 870 || (ch >= '0' && ch <= '9') 871 || (ch >= ':' && ch <= '@') 872 || (ch >= '[' && ch <= '^') 873 || (ch >= '{' && ch <= '~') 874 || ch == '`' || ch == '_') 875 { 876 goto doubleLiteral; 877 } 878 else 879 break decimalLoop; 880 } 881 else 882 { 883 doubleLiteral: 884 range.popFront(); 885 foundDot = true; 886 type = tok!"doubleLiteral"; 887 } 888 } 889 break; 890 default: 891 break decimalLoop; 892 } 893 } 894 token = Token(type, cache.intern(range.slice(mark)), line, column, 895 index); 896 } 897 898 void lexIntSuffix(ref IdType type) pure nothrow @safe 899 { 900 bool secondPass; 901 if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U') 902 { 903 U: 904 if (type == tok!"intLiteral") 905 type = tok!"uintLiteral"; 906 else 907 type = tok!"ulongLiteral"; 908 range.popFront(); 909 if (secondPass) 910 return; 911 if (range.index < range.bytes.length 912 && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')) 913 goto L; 914 goto I; 915 } 916 if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l') 917 { 918 L: 919 if (type == tok!"uintLiteral") 920 type = tok!"ulongLiteral"; 921 else 922 type = tok!"longLiteral"; 923 range.popFront(); 924 if (range.index < range.bytes.length 925 && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u')) 926 { 927 secondPass = true; 928 goto U; 929 } 930 goto I; 931 } 932 I: 933 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 934 { 935 warning("Complex number literals are deprecated"); 936 range.popFront(); 937 if (type == tok!"longLiteral" || type == tok!"ulongLiteral") 938 type = tok!"idoubleLiteral"; 939 else 940 type = tok!"ifloatLiteral"; 941 } 942 } 943 944 void lexFloatSuffix(ref IdType type) pure nothrow @safe 945 { 946 switch (range.bytes[range.index]) 947 { 948 case 'L': 949 range.popFront(); 950 type = tok!"doubleLiteral"; 951 break; 952 case 'f': 953 case 'F': 954 range.popFront(); 955 type = tok!"floatLiteral"; 956 break; 957 default: 958 break; 959 } 960 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 961 { 962 warning("Complex number literals are deprecated"); 963 range.popFront(); 964 if (type == tok!"floatLiteral") 965 type = tok!"ifloatLiteral"; 966 else 967 type = tok!"idoubleLiteral"; 968 } 969 } 970 971 void lexExponent(ref IdType type) pure nothrow @safe 972 { 973 range.popFront(); 974 bool foundSign = false; 975 bool foundDigit = false; 976 while (range.index < range.bytes.length) 977 { 978 switch (range.bytes[range.index]) 979 { 980 case '-': 981 case '+': 982 if (foundSign) 983 { 984 if (!foundDigit) 985 error("Expected an exponent"); 986 return; 987 } 988 foundSign = true; 989 range.popFront(); 990 break; 991 case '0': .. case '9': 992 case '_': 993 foundDigit = true; 994 range.popFront(); 995 break; 996 case 'L': 997 case 'f': 998 case 'F': 999 case 'i': 1000 lexFloatSuffix(type); 1001 return; 1002 default: 1003 if (!foundDigit) 1004 error("Expected an exponent"); 1005 return; 1006 } 1007 } 1008 } 1009 1010 void lexScriptLine(ref Token token) 1011 { 1012 mixin (tokenStart); 1013 while (!(range.index >= range.bytes.length) && !isNewline) 1014 { 1015 range.popFront(); 1016 } 1017 token = Token(tok!"scriptLine", cache.intern(range.slice(mark)), 1018 line, column, index); 1019 } 1020 1021 void lexSpecialTokenSequence(ref Token token) 1022 { 1023 mixin (tokenStart); 1024 while (!(range.index >= range.bytes.length) && !isNewline) 1025 { 1026 range.popFront(); 1027 } 1028 token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), 1029 line, column, index); 1030 } 1031 1032 void lexSlashStarComment(ref Token token) @trusted 1033 { 1034 mixin (tokenStart); 1035 IdType type = tok!"comment"; 1036 range.popFrontN(2); 1037 while (range.index < range.bytes.length) 1038 { 1039 version (iasm64NotWindows) 1040 { 1041 if (haveSSE42 && range.index + 16 < range.bytes.length) 1042 skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index, 1043 &range.index, &range.column); 1044 } 1045 if (range.bytes[range.index] == '*') 1046 { 1047 range.popFront(); 1048 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1049 { 1050 range.popFront(); 1051 break; 1052 } 1053 } 1054 else 1055 popFrontWhitespaceAware(); 1056 } 1057 if (config.commentBehavior == CommentBehavior.intern) 1058 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1059 else 1060 token = Token(type, cast(string) range.slice(mark), line, column, index); 1061 } 1062 1063 void lexSlashSlashComment(ref Token token) @trusted 1064 { 1065 mixin (tokenStart); 1066 IdType type = tok!"comment"; 1067 range.popFrontN(2); 1068 while (range.index < range.bytes.length) 1069 { 1070 version (iasm64NotWindows) 1071 { 1072 if (haveSSE42 && range.index + 16 < range.bytes.length) 1073 { 1074 skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1075 &range.index, &range.column); 1076 } 1077 } 1078 if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n') 1079 break; 1080 range.popFront(); 1081 } 1082 if (config.commentBehavior == CommentBehavior.intern) 1083 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1084 else 1085 token = Token(type, cast(string) range.slice(mark), line, column, index); 1086 } 1087 1088 void lexSlashPlusComment(ref Token token) @trusted 1089 { 1090 mixin (tokenStart); 1091 IdType type = tok!"comment"; 1092 range.index += 2; 1093 range.column += 2; 1094 int depth = 1; 1095 while (depth > 0 && !(range.index >= range.bytes.length)) 1096 { 1097 version (iasm64NotWindows) 1098 { 1099 if (haveSSE42 && range.index + 16 < range.bytes.length) 1100 { 1101 skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1102 &range.index, &range.column); 1103 } 1104 } 1105 if (range.bytes[range.index] == '+') 1106 { 1107 range.popFront(); 1108 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1109 { 1110 range.popFront(); 1111 depth--; 1112 } 1113 } 1114 else if (range.bytes[range.index] == '/') 1115 { 1116 range.popFront(); 1117 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+') 1118 { 1119 range.popFront(); 1120 depth++; 1121 } 1122 } 1123 else 1124 popFrontWhitespaceAware(); 1125 } 1126 if (config.commentBehavior == CommentBehavior.intern) 1127 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1128 else 1129 token = Token(type, cast(string) range.slice(mark), line, column, index); 1130 } 1131 1132 void lexStringLiteral(ref Token token) @trusted 1133 { 1134 mixin (tokenStart); 1135 range.popFront(); 1136 while (true) 1137 { 1138 if (range.index >= range.bytes.length) 1139 { 1140 error("Error: unterminated string literal"); 1141 token = Token(tok!""); 1142 return; 1143 } 1144 version (iasm64NotWindows) 1145 { 1146 if (haveSSE42 && range.index + 16 < range.bytes.length) 1147 { 1148 skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1149 &range.index, &range.column); 1150 } 1151 } 1152 if (range.bytes[range.index] == '"') 1153 { 1154 range.popFront(); 1155 break; 1156 } 1157 else if (range.bytes[range.index] == '\\') 1158 { 1159 lexEscapeSequence(); 1160 } 1161 else 1162 popFrontWhitespaceAware(); 1163 } 1164 IdType type = tok!"stringLiteral"; 1165 lexStringSuffix(type); 1166 token = Token(type, cache.intern(range.slice(mark)), line, column, 1167 index); 1168 } 1169 1170 void lexWysiwygString(ref Token token) @trusted 1171 { 1172 mixin (tokenStart); 1173 IdType type = tok!"stringLiteral"; 1174 immutable bool backtick = range.bytes[range.index] == '`'; 1175 if (backtick) 1176 { 1177 range.popFront(); 1178 while (true) 1179 { 1180 if (range.index >= range.bytes.length) 1181 { 1182 error("Error: unterminated string literal"); 1183 token = Token(tok!""); 1184 return; 1185 } 1186 version (iasm64NotWindows) 1187 { 1188 if (haveSSE42 && range.index + 16 < range.bytes.length) 1189 { 1190 skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index, 1191 &range.index, &range.column); 1192 } 1193 } 1194 if (range.bytes[range.index] == '`') 1195 { 1196 range.popFront(); 1197 break; 1198 } 1199 else 1200 popFrontWhitespaceAware(); 1201 } 1202 } 1203 else 1204 { 1205 range.popFront(); 1206 if (range.index >= range.bytes.length) 1207 { 1208 error("Error: unterminated string literal"); 1209 token = Token(tok!""); 1210 return; 1211 } 1212 range.popFront(); 1213 while (true) 1214 { 1215 if (range.index >= range.bytes.length) 1216 { 1217 error("Error: unterminated string literal"); 1218 token = Token(tok!""); 1219 return; 1220 } 1221 else if (range.bytes[range.index] == '"') 1222 { 1223 range.popFront(); 1224 break; 1225 } 1226 else 1227 popFrontWhitespaceAware(); 1228 } 1229 } 1230 lexStringSuffix(type); 1231 token = Token(type, cache.intern(range.slice(mark)), line, column, 1232 index); 1233 } 1234 1235 private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe 1236 { 1237 if (range.index >= range.bytes.length) 1238 { 1239 type = tok!"stringLiteral"; 1240 return 0; 1241 } 1242 else 1243 { 1244 switch (range.bytes[range.index]) 1245 { 1246 case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w'; 1247 case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd'; 1248 case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c'; 1249 default: type = tok!"stringLiteral"; return 0; 1250 } 1251 } 1252 } 1253 1254 void lexDelimitedString(ref Token token) 1255 { 1256 mixin (tokenStart); 1257 range.index += 2; 1258 range.column += 2; 1259 ubyte open; 1260 ubyte close; 1261 switch (range.bytes[range.index]) 1262 { 1263 case '<': 1264 open = '<'; 1265 close = '>'; 1266 range.popFront(); 1267 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1268 break; 1269 case '{': 1270 open = '{'; 1271 close = '}'; 1272 range.popFront(); 1273 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1274 break; 1275 case '[': 1276 open = '['; 1277 close = ']'; 1278 range.popFront(); 1279 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1280 break; 1281 case '(': 1282 open = '('; 1283 close = ')'; 1284 range.popFront(); 1285 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1286 break; 1287 default: 1288 lexHeredocString(token, mark, line, column, index); 1289 break; 1290 } 1291 } 1292 1293 void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, 1294 size_t index, ubyte open, ubyte close) 1295 { 1296 int depth = 1; 1297 while (!(range.index >= range.bytes.length) && depth > 0) 1298 { 1299 if (range.bytes[range.index] == open) 1300 { 1301 depth++; 1302 range.popFront(); 1303 } 1304 else if (range.bytes[range.index] == close) 1305 { 1306 depth--; 1307 range.popFront(); 1308 if (depth <= 0) 1309 { 1310 if (range.bytes[range.index] == '"') 1311 { 1312 range.popFront(); 1313 } 1314 else 1315 { 1316 error("Error: \" expected to end delimited string literal"); 1317 token = Token(tok!""); 1318 return; 1319 } 1320 } 1321 } 1322 else 1323 popFrontWhitespaceAware(); 1324 } 1325 IdType type = tok!"stringLiteral"; 1326 lexStringSuffix(type); 1327 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1328 } 1329 1330 void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index) 1331 { 1332 Token ident; 1333 lexIdentifier(ident); 1334 if (isNewline()) 1335 popFrontWhitespaceAware(); 1336 else 1337 error("Newline expected"); 1338 while (!(range.index >= range.bytes.length)) 1339 { 1340 if (isNewline()) 1341 { 1342 popFrontWhitespaceAware(); 1343 if (!range.canPeek(ident.text.length)) 1344 { 1345 error(ident.text ~ " expected"); 1346 break; 1347 } 1348 if (range.peek(ident.text.length - 1) == ident.text) 1349 { 1350 range.popFrontN(ident.text.length); 1351 break; 1352 } 1353 } 1354 else 1355 { 1356 range.popFront(); 1357 } 1358 } 1359 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"') 1360 { 1361 range.popFront(); 1362 } 1363 else 1364 error(`" expected`); 1365 IdType type = tok!"stringLiteral"; 1366 lexStringSuffix(type); 1367 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1368 } 1369 1370 void lexTokenString(ref Token token) 1371 { 1372 mixin (tokenStart); 1373 assert (range.bytes[range.index] == 'q'); 1374 range.popFront(); 1375 assert (range.bytes[range.index] == '{'); 1376 range.popFront(); 1377 auto app = appender!string(); 1378 app.put("q{"); 1379 int depth = 1; 1380 1381 immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; 1382 immutable StringBehavior oldString = config.stringBehavior; 1383 config.whitespaceBehavior = WhitespaceBehavior.include; 1384 config.stringBehavior = StringBehavior.source; 1385 scope (exit) 1386 { 1387 config.whitespaceBehavior = oldWhitespace; 1388 config.stringBehavior = oldString; 1389 } 1390 1391 advance(_front); 1392 while (depth > 0 && !empty) 1393 { 1394 auto t = front(); 1395 if (t.text is null) 1396 app.put(str(t.type)); 1397 else 1398 app.put(t.text); 1399 if (t.type == tok!"}") 1400 { 1401 depth--; 1402 if (depth > 0) 1403 popFront(); 1404 } 1405 else if (t.type == tok!"{") 1406 { 1407 depth++; 1408 popFront(); 1409 } 1410 else 1411 popFront(); 1412 } 1413 IdType type = tok!"stringLiteral"; 1414 auto b = lexStringSuffix(type); 1415 if (b != 0) 1416 app.put(b); 1417 token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, 1418 column, index); 1419 } 1420 1421 void lexHexString(ref Token token) 1422 { 1423 mixin (tokenStart); 1424 range.index += 2; 1425 range.column += 2; 1426 1427 loop: while (true) 1428 { 1429 if (range.index >= range.bytes.length) 1430 { 1431 error("Error: unterminated hex string literal"); 1432 token = Token(tok!""); 1433 return; 1434 } 1435 else if (isWhitespace()) 1436 popFrontWhitespaceAware(); 1437 else switch (range.bytes[range.index]) 1438 { 1439 case '0': .. case '9': 1440 case 'A': .. case 'F': 1441 case 'a': .. case 'f': 1442 range.popFront(); 1443 break; 1444 case '"': 1445 range.popFront(); 1446 break loop; 1447 default: 1448 error("Error: invalid character in hex string"); 1449 token = Token(tok!""); 1450 return; 1451 } 1452 } 1453 1454 IdType type = tok!"stringLiteral"; 1455 lexStringSuffix(type); 1456 token = Token(type, cache.intern(range.slice(mark)), line, column, 1457 index); 1458 } 1459 1460 bool lexEscapeSequence() 1461 { 1462 range.popFront(); 1463 if (range.index >= range.bytes.length) 1464 { 1465 error("Error: non-terminated character escape sequence."); 1466 return false; 1467 } 1468 switch (range.bytes[range.index]) 1469 { 1470 case '\'': 1471 case '"': 1472 case '?': 1473 case '\\': 1474 case 'a': 1475 case 'b': 1476 case 'f': 1477 case 'n': 1478 case 'r': 1479 case 't': 1480 case 'v': 1481 range.popFront(); 1482 break; 1483 case 'x': 1484 range.popFront(); 1485 foreach (i; 0 .. 2) 1486 { 1487 if (range.index >= range.bytes.length) 1488 { 1489 error("Error: 2 hex digits expected."); 1490 return false; 1491 } 1492 switch (range.bytes[range.index]) 1493 { 1494 case '0': .. case '9': 1495 case 'a': .. case 'f': 1496 case 'A': .. case 'F': 1497 range.popFront(); 1498 break; 1499 default: 1500 error("Error: 2 hex digits expected."); 1501 return false; 1502 } 1503 } 1504 break; 1505 case '0': 1506 if (!(range.index + 1 < range.bytes.length) 1507 || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\'')) 1508 { 1509 range.popFront(); 1510 break; 1511 } 1512 goto case; 1513 case '1': .. case '7': 1514 for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) 1515 && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++) 1516 range.popFront(); 1517 break; 1518 case 'u': 1519 range.popFront(); 1520 foreach (i; 0 .. 4) 1521 { 1522 if (range.index >= range.bytes.length) 1523 { 1524 error("Error: at least 4 hex digits expected."); 1525 return false; 1526 } 1527 switch (range.bytes[range.index]) 1528 { 1529 case '0': .. case '9': 1530 case 'a': .. case 'f': 1531 case 'A': .. case 'F': 1532 range.popFront(); 1533 break; 1534 default: 1535 error("Error: at least 4 hex digits expected."); 1536 return false; 1537 } 1538 } 1539 break; 1540 case 'U': 1541 range.popFront(); 1542 foreach (i; 0 .. 8) 1543 { 1544 if (range.index >= range.bytes.length) 1545 { 1546 error("Error: at least 8 hex digits expected."); 1547 return false; 1548 } 1549 switch (range.bytes[range.index]) 1550 { 1551 case '0': .. case '9': 1552 case 'a': .. case 'f': 1553 case 'A': .. case 'F': 1554 range.popFront(); 1555 break; 1556 default: 1557 error("Error: at least 8 hex digits expected."); 1558 return false; 1559 } 1560 } 1561 break; 1562 default: 1563 while (true) 1564 { 1565 if (range.index >= range.bytes.length) 1566 { 1567 error("Error: non-terminated character escape sequence."); 1568 return false; 1569 } 1570 if (range.bytes[range.index] == ';') 1571 { 1572 range.popFront(); 1573 break; 1574 } 1575 else 1576 { 1577 range.popFront(); 1578 } 1579 } 1580 } 1581 return true; 1582 } 1583 1584 void lexCharacterLiteral(ref Token token) 1585 { 1586 mixin (tokenStart); 1587 range.popFront(); 1588 if (range.bytes[range.index] == '\\') 1589 lexEscapeSequence(); 1590 else if (range.bytes[range.index] == '\'') 1591 { 1592 range.popFront(); 1593 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1594 line, column, index); 1595 } 1596 else if (range.bytes[range.index] & 0x80) 1597 { 1598 while (range.bytes[range.index] & 0x80) 1599 range.popFront(); 1600 } 1601 else 1602 popFrontWhitespaceAware(); 1603 1604 if (range.index < range.bytes.length && range.bytes[range.index] == '\'') 1605 { 1606 range.popFront(); 1607 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1608 line, column, index); 1609 } 1610 else 1611 { 1612 error("Error: Expected ' to end character literal"); 1613 token = Token(tok!""); 1614 } 1615 } 1616 1617 void lexIdentifier(ref Token token) @trusted 1618 { 1619 mixin (tokenStart); 1620 if (isSeparating(0)) 1621 { 1622 error("Invalid identifier"); 1623 range.popFront(); 1624 } 1625 while (true) 1626 { 1627 version (iasm64NotWindows) 1628 { 1629 if (haveSSE42 && range.index + 16 < range.bytes.length) 1630 { 1631 immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_') 1632 (range.bytes.ptr + range.index); 1633 range.column += i; 1634 range.index += i; 1635 } 1636 } 1637 if (isSeparating(0)) 1638 break; 1639 else 1640 range.popFront(); 1641 } 1642 token = Token(tok!"identifier", cache.intern(range.slice(mark)), line, 1643 column, index); 1644 } 1645 1646 void lexDot(ref Token token) 1647 { 1648 mixin (tokenStart); 1649 if (!(range.index + 1 < range.bytes.length)) 1650 { 1651 range.popFront(); 1652 token = Token(tok!".", null, line, column, index); 1653 return; 1654 } 1655 switch (range.peekAt(1)) 1656 { 1657 case '0': .. case '9': 1658 lexNumber(token); 1659 return; 1660 case '.': 1661 range.popFront(); 1662 range.popFront(); 1663 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.') 1664 { 1665 range.popFront(); 1666 token = Token(tok!"...", null, line, column, index); 1667 } 1668 else 1669 token = Token(tok!"..", null, line, column, index); 1670 return; 1671 default: 1672 range.popFront(); 1673 token = Token(tok!".", null, line, column, index); 1674 return; 1675 } 1676 } 1677 1678 void lexLongNewline(ref Token token) @nogc 1679 { 1680 mixin (tokenStart); 1681 range.popFront(); 1682 range.popFront(); 1683 range.popFront(); 1684 range.incrementLine(); 1685 string text = config.whitespaceBehavior == WhitespaceBehavior.include 1686 ? cache.intern(range.slice(mark)) : ""; 1687 token = Token(tok!"whitespace", text, line, 1688 column, index); 1689 } 1690 1691 bool isNewline() @nogc 1692 { 1693 if (range.bytes[range.index] == '\n') return true; 1694 if (range.bytes[range.index] == '\r') return true; 1695 return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length) 1696 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 1697 } 1698 1699 bool isSeparating(size_t offset) @nogc 1700 { 1701 enum : ubyte 1702 { 1703 n, y, m // no, yes, maybe 1704 } 1705 1706 if (range.index + offset >= range.bytes.length) 1707 return true; 1708 auto c = range.bytes[range.index + offset]; 1709 static immutable ubyte[256] LOOKUP_TABLE = [ 1710 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1711 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1712 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1713 n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y, 1714 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1715 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n, 1716 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1717 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, 1718 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1719 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1720 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1721 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1722 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1723 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1724 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1725 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m 1726 ]; 1727 immutable ubyte result = LOOKUP_TABLE[c]; 1728 if (result == n) 1729 return false; 1730 if (result == y) 1731 return true; 1732 if (result == m) 1733 { 1734 auto r = range; 1735 range.popFrontN(offset); 1736 return (r.canPeek(2) && (r.peek(2) == "\u2028" 1737 || r.peek(2) == "\u2029")); 1738 } 1739 assert (false); 1740 } 1741 1742 1743 1744 enum tokenStart = q{ 1745 size_t index = range.index; 1746 size_t column = range.column; 1747 size_t line = range.line; 1748 auto mark = range.mark(); 1749 }; 1750 1751 void error(string message) 1752 { 1753 messages ~= Message(range.line, range.column, message, true); 1754 } 1755 1756 void warning(string message) 1757 { 1758 messages ~= Message(range.line, range.column, message, false); 1759 assert (messages.length > 0); 1760 } 1761 1762 static struct Message 1763 { 1764 size_t line; 1765 size_t column; 1766 string message; 1767 bool isError; 1768 } 1769 1770 Message[] messages; 1771 StringCache* cache; 1772 LexerConfig config; 1773 bool haveSSE42; 1774 } 1775 1776 /** 1777 * Creates a token range from the given source code. Creates a default lexer 1778 * configuration and a GC-managed string cache. 1779 */ 1780 public auto byToken(ubyte[] range) 1781 { 1782 LexerConfig config; 1783 StringCache* cache = new StringCache(StringCache.defaultBucketCount); 1784 return DLexer(range, config, cache); 1785 } 1786 1787 /** 1788 * Creates a token range from the given source code. Uses the given string 1789 * cache. 1790 */ 1791 public auto byToken(ubyte[] range, StringCache* cache) 1792 { 1793 LexerConfig config; 1794 return DLexer(range, config, cache); 1795 } 1796 1797 /** 1798 * Creates a token range from the given source code. Uses the provided lexer 1799 * configuration and string cache. 1800 */ 1801 public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache) 1802 { 1803 return DLexer(range, config, cache); 1804 } 1805 1806 /** 1807 * Removes "decoration" such as leading whitespace, leading + and * characters, 1808 * and places the result into the given output range 1809 */ 1810 public void unDecorateComment(T)(string comment, auto ref T outputRange) 1811 if (isOutputRange!(T, string)) 1812 in 1813 { 1814 assert (comment.length >= 3); 1815 } 1816 body 1817 { 1818 import std.string : lineSplitter, stripRight; 1819 1820 static void adjustBeginningAndEnd(string s, ref size_t a, ref size_t b) pure nothrow @nogc @safe 1821 { 1822 immutable char c = s[1]; 1823 while (a < b && s[a] == c) a++; 1824 while (b > a && s[b] == c) b--; 1825 b++; 1826 } 1827 1828 string leadingChars; 1829 size_t i = 3; 1830 size_t j; 1831 bool hasOutput = false; 1832 bool lastWasBlank = false; 1833 switch (comment[0 .. 3]) 1834 { 1835 case "///": 1836 j = comment.length; 1837 1838 foreach (line; lineSplitter(comment)) 1839 { 1840 auto l = line[3 .. $]; 1841 if (leadingChars.empty) 1842 { 1843 size_t k = 0; 1844 while (k < l.length && (l[k] == ' ' || l[k] == '\t')) k++; 1845 leadingChars = l[0 .. k]; 1846 } 1847 immutable string stripped = l.stripRight(); 1848 if (hasOutput) 1849 outputRange.put('\n'); 1850 else 1851 hasOutput = true; 1852 if (stripped.length >= leadingChars.length && stripped.startsWith(leadingChars)) 1853 outputRange.put(stripped[leadingChars.length .. $]); 1854 else 1855 outputRange.put(stripped); 1856 } 1857 break; 1858 case "/++": 1859 case "/**": 1860 j = comment.length - 2; 1861 // Skip beginning and ending stars and plusses 1862 adjustBeginningAndEnd(comment, i, j); 1863 foreach (line; lineSplitter(comment[i .. j])) 1864 { 1865 immutable string stripped = line.stripRight(); 1866 if (leadingChars.empty) 1867 { 1868 size_t k = 0; 1869 while (k < line.length && (line[k] == ' ' || line[k] == '\t')) k++; 1870 if (k < line.length && line[k] == comment[1]) 1871 { 1872 k++; 1873 while (k < line.length && (line[k] == ' ' || line[k] == '\t')) k++; 1874 } 1875 if (k == stripped.length) 1876 continue; 1877 leadingChars = line[0 .. k]; 1878 } 1879 1880 if (stripped.startsWith(leadingChars)) 1881 { 1882 if (stripped.length > leadingChars.length) 1883 { 1884 if (hasOutput) 1885 outputRange.put('\n'); 1886 hasOutput = true; 1887 if (lastWasBlank) 1888 outputRange.put('\n'); 1889 lastWasBlank = false; 1890 outputRange.put(stripped[leadingChars.length .. $]); 1891 } 1892 } 1893 else if (hasOutput && stripped.length == leadingChars.stripRight().length) 1894 lastWasBlank = true; 1895 else if (!stripped.empty && !leadingChars.startsWith(stripped)) 1896 { 1897 if (hasOutput) 1898 outputRange.put('\n'); 1899 hasOutput = true; 1900 if (lastWasBlank) 1901 outputRange.put('\n'); 1902 lastWasBlank = false; 1903 outputRange.put(stripped); 1904 } 1905 else 1906 lastWasBlank = false; 1907 } 1908 break; 1909 default: 1910 outputRange.put(comment); 1911 break; 1912 } 1913 } 1914 1915 /// 1916 unittest 1917 { 1918 import std.array:array, appender; 1919 import std.stdio:stderr; 1920 stderr.writeln("Running unittest for unDecorateComment..."); 1921 1922 1923 string[] inputs = [ 1924 "/***************\n*******************/", 1925 "/***************\n *\n ******************/", 1926 "/**\n*/", 1927 "/** */", 1928 "/***/", 1929 "/** abcde */", 1930 "/// abcde\n/// abcde", 1931 "/**\n * stuff\n */", 1932 "/**\n *\n * stuff\n */", 1933 "/**\n *\n * stuff\n *\n */", 1934 "/**\n *\n * stuff\n *\n*/", 1935 "/**\n * abcde\n * abcde \n */", 1936 "/**\n * abcde\n *\n * abcde\n */", 1937 ]; 1938 string[] outputs = [ 1939 "", 1940 "", 1941 "", 1942 "", 1943 "", 1944 "abcde", 1945 "abcde\nabcde", 1946 "stuff", 1947 "stuff", 1948 "stuff", 1949 "stuff", 1950 "abcde\n abcde", 1951 "abcde\n\nabcde" 1952 ]; 1953 assert(inputs.length == outputs.length); 1954 foreach (pair; zip(inputs, outputs)) 1955 { 1956 foreach (b; [true, false]) 1957 { 1958 auto app = appender!string(); 1959 unDecorateComment(b ? pair[0] : pair[0].replace("*", "+"), app); 1960 assert(pair[1] == app.data, "[[" ~ pair[0] ~ "]] => [[" ~ app.data ~ "]]"); 1961 } 1962 } 1963 stderr.writeln("Unittest for unDecorateComment passed."); 1964 } 1965 1966 1967 /** 1968 * The string cache is used for string interning. 1969 * 1970 * It will only store a single copy of any string that it is asked to hold. 1971 * Interned strings can be compared for equality by comparing their $(B .ptr) 1972 * field. 1973 * 1974 * Default and postbilt constructors are disabled. When a StringCache goes out 1975 * of scope, the memory held by it is freed. 1976 * 1977 * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning) 1978 */ 1979 struct StringCache 1980 { 1981 public pure nothrow @nogc: 1982 1983 @disable this(); 1984 @disable this(this); 1985 1986 /** 1987 * Params: bucketCount = the initial number of buckets. Must be a 1988 * power of two 1989 */ 1990 this(size_t bucketCount) nothrow @trusted @nogc 1991 in 1992 { 1993 import core.bitop : popcnt; 1994 static if (size_t.sizeof == 8) 1995 { 1996 immutable low = popcnt(cast(uint) bucketCount); 1997 immutable high = popcnt(cast(uint) (bucketCount >> 32)); 1998 assert ((low == 0 && high == 1) || (low == 1 && high == 0)); 1999 } 2000 else 2001 { 2002 static assert (size_t.sizeof == 4); 2003 assert (popcnt(cast(uint) bucketCount) == 1); 2004 } 2005 } 2006 body 2007 { 2008 buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount]; 2009 } 2010 2011 ~this() 2012 { 2013 Block* current = rootBlock; 2014 while (current !is null) 2015 { 2016 Block* prev = current; 2017 current = current.next; 2018 free(cast(void*) prev); 2019 } 2020 foreach (nodePointer; buckets) 2021 { 2022 Node* currentNode = nodePointer; 2023 while (currentNode !is null) 2024 { 2025 if (currentNode.mallocated) 2026 free(currentNode.str.ptr); 2027 Node* prev = currentNode; 2028 currentNode = currentNode.next; 2029 free(prev); 2030 } 2031 } 2032 rootBlock = null; 2033 free(buckets.ptr); 2034 buckets = null; 2035 } 2036 2037 /** 2038 * Caches a string. 2039 */ 2040 string intern(const(ubyte)[] str) @safe 2041 { 2042 if (str is null || str.length == 0) 2043 return ""; 2044 return _intern(str); 2045 } 2046 2047 /** 2048 * ditto 2049 */ 2050 string intern(string str) @trusted 2051 { 2052 return intern(cast(ubyte[]) str); 2053 } 2054 2055 /** 2056 * The default bucket count for the string cache. 2057 */ 2058 static enum defaultBucketCount = 4096; 2059 2060 private: 2061 2062 string _intern(const(ubyte)[] bytes) @trusted 2063 { 2064 immutable uint hash = hashBytes(bytes); 2065 immutable size_t index = hash & (buckets.length - 1); 2066 Node* s = find(bytes, hash); 2067 if (s !is null) 2068 return cast(string) s.str; 2069 ubyte[] mem = void; 2070 bool mallocated = bytes.length > BIG_STRING; 2071 if (mallocated) 2072 mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length]; 2073 else 2074 mem = allocate(bytes.length); 2075 mem[] = bytes[]; 2076 Node* node = cast(Node*) malloc(Node.sizeof); 2077 node.str = mem; 2078 node.hash = hash; 2079 node.next = buckets[index]; 2080 node.mallocated = mallocated; 2081 buckets[index] = node; 2082 return cast(string) mem; 2083 } 2084 2085 Node* find(const(ubyte)[] bytes, uint hash) @trusted 2086 { 2087 import std.algorithm : equal; 2088 immutable size_t index = hash & (buckets.length - 1); 2089 Node* node = buckets[index]; 2090 while (node !is null) 2091 { 2092 if (node.hash == hash && bytes == cast(ubyte[]) node.str) 2093 return node; 2094 node = node.next; 2095 } 2096 return node; 2097 } 2098 2099 static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc 2100 in 2101 { 2102 assert (data !is null); 2103 assert (data.length > 0); 2104 } 2105 body 2106 { 2107 immutable uint m = 0x5bd1e995; 2108 immutable int r = 24; 2109 uint h = cast(uint) data.length; 2110 while (data.length >= 4) 2111 { 2112 uint k = (cast(ubyte) data[3]) << 24 2113 | (cast(ubyte) data[2]) << 16 2114 | (cast(ubyte) data[1]) << 8 2115 | (cast(ubyte) data[0]); 2116 k *= m; 2117 k ^= k >> r; 2118 k *= m; 2119 h *= m; 2120 h ^= k; 2121 data = data[4 .. $]; 2122 } 2123 switch (data.length & 3) 2124 { 2125 case 3: 2126 h ^= data[2] << 16; 2127 goto case; 2128 case 2: 2129 h ^= data[1] << 8; 2130 goto case; 2131 case 1: 2132 h ^= data[0]; 2133 h *= m; 2134 break; 2135 default: 2136 break; 2137 } 2138 h ^= h >> 13; 2139 h *= m; 2140 h ^= h >> 15; 2141 return h; 2142 } 2143 2144 ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc 2145 in 2146 { 2147 assert (numBytes != 0); 2148 } 2149 out (result) 2150 { 2151 assert (result.length == numBytes); 2152 } 2153 body 2154 { 2155 Block* r = rootBlock; 2156 size_t i = 0; 2157 while (i <= 3 && r !is null) 2158 { 2159 immutable size_t available = r.bytes.length; 2160 immutable size_t oldUsed = r.used; 2161 immutable size_t newUsed = oldUsed + numBytes; 2162 if (newUsed <= available) 2163 { 2164 r.used = newUsed; 2165 return r.bytes[oldUsed .. newUsed]; 2166 } 2167 i++; 2168 r = r.next; 2169 } 2170 Block* b = cast(Block*) calloc(Block.sizeof, 1); 2171 b.used = numBytes; 2172 b.next = rootBlock; 2173 rootBlock = b; 2174 return b.bytes[0 .. numBytes]; 2175 } 2176 2177 static struct Node 2178 { 2179 ubyte[] str = void; 2180 Node* next = void; 2181 uint hash = void; 2182 bool mallocated = void; 2183 } 2184 2185 static struct Block 2186 { 2187 Block* next; 2188 size_t used; 2189 enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof; 2190 ubyte[BLOCK_CAPACITY] bytes; 2191 } 2192 2193 static assert (BLOCK_SIZE == Block.sizeof); 2194 2195 enum BLOCK_SIZE = 1024 * 16; 2196 2197 // If a string would take up more than 1/4 of a block, allocate it outside 2198 // of the block. 2199 enum BIG_STRING = BLOCK_SIZE / 4; 2200 2201 Node*[] buckets; 2202 Block* rootBlock; 2203 } 2204 2205 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted; 2206 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted; 2207 private extern(C) void free(void*) nothrow pure @nogc @trusted; 2208 2209 unittest 2210 { 2211 auto source = cast(ubyte[]) q{ import std.stdio;}}; 2212 auto tokens = getTokensForParser(source, LexerConfig(), 2213 new StringCache(StringCache.defaultBucketCount)); 2214 assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", 2215 tok!"identifier", tok!";"])); 2216 } 2217 2218 /// Test \x char sequence 2219 unittest 2220 { 2221 auto toks = (string s) => byToken(cast(ubyte[])s); 2222 2223 // valid 2224 immutable hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; 2225 auto source = ""; 2226 foreach (h1; hex) 2227 foreach (h2; hex) 2228 source ~= "'\\x" ~ h1 ~ h2 ~ "'"; 2229 assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty); 2230 2231 // invalid 2232 assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2233 assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2234 assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2235 assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2236 assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2237 } 2238 2239 version (iasm64NotWindows) 2240 { 2241 /** 2242 * Returns: 2243 */ 2244 ushort newlineMask(const ubyte*) pure nothrow @trusted @nogc 2245 { 2246 asm pure nothrow @nogc 2247 { 2248 naked; 2249 movdqu XMM1, [RDI]; 2250 mov RAX, 3; 2251 mov RDX, 16; 2252 mov R8, 0x0d0d0d0d0d0d0d0dL; 2253 movq XMM2, R8; 2254 shufpd XMM2, XMM2, 0; 2255 pcmpeqb XMM2, XMM1; 2256 mov R9, 0x0a0a0a0a0a0a0a0aL; 2257 movq XMM3, R9; 2258 shufpd XMM3, XMM3, 0; 2259 pcmpeqb XMM3, XMM1; 2260 mov R10, 0xe280a8L; 2261 movq XMM4, R10; 2262 pcmpestrm XMM4, XMM1, 0b01001100; 2263 movdqa XMM4, XMM0; 2264 mov R11, 0xe280a9L; 2265 movq XMM5, R11; 2266 pcmpestrm XMM5, XMM1, 0b01001100; 2267 movdqa XMM5, XMM0; 2268 mov RCX, 0x0a0d; 2269 dec RAX; 2270 movq XMM6, RCX; 2271 pcmpestrm XMM6, XMM1, 0b01001100; 2272 movdqa XMM6, XMM0; 2273 movdqa XMM7, XMM6; 2274 pslldq XMM7, 1; 2275 movdqa XMM0, XMM4; 2276 por XMM0, XMM5; 2277 por XMM7, XMM6; 2278 movdqa XMM1, XMM2; 2279 por XMM1, XMM3; 2280 pxor XMM7, XMM1; 2281 por XMM7, XMM0; 2282 por XMM7, XMM6; 2283 pmovmskb RAX, XMM7; 2284 and RAX, 0b0011_1111_1111_1111; 2285 ret; 2286 } 2287 } 2288 2289 /** 2290 * Skips between 0 and 16 bytes that match (or do not match) one of the 2291 * given $(B chars). 2292 */ 2293 void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow 2294 @trusted @nogc if (chars.length <= 8) 2295 { 2296 enum constant = ByteCombine!chars; 2297 enum charsLength = chars.length; 2298 static if (matching) 2299 enum flags = 0b0001_0000; 2300 else 2301 enum flags = 0b0000_0000; 2302 asm pure nothrow @nogc 2303 { 2304 naked; 2305 movdqu XMM1, [RDX]; 2306 mov R10, constant; 2307 movq XMM2, R10; 2308 mov RAX, charsLength; 2309 mov RDX, 16; 2310 pcmpestri XMM2, XMM1, flags; 2311 add [RSI], RCX; 2312 add [RDI], RCX; 2313 ret; 2314 } 2315 } 2316 2317 /** 2318 * Returns: the number of bytes starting at the given location that match 2319 * (or do not match if $(B invert) is true) the byte ranges in $(B chars). 2320 */ 2321 ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc 2322 { 2323 static assert (chars.length % 2 == 0); 2324 enum constant = ByteCombine!chars; 2325 static if (invert) 2326 enum rangeMatchFlags = 0b0000_0100; 2327 else 2328 enum rangeMatchFlags = 0b0001_0100; 2329 enum charsLength = chars.length; 2330 asm pure nothrow @nogc 2331 { 2332 naked; 2333 movdqu XMM1, [RDI]; 2334 mov R10, constant; 2335 movq XMM2, R10; 2336 mov RAX, charsLength; 2337 mov RDX, 16; 2338 pcmpestri XMM2, XMM1, rangeMatchFlags; 2339 mov RAX, RCX; 2340 ret; 2341 } 2342 } 2343 2344 template ByteCombine(c...) 2345 { 2346 static assert (c.length <= 8); 2347 static if (c.length > 1) 2348 enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8); 2349 else 2350 enum ulong ByteCombine = c[0]; 2351 } 2352 }