1 module dparse.lexer; 2 3 import std.typecons; 4 import std.typetuple; 5 import std.array; 6 import std.algorithm; 7 import std.range; 8 import std.experimental.lexer; 9 import std.traits; 10 import core.cpuid : sse42; 11 version (D_InlineAsm_X86_64) 12 { 13 version (Windows) {} 14 else version = iasm64NotWindows; 15 } 16 17 public import dparse.trivia; 18 19 /// Operators 20 private enum operators = [ 21 ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", 22 "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", 23 "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", 24 "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", 25 "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" 26 ]; 27 28 /// Kewords 29 private enum keywords = [ 30 "abstract", "alias", "align", "asm", "assert", "auto", "bool", 31 "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", 32 "char", "class", "const", "continue", "creal", "dchar", "debug", "default", 33 "delegate", "delete", "deprecated", "do", "double", "else", "enum", 34 "export", "extern", "false", "final", "finally", "float", "for", "foreach", 35 "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", 36 "immutable", "import", "in", "inout", "int", "interface", "invariant", 37 "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", 38 "null", "out", "override", "package", "pragma", "private", "protected", 39 "public", "pure", "real", "ref", "return", "scope", "shared", "short", 40 "static", "struct", "super", "switch", "synchronized", "template", "this", 41 "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", 42 "uint", "ulong", "union", "unittest", "ushort", "version", "void", 43 "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", 44 "__FILE_FULL_PATH__", "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", 45 "__parameters", "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", 46 "__vector", "__VENDOR__", "__VERSION__" 47 ]; 48 49 /// Other tokens 50 private enum dynamicTokens = [ 51 "specialTokenSequence", "comment", "identifier", "scriptLine", 52 "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", 53 "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", 54 "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", 55 "dstringLiteral", "stringLiteral", "wstringLiteral" 56 ]; 57 58 private enum pseudoTokenHandlers = [ 59 "\"", "lexStringLiteral", 60 "`", "lexWysiwygString", 61 "//", "lexSlashSlashComment", 62 "/*", "lexSlashStarComment", 63 "/+", "lexSlashPlusComment", 64 ".", "lexDot", 65 "'", "lexCharacterLiteral", 66 "0", "lexNumber", 67 "1", "lexDecimal", 68 "2", "lexDecimal", 69 "3", "lexDecimal", 70 "4", "lexDecimal", 71 "5", "lexDecimal", 72 "6", "lexDecimal", 73 "7", "lexDecimal", 74 "8", "lexDecimal", 75 "9", "lexDecimal", 76 "q\"", "lexDelimitedString", 77 "q{", "lexTokenString", 78 "r\"", "lexWysiwygString", 79 "x\"", "lexHexString", 80 " ", "lexWhitespace", 81 "\t", "lexWhitespace", 82 "\r", "lexWhitespace", 83 "\n", "lexWhitespace", 84 "\v", "lexWhitespace", 85 "\f", "lexWhitespace", 86 "\u2028", "lexLongNewline", 87 "\u2029", "lexLongNewline", 88 "#!", "lexScriptLine", 89 "#line", "lexSpecialTokenSequence" 90 ]; 91 92 /// Token ID type for the D lexer. 93 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 94 95 /** 96 * Function used for converting an IdType to a string. 97 * 98 * Examples: 99 * --- 100 * IdType c = tok!"case"; 101 * assert (str(c) == "case"); 102 * --- 103 */ 104 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 105 106 /** 107 * Template used to refer to D token types. 108 * 109 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 110 * values that can be passed to this template. 111 * Example: 112 * --- 113 * import dparse.lexer; 114 * IdType t = tok!"floatLiteral"; 115 * --- 116 */ 117 public template tok(string token) 118 { 119 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 120 } 121 122 mixin template TokenTriviaFields() 123 { 124 /** 125 * Whitespace and comment tokens attached to this token. 126 * 127 * All trivia tokens must have the text property set to the text with 128 * which they identify with. This means you can map all trivia tokens to 129 * their .text property and join them together to get the source code back 130 * without any loss of information. 131 * 132 * Trivia is only included when calling getTokensForParser. When iterating 133 * over DLexer all tokens will be in their raw form and none will be 134 * converted to trivia. 135 * 136 * Note: in the future you might need to explicitly pass 137 * WhitespaceBehavior.include (or keep the default) as getTokensForParser 138 * currently overrides it to include. 139 * 140 * Contains: `comment`, `whitespace`, `specialTokenSequence` 141 */ 142 immutable(typeof(this))[] leadingTrivia; 143 /// ditto 144 immutable(typeof(this))[] trailingTrivia; 145 146 string memoizedLeadingComment = null; 147 string memoizedTrailingComment = null; 148 149 /// Legacy property to get documentation comments, with comment border 150 /// stripped off, which is attached to this token. 151 string comment() const pure nothrow @safe @property { 152 import dparse.trivia : extractLeadingDdoc; 153 if (memoizedLeadingComment !is null) 154 return memoizedLeadingComment; 155 return (cast()memoizedLeadingComment) = this.extractLeadingDdoc; 156 } 157 158 /// ditto 159 string trailingComment() const pure nothrow @safe @property { 160 import dparse.trivia : extractTrailingDdoc; 161 if (memoizedTrailingComment !is null) 162 return memoizedTrailingComment; 163 return (cast()memoizedTrailingComment) = this.extractTrailingDdoc; 164 } 165 166 int opCmp(size_t i) const pure nothrow @safe @nogc { 167 if (index < i) return -1; 168 if (index > i) return 1; 169 return 0; 170 } 171 172 int opCmp(ref const typeof(this) other) const pure nothrow @safe @nogc { 173 return opCmp(other.index); 174 } 175 } 176 177 // mixin in from dparse.lexer to make error messages more managable size as the 178 // entire string is dumped when there is a type mismatch. 179 private enum extraFields = "import dparse.lexer:TokenTriviaFields; mixin TokenTriviaFields;"; 180 181 /// The token type in the D lexer 182 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); 183 184 /** 185 * Configure whitespace handling 186 */ 187 public enum WhitespaceBehavior : ubyte 188 { 189 include = 0b0000_0000, 190 skip = 0b0000_0001, 191 } 192 193 private enum stringBehaviorNotWorking = "Automatic string parsing is not " 194 ~ "supported and was previously not working. To unescape strings use the " 195 ~ "`dparse.strings : unescapeString` function on the token texts instead."; 196 197 /** 198 * Configure string lexing behavior 199 */ 200 // was enum, but struct now for deprecations and support with old compilers 201 public struct StringBehavior 202 { 203 /// Do not include quote characters, process escape sequences 204 deprecated(stringBehaviorNotWorking) static immutable StringBehavior compiler = StringBehavior(0b0000_0000); 205 /// Opening quotes, closing quotes, and string suffixes are included in 206 /// the string token 207 deprecated(stringBehaviorNotWorking) static immutable StringBehavior includeQuoteChars = StringBehavior(0b0000_0001); 208 /// String escape sequences are not replaced 209 deprecated(stringBehaviorNotWorking) static immutable StringBehavior notEscaped = StringBehavior(0b0000_0010); 210 /// Not modified at all. Useful for formatters or highlighters 211 static immutable StringBehavior source = StringBehavior(0b0000_0011); 212 213 ubyte behavior; 214 alias behavior this; 215 } 216 217 public enum CommentBehavior : bool 218 { 219 intern = true, 220 noIntern = false 221 } 222 /** 223 * Lexer configuration struct 224 */ 225 public struct LexerConfig 226 { 227 string fileName; 228 StringBehavior stringBehavior; 229 WhitespaceBehavior whitespaceBehavior; 230 CommentBehavior commentBehavior = CommentBehavior.intern; 231 } 232 233 /** 234 * Basic type token types. 235 */ 236 public alias BasicTypes = AliasSeq!(tok!"int", tok!"bool", tok!"byte", 237 tok!"cdouble", tok!"cent", tok!"cfloat", tok!"char", tok!"creal", 238 tok!"dchar", tok!"double", tok!"float", tok!"idouble", 239 tok!"ifloat", tok!"ireal", tok!"long", tok!"real", tok!"short", 240 tok!"ubyte", tok!"ucent", tok!"uint", tok!"ulong", tok!"ushort", 241 tok!"void", tok!"wchar"); 242 243 /** 244 * Returns: true if the given ID is for a basic type. 245 */ 246 public bool isBasicType(IdType type) nothrow pure @safe @nogc 247 { 248 switch (type) 249 { 250 foreach (T; BasicTypes) 251 { 252 case T: 253 return true; 254 } 255 default: 256 return false; 257 } 258 } 259 260 /** 261 * Number literal token types. 262 */ 263 public alias NumberLiterals = AliasSeq!(tok!"doubleLiteral", 264 tok!"floatLiteral", tok!"idoubleLiteral", tok!"ifloatLiteral", 265 tok!"intLiteral", tok!"longLiteral", tok!"realLiteral", 266 tok!"irealLiteral", tok!"uintLiteral", tok!"ulongLiteral"); 267 268 /** 269 * Returns: true if the given ID type is for a number literal. 270 */ 271 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc 272 { 273 switch (type) 274 { 275 foreach (T; NumberLiterals) 276 { 277 case T: 278 return true; 279 } 280 default: 281 return false; 282 } 283 } 284 285 /** 286 * Number literal token types. 287 */ 288 public alias IntegerLiterals = AliasSeq!(tok!"intLiteral", tok!"longLiteral", 289 tok!"uintLiteral", tok!"ulongLiteral"); 290 291 /** 292 * Returns: true if the given ID type is for a integer literal. 293 */ 294 public bool isIntegerLiteral(IdType type) nothrow pure @safe @nogc 295 { 296 switch (type) 297 { 298 foreach (T; IntegerLiterals) 299 { 300 case T: 301 return true; 302 } 303 default: 304 return false; 305 } 306 } 307 308 /** 309 * Operator token types. 310 */ 311 public alias Operators = AliasSeq!(tok!",", tok!".", tok!"..", tok!"...", 312 tok!"/", tok!"/=", tok!"!", tok!"!<", tok!"!<=", tok!"!<>", 313 tok!"!<>=", tok!"!=", tok!"!>", tok!"!>=", tok!"$", tok!"%", 314 tok!"%=", tok!"&", tok!"&&", tok!"&=", tok!"(", tok!")", 315 tok!"*", tok!"*=", tok!"+", tok!"++", tok!"+=", tok!"-", 316 tok!"--", tok!"-=", tok!":", tok!";", tok!"<", tok!"<<", 317 tok!"<<=", tok!"<=", tok!"<>", tok!"<>=", tok!"=", tok!"==", 318 tok!"=>", tok!">", tok!">=", tok!">>", tok!">>=", tok!">>>", 319 tok!">>>=", tok!"?", tok!"@", tok!"[", tok!"]", tok!"^", 320 tok!"^=", tok!"^^", tok!"^^=", tok!"{", tok!"|", tok!"|=", 321 tok!"||", tok!"}", tok!"~", tok!"~="); 322 323 /** 324 * Returns: true if the given ID type is for an operator. 325 */ 326 public bool isOperator(IdType type) nothrow pure @safe @nogc 327 { 328 switch (type) 329 { 330 foreach (T; Operators) 331 { 332 case T: 333 return true; 334 } 335 default: 336 return false; 337 } 338 } 339 340 /** 341 * Keyword token types. 342 */ 343 public alias Keywords = AliasSeq!(tok!"abstract", tok!"alias", tok!"align", 344 tok!"asm", tok!"assert", tok!"auto", tok!"break", 345 tok!"case", tok!"cast", tok!"catch", tok!"class", tok!"const", 346 tok!"continue", tok!"debug", tok!"default", tok!"delegate", 347 tok!"delete", tok!"deprecated", tok!"do", tok!"else", tok!"enum", 348 tok!"export", tok!"extern", tok!"false", tok!"final", tok!"finally", 349 tok!"for", tok!"foreach", tok!"foreach_reverse", tok!"function", 350 tok!"goto", tok!"if", tok!"immutable", tok!"import", tok!"in", 351 tok!"inout", tok!"interface", tok!"invariant", tok!"is", 352 tok!"lazy", tok!"macro", tok!"mixin", tok!"module", tok!"new", 353 tok!"nothrow", tok!"null", tok!"out", tok!"override", tok!"package", 354 tok!"pragma", tok!"private", tok!"protected", tok!"public", 355 tok!"pure", tok!"ref", tok!"return", tok!"scope", tok!"shared", 356 tok!"static", tok!"struct", tok!"super", tok!"switch", tok!"synchronized", 357 tok!"template", tok!"this", tok!"throw", tok!"true", tok!"try", 358 tok!"typedef", tok!"typeid", tok!"typeof", tok!"union", tok!"unittest", 359 tok!"version", tok!"while", tok!"with", tok!"__DATE__", 360 tok!"__EOF__", tok!"__FILE__", tok!"__FILE_FULL_PATH__", tok!"__FUNCTION__", 361 tok!"__gshared", tok!"__LINE__", tok!"__MODULE__", tok!"__parameters", 362 tok!"__PRETTY_FUNCTION__", tok!"__TIME__", tok!"__TIMESTAMP__", 363 tok!"__traits", tok!"__vector", tok!"__VENDOR__", tok!"__VERSION__"); 364 365 /** 366 * Returns: true if the given ID type is for a keyword. 367 */ 368 public bool isKeyword(IdType type) pure nothrow @safe @nogc 369 { 370 switch (type) 371 { 372 foreach (T; Keywords) 373 { 374 case T: 375 return true; 376 } 377 default: 378 return false; 379 } 380 } 381 382 /** 383 * String literal token types 384 */ 385 public alias StringLiterals = AliasSeq!(tok!"dstringLiteral", 386 tok!"stringLiteral", tok!"wstringLiteral"); 387 388 /** 389 * Returns: true if the given ID type is for a string literal. 390 */ 391 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc 392 { 393 switch (type) 394 { 395 foreach (T; StringLiterals) 396 { 397 case T: 398 return true; 399 } 400 default: 401 return false; 402 } 403 } 404 405 /** 406 * Protection token types. 407 */ 408 public alias Protections = AliasSeq!(tok!"export", tok!"package", 409 tok!"private", tok!"public", tok!"protected"); 410 411 /** 412 * Returns: true if the given ID type is for a protection attribute. 413 */ 414 public bool isProtection(IdType type) pure nothrow @safe @nogc 415 { 416 switch (type) 417 { 418 foreach (T; Protections) 419 { 420 case T: 421 return true; 422 } 423 default: 424 return false; 425 } 426 } 427 428 public alias SpecialTokens = AliasSeq!(tok!"__DATE__", tok!"__TIME__", 429 tok!"__TIMESTAMP__", tok!"__VENDOR__", tok!"__VERSION__", tok!"__FILE__", 430 tok!"__FILE_FULL_PATH__", tok!"__LINE__", tok!"__MODULE__", 431 tok!"__FUNCTION__", tok!"__PRETTY_FUNCTION__"); 432 433 public bool isSpecialToken(IdType type) pure nothrow @safe @nogc 434 { 435 switch (type) 436 { 437 foreach (T; SpecialTokens) 438 { 439 case T: 440 return true; 441 } 442 default: 443 return false; 444 } 445 } 446 447 public alias Literals = AliasSeq!(StringLiterals, NumberLiterals, tok!"characterLiteral", 448 SpecialTokens, tok!"true", tok!"false", tok!"null", tok!"$"); 449 450 public bool isLiteral(IdType type) pure nothrow @safe @nogc 451 { 452 switch (type) 453 { 454 foreach (T; Literals) 455 { 456 case T: 457 return true; 458 } 459 default: 460 return false; 461 } 462 } 463 464 /** 465 * Returns: an array of tokens lexed from the given source code to the output 466 * range. All whitespace, comment and specialTokenSequence tokens (trivia) are 467 * attached to the token nearest to them. 468 * 469 * Trivia is put on the last token as `trailingTrivia` if it is on the same 470 * line as the trivia, otherwise it will be attached to the next token in the 471 * `leadingTrivia` until there is the EOF, where it will be attached as 472 * `trailingTrivia` again. 473 */ 474 const(Token)[] getTokensForParser(R)(R sourceCode, LexerConfig config, StringCache* cache) 475 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 476 { 477 config.whitespaceBehavior = WhitespaceBehavior.include; 478 config.commentBehavior = CommentBehavior.noIntern; 479 480 auto leadingTriviaAppender = appender!(Token[])(); 481 leadingTriviaAppender.reserve(128); 482 auto trailingTriviaAppender = appender!(Token[])(); 483 trailingTriviaAppender.reserve(128); 484 485 auto output = appender!(typeof(return))(); 486 auto lexer = DLexer(sourceCode, config, cache); 487 loop: while (!lexer.empty) switch (lexer.front.type) 488 { 489 case tok!"specialTokenSequence": 490 case tok!"whitespace": 491 case tok!"comment": 492 if (!output.data.empty && lexer.front.line == output.data[$ - 1].line) 493 trailingTriviaAppender.put(lexer.front); 494 else 495 leadingTriviaAppender.put(lexer.front); 496 lexer.popFront(); 497 break; 498 case tok!"__EOF__": 499 break loop; 500 default: 501 Token t = lexer.front; 502 lexer.popFront(); 503 504 if (!output.data.empty && !trailingTriviaAppender.data.empty) 505 (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup; 506 t.leadingTrivia = leadingTriviaAppender.data.idup; 507 leadingTriviaAppender.clear(); 508 trailingTriviaAppender.clear(); 509 510 output.put(t); 511 break; 512 } 513 514 if (!output.data.empty) 515 { 516 trailingTriviaAppender.put(leadingTriviaAppender.data); 517 (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup; 518 } 519 520 return output.data; 521 } 522 523 /** 524 * The D lexer struct. 525 */ 526 public struct DLexer 527 { 528 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 529 keywords, pseudoTokenHandlers); 530 531 /// 532 @disable this(); 533 534 /** 535 * Params: 536 * range = the bytes that compose the source code that will be lexed. 537 * config = the lexer configuration to use. 538 * cache = the string interning cache for de-duplicating identifiers and 539 * other token text. 540 * haveSSE42 = Parse streaming SIMD Extensions 4.2 in inline assembly 541 */ 542 this(R)(R range, const LexerConfig config, StringCache* cache, 543 bool haveSSE42 = sse42()) pure nothrow @safe 544 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 545 { 546 this.haveSSE42 = haveSSE42; 547 auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf) 548 ? range[3 .. $] : range; 549 static if (is(ElementEncodingType!R == immutable)) 550 this.range = LexerRange(cast(const(ubyte)[]) r); 551 else 552 this.range = LexerRange(cast(const(ubyte)[]) r.idup); 553 this.config = config; 554 this.cache = cache; 555 popFront(); 556 } 557 558 /// 559 public void popFront()() pure nothrow @safe 560 { 561 do 562 _popFront(); 563 while (config.whitespaceBehavior == WhitespaceBehavior.skip 564 && _front.type == tok!"whitespace"); 565 } 566 567 /** 568 * Lexer error/warning message. 569 */ 570 static struct Message 571 { 572 /// 1-based line number 573 size_t line; 574 /// 1-based byte offset 575 size_t column; 576 /// Text of the message 577 string message; 578 /// `true` for an error, `false` for a warning 579 bool isError; 580 } 581 582 /** 583 * Returns: An array of all of the warnings and errors generated so far 584 * during lexing. It may make sense to only check this when `empty` 585 * returns `true`. 586 */ 587 const(Message[]) messages() const @property 588 { 589 return _messages; 590 } 591 592 private pure nothrow @safe: 593 594 bool isWhitespace() 595 { 596 switch (range.bytes[range.index]) 597 { 598 case ' ': 599 case '\r': 600 case '\n': 601 case '\t': 602 case '\v': 603 case '\f': 604 return true; 605 case 0xe2: 606 auto peek = range.peek(2); 607 return peek.length == 2 608 && peek[0] == 0x80 609 && (peek[1] == 0xa8 || peek[1] == 0xa9); 610 default: 611 return false; 612 } 613 } 614 615 void popFrontWhitespaceAware() 616 { 617 switch (range.bytes[range.index]) 618 { 619 case '\r': 620 range.popFront(); 621 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 622 { 623 range.popFront(); 624 range.incrementLine(); 625 } 626 else 627 range.incrementLine(); 628 return; 629 case '\n': 630 range.popFront(); 631 range.incrementLine(); 632 return; 633 case 0xe2: 634 auto lookahead = range.peek(3); 635 if (lookahead.length == 3 && lookahead[1] == 0x80 636 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 637 { 638 range.index+=3; 639 range.column+=3; 640 range.incrementLine(); 641 return; 642 } 643 else 644 { 645 range.popFront(); 646 return; 647 } 648 default: 649 range.popFront(); 650 return; 651 } 652 } 653 654 void lexWhitespace(ref Token token) @trusted 655 { 656 mixin (tokenStart); 657 loop: do 658 { 659 version (iasm64NotWindows) 660 { 661 if (haveSSE42 && range.index + 16 < range.bytes.length) 662 { 663 skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index, 664 &range.index, &range.column); 665 } 666 } 667 switch (range.bytes[range.index]) 668 { 669 case '\r': 670 range.popFront(); 671 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 672 { 673 range.popFront(); 674 } 675 range.column = 1; 676 range.line += 1; 677 break; 678 case '\n': 679 range.popFront(); 680 range.column = 1; 681 range.line += 1; 682 break; 683 case ' ': 684 case '\t': 685 case '\v': 686 case '\f': 687 range.popFront(); 688 break; 689 case 0xe2: 690 if (range.index + 2 >= range.bytes.length) 691 break loop; 692 if (range.bytes[range.index + 1] != 0x80) 693 break loop; 694 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9) 695 { 696 range.index += 3; 697 range.column += 3; 698 range.column = 1; 699 range.line += 1; 700 break; 701 } 702 break loop; 703 default: 704 break loop; 705 } 706 } while (!(range.index >= range.bytes.length)); 707 string text = config.whitespaceBehavior == WhitespaceBehavior.include 708 ? cache.intern(range.slice(mark)) : ""; 709 token = Token(tok!"whitespace", text, line, column, index); 710 } 711 712 void lexNumber(ref Token token) 713 { 714 mixin (tokenStart); 715 if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length) 716 { 717 immutable ahead = range.bytes[range.index + 1]; 718 switch (ahead) 719 { 720 case 'x': 721 case 'X': 722 range.index += 2; 723 range.column += 2; 724 lexHex(token, mark, line, column, index); 725 return; 726 case 'b': 727 case 'B': 728 range.index += 2; 729 range.column += 2; 730 lexBinary(token, mark, line, column, index); 731 return; 732 default: 733 lexDecimal(token, mark, line, column, index); 734 return; 735 } 736 } 737 else 738 lexDecimal(token, mark, line, column, index); 739 } 740 741 void lexHex(ref Token token) 742 { 743 mixin (tokenStart); 744 lexHex(token, mark, line, column, index); 745 } 746 747 void lexHex(ref Token token, size_t mark, size_t line, size_t column, 748 size_t index) @trusted 749 { 750 IdType type = tok!"intLiteral"; 751 bool foundDot; 752 hexLoop: while (!(range.index >= range.bytes.length)) 753 { 754 switch (range.bytes[range.index]) 755 { 756 case 'a': .. case 'f': 757 case 'A': .. case 'F': 758 case '0': .. case '9': 759 case '_': 760 version (iasm64NotWindows) 761 { 762 if (haveSSE42 && range.index + 16 < range.bytes.length) 763 { 764 immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_') 765 (range.bytes.ptr + range.index); 766 range.column += i; 767 range.index += i; 768 } 769 else 770 range.popFront(); 771 } 772 else 773 range.popFront(); 774 break; 775 case 'u': 776 case 'U': 777 lexIntSuffix(type); 778 break hexLoop; 779 case 'i': 780 if (foundDot) 781 lexFloatSuffix(type); 782 break hexLoop; 783 case 'L': 784 if (foundDot) 785 lexFloatSuffix(type); 786 else 787 lexIntSuffix(type); 788 break hexLoop; 789 case 'p': 790 case 'P': 791 lexExponent(type); 792 break hexLoop; 793 case '.': 794 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 795 break hexLoop; 796 else 797 { 798 // The following bit of silliness tries to tell the 799 // difference between "int dot identifier" and 800 // "double identifier". 801 if (range.index + 1 < range.bytes.length) 802 { 803 switch (range.peekAt(1)) 804 { 805 case '0': .. case '9': 806 case 'A': .. case 'F': 807 case 'a': .. case 'f': 808 goto doubleLiteral; 809 default: 810 break hexLoop; 811 } 812 } 813 else 814 { 815 doubleLiteral: 816 range.popFront(); 817 foundDot = true; 818 type = tok!"doubleLiteral"; 819 } 820 } 821 break; 822 default: 823 break hexLoop; 824 } 825 } 826 token = Token(type, cache.intern(range.slice(mark)), line, column, 827 index); 828 } 829 830 void lexBinary(ref Token token) 831 { 832 mixin (tokenStart); 833 return lexBinary(token, mark, line, column, index); 834 } 835 836 void lexBinary(ref Token token, size_t mark, size_t line, size_t column, 837 size_t index) @trusted 838 { 839 IdType type = tok!"intLiteral"; 840 binaryLoop: while (!(range.index >= range.bytes.length)) 841 { 842 switch (range.bytes[range.index]) 843 { 844 case '0': 845 case '1': 846 case '_': 847 version (iasm64NotWindows) 848 { 849 if (haveSSE42 && range.index + 16 < range.bytes.length) 850 { 851 immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')( 852 range.bytes.ptr + range.index); 853 range.column += i; 854 range.index += i; 855 } 856 else 857 range.popFront(); 858 } 859 else 860 range.popFront(); 861 break; 862 case 'u': 863 case 'U': 864 case 'L': 865 lexIntSuffix(type); 866 break binaryLoop; 867 default: 868 break binaryLoop; 869 } 870 } 871 token = Token(type, cache.intern(range.slice(mark)), line, column, 872 index); 873 } 874 875 void lexDecimal(ref Token token) 876 { 877 mixin (tokenStart); 878 lexDecimal(token, mark, line, column, index); 879 } 880 881 void lexDecimal(ref Token token, size_t mark, size_t line, size_t column, 882 size_t index) @trusted 883 { 884 bool foundDot = range.bytes[range.index] == '.'; 885 IdType type = tok!"intLiteral"; 886 if (foundDot) 887 { 888 range.popFront(); 889 type = tok!"doubleLiteral"; 890 } 891 892 decimalLoop: while (!(range.index >= range.bytes.length)) 893 { 894 switch (range.bytes[range.index]) 895 { 896 case '0': .. case '9': 897 case '_': 898 version (iasm64NotWindows) 899 { 900 if (haveSSE42 && range.index + 16 < range.bytes.length) 901 { 902 immutable ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index); 903 range.column += i; 904 range.index += i; 905 } 906 else 907 range.popFront(); 908 } 909 else 910 range.popFront(); 911 break; 912 case 'u': 913 case 'U': 914 if (!foundDot) 915 lexIntSuffix(type); 916 break decimalLoop; 917 case 'i': 918 lexFloatSuffix(type); 919 break decimalLoop; 920 case 'L': 921 if (foundDot) 922 lexFloatSuffix(type); 923 else 924 lexIntSuffix(type); 925 break decimalLoop; 926 case 'f': 927 case 'F': 928 lexFloatSuffix(type); 929 break decimalLoop; 930 case 'e': 931 case 'E': 932 lexExponent(type); 933 break decimalLoop; 934 case '.': 935 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 936 break decimalLoop; 937 else 938 { 939 // The following bit of silliness tries to tell the 940 // difference between "int dot identifier" and 941 // "double identifier". 942 if (range.index + 1 < range.bytes.length) 943 { 944 immutable ch = range.peekAt(1); 945 if (ch <= 0x2f 946 || (ch >= '0' && ch <= '9') 947 || (ch >= ':' && ch <= '@') 948 || (ch >= '[' && ch <= '^') 949 || (ch >= '{' && ch <= '~') 950 || ch == '`' || ch == '_') 951 { 952 goto doubleLiteral; 953 } 954 else 955 break decimalLoop; 956 } 957 else 958 { 959 doubleLiteral: 960 range.popFront(); 961 foundDot = true; 962 type = tok!"doubleLiteral"; 963 } 964 } 965 break; 966 default: 967 break decimalLoop; 968 } 969 } 970 token = Token(type, cache.intern(range.slice(mark)), line, column, 971 index); 972 } 973 974 void lexIntSuffix(ref IdType type) pure nothrow @safe 975 { 976 bool secondPass; 977 if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U') 978 { 979 U: 980 if (type == tok!"intLiteral") 981 type = tok!"uintLiteral"; 982 else 983 type = tok!"ulongLiteral"; 984 range.popFront(); 985 if (secondPass) 986 return; 987 if (range.index < range.bytes.length 988 && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')) 989 goto L; 990 goto I; 991 } 992 if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l') 993 { 994 L: 995 if (type == tok!"uintLiteral") 996 type = tok!"ulongLiteral"; 997 else 998 type = tok!"longLiteral"; 999 range.popFront(); 1000 if (range.index < range.bytes.length 1001 && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u')) 1002 { 1003 secondPass = true; 1004 goto U; 1005 } 1006 goto I; 1007 } 1008 I: 1009 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 1010 { 1011 warning("Complex number literals are deprecated"); 1012 range.popFront(); 1013 if (type == tok!"longLiteral" || type == tok!"ulongLiteral") 1014 type = tok!"idoubleLiteral"; 1015 else 1016 type = tok!"ifloatLiteral"; 1017 } 1018 } 1019 1020 void lexFloatSuffix(ref IdType type) pure nothrow @safe 1021 { 1022 switch (range.bytes[range.index]) 1023 { 1024 case 'L': 1025 range.popFront(); 1026 type = tok!"doubleLiteral"; 1027 break; 1028 case 'f': 1029 case 'F': 1030 range.popFront(); 1031 type = tok!"floatLiteral"; 1032 break; 1033 default: 1034 break; 1035 } 1036 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 1037 { 1038 warning("Complex number literals are deprecated"); 1039 range.popFront(); 1040 if (type == tok!"floatLiteral") 1041 type = tok!"ifloatLiteral"; 1042 else 1043 type = tok!"idoubleLiteral"; 1044 } 1045 } 1046 1047 void lexExponent(ref IdType type) pure nothrow @safe 1048 { 1049 range.popFront(); 1050 bool foundSign = false; 1051 bool foundDigit = false; 1052 while (range.index < range.bytes.length) 1053 { 1054 switch (range.bytes[range.index]) 1055 { 1056 case '-': 1057 case '+': 1058 if (foundSign) 1059 { 1060 if (!foundDigit) 1061 error("Expected an exponent"); 1062 return; 1063 } 1064 foundSign = true; 1065 range.popFront(); 1066 break; 1067 case '0': .. case '9': 1068 case '_': 1069 foundDigit = true; 1070 range.popFront(); 1071 break; 1072 case 'L': 1073 case 'f': 1074 case 'F': 1075 case 'i': 1076 lexFloatSuffix(type); 1077 return; 1078 default: 1079 if (!foundDigit) 1080 error("Expected an exponent"); 1081 return; 1082 } 1083 } 1084 } 1085 1086 void lexScriptLine(ref Token token) 1087 { 1088 mixin (tokenStart); 1089 while (!(range.index >= range.bytes.length) && !isNewline) 1090 { 1091 range.popFront(); 1092 } 1093 token = Token(tok!"scriptLine", cache.intern(range.slice(mark)), 1094 line, column, index); 1095 } 1096 1097 void lexSpecialTokenSequence(ref Token token) 1098 { 1099 mixin (tokenStart); 1100 while (!(range.index >= range.bytes.length) && !isNewline) 1101 { 1102 range.popFront(); 1103 } 1104 token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), 1105 line, column, index); 1106 } 1107 1108 void lexSlashStarComment(ref Token token) @trusted 1109 { 1110 mixin (tokenStart); 1111 IdType type = tok!"comment"; 1112 range.popFrontN(2); 1113 while (range.index < range.bytes.length) 1114 { 1115 version (iasm64NotWindows) 1116 { 1117 if (haveSSE42 && range.index + 16 < range.bytes.length) 1118 skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index, 1119 &range.index, &range.column); 1120 } 1121 if (range.bytes[range.index] == '*') 1122 { 1123 range.popFront(); 1124 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1125 { 1126 range.popFront(); 1127 break; 1128 } 1129 } 1130 else 1131 popFrontWhitespaceAware(); 1132 } 1133 if (config.commentBehavior == CommentBehavior.intern) 1134 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1135 else 1136 token = Token(type, cast(string) range.slice(mark), line, column, index); 1137 } 1138 1139 void lexSlashSlashComment(ref Token token) @trusted 1140 { 1141 mixin (tokenStart); 1142 IdType type = tok!"comment"; 1143 range.popFrontN(2); 1144 while (range.index < range.bytes.length) 1145 { 1146 version (iasm64NotWindows) 1147 { 1148 if (haveSSE42 && range.index + 16 < range.bytes.length) 1149 { 1150 skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1151 &range.index, &range.column); 1152 } 1153 } 1154 if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n') 1155 break; 1156 range.popFront(); 1157 } 1158 if (config.commentBehavior == CommentBehavior.intern) 1159 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1160 else 1161 token = Token(type, cast(string) range.slice(mark), line, column, index); 1162 } 1163 1164 void lexSlashPlusComment(ref Token token) @trusted 1165 { 1166 mixin (tokenStart); 1167 IdType type = tok!"comment"; 1168 range.index += 2; 1169 range.column += 2; 1170 int depth = 1; 1171 while (depth > 0 && !(range.index >= range.bytes.length)) 1172 { 1173 version (iasm64NotWindows) 1174 { 1175 if (haveSSE42 && range.index + 16 < range.bytes.length) 1176 { 1177 skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1178 &range.index, &range.column); 1179 } 1180 } 1181 if (range.bytes[range.index] == '+') 1182 { 1183 range.popFront(); 1184 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1185 { 1186 range.popFront(); 1187 depth--; 1188 } 1189 } 1190 else if (range.bytes[range.index] == '/') 1191 { 1192 range.popFront(); 1193 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+') 1194 { 1195 range.popFront(); 1196 depth++; 1197 } 1198 } 1199 else 1200 popFrontWhitespaceAware(); 1201 } 1202 if (config.commentBehavior == CommentBehavior.intern) 1203 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1204 else 1205 token = Token(type, cast(string) range.slice(mark), line, column, index); 1206 } 1207 1208 void lexStringLiteral(ref Token token) @trusted 1209 { 1210 mixin (tokenStart); 1211 range.popFront(); 1212 while (true) 1213 { 1214 if (range.index >= range.bytes.length) 1215 { 1216 error("Error: unterminated string literal"); 1217 token = Token(tok!""); 1218 return; 1219 } 1220 version (iasm64NotWindows) 1221 { 1222 if (haveSSE42 && range.index + 16 < range.bytes.length) 1223 { 1224 skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1225 &range.index, &range.column); 1226 } 1227 } 1228 if (range.bytes[range.index] == '"') 1229 { 1230 range.popFront(); 1231 break; 1232 } 1233 else if (range.bytes[range.index] == '\\') 1234 { 1235 if (!lexEscapeSequence()) 1236 { 1237 token = Token.init; 1238 return; 1239 } 1240 } 1241 else 1242 popFrontWhitespaceAware(); 1243 } 1244 IdType type = tok!"stringLiteral"; 1245 lexStringSuffix(type); 1246 token = Token(type, cache.intern(range.slice(mark)), line, column, 1247 index); 1248 } 1249 1250 void lexWysiwygString(ref Token token) @trusted 1251 { 1252 mixin (tokenStart); 1253 IdType type = tok!"stringLiteral"; 1254 immutable bool backtick = range.bytes[range.index] == '`'; 1255 if (backtick) 1256 { 1257 range.popFront(); 1258 while (true) 1259 { 1260 if (range.index >= range.bytes.length) 1261 { 1262 error("Error: unterminated string literal"); 1263 token = Token(tok!""); 1264 return; 1265 } 1266 version (iasm64NotWindows) 1267 { 1268 if (haveSSE42 && range.index + 16 < range.bytes.length) 1269 { 1270 skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index, 1271 &range.index, &range.column); 1272 } 1273 } 1274 if (range.bytes[range.index] == '`') 1275 { 1276 range.popFront(); 1277 break; 1278 } 1279 else 1280 popFrontWhitespaceAware(); 1281 } 1282 } 1283 else 1284 { 1285 range.popFront(); 1286 if (range.index >= range.bytes.length) 1287 { 1288 error("Error: unterminated string literal"); 1289 token = Token(tok!""); 1290 return; 1291 } 1292 range.popFront(); 1293 while (true) 1294 { 1295 if (range.index >= range.bytes.length) 1296 { 1297 error("Error: unterminated string literal"); 1298 token = Token(tok!""); 1299 return; 1300 } 1301 else if (range.bytes[range.index] == '"') 1302 { 1303 range.popFront(); 1304 break; 1305 } 1306 else 1307 popFrontWhitespaceAware(); 1308 } 1309 } 1310 lexStringSuffix(type); 1311 token = Token(type, cache.intern(range.slice(mark)), line, column, 1312 index); 1313 } 1314 1315 private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe 1316 { 1317 if (range.index >= range.bytes.length) 1318 { 1319 type = tok!"stringLiteral"; 1320 return 0; 1321 } 1322 else 1323 { 1324 switch (range.bytes[range.index]) 1325 { 1326 case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w'; 1327 case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd'; 1328 case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c'; 1329 default: type = tok!"stringLiteral"; return 0; 1330 } 1331 } 1332 } 1333 1334 void lexDelimitedString(ref Token token) 1335 { 1336 mixin (tokenStart); 1337 range.index += 2; 1338 range.column += 2; 1339 ubyte open; 1340 ubyte close; 1341 switch (range.bytes[range.index]) 1342 { 1343 case '<': 1344 open = '<'; 1345 close = '>'; 1346 range.popFront(); 1347 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1348 break; 1349 case '{': 1350 open = '{'; 1351 close = '}'; 1352 range.popFront(); 1353 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1354 break; 1355 case '[': 1356 open = '['; 1357 close = ']'; 1358 range.popFront(); 1359 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1360 break; 1361 case '(': 1362 open = '('; 1363 close = ')'; 1364 range.popFront(); 1365 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1366 break; 1367 default: 1368 lexHeredocString(token, mark, line, column, index); 1369 break; 1370 } 1371 } 1372 1373 void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, 1374 size_t index, ubyte open, ubyte close) 1375 { 1376 int depth = 1; 1377 while (!(range.index >= range.bytes.length) && depth > 0) 1378 { 1379 if (range.bytes[range.index] == open) 1380 { 1381 depth++; 1382 range.popFront(); 1383 } 1384 else if (range.bytes[range.index] == close) 1385 { 1386 depth--; 1387 range.popFront(); 1388 if (depth <= 0) 1389 { 1390 if (range.bytes[range.index] == '"') 1391 { 1392 range.popFront(); 1393 } 1394 else 1395 { 1396 error("Error: `\"` expected to end delimited string literal"); 1397 token = Token(tok!""); 1398 return; 1399 } 1400 } 1401 } 1402 else 1403 popFrontWhitespaceAware(); 1404 } 1405 IdType type = tok!"stringLiteral"; 1406 lexStringSuffix(type); 1407 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1408 } 1409 1410 void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index) 1411 { 1412 Token ident; 1413 lexIdentifier(ident); 1414 if (isNewline()) 1415 popFrontWhitespaceAware(); 1416 else 1417 error("Newline expected"); 1418 while (!(range.index >= range.bytes.length)) 1419 { 1420 if (isNewline()) 1421 { 1422 popFrontWhitespaceAware(); 1423 if (!range.canPeek(ident.text.length)) 1424 { 1425 error(ident.text ~ " expected"); 1426 break; 1427 } 1428 if (range.peek(ident.text.length - 1) == ident.text) 1429 { 1430 range.popFrontN(ident.text.length); 1431 break; 1432 } 1433 } 1434 else 1435 { 1436 range.popFront(); 1437 } 1438 } 1439 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"') 1440 { 1441 range.popFront(); 1442 } 1443 else 1444 error("`\"` expected"); 1445 IdType type = tok!"stringLiteral"; 1446 lexStringSuffix(type); 1447 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1448 } 1449 1450 void lexTokenString(ref Token token) 1451 { 1452 mixin (tokenStart); 1453 assert (range.bytes[range.index] == 'q'); 1454 range.popFront(); 1455 assert (range.bytes[range.index] == '{'); 1456 range.popFront(); 1457 auto app = appender!string(); 1458 app.put("q{"); 1459 int depth = 1; 1460 1461 immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; 1462 immutable StringBehavior oldString = config.stringBehavior; 1463 config.whitespaceBehavior = WhitespaceBehavior.include; 1464 config.stringBehavior = StringBehavior.source; 1465 scope (exit) 1466 { 1467 config.whitespaceBehavior = oldWhitespace; 1468 config.stringBehavior = oldString; 1469 } 1470 1471 advance(_front); 1472 while (depth > 0 && !empty) 1473 { 1474 auto t = front(); 1475 if (t.text is null) 1476 app.put(str(t.type)); 1477 else 1478 app.put(t.text); 1479 if (t.type == tok!"}") 1480 { 1481 depth--; 1482 if (depth > 0) 1483 popFront(); 1484 } 1485 else if (t.type == tok!"{") 1486 { 1487 depth++; 1488 popFront(); 1489 } 1490 else 1491 popFront(); 1492 } 1493 IdType type = tok!"stringLiteral"; 1494 auto b = lexStringSuffix(type); 1495 if (b != 0) 1496 app.put(b); 1497 token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, 1498 column, index); 1499 } 1500 1501 void lexHexString(ref Token token) 1502 { 1503 mixin (tokenStart); 1504 range.index += 2; 1505 range.column += 2; 1506 1507 loop: while (true) 1508 { 1509 if (range.index >= range.bytes.length) 1510 { 1511 error("Error: unterminated hex string literal"); 1512 token = Token(tok!""); 1513 return; 1514 } 1515 else if (isWhitespace()) 1516 popFrontWhitespaceAware(); 1517 else switch (range.bytes[range.index]) 1518 { 1519 case '0': .. case '9': 1520 case 'A': .. case 'F': 1521 case 'a': .. case 'f': 1522 range.popFront(); 1523 break; 1524 case '"': 1525 range.popFront(); 1526 break loop; 1527 default: 1528 error("Error: invalid character in hex string"); 1529 token = Token(tok!""); 1530 return; 1531 } 1532 } 1533 1534 IdType type = tok!"stringLiteral"; 1535 lexStringSuffix(type); 1536 token = Token(type, cache.intern(range.slice(mark)), line, column, 1537 index); 1538 } 1539 1540 bool lexNamedEntity() 1541 in { assert (range.bytes[range.index] == '&'); } 1542 do 1543 { 1544 Token t; 1545 range.popFront(); 1546 lexIdentifier(t, true); 1547 if (t.type != tok!"identifier" || range.empty || range.bytes[range.index] != ';') 1548 { 1549 error("Error: invalid named character entity"); 1550 return false; 1551 } 1552 range.popFront(); 1553 return true; 1554 } 1555 1556 bool lexEscapeSequence() 1557 { 1558 range.popFront(); 1559 if (range.index >= range.bytes.length) 1560 { 1561 error("Error: non-terminated character escape sequence."); 1562 return false; 1563 } 1564 switch (range.bytes[range.index]) 1565 { 1566 case '&': return lexNamedEntity(); 1567 case '\'': 1568 case '"': 1569 case '?': 1570 case '\\': 1571 case 'a': 1572 case 'b': 1573 case 'f': 1574 case 'n': 1575 case 'r': 1576 case 't': 1577 case 'v': 1578 range.popFront(); 1579 break; 1580 case 'x': 1581 range.popFront(); 1582 foreach (i; 0 .. 2) 1583 { 1584 if (range.index >= range.bytes.length) 1585 { 1586 error("Error: 2 hex digits expected."); 1587 return false; 1588 } 1589 switch (range.bytes[range.index]) 1590 { 1591 case '0': .. case '9': 1592 case 'a': .. case 'f': 1593 case 'A': .. case 'F': 1594 range.popFront(); 1595 break; 1596 default: 1597 error("Error: 2 hex digits expected."); 1598 return false; 1599 } 1600 } 1601 break; 1602 case '0': 1603 if (!(range.index + 1 < range.bytes.length) 1604 || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\'')) 1605 { 1606 range.popFront(); 1607 break; 1608 } 1609 goto case; 1610 case '1': .. case '7': 1611 for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) 1612 && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++) 1613 range.popFront(); 1614 break; 1615 case 'u': 1616 range.popFront(); 1617 foreach (i; 0 .. 4) 1618 { 1619 if (range.index >= range.bytes.length) 1620 { 1621 error("Error: at least 4 hex digits expected."); 1622 return false; 1623 } 1624 switch (range.bytes[range.index]) 1625 { 1626 case '0': .. case '9': 1627 case 'a': .. case 'f': 1628 case 'A': .. case 'F': 1629 range.popFront(); 1630 break; 1631 default: 1632 error("Error: at least 4 hex digits expected."); 1633 return false; 1634 } 1635 } 1636 break; 1637 case 'U': 1638 range.popFront(); 1639 foreach (i; 0 .. 8) 1640 { 1641 if (range.index >= range.bytes.length) 1642 { 1643 error("Error: at least 8 hex digits expected."); 1644 return false; 1645 } 1646 switch (range.bytes[range.index]) 1647 { 1648 case '0': .. case '9': 1649 case 'a': .. case 'f': 1650 case 'A': .. case 'F': 1651 range.popFront(); 1652 break; 1653 default: 1654 error("Error: at least 8 hex digits expected."); 1655 return false; 1656 } 1657 } 1658 break; 1659 default: 1660 error("Invalid escape sequence"); 1661 while (true) 1662 { 1663 if (range.index >= range.bytes.length) 1664 { 1665 error("Error: non-terminated character escape sequence."); 1666 break; 1667 } 1668 if (range.bytes[range.index] == ';') 1669 { 1670 range.popFront(); 1671 break; 1672 } 1673 else 1674 { 1675 range.popFront(); 1676 } 1677 } 1678 return false; 1679 } 1680 return true; 1681 } 1682 1683 void lexCharacterLiteral(ref Token token) 1684 { 1685 mixin (tokenStart); 1686 range.popFront(); 1687 if (range.empty) 1688 goto err; 1689 if (range.bytes[range.index] == '\\') 1690 lexEscapeSequence(); 1691 else if (range.bytes[range.index] == '\'') 1692 { 1693 range.popFront(); 1694 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1695 line, column, index); 1696 } 1697 else if (range.bytes[range.index] & 0x80) 1698 { 1699 while (range.bytes[range.index] & 0x80) 1700 range.popFront(); 1701 } 1702 else 1703 popFrontWhitespaceAware(); 1704 1705 if (range.index < range.bytes.length && range.bytes[range.index] == '\'') 1706 { 1707 range.popFront(); 1708 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1709 line, column, index); 1710 } 1711 else 1712 { 1713 err: 1714 error("Error: Expected `'` to end character literal"); 1715 token = Token(tok!""); 1716 } 1717 } 1718 1719 void lexIdentifier(ref Token token, const bool silent = false) @trusted 1720 { 1721 mixin (tokenStart); 1722 1723 if (isSeparating(0)) 1724 { 1725 if (silent) return; 1726 1727 error("Invalid identifier"); 1728 range.popFront(); 1729 } 1730 while (true) 1731 { 1732 version (iasm64NotWindows) 1733 { 1734 if (haveSSE42 && range.index + 16 < range.bytes.length) 1735 { 1736 immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_') 1737 (range.bytes.ptr + range.index); 1738 range.column += i; 1739 range.index += i; 1740 } 1741 } 1742 if (isSeparating(0)) 1743 break; 1744 else 1745 range.popFront(); 1746 } 1747 token = Token(tok!"identifier", cache.intern(range.slice(mark)), line, 1748 column, index); 1749 } 1750 1751 void lexDot(ref Token token) 1752 { 1753 mixin (tokenStart); 1754 if (!(range.index + 1 < range.bytes.length)) 1755 { 1756 range.popFront(); 1757 token = Token(tok!".", null, line, column, index); 1758 return; 1759 } 1760 switch (range.peekAt(1)) 1761 { 1762 case '0': .. case '9': 1763 lexNumber(token); 1764 return; 1765 case '.': 1766 range.popFront(); 1767 range.popFront(); 1768 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.') 1769 { 1770 range.popFront(); 1771 token = Token(tok!"...", null, line, column, index); 1772 } 1773 else 1774 token = Token(tok!"..", null, line, column, index); 1775 return; 1776 default: 1777 range.popFront(); 1778 token = Token(tok!".", null, line, column, index); 1779 return; 1780 } 1781 } 1782 1783 void lexLongNewline(ref Token token) @nogc 1784 { 1785 mixin (tokenStart); 1786 range.popFront(); 1787 range.popFront(); 1788 range.popFront(); 1789 range.incrementLine(); 1790 string text = config.whitespaceBehavior == WhitespaceBehavior.include 1791 ? cache.intern(range.slice(mark)) : ""; 1792 token = Token(tok!"whitespace", text, line, 1793 column, index); 1794 } 1795 1796 bool isNewline() @nogc 1797 { 1798 if (range.bytes[range.index] == '\n') return true; 1799 if (range.bytes[range.index] == '\r') return true; 1800 return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length) 1801 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 1802 } 1803 1804 bool isSeparating(size_t offset) @nogc 1805 { 1806 enum : ubyte 1807 { 1808 n, y, m // no, yes, maybe 1809 } 1810 1811 if (range.index + offset >= range.bytes.length) 1812 return true; 1813 auto c = range.bytes[range.index + offset]; 1814 static immutable ubyte[256] LOOKUP_TABLE = [ 1815 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1816 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1817 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1818 n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y, 1819 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1820 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n, 1821 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1822 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, 1823 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1824 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1825 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1826 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1827 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1828 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1829 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1830 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m 1831 ]; 1832 immutable ubyte result = LOOKUP_TABLE[c]; 1833 if (result == n) 1834 return false; 1835 if (result == y) 1836 return true; 1837 if (result == m) 1838 { 1839 auto r = range; 1840 range.popFrontN(offset); 1841 return (r.canPeek(2) && (r.peek(2) == "\u2028" 1842 || r.peek(2) == "\u2029")); 1843 } 1844 assert (false); 1845 } 1846 1847 1848 1849 enum tokenStart = q{ 1850 size_t index = range.index; 1851 size_t column = range.column; 1852 size_t line = range.line; 1853 auto mark = range.mark(); 1854 }; 1855 1856 void error(string message) 1857 { 1858 _messages ~= Message(range.line, range.column, message, true); 1859 } 1860 1861 void warning(string message) 1862 { 1863 _messages ~= Message(range.line, range.column, message, false); 1864 assert (_messages.length > 0); 1865 } 1866 1867 Message[] _messages; 1868 StringCache* cache; 1869 LexerConfig config; 1870 bool haveSSE42; 1871 } 1872 1873 /** 1874 * Creates a token range from the given source code. Creates a default lexer 1875 * configuration and a GC-managed string cache. 1876 */ 1877 public auto byToken(R)(R range) 1878 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1879 { 1880 LexerConfig config; 1881 StringCache* cache = new StringCache(range.length.optimalBucketCount); 1882 return DLexer(range, config, cache); 1883 } 1884 1885 /** 1886 * Creates a token range from the given source code. Uses the given string 1887 * cache. 1888 */ 1889 public auto byToken(R)(R range, StringCache* cache) 1890 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1891 { 1892 LexerConfig config; 1893 return DLexer(range, config, cache); 1894 } 1895 1896 /** 1897 * Creates a token range from the given source code. Uses the provided lexer 1898 * configuration and string cache. 1899 */ 1900 public auto byToken(R)(R range, const LexerConfig config, StringCache* cache) 1901 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1902 { 1903 return DLexer(range, config, cache); 1904 } 1905 1906 /** 1907 * Helper function used to avoid too much allocations while lexing. 1908 * 1909 * Params: 1910 * size = The length in bytes of the source file. 1911 * 1912 * Returns: 1913 * The optimal initial bucket count a `StringCache` should have. 1914 */ 1915 size_t optimalBucketCount(size_t size) 1916 { 1917 import std.math : nextPow2; 1918 return nextPow2((size + 31U) / 32U).min(1U << 30U); 1919 } 1920 /// 1921 unittest 1922 { 1923 assert(optimalBucketCount(1) == 2); 1924 assert(optimalBucketCount(9000 * 32) == 16384); 1925 static if (size_t.sizeof == ulong.sizeof) 1926 assert(optimalBucketCount(100_000_000_000UL) == 1 << 30); 1927 } 1928 1929 /** 1930 * The string cache is used for string interning. 1931 * 1932 * It will only store a single copy of any string that it is asked to hold. 1933 * Interned strings can be compared for equality by comparing their $(B .ptr) 1934 * field. 1935 * 1936 * Default and postbilt constructors are disabled. When a StringCache goes out 1937 * of scope, the memory held by it is freed. 1938 * 1939 * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning) 1940 */ 1941 struct StringCache 1942 { 1943 public pure nothrow @nogc: 1944 1945 @disable this(); 1946 @disable this(this); 1947 1948 /** 1949 * Params: bucketCount = the initial number of buckets. Must be a 1950 * power of two 1951 */ 1952 this(size_t bucketCount) nothrow @trusted @nogc 1953 in 1954 { 1955 import core.bitop : popcnt; 1956 static if (size_t.sizeof == 8) 1957 { 1958 immutable low = popcnt(cast(uint) bucketCount); 1959 immutable high = popcnt(cast(uint) (bucketCount >> 32)); 1960 assert ((low == 0 && high == 1) || (low == 1 && high == 0)); 1961 } 1962 else 1963 { 1964 static assert (size_t.sizeof == 4); 1965 assert (popcnt(cast(uint) bucketCount) == 1); 1966 } 1967 } 1968 do 1969 { 1970 buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount]; 1971 } 1972 1973 ~this() 1974 { 1975 Block* current = rootBlock; 1976 while (current !is null) 1977 { 1978 Block* prev = current; 1979 current = current.next; 1980 free(cast(void*) prev); 1981 } 1982 foreach (nodePointer; buckets) 1983 { 1984 Node* currentNode = nodePointer; 1985 while (currentNode !is null) 1986 { 1987 if (currentNode.mallocated) 1988 free(currentNode.str.ptr); 1989 Node* prev = currentNode; 1990 currentNode = currentNode.next; 1991 free(prev); 1992 } 1993 } 1994 rootBlock = null; 1995 free(buckets.ptr); 1996 buckets = null; 1997 } 1998 1999 /** 2000 * Caches a string. 2001 */ 2002 string intern(const(ubyte)[] str) @safe 2003 { 2004 if (str is null || str.length == 0) 2005 return ""; 2006 return _intern(str); 2007 } 2008 2009 /** 2010 * ditto 2011 */ 2012 string intern(string str) @trusted 2013 { 2014 return intern(cast(ubyte[]) str); 2015 } 2016 2017 /** 2018 * The default bucket count for the string cache. 2019 */ 2020 static enum defaultBucketCount = 4096; 2021 2022 private: 2023 2024 string _intern(const(ubyte)[] bytes) @trusted 2025 { 2026 immutable uint hash = hashBytes(bytes); 2027 immutable size_t index = hash & (buckets.length - 1); 2028 Node* s = find(bytes, hash); 2029 if (s !is null) 2030 return cast(string) s.str; 2031 ubyte[] mem = void; 2032 bool mallocated = bytes.length > BIG_STRING; 2033 if (mallocated) 2034 mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length]; 2035 else 2036 mem = allocate(bytes.length); 2037 mem[] = bytes[]; 2038 Node* node = cast(Node*) malloc(Node.sizeof); 2039 node.str = mem; 2040 node.hash = hash; 2041 node.next = buckets[index]; 2042 node.mallocated = mallocated; 2043 buckets[index] = node; 2044 return cast(string) mem; 2045 } 2046 2047 Node* find(const(ubyte)[] bytes, uint hash) @trusted 2048 { 2049 import std.algorithm : equal; 2050 immutable size_t index = hash & (buckets.length - 1); 2051 Node* node = buckets[index]; 2052 while (node !is null) 2053 { 2054 if (node.hash == hash && bytes == cast(ubyte[]) node.str) 2055 return node; 2056 node = node.next; 2057 } 2058 return node; 2059 } 2060 2061 static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc 2062 in 2063 { 2064 assert (data !is null); 2065 assert (data.length > 0); 2066 } 2067 do 2068 { 2069 immutable uint m = 0x5bd1e995; 2070 immutable int r = 24; 2071 uint h = cast(uint) data.length; 2072 while (data.length >= 4) 2073 { 2074 uint k = (cast(ubyte) data[3]) << 24 2075 | (cast(ubyte) data[2]) << 16 2076 | (cast(ubyte) data[1]) << 8 2077 | (cast(ubyte) data[0]); 2078 k *= m; 2079 k ^= k >> r; 2080 k *= m; 2081 h *= m; 2082 h ^= k; 2083 data = data[4 .. $]; 2084 } 2085 switch (data.length & 3) 2086 { 2087 case 3: 2088 h ^= data[2] << 16; 2089 goto case; 2090 case 2: 2091 h ^= data[1] << 8; 2092 goto case; 2093 case 1: 2094 h ^= data[0]; 2095 h *= m; 2096 break; 2097 default: 2098 break; 2099 } 2100 h ^= h >> 13; 2101 h *= m; 2102 h ^= h >> 15; 2103 return h; 2104 } 2105 2106 ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc 2107 in 2108 { 2109 assert (numBytes != 0); 2110 } 2111 out (result) 2112 { 2113 assert (result.length == numBytes); 2114 } 2115 do 2116 { 2117 Block* r = rootBlock; 2118 size_t i = 0; 2119 while (i <= 3 && r !is null) 2120 { 2121 immutable size_t available = r.bytes.length; 2122 immutable size_t oldUsed = r.used; 2123 immutable size_t newUsed = oldUsed + numBytes; 2124 if (newUsed <= available) 2125 { 2126 r.used = newUsed; 2127 return r.bytes[oldUsed .. newUsed]; 2128 } 2129 i++; 2130 r = r.next; 2131 } 2132 Block* b = cast(Block*) calloc(Block.sizeof, 1); 2133 b.used = numBytes; 2134 b.next = rootBlock; 2135 rootBlock = b; 2136 return b.bytes[0 .. numBytes]; 2137 } 2138 2139 static struct Node 2140 { 2141 ubyte[] str = void; 2142 Node* next = void; 2143 uint hash = void; 2144 bool mallocated = void; 2145 } 2146 2147 static struct Block 2148 { 2149 Block* next; 2150 size_t used; 2151 enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof; 2152 ubyte[BLOCK_CAPACITY] bytes; 2153 } 2154 2155 static assert (BLOCK_SIZE == Block.sizeof); 2156 2157 enum BLOCK_SIZE = 1024 * 16; 2158 2159 // If a string would take up more than 1/4 of a block, allocate it outside 2160 // of the block. 2161 enum BIG_STRING = BLOCK_SIZE / 4; 2162 2163 Node*[] buckets; 2164 Block* rootBlock; 2165 } 2166 2167 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted; 2168 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted; 2169 private extern(C) void free(void*) nothrow pure @nogc @trusted; 2170 2171 unittest 2172 { 2173 auto source = cast(ubyte[]) q{ import std.stdio;}}; 2174 auto tokens = getTokensForParser(source, LexerConfig(), 2175 new StringCache(StringCache.defaultBucketCount)); 2176 assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", 2177 tok!"identifier", tok!";"])); 2178 } 2179 2180 /// Test \x char sequence 2181 unittest 2182 { 2183 auto toks = (string s) => byToken(cast(ubyte[])s); 2184 2185 // valid 2186 immutable hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; 2187 auto source = ""; 2188 foreach (h1; hex) 2189 foreach (h2; hex) 2190 source ~= "'\\x" ~ h1 ~ h2 ~ "'"; 2191 assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty); 2192 2193 // invalid 2194 assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2195 assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2196 assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2197 assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2198 assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2199 } 2200 2201 version (iasm64NotWindows) 2202 { 2203 /** 2204 * Skips between 0 and 16 bytes that match (or do not match) one of the 2205 * given $(B chars). 2206 */ 2207 void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow 2208 @trusted @nogc if (chars.length <= 8) 2209 { 2210 enum constant = ByteCombine!chars; 2211 enum charsLength = chars.length; 2212 static if (matching) 2213 enum flags = 0b0001_0000; 2214 else 2215 enum flags = 0b0000_0000; 2216 asm pure nothrow @nogc 2217 { 2218 naked; 2219 movdqu XMM1, [RDX]; 2220 mov R10, constant; 2221 movq XMM2, R10; 2222 mov RAX, charsLength; 2223 mov RDX, 16; 2224 pcmpestri XMM2, XMM1, flags; 2225 add [RSI], RCX; 2226 add [RDI], RCX; 2227 ret; 2228 } 2229 } 2230 2231 /** 2232 * Returns: the number of bytes starting at the given location that match 2233 * (or do not match if $(B invert) is true) the byte ranges in $(B chars). 2234 */ 2235 ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc 2236 { 2237 static assert (chars.length % 2 == 0); 2238 enum constant = ByteCombine!chars; 2239 static if (invert) 2240 enum rangeMatchFlags = 0b0000_0100; 2241 else 2242 enum rangeMatchFlags = 0b0001_0100; 2243 enum charsLength = chars.length; 2244 asm pure nothrow @nogc 2245 { 2246 naked; 2247 movdqu XMM1, [RDI]; 2248 mov R10, constant; 2249 movq XMM2, R10; 2250 mov RAX, charsLength; 2251 mov RDX, 16; 2252 pcmpestri XMM2, XMM1, rangeMatchFlags; 2253 mov RAX, RCX; 2254 ret; 2255 } 2256 } 2257 2258 template ByteCombine(c...) 2259 { 2260 static assert (c.length <= 8); 2261 static if (c.length > 1) 2262 enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8); 2263 else 2264 enum ulong ByteCombine = c[0]; 2265 } 2266 } 2267 2268 unittest 2269 { 2270 import core.exception : RangeError; 2271 import std.exception : assertNotThrown; 2272 2273 static immutable src1 = "/++"; 2274 static immutable src2 = "/**"; 2275 2276 LexerConfig cf; 2277 StringCache ca = StringCache(16); 2278 2279 assertNotThrown!RangeError(getTokensForParser(src1, cf, &ca)); 2280 assertNotThrown!RangeError(getTokensForParser(src2, cf, &ca)); 2281 } 2282 2283 unittest 2284 { 2285 static immutable src = `"\eeee"`; 2286 2287 LexerConfig cf; 2288 StringCache ca = StringCache(16); 2289 2290 auto l = DLexer(src, cf, &ca); 2291 assert(l.front().type == tok!""); 2292 assert(!l.messages.empty); 2293 } 2294 2295 unittest 2296 { 2297 alias Msg = DLexer.Message; 2298 LexerConfig cf; 2299 StringCache ca = StringCache(16); 2300 2301 { 2302 auto l = DLexer(`"\©"`, cf, &ca); 2303 assert(l.front().type == tok!"stringLiteral"); 2304 assert(l.messages == []); 2305 } 2306 { 2307 auto l = DLexer(`"\™\⌝"`, cf, &ca); 2308 assert(l.front().type == tok!"stringLiteral"); 2309 assert(l.messages == []); 2310 } 2311 { 2312 auto l = DLexer(`"\&trade"`, cf, &ca); 2313 assert(l.front().type == tok!""); 2314 assert(l.messages == [ Msg(1, 9, "Error: invalid named character entity", true) ]); 2315 } 2316 { 2317 auto l = DLexer(`"\™\&urcorn"`, cf, &ca); 2318 assert(l.front().type == tok!""); 2319 assert(l.messages == [ Msg(1, 18, "Error: invalid named character entity", true) ]); 2320 } 2321 { 2322 auto l = DLexer(`"\&"`, cf, &ca); 2323 assert(l.front().type == tok!""); 2324 assert(l.messages == [ Msg(1, 4, "Error: invalid named character entity", true) ]); 2325 } 2326 { 2327 auto l = DLexer(`"\&0"`, cf, &ca); 2328 assert(l.front().type == tok!""); 2329 assert(l.messages == [ Msg(1, 5, "Error: invalid named character entity", true) ]); 2330 } 2331 { 2332 auto l = DLexer(`"\©`, cf, &ca); 2333 assert(l.front().type == tok!""); 2334 assert(l.messages == [ Msg(1, 8, "Error: invalid named character entity", true) ]); 2335 } 2336 { 2337 auto l = DLexer(`"\©`, cf, &ca); 2338 assert(l.front().type == tok!""); 2339 assert(l.messages == [ Msg(1, 9, "Error: unterminated string literal", true) ]); 2340 } 2341 } 2342 2343 // legacy code using compatibility comment and trailingComment 2344 unittest 2345 { 2346 import std.conv : to; 2347 import std.exception : enforce; 2348 2349 static immutable src = `/// this is a module. 2350 // mixed 2351 /// it can do stuff 2352 module foo.bar; 2353 2354 // hello 2355 2356 /** 2357 * some doc 2358 * hello 2359 */ 2360 int x; /// very nice 2361 2362 // TODO: do stuff 2363 void main() { 2364 #line 40 2365 /// could be better 2366 writeln(":)"); 2367 } 2368 2369 /// end of file`; 2370 2371 LexerConfig cf; 2372 StringCache ca = StringCache(16); 2373 2374 const tokens = getTokensForParser(src, cf, &ca); 2375 2376 void assertEquals(T)(T a, T b, string what, string file = __FILE__, size_t line = __LINE__) 2377 { 2378 enforce(a == b, "Failed " ~ what ~ " '" ~ a.to!string ~ "' == '" ~ b.to!string ~ "'", file, line); 2379 } 2380 2381 void test(size_t index, IdType type, string comment, string trailingComment, 2382 string file = __FILE__, size_t line = __LINE__) 2383 { 2384 assertEquals(tokens[index].type, type, "type", file, line); 2385 assertEquals(tokens[index].comment, comment, "comment", file, line); 2386 assertEquals(tokens[index].trailingComment, trailingComment, "trailingComment", file, line); 2387 } 2388 2389 test(0, tok!"module", "this is a module.\nit can do stuff", ""); 2390 test(1, tok!"identifier", "", ""); 2391 test(2, tok!".", "", ""); 2392 test(3, tok!"identifier", "", ""); 2393 test(4, tok!";", "", ""); 2394 test(5, tok!"int", "some doc\nhello", ""); 2395 test(6, tok!"identifier", "", ""); 2396 test(7, tok!";", "", "very nice"); 2397 test(8, tok!"void", "", ""); 2398 test(9, tok!"identifier", "", ""); 2399 test(10, tok!"(", "", ""); 2400 test(11, tok!")", "", ""); 2401 test(12, tok!"{", "", ""); 2402 test(13, tok!"identifier", "could be better", ""); 2403 test(14, tok!"(", "", ""); 2404 test(15, tok!"stringLiteral", "", ""); 2405 test(16, tok!")", "", ""); 2406 test(17, tok!";", "", ""); 2407 test(18, tok!"}", "", ""); 2408 } 2409 2410 // dlang-community/D-Scanner#805 2411 unittest 2412 { 2413 final class SomeExpr 2414 { 2415 Token tok; 2416 } 2417 2418 auto e1 = new SomeExpr(); 2419 const e2 = new SomeExpr(); 2420 immutable e3 = new immutable SomeExpr(); 2421 2422 immutable t1 = e1.tok; 2423 immutable t2 = e2.tok; 2424 immutable t3 = e3.tok; 2425 }