1 module dparse.lexer; 2 3 import std.typecons; 4 import std.typetuple; 5 import std.array; 6 import std.algorithm; 7 import std.range; 8 import std.experimental.lexer; 9 import std.traits; 10 import core.cpuid : sse42; 11 12 public import dparse.trivia; 13 14 /// Operators 15 private enum operators = [ 16 ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", 17 "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", 18 "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", 19 "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", 20 "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" 21 ]; 22 23 /// Kewords 24 private enum keywords = [ 25 "abstract", "alias", "align", "asm", "assert", "auto", "bool", 26 "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", 27 "char", "class", "const", "continue", "creal", "dchar", "debug", "default", 28 "delegate", "delete", "deprecated", "do", "double", "else", "enum", 29 "export", "extern", "false", "final", "finally", "float", "for", "foreach", 30 "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", 31 "immutable", "import", "in", "inout", "int", "interface", "invariant", 32 "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", 33 "null", "out", "override", "package", "pragma", "private", "protected", 34 "public", "pure", "real", "ref", "return", "scope", "shared", "short", 35 "static", "struct", "super", "switch", "synchronized", "template", "this", 36 "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", 37 "uint", "ulong", "union", "unittest", "ushort", "version", "void", 38 "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", 39 "__FILE_FULL_PATH__", "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", 40 "__parameters", "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", 41 "__vector", "__VENDOR__", "__VERSION__" 42 ]; 43 44 /// Other tokens 45 private enum dynamicTokens = [ 46 "specialTokenSequence", "comment", "identifier", "scriptLine", 47 "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", 48 "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", 49 "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", 50 "dstringLiteral", "stringLiteral", "wstringLiteral" 51 ]; 52 53 private enum pseudoTokenHandlers = [ 54 "\"", "lexStringLiteral", 55 "`", "lexWysiwygString", 56 "//", "lexSlashSlashComment", 57 "/*", "lexSlashStarComment", 58 "/+", "lexSlashPlusComment", 59 ".", "lexDot", 60 "'", "lexCharacterLiteral", 61 "0", "lexNumber", 62 "1", "lexDecimal", 63 "2", "lexDecimal", 64 "3", "lexDecimal", 65 "4", "lexDecimal", 66 "5", "lexDecimal", 67 "6", "lexDecimal", 68 "7", "lexDecimal", 69 "8", "lexDecimal", 70 "9", "lexDecimal", 71 "q\"", "lexDelimitedString", 72 "q{", "lexTokenString", 73 "r\"", "lexWysiwygString", 74 "x\"", "lexHexString", 75 " ", "lexWhitespace", 76 "\t", "lexWhitespace", 77 "\r", "lexWhitespace", 78 "\n", "lexWhitespace", 79 "\v", "lexWhitespace", 80 "\f", "lexWhitespace", 81 "\u2028", "lexLongNewline", 82 "\u2029", "lexLongNewline", 83 "#!", "lexScriptLine", 84 "#line", "lexSpecialTokenSequence" 85 ]; 86 87 /// Token ID type for the D lexer. 88 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 89 90 /** 91 * Function used for converting an IdType to a string. 92 * 93 * Examples: 94 * --- 95 * IdType c = tok!"case"; 96 * assert (str(c) == "case"); 97 * --- 98 */ 99 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 100 101 /** 102 * Template used to refer to D token types. 103 * 104 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 105 * values that can be passed to this template. 106 * Example: 107 * --- 108 * import dparse.lexer; 109 * IdType t = tok!"floatLiteral"; 110 * --- 111 */ 112 public template tok(string token) 113 { 114 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 115 } 116 117 mixin template TokenTriviaFields() 118 { 119 /** 120 * Whitespace and comment tokens attached to this token. 121 * 122 * All trivia tokens must have the text property set to the text with 123 * which they identify with. This means you can map all trivia tokens to 124 * their .text property and join them together to get the source code back 125 * without any loss of information. 126 * 127 * Trivia is only included when calling getTokensForParser. When iterating 128 * over DLexer all tokens will be in their raw form and none will be 129 * converted to trivia. 130 * 131 * Note: in the future you might need to explicitly pass 132 * WhitespaceBehavior.include (or keep the default) as getTokensForParser 133 * currently overrides it to include. 134 * 135 * Contains: `comment`, `whitespace`, `specialTokenSequence` 136 */ 137 immutable(typeof(this))[] leadingTrivia; 138 /// ditto 139 immutable(typeof(this))[] trailingTrivia; 140 141 string memoizedLeadingComment = null; 142 string memoizedTrailingComment = null; 143 144 /// Legacy property to get documentation comments, with comment border 145 /// stripped off, which is attached to this token. 146 string comment() const pure nothrow @safe @property { 147 import dparse.trivia : extractLeadingDdoc; 148 if (memoizedLeadingComment !is null) 149 return memoizedLeadingComment; 150 return (cast()memoizedLeadingComment) = this.extractLeadingDdoc; 151 } 152 153 /// ditto 154 string trailingComment() const pure nothrow @safe @property { 155 import dparse.trivia : extractTrailingDdoc; 156 if (memoizedTrailingComment !is null) 157 return memoizedTrailingComment; 158 return (cast()memoizedTrailingComment) = this.extractTrailingDdoc; 159 } 160 161 int opCmp(size_t i) const pure nothrow @safe @nogc { 162 if (index < i) return -1; 163 if (index > i) return 1; 164 return 0; 165 } 166 167 int opCmp(ref const typeof(this) other) const pure nothrow @safe @nogc { 168 return opCmp(other.index); 169 } 170 } 171 172 // mixin in from dparse.lexer to make error messages more managable size as the 173 // entire string is dumped when there is a type mismatch. 174 private enum extraFields = "import dparse.lexer:TokenTriviaFields; mixin TokenTriviaFields;"; 175 176 /// The token type in the D lexer 177 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); 178 179 /** 180 * Configure whitespace handling 181 */ 182 public enum WhitespaceBehavior : ubyte 183 { 184 include = 0b0000_0000, 185 skip = 0b0000_0001, 186 } 187 188 private enum stringBehaviorNotWorking = "Automatic string parsing is not " 189 ~ "supported and was previously not working. To unescape strings use the " 190 ~ "`dparse.strings : unescapeString` function on the token texts instead."; 191 192 /** 193 * Configure string lexing behavior 194 */ 195 // was enum, but struct now for deprecations and support with old compilers 196 public struct StringBehavior 197 { 198 /// Do not include quote characters, process escape sequences 199 deprecated(stringBehaviorNotWorking) static immutable StringBehavior compiler = StringBehavior(0b0000_0000); 200 /// Opening quotes, closing quotes, and string suffixes are included in 201 /// the string token 202 deprecated(stringBehaviorNotWorking) static immutable StringBehavior includeQuoteChars = StringBehavior(0b0000_0001); 203 /// String escape sequences are not replaced 204 deprecated(stringBehaviorNotWorking) static immutable StringBehavior notEscaped = StringBehavior(0b0000_0010); 205 /// Not modified at all. Useful for formatters or highlighters 206 static immutable StringBehavior source = StringBehavior(0b0000_0011); 207 208 ubyte behavior; 209 alias behavior this; 210 } 211 212 public enum CommentBehavior : bool 213 { 214 intern = true, 215 noIntern = false 216 } 217 /** 218 * Lexer configuration struct 219 */ 220 public struct LexerConfig 221 { 222 string fileName; 223 StringBehavior stringBehavior; 224 WhitespaceBehavior whitespaceBehavior; 225 CommentBehavior commentBehavior = CommentBehavior.intern; 226 } 227 228 /** 229 * Basic type token types. 230 */ 231 public alias BasicTypes = AliasSeq!(tok!"int", tok!"bool", tok!"byte", 232 tok!"cdouble", tok!"cent", tok!"cfloat", tok!"char", tok!"creal", 233 tok!"dchar", tok!"double", tok!"float", tok!"idouble", 234 tok!"ifloat", tok!"ireal", tok!"long", tok!"real", tok!"short", 235 tok!"ubyte", tok!"ucent", tok!"uint", tok!"ulong", tok!"ushort", 236 tok!"void", tok!"wchar"); 237 238 /** 239 * Returns: true if the given ID is for a basic type. 240 */ 241 public bool isBasicType(IdType type) nothrow pure @safe @nogc 242 { 243 switch (type) 244 { 245 foreach (T; BasicTypes) 246 { 247 case T: 248 return true; 249 } 250 default: 251 return false; 252 } 253 } 254 255 /** 256 * Number literal token types. 257 */ 258 public alias NumberLiterals = AliasSeq!(tok!"doubleLiteral", 259 tok!"floatLiteral", tok!"idoubleLiteral", tok!"ifloatLiteral", 260 tok!"intLiteral", tok!"longLiteral", tok!"realLiteral", 261 tok!"irealLiteral", tok!"uintLiteral", tok!"ulongLiteral"); 262 263 /** 264 * Returns: true if the given ID type is for a number literal. 265 */ 266 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc 267 { 268 switch (type) 269 { 270 foreach (T; NumberLiterals) 271 { 272 case T: 273 return true; 274 } 275 default: 276 return false; 277 } 278 } 279 280 /** 281 * Number literal token types. 282 */ 283 public alias IntegerLiterals = AliasSeq!(tok!"intLiteral", tok!"longLiteral", 284 tok!"uintLiteral", tok!"ulongLiteral"); 285 286 /** 287 * Returns: true if the given ID type is for a integer literal. 288 */ 289 public bool isIntegerLiteral(IdType type) nothrow pure @safe @nogc 290 { 291 switch (type) 292 { 293 foreach (T; IntegerLiterals) 294 { 295 case T: 296 return true; 297 } 298 default: 299 return false; 300 } 301 } 302 303 /** 304 * Operator token types. 305 */ 306 public alias Operators = AliasSeq!(tok!",", tok!".", tok!"..", tok!"...", 307 tok!"/", tok!"/=", tok!"!", tok!"!<", tok!"!<=", tok!"!<>", 308 tok!"!<>=", tok!"!=", tok!"!>", tok!"!>=", tok!"$", tok!"%", 309 tok!"%=", tok!"&", tok!"&&", tok!"&=", tok!"(", tok!")", 310 tok!"*", tok!"*=", tok!"+", tok!"++", tok!"+=", tok!"-", 311 tok!"--", tok!"-=", tok!":", tok!";", tok!"<", tok!"<<", 312 tok!"<<=", tok!"<=", tok!"<>", tok!"<>=", tok!"=", tok!"==", 313 tok!"=>", tok!">", tok!">=", tok!">>", tok!">>=", tok!">>>", 314 tok!">>>=", tok!"?", tok!"@", tok!"[", tok!"]", tok!"^", 315 tok!"^=", tok!"^^", tok!"^^=", tok!"{", tok!"|", tok!"|=", 316 tok!"||", tok!"}", tok!"~", tok!"~="); 317 318 /** 319 * Returns: true if the given ID type is for an operator. 320 */ 321 public bool isOperator(IdType type) nothrow pure @safe @nogc 322 { 323 switch (type) 324 { 325 foreach (T; Operators) 326 { 327 case T: 328 return true; 329 } 330 default: 331 return false; 332 } 333 } 334 335 /** 336 * Keyword token types. 337 */ 338 public alias Keywords = AliasSeq!(tok!"abstract", tok!"alias", tok!"align", 339 tok!"asm", tok!"assert", tok!"auto", tok!"break", 340 tok!"case", tok!"cast", tok!"catch", tok!"class", tok!"const", 341 tok!"continue", tok!"debug", tok!"default", tok!"delegate", 342 tok!"delete", tok!"deprecated", tok!"do", tok!"else", tok!"enum", 343 tok!"export", tok!"extern", tok!"false", tok!"final", tok!"finally", 344 tok!"for", tok!"foreach", tok!"foreach_reverse", tok!"function", 345 tok!"goto", tok!"if", tok!"immutable", tok!"import", tok!"in", 346 tok!"inout", tok!"interface", tok!"invariant", tok!"is", 347 tok!"lazy", tok!"macro", tok!"mixin", tok!"module", tok!"new", 348 tok!"nothrow", tok!"null", tok!"out", tok!"override", tok!"package", 349 tok!"pragma", tok!"private", tok!"protected", tok!"public", 350 tok!"pure", tok!"ref", tok!"return", tok!"scope", tok!"shared", 351 tok!"static", tok!"struct", tok!"super", tok!"switch", tok!"synchronized", 352 tok!"template", tok!"this", tok!"throw", tok!"true", tok!"try", 353 tok!"typedef", tok!"typeid", tok!"typeof", tok!"union", tok!"unittest", 354 tok!"version", tok!"while", tok!"with", tok!"__DATE__", 355 tok!"__EOF__", tok!"__FILE__", tok!"__FILE_FULL_PATH__", tok!"__FUNCTION__", 356 tok!"__gshared", tok!"__LINE__", tok!"__MODULE__", tok!"__parameters", 357 tok!"__PRETTY_FUNCTION__", tok!"__TIME__", tok!"__TIMESTAMP__", 358 tok!"__traits", tok!"__vector", tok!"__VENDOR__", tok!"__VERSION__"); 359 360 /** 361 * Returns: true if the given ID type is for a keyword. 362 */ 363 public bool isKeyword(IdType type) pure nothrow @safe @nogc 364 { 365 switch (type) 366 { 367 foreach (T; Keywords) 368 { 369 case T: 370 return true; 371 } 372 default: 373 return false; 374 } 375 } 376 377 /** 378 * String literal token types 379 */ 380 public alias StringLiterals = AliasSeq!(tok!"dstringLiteral", 381 tok!"stringLiteral", tok!"wstringLiteral"); 382 383 /** 384 * Returns: true if the given ID type is for a string literal. 385 */ 386 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc 387 { 388 switch (type) 389 { 390 foreach (T; StringLiterals) 391 { 392 case T: 393 return true; 394 } 395 default: 396 return false; 397 } 398 } 399 400 /** 401 * Protection token types. 402 */ 403 public alias Protections = AliasSeq!(tok!"export", tok!"package", 404 tok!"private", tok!"public", tok!"protected"); 405 406 /** 407 * Returns: true if the given ID type is for a protection attribute. 408 */ 409 public bool isProtection(IdType type) pure nothrow @safe @nogc 410 { 411 switch (type) 412 { 413 foreach (T; Protections) 414 { 415 case T: 416 return true; 417 } 418 default: 419 return false; 420 } 421 } 422 423 public alias SpecialTokens = AliasSeq!(tok!"__DATE__", tok!"__TIME__", 424 tok!"__TIMESTAMP__", tok!"__VENDOR__", tok!"__VERSION__", tok!"__FILE__", 425 tok!"__FILE_FULL_PATH__", tok!"__LINE__", tok!"__MODULE__", 426 tok!"__FUNCTION__", tok!"__PRETTY_FUNCTION__"); 427 428 public bool isSpecialToken(IdType type) pure nothrow @safe @nogc 429 { 430 switch (type) 431 { 432 foreach (T; SpecialTokens) 433 { 434 case T: 435 return true; 436 } 437 default: 438 return false; 439 } 440 } 441 442 public alias Literals = AliasSeq!(StringLiterals, NumberLiterals, tok!"characterLiteral", 443 SpecialTokens, tok!"true", tok!"false", tok!"null", tok!"$"); 444 445 public bool isLiteral(IdType type) pure nothrow @safe @nogc 446 { 447 switch (type) 448 { 449 foreach (T; Literals) 450 { 451 case T: 452 return true; 453 } 454 default: 455 return false; 456 } 457 } 458 459 /** 460 * Returns: an array of tokens lexed from the given source code to the output 461 * range. All whitespace, comment and specialTokenSequence tokens (trivia) are 462 * attached to the token nearest to them. 463 * 464 * Trivia is put on the last token as `trailingTrivia` if it is on the same 465 * line as the trivia, otherwise it will be attached to the next token in the 466 * `leadingTrivia` until there is the EOF, where it will be attached as 467 * `trailingTrivia` again. 468 */ 469 const(Token)[] getTokensForParser(R)(R sourceCode, LexerConfig config, StringCache* cache) 470 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 471 { 472 config.whitespaceBehavior = WhitespaceBehavior.include; 473 config.commentBehavior = CommentBehavior.noIntern; 474 475 auto leadingTriviaAppender = appender!(Token[])(); 476 leadingTriviaAppender.reserve(128); 477 auto trailingTriviaAppender = appender!(Token[])(); 478 trailingTriviaAppender.reserve(128); 479 480 auto output = appender!(typeof(return))(); 481 auto lexer = DLexer(sourceCode, config, cache); 482 loop: while (!lexer.empty) switch (lexer.front.type) 483 { 484 case tok!"specialTokenSequence": 485 case tok!"whitespace": 486 case tok!"comment": 487 if (!output.data.empty && lexer.front.line == output.data[$ - 1].line) 488 trailingTriviaAppender.put(lexer.front); 489 else 490 leadingTriviaAppender.put(lexer.front); 491 lexer.popFront(); 492 break; 493 case tok!"__EOF__": 494 break loop; 495 default: 496 Token t = lexer.front; 497 lexer.popFront(); 498 499 if (!output.data.empty && !trailingTriviaAppender.data.empty) 500 (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup; 501 t.leadingTrivia = leadingTriviaAppender.data.idup; 502 leadingTriviaAppender.clear(); 503 trailingTriviaAppender.clear(); 504 505 output.put(t); 506 break; 507 } 508 509 if (!output.data.empty) 510 { 511 trailingTriviaAppender.put(leadingTriviaAppender.data); 512 (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup; 513 } 514 515 return output.data; 516 } 517 518 /** 519 * The D lexer struct. 520 */ 521 public struct DLexer 522 { 523 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 524 keywords, pseudoTokenHandlers); 525 526 /// 527 @disable this(); 528 529 /** 530 * Params: 531 * range = the bytes that compose the source code that will be lexed. 532 * config = the lexer configuration to use. 533 * cache = the string interning cache for de-duplicating identifiers and 534 * other token text. 535 * haveSSE42 = Parse streaming SIMD Extensions 4.2 in inline assembly 536 */ 537 this(R)(R range, const LexerConfig config, StringCache* cache, 538 bool haveSSE42 = sse42()) pure nothrow @safe 539 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 540 { 541 this.haveSSE42 = haveSSE42; 542 auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf) 543 ? range[3 .. $] : range; 544 static if (is(ElementEncodingType!R == immutable)) 545 this.range = LexerRange(cast(const(ubyte)[]) r); 546 else 547 this.range = LexerRange(cast(const(ubyte)[]) r.idup); 548 this.config = config; 549 this.cache = cache; 550 popFront(); 551 } 552 553 /// 554 public void popFront()() pure nothrow @safe 555 { 556 do 557 _popFront(); 558 while (config.whitespaceBehavior == WhitespaceBehavior.skip 559 && _front.type == tok!"whitespace"); 560 } 561 562 /** 563 * Lexer error/warning message. 564 */ 565 static struct Message 566 { 567 /// 1-based line number 568 size_t line; 569 /// 1-based byte offset 570 size_t column; 571 /// Text of the message 572 string message; 573 /// `true` for an error, `false` for a warning 574 bool isError; 575 } 576 577 /** 578 * Returns: An array of all of the warnings and errors generated so far 579 * during lexing. It may make sense to only check this when `empty` 580 * returns `true`. 581 */ 582 const(Message[]) messages() const @property 583 { 584 return _messages; 585 } 586 587 private pure nothrow @safe: 588 589 bool isWhitespace() 590 { 591 switch (range.bytes[range.index]) 592 { 593 case ' ': 594 case '\r': 595 case '\n': 596 case '\t': 597 case '\v': 598 case '\f': 599 return true; 600 case 0xe2: 601 auto peek = range.peek(2); 602 return peek.length == 2 603 && peek[0] == 0x80 604 && (peek[1] == 0xa8 || peek[1] == 0xa9); 605 default: 606 return false; 607 } 608 } 609 610 void popFrontWhitespaceAware() 611 { 612 switch (range.bytes[range.index]) 613 { 614 case '\r': 615 range.popFront(); 616 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 617 { 618 range.popFront(); 619 range.incrementLine(); 620 } 621 else 622 range.incrementLine(); 623 return; 624 case '\n': 625 range.popFront(); 626 range.incrementLine(); 627 return; 628 case 0xe2: 629 auto lookahead = range.peek(3); 630 if (lookahead.length == 3 && lookahead[1] == 0x80 631 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 632 { 633 range.index+=3; 634 range.column+=3; 635 range.incrementLine(); 636 return; 637 } 638 else 639 { 640 range.popFront(); 641 return; 642 } 643 default: 644 range.popFront(); 645 return; 646 } 647 } 648 649 void lexWhitespace(ref Token token) @trusted 650 { 651 mixin (tokenStart); 652 loop: do 653 { 654 version (X86_64) 655 { 656 if (haveSSE42 && range.index + 16 < range.bytes.length) 657 { 658 skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index, 659 &range.index, &range.column); 660 } 661 } 662 switch (range.bytes[range.index]) 663 { 664 case '\r': 665 range.popFront(); 666 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 667 { 668 range.popFront(); 669 } 670 range.column = 1; 671 range.line += 1; 672 break; 673 case '\n': 674 range.popFront(); 675 range.column = 1; 676 range.line += 1; 677 break; 678 case ' ': 679 case '\t': 680 case '\v': 681 case '\f': 682 range.popFront(); 683 break; 684 case 0xe2: 685 if (range.index + 2 >= range.bytes.length) 686 break loop; 687 if (range.bytes[range.index + 1] != 0x80) 688 break loop; 689 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9) 690 { 691 range.index += 3; 692 range.column += 3; 693 range.column = 1; 694 range.line += 1; 695 break; 696 } 697 break loop; 698 default: 699 break loop; 700 } 701 } while (!(range.index >= range.bytes.length)); 702 string text = config.whitespaceBehavior == WhitespaceBehavior.include 703 ? cache.intern(range.slice(mark)) : ""; 704 token = Token(tok!"whitespace", text, line, column, index); 705 } 706 707 void lexNumber(ref Token token) 708 { 709 mixin (tokenStart); 710 if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length) 711 { 712 immutable ahead = range.bytes[range.index + 1]; 713 switch (ahead) 714 { 715 case 'x': 716 case 'X': 717 range.index += 2; 718 range.column += 2; 719 lexHex(token, mark, line, column, index); 720 return; 721 case 'b': 722 case 'B': 723 range.index += 2; 724 range.column += 2; 725 lexBinary(token, mark, line, column, index); 726 return; 727 default: 728 lexDecimal(token, mark, line, column, index); 729 return; 730 } 731 } 732 else 733 lexDecimal(token, mark, line, column, index); 734 } 735 736 void lexHex(ref Token token) 737 { 738 mixin (tokenStart); 739 lexHex(token, mark, line, column, index); 740 } 741 742 void lexHex(ref Token token, size_t mark, size_t line, size_t column, 743 size_t index) @trusted 744 { 745 IdType type = tok!"intLiteral"; 746 bool foundDot; 747 hexLoop: while (!(range.index >= range.bytes.length)) 748 { 749 switch (range.bytes[range.index]) 750 { 751 case 'a': .. case 'f': 752 case 'A': .. case 'F': 753 case '0': .. case '9': 754 case '_': 755 version (X86_64) 756 { 757 if (haveSSE42 && range.index + 16 < range.bytes.length) 758 { 759 immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_') 760 (range.bytes.ptr + range.index); 761 range.column += i; 762 range.index += i; 763 } 764 else 765 range.popFront(); 766 } 767 else 768 range.popFront(); 769 break; 770 case 'u': 771 case 'U': 772 lexIntSuffix(type); 773 break hexLoop; 774 case 'i': 775 if (foundDot) 776 lexFloatSuffix(type); 777 break hexLoop; 778 case 'L': 779 if (foundDot) 780 lexFloatSuffix(type); 781 else 782 lexIntSuffix(type); 783 break hexLoop; 784 case 'p': 785 case 'P': 786 lexExponent(type); 787 break hexLoop; 788 case '.': 789 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 790 break hexLoop; 791 else 792 { 793 // The following bit of silliness tries to tell the 794 // difference between "int dot identifier" and 795 // "double identifier". 796 if (range.index + 1 < range.bytes.length) 797 { 798 switch (range.peekAt(1)) 799 { 800 case '0': .. case '9': 801 case 'A': .. case 'F': 802 case 'a': .. case 'f': 803 goto doubleLiteral; 804 default: 805 break hexLoop; 806 } 807 } 808 else 809 { 810 doubleLiteral: 811 range.popFront(); 812 foundDot = true; 813 type = tok!"doubleLiteral"; 814 } 815 } 816 break; 817 default: 818 break hexLoop; 819 } 820 } 821 token = Token(type, cache.intern(range.slice(mark)), line, column, 822 index); 823 } 824 825 void lexBinary(ref Token token) 826 { 827 mixin (tokenStart); 828 return lexBinary(token, mark, line, column, index); 829 } 830 831 void lexBinary(ref Token token, size_t mark, size_t line, size_t column, 832 size_t index) @trusted 833 { 834 IdType type = tok!"intLiteral"; 835 binaryLoop: while (!(range.index >= range.bytes.length)) 836 { 837 switch (range.bytes[range.index]) 838 { 839 case '0': 840 case '1': 841 case '_': 842 version (X86_64) 843 { 844 if (haveSSE42 && range.index + 16 < range.bytes.length) 845 { 846 immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')( 847 range.bytes.ptr + range.index); 848 range.column += i; 849 range.index += i; 850 } 851 else 852 range.popFront(); 853 } 854 else 855 range.popFront(); 856 break; 857 case 'u': 858 case 'U': 859 case 'L': 860 lexIntSuffix(type); 861 break binaryLoop; 862 default: 863 break binaryLoop; 864 } 865 } 866 token = Token(type, cache.intern(range.slice(mark)), line, column, 867 index); 868 } 869 870 void lexDecimal(ref Token token) 871 { 872 mixin (tokenStart); 873 lexDecimal(token, mark, line, column, index); 874 } 875 876 void lexDecimal(ref Token token, size_t mark, size_t line, size_t column, 877 size_t index) @trusted 878 { 879 bool foundDot = range.bytes[range.index] == '.'; 880 IdType type = tok!"intLiteral"; 881 if (foundDot) 882 { 883 range.popFront(); 884 type = tok!"doubleLiteral"; 885 } 886 887 decimalLoop: while (!(range.index >= range.bytes.length)) 888 { 889 switch (range.bytes[range.index]) 890 { 891 case '0': .. case '9': 892 case '_': 893 version (X86_64) 894 { 895 if (haveSSE42 && range.index + 16 < range.bytes.length) 896 { 897 immutable ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index); 898 range.column += i; 899 range.index += i; 900 } 901 else 902 range.popFront(); 903 } 904 else 905 range.popFront(); 906 break; 907 case 'u': 908 case 'U': 909 if (!foundDot) 910 lexIntSuffix(type); 911 break decimalLoop; 912 case 'i': 913 lexFloatSuffix(type); 914 break decimalLoop; 915 case 'L': 916 if (foundDot) 917 lexFloatSuffix(type); 918 else 919 lexIntSuffix(type); 920 break decimalLoop; 921 case 'f': 922 case 'F': 923 lexFloatSuffix(type); 924 break decimalLoop; 925 case 'e': 926 case 'E': 927 lexExponent(type); 928 break decimalLoop; 929 case '.': 930 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 931 break decimalLoop; 932 else 933 { 934 // The following bit of silliness tries to tell the 935 // difference between "int dot identifier" and 936 // "double identifier". 937 if (range.index + 1 < range.bytes.length) 938 { 939 immutable ch = range.peekAt(1); 940 if (ch <= 0x2f 941 || (ch >= '0' && ch <= '9') 942 || (ch >= ':' && ch <= '@') 943 || (ch >= '[' && ch <= '^') 944 || (ch >= '{' && ch <= '~') 945 || ch == '`' || ch == '_') 946 { 947 goto doubleLiteral; 948 } 949 else 950 break decimalLoop; 951 } 952 else 953 { 954 doubleLiteral: 955 range.popFront(); 956 foundDot = true; 957 type = tok!"doubleLiteral"; 958 } 959 } 960 break; 961 default: 962 break decimalLoop; 963 } 964 } 965 token = Token(type, cache.intern(range.slice(mark)), line, column, 966 index); 967 } 968 969 void lexIntSuffix(ref IdType type) pure nothrow @safe 970 { 971 bool secondPass; 972 if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U') 973 { 974 U: 975 if (type == tok!"intLiteral") 976 type = tok!"uintLiteral"; 977 else 978 type = tok!"ulongLiteral"; 979 range.popFront(); 980 if (secondPass) 981 return; 982 if (range.index < range.bytes.length 983 && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')) 984 goto L; 985 goto I; 986 } 987 if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l') 988 { 989 L: 990 if (type == tok!"uintLiteral") 991 type = tok!"ulongLiteral"; 992 else 993 type = tok!"longLiteral"; 994 range.popFront(); 995 if (range.index < range.bytes.length 996 && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u')) 997 { 998 secondPass = true; 999 goto U; 1000 } 1001 goto I; 1002 } 1003 I: 1004 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 1005 { 1006 warning("Complex number literals are deprecated"); 1007 range.popFront(); 1008 if (type == tok!"longLiteral" || type == tok!"ulongLiteral") 1009 type = tok!"idoubleLiteral"; 1010 else 1011 type = tok!"ifloatLiteral"; 1012 } 1013 } 1014 1015 void lexFloatSuffix(ref IdType type) pure nothrow @safe 1016 { 1017 switch (range.bytes[range.index]) 1018 { 1019 case 'L': 1020 range.popFront(); 1021 type = tok!"doubleLiteral"; 1022 break; 1023 case 'f': 1024 case 'F': 1025 range.popFront(); 1026 type = tok!"floatLiteral"; 1027 break; 1028 default: 1029 break; 1030 } 1031 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 1032 { 1033 warning("Complex number literals are deprecated"); 1034 range.popFront(); 1035 if (type == tok!"floatLiteral") 1036 type = tok!"ifloatLiteral"; 1037 else 1038 type = tok!"idoubleLiteral"; 1039 } 1040 } 1041 1042 void lexExponent(ref IdType type) pure nothrow @safe 1043 { 1044 range.popFront(); 1045 bool foundSign = false; 1046 bool foundDigit = false; 1047 while (range.index < range.bytes.length) 1048 { 1049 switch (range.bytes[range.index]) 1050 { 1051 case '-': 1052 case '+': 1053 if (foundSign) 1054 { 1055 if (!foundDigit) 1056 error("Expected an exponent"); 1057 return; 1058 } 1059 foundSign = true; 1060 range.popFront(); 1061 break; 1062 case '0': .. case '9': 1063 case '_': 1064 foundDigit = true; 1065 range.popFront(); 1066 break; 1067 case 'L': 1068 case 'f': 1069 case 'F': 1070 case 'i': 1071 lexFloatSuffix(type); 1072 return; 1073 default: 1074 if (!foundDigit) 1075 error("Expected an exponent"); 1076 return; 1077 } 1078 } 1079 } 1080 1081 void lexScriptLine(ref Token token) 1082 { 1083 mixin (tokenStart); 1084 while (!(range.index >= range.bytes.length) && !isNewline) 1085 { 1086 range.popFront(); 1087 } 1088 token = Token(tok!"scriptLine", cache.intern(range.slice(mark)), 1089 line, column, index); 1090 } 1091 1092 void lexSpecialTokenSequence(ref Token token) 1093 { 1094 mixin (tokenStart); 1095 while (!(range.index >= range.bytes.length) && !isNewline) 1096 { 1097 range.popFront(); 1098 } 1099 token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), 1100 line, column, index); 1101 } 1102 1103 void lexSlashStarComment(ref Token token) @trusted 1104 { 1105 mixin (tokenStart); 1106 IdType type = tok!"comment"; 1107 range.popFrontN(2); 1108 while (range.index < range.bytes.length) 1109 { 1110 version (X86_64) 1111 { 1112 if (haveSSE42 && range.index + 16 < range.bytes.length) 1113 skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index, 1114 &range.index, &range.column); 1115 } 1116 if (range.bytes[range.index] == '*') 1117 { 1118 range.popFront(); 1119 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1120 { 1121 range.popFront(); 1122 break; 1123 } 1124 } 1125 else 1126 popFrontWhitespaceAware(); 1127 } 1128 if (config.commentBehavior == CommentBehavior.intern) 1129 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1130 else 1131 token = Token(type, cast(string) range.slice(mark), line, column, index); 1132 } 1133 1134 void lexSlashSlashComment(ref Token token) @trusted 1135 { 1136 mixin (tokenStart); 1137 IdType type = tok!"comment"; 1138 range.popFrontN(2); 1139 while (range.index < range.bytes.length) 1140 { 1141 version (X86_64) 1142 { 1143 if (haveSSE42 && range.index + 16 < range.bytes.length) 1144 { 1145 skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1146 &range.index, &range.column); 1147 } 1148 } 1149 if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n') 1150 break; 1151 range.popFront(); 1152 } 1153 if (config.commentBehavior == CommentBehavior.intern) 1154 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1155 else 1156 token = Token(type, cast(string) range.slice(mark), line, column, index); 1157 } 1158 1159 void lexSlashPlusComment(ref Token token) @trusted 1160 { 1161 mixin (tokenStart); 1162 IdType type = tok!"comment"; 1163 range.index += 2; 1164 range.column += 2; 1165 int depth = 1; 1166 while (depth > 0 && !(range.index >= range.bytes.length)) 1167 { 1168 version (X86_64) 1169 { 1170 if (haveSSE42 && range.index + 16 < range.bytes.length) 1171 { 1172 skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1173 &range.index, &range.column); 1174 } 1175 } 1176 if (range.bytes[range.index] == '+') 1177 { 1178 range.popFront(); 1179 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1180 { 1181 range.popFront(); 1182 depth--; 1183 } 1184 } 1185 else if (range.bytes[range.index] == '/') 1186 { 1187 range.popFront(); 1188 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+') 1189 { 1190 range.popFront(); 1191 depth++; 1192 } 1193 } 1194 else 1195 popFrontWhitespaceAware(); 1196 } 1197 if (config.commentBehavior == CommentBehavior.intern) 1198 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1199 else 1200 token = Token(type, cast(string) range.slice(mark), line, column, index); 1201 } 1202 1203 void lexStringLiteral(ref Token token) @trusted 1204 { 1205 mixin (tokenStart); 1206 range.popFront(); 1207 while (true) 1208 { 1209 if (range.index >= range.bytes.length) 1210 { 1211 error(token, "Error: unterminated string literal"); 1212 return; 1213 } 1214 version (X86_64) 1215 { 1216 if (haveSSE42 && range.index + 16 < range.bytes.length) 1217 { 1218 skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1219 &range.index, &range.column); 1220 } 1221 } 1222 if (range.bytes[range.index] == '"') 1223 { 1224 range.popFront(); 1225 break; 1226 } 1227 else if (range.bytes[range.index] == '\\') 1228 { 1229 if (!lexEscapeSequence()) 1230 { 1231 token = Token.init; 1232 return; 1233 } 1234 } 1235 else 1236 popFrontWhitespaceAware(); 1237 } 1238 IdType type = tok!"stringLiteral"; 1239 lexStringSuffix(type); 1240 token = Token(type, cache.intern(range.slice(mark)), line, column, 1241 index); 1242 } 1243 1244 void lexWysiwygString(ref Token token) @trusted 1245 { 1246 mixin (tokenStart); 1247 IdType type = tok!"stringLiteral"; 1248 immutable bool backtick = range.bytes[range.index] == '`'; 1249 if (backtick) 1250 { 1251 range.popFront(); 1252 while (true) 1253 { 1254 if (range.index >= range.bytes.length) 1255 { 1256 error(token, "Error: unterminated string literal"); 1257 return; 1258 } 1259 version (X86_64) 1260 { 1261 if (haveSSE42 && range.index + 16 < range.bytes.length) 1262 { 1263 skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index, 1264 &range.index, &range.column); 1265 } 1266 } 1267 if (range.bytes[range.index] == '`') 1268 { 1269 range.popFront(); 1270 break; 1271 } 1272 else 1273 popFrontWhitespaceAware(); 1274 } 1275 } 1276 else 1277 { 1278 range.popFront(); 1279 if (range.index >= range.bytes.length) 1280 { 1281 error(token, "Error: unterminated string literal"); 1282 return; 1283 } 1284 range.popFront(); 1285 while (true) 1286 { 1287 if (range.index >= range.bytes.length) 1288 { 1289 error(token, "Error: unterminated string literal"); 1290 return; 1291 } 1292 else if (range.bytes[range.index] == '"') 1293 { 1294 range.popFront(); 1295 break; 1296 } 1297 else 1298 popFrontWhitespaceAware(); 1299 } 1300 } 1301 lexStringSuffix(type); 1302 token = Token(type, cache.intern(range.slice(mark)), line, column, 1303 index); 1304 } 1305 1306 private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe 1307 { 1308 if (range.index >= range.bytes.length) 1309 { 1310 type = tok!"stringLiteral"; 1311 return 0; 1312 } 1313 else 1314 { 1315 switch (range.bytes[range.index]) 1316 { 1317 case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w'; 1318 case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd'; 1319 case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c'; 1320 default: type = tok!"stringLiteral"; return 0; 1321 } 1322 } 1323 } 1324 1325 void lexDelimitedString(ref Token token) 1326 { 1327 mixin (tokenStart); 1328 range.index += 2; 1329 range.column += 2; 1330 ubyte open; 1331 ubyte close; 1332 switch (range.bytes[range.index]) 1333 { 1334 case '<': 1335 open = '<'; 1336 close = '>'; 1337 range.popFront(); 1338 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1339 break; 1340 case '{': 1341 open = '{'; 1342 close = '}'; 1343 range.popFront(); 1344 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1345 break; 1346 case '[': 1347 open = '['; 1348 close = ']'; 1349 range.popFront(); 1350 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1351 break; 1352 case '(': 1353 open = '('; 1354 close = ')'; 1355 range.popFront(); 1356 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1357 break; 1358 default: 1359 lexHeredocString(token, mark, line, column, index); 1360 break; 1361 } 1362 } 1363 1364 void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, 1365 size_t index, ubyte open, ubyte close) 1366 { 1367 int depth = 1; 1368 while (!(range.index >= range.bytes.length) && depth > 0) 1369 { 1370 if (range.bytes[range.index] == open) 1371 { 1372 depth++; 1373 range.popFront(); 1374 } 1375 else if (range.bytes[range.index] == close) 1376 { 1377 depth--; 1378 range.popFront(); 1379 if (depth <= 0) 1380 { 1381 if (range.bytes[range.index] == '"') 1382 { 1383 range.popFront(); 1384 } 1385 else 1386 { 1387 error(token, "Error: `\"` expected to end delimited string literal"); 1388 return; 1389 } 1390 } 1391 } 1392 else 1393 popFrontWhitespaceAware(); 1394 } 1395 IdType type = tok!"stringLiteral"; 1396 lexStringSuffix(type); 1397 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1398 } 1399 1400 void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index) 1401 { 1402 Token ident; 1403 lexIdentifier(ident); 1404 if (!(range.index >= range.bytes.length) && isNewline()) 1405 popFrontWhitespaceAware(); 1406 else 1407 error("Newline expected"); 1408 while (!(range.index >= range.bytes.length)) 1409 { 1410 if (isNewline()) 1411 { 1412 popFrontWhitespaceAware(); 1413 if (!range.canPeek(ident.text.length)) 1414 { 1415 error(ident.text ~ " expected"); 1416 break; 1417 } 1418 if (range.peek(ident.text.length - 1) == ident.text) 1419 { 1420 range.popFrontN(ident.text.length); 1421 break; 1422 } 1423 } 1424 else 1425 { 1426 range.popFront(); 1427 } 1428 } 1429 IdType type; 1430 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"') 1431 { 1432 type = tok!"stringLiteral"; 1433 lexStringSuffix(type); 1434 range.popFront(); 1435 } 1436 else 1437 error("`\"` expected"); 1438 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1439 } 1440 1441 void lexTokenString(ref Token token) 1442 { 1443 mixin (tokenStart); 1444 assert (range.bytes[range.index] == 'q'); 1445 range.popFront(); 1446 assert (range.bytes[range.index] == '{'); 1447 range.popFront(); 1448 auto app = appender!string(); 1449 app.put("q{"); 1450 int depth = 1; 1451 1452 immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; 1453 immutable StringBehavior oldString = config.stringBehavior; 1454 config.whitespaceBehavior = WhitespaceBehavior.include; 1455 config.stringBehavior = StringBehavior.source; 1456 scope (exit) 1457 { 1458 config.whitespaceBehavior = oldWhitespace; 1459 config.stringBehavior = oldString; 1460 } 1461 1462 advance(_front); 1463 1464 if (range.index >= range.bytes.length) 1465 { 1466 error(token, "Error: unterminated token string literal"); 1467 return; 1468 } 1469 1470 while (depth > 0 && !empty) 1471 { 1472 auto t = front(); 1473 if (t.text is null) 1474 app.put(str(t.type)); 1475 else 1476 app.put(t.text); 1477 if (t.type == tok!"}") 1478 { 1479 depth--; 1480 if (depth > 0) 1481 popFront(); 1482 } 1483 else if (t.type == tok!"{") 1484 { 1485 depth++; 1486 popFront(); 1487 } 1488 else 1489 popFront(); 1490 } 1491 IdType type = tok!"stringLiteral"; 1492 auto b = lexStringSuffix(type); 1493 if (b != 0) 1494 app.put(b); 1495 token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, 1496 column, index); 1497 } 1498 1499 void lexHexString(ref Token token) 1500 { 1501 mixin (tokenStart); 1502 range.index += 2; 1503 range.column += 2; 1504 1505 loop: while (true) 1506 { 1507 if (range.index >= range.bytes.length) 1508 { 1509 error(token, "Error: unterminated hex string literal"); 1510 return; 1511 } 1512 else if (isWhitespace()) 1513 popFrontWhitespaceAware(); 1514 else switch (range.bytes[range.index]) 1515 { 1516 case '0': .. case '9': 1517 case 'A': .. case 'F': 1518 case 'a': .. case 'f': 1519 range.popFront(); 1520 break; 1521 case '"': 1522 range.popFront(); 1523 break loop; 1524 default: 1525 error(token, "Error: invalid character in hex string"); 1526 return; 1527 } 1528 } 1529 1530 IdType type = tok!"stringLiteral"; 1531 lexStringSuffix(type); 1532 token = Token(type, cache.intern(range.slice(mark)), line, column, 1533 index); 1534 } 1535 1536 bool lexNamedEntity() 1537 in { assert (range.bytes[range.index] == '&'); } 1538 do 1539 { 1540 Token t; 1541 range.popFront(); 1542 lexIdentifier(t, true); 1543 if (t.type != tok!"identifier" || range.empty || range.bytes[range.index] != ';') 1544 { 1545 error("Error: invalid named character entity"); 1546 return false; 1547 } 1548 range.popFront(); 1549 return true; 1550 } 1551 1552 bool lexEscapeSequence() 1553 { 1554 range.popFront(); 1555 if (range.index >= range.bytes.length) 1556 { 1557 error("Error: non-terminated character escape sequence."); 1558 return false; 1559 } 1560 switch (range.bytes[range.index]) 1561 { 1562 case '&': return lexNamedEntity(); 1563 case '\'': 1564 case '"': 1565 case '?': 1566 case '\\': 1567 case 'a': 1568 case 'b': 1569 case 'f': 1570 case 'n': 1571 case 'r': 1572 case 't': 1573 case 'v': 1574 range.popFront(); 1575 break; 1576 case 'x': 1577 range.popFront(); 1578 foreach (i; 0 .. 2) 1579 { 1580 if (range.index >= range.bytes.length) 1581 { 1582 error("Error: 2 hex digits expected."); 1583 return false; 1584 } 1585 switch (range.bytes[range.index]) 1586 { 1587 case '0': .. case '9': 1588 case 'a': .. case 'f': 1589 case 'A': .. case 'F': 1590 range.popFront(); 1591 break; 1592 default: 1593 error("Error: 2 hex digits expected."); 1594 return false; 1595 } 1596 } 1597 break; 1598 case '0': 1599 if (!(range.index + 1 < range.bytes.length) 1600 || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\'')) 1601 { 1602 range.popFront(); 1603 break; 1604 } 1605 goto case; 1606 case '1': .. case '7': 1607 for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) 1608 && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++) 1609 range.popFront(); 1610 break; 1611 case 'u': 1612 range.popFront(); 1613 foreach (i; 0 .. 4) 1614 { 1615 if (range.index >= range.bytes.length) 1616 { 1617 error("Error: at least 4 hex digits expected."); 1618 return false; 1619 } 1620 switch (range.bytes[range.index]) 1621 { 1622 case '0': .. case '9': 1623 case 'a': .. case 'f': 1624 case 'A': .. case 'F': 1625 range.popFront(); 1626 break; 1627 default: 1628 error("Error: at least 4 hex digits expected."); 1629 return false; 1630 } 1631 } 1632 break; 1633 case 'U': 1634 range.popFront(); 1635 foreach (i; 0 .. 8) 1636 { 1637 if (range.index >= range.bytes.length) 1638 { 1639 error("Error: at least 8 hex digits expected."); 1640 return false; 1641 } 1642 switch (range.bytes[range.index]) 1643 { 1644 case '0': .. case '9': 1645 case 'a': .. case 'f': 1646 case 'A': .. case 'F': 1647 range.popFront(); 1648 break; 1649 default: 1650 error("Error: at least 8 hex digits expected."); 1651 return false; 1652 } 1653 } 1654 break; 1655 default: 1656 error("Invalid escape sequence"); 1657 while (true) 1658 { 1659 if (range.index >= range.bytes.length) 1660 { 1661 error("Error: non-terminated character escape sequence."); 1662 break; 1663 } 1664 if (range.bytes[range.index] == ';') 1665 { 1666 range.popFront(); 1667 break; 1668 } 1669 else 1670 { 1671 range.popFront(); 1672 } 1673 } 1674 return false; 1675 } 1676 return true; 1677 } 1678 1679 void lexCharacterLiteral(ref Token token) 1680 { 1681 mixin (tokenStart); 1682 range.popFront(); 1683 if (range.empty) 1684 goto err; 1685 if (range.bytes[range.index] == '\\') 1686 lexEscapeSequence(); 1687 else if (range.bytes[range.index] == '\'') 1688 { 1689 range.popFront(); 1690 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1691 line, column, index); 1692 } 1693 else if (range.bytes[range.index] & 0x80) 1694 { 1695 while (range.bytes[range.index] & 0x80) 1696 range.popFront(); 1697 } 1698 else 1699 popFrontWhitespaceAware(); 1700 1701 if (range.index < range.bytes.length && range.bytes[range.index] == '\'') 1702 { 1703 range.popFront(); 1704 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1705 line, column, index); 1706 } 1707 else 1708 { 1709 err: 1710 error(token, "Error: Expected `'` to end character literal"); 1711 } 1712 } 1713 1714 void lexIdentifier(ref Token token, const bool silent = false) @trusted 1715 { 1716 mixin (tokenStart); 1717 1718 if (isSeparating(0)) 1719 { 1720 if (silent) return; 1721 1722 error("Invalid identifier"); 1723 range.popFront(); 1724 } 1725 while (true) 1726 { 1727 version (X86_64) 1728 { 1729 if (haveSSE42 && range.index + 16 < range.bytes.length) 1730 { 1731 immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_') 1732 (range.bytes.ptr + range.index); 1733 range.column += i; 1734 range.index += i; 1735 } 1736 } 1737 if (isSeparating(0)) 1738 break; 1739 else 1740 range.popFront(); 1741 } 1742 token = Token(tok!"identifier", cache.intern(range.slice(mark)), line, 1743 column, index); 1744 } 1745 1746 void lexDot(ref Token token) 1747 { 1748 mixin (tokenStart); 1749 if (!(range.index + 1 < range.bytes.length)) 1750 { 1751 range.popFront(); 1752 token = Token(tok!".", null, line, column, index); 1753 return; 1754 } 1755 switch (range.peekAt(1)) 1756 { 1757 case '0': .. case '9': 1758 lexNumber(token); 1759 return; 1760 case '.': 1761 range.popFront(); 1762 range.popFront(); 1763 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.') 1764 { 1765 range.popFront(); 1766 token = Token(tok!"...", null, line, column, index); 1767 } 1768 else 1769 token = Token(tok!"..", null, line, column, index); 1770 return; 1771 default: 1772 range.popFront(); 1773 token = Token(tok!".", null, line, column, index); 1774 return; 1775 } 1776 } 1777 1778 void lexLongNewline(ref Token token) @nogc 1779 { 1780 mixin (tokenStart); 1781 range.popFront(); 1782 range.popFront(); 1783 range.popFront(); 1784 range.incrementLine(); 1785 string text = config.whitespaceBehavior == WhitespaceBehavior.include 1786 ? cache.intern(range.slice(mark)) : ""; 1787 token = Token(tok!"whitespace", text, line, 1788 column, index); 1789 } 1790 1791 bool isNewline() @nogc 1792 { 1793 if (range.bytes[range.index] == '\n') return true; 1794 if (range.bytes[range.index] == '\r') return true; 1795 return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length) 1796 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 1797 } 1798 1799 bool isSeparating(size_t offset) @nogc 1800 { 1801 enum : ubyte 1802 { 1803 n, y, m // no, yes, maybe 1804 } 1805 1806 if (range.index + offset >= range.bytes.length) 1807 return true; 1808 auto c = range.bytes[range.index + offset]; 1809 static immutable ubyte[256] LOOKUP_TABLE = [ 1810 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1811 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1812 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1813 n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y, 1814 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1815 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n, 1816 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1817 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, 1818 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1819 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1820 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1821 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1822 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1823 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1824 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1825 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m 1826 ]; 1827 immutable ubyte result = LOOKUP_TABLE[c]; 1828 if (result == n) 1829 return false; 1830 if (result == y) 1831 return true; 1832 if (result == m) 1833 { 1834 auto r = range; 1835 range.popFrontN(offset); 1836 return (r.canPeek(2) && (r.peek(2) == "\u2028" 1837 || r.peek(2) == "\u2029")); 1838 } 1839 assert (false); 1840 } 1841 1842 1843 1844 enum tokenStart = q{ 1845 size_t index = range.index; 1846 size_t column = range.column; 1847 size_t line = range.line; 1848 auto mark = range.mark(); 1849 }; 1850 1851 void error(ref Token token, string message) 1852 { 1853 token.type = tok!""; 1854 error(message); 1855 } 1856 1857 void error(string message) 1858 { 1859 _messages ~= Message(range.line, range.column, message, true); 1860 } 1861 1862 void warning(string message) 1863 { 1864 _messages ~= Message(range.line, range.column, message, false); 1865 assert (_messages.length > 0); 1866 } 1867 1868 Message[] _messages; 1869 StringCache* cache; 1870 LexerConfig config; 1871 bool haveSSE42; 1872 } 1873 1874 /** 1875 * Creates a token range from the given source code. Creates a default lexer 1876 * configuration and a GC-managed string cache. 1877 */ 1878 public auto byToken(R)(R range) 1879 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1880 { 1881 LexerConfig config; 1882 StringCache* cache = new StringCache(range.length.optimalBucketCount); 1883 return DLexer(range, config, cache); 1884 } 1885 1886 /** 1887 * Creates a token range from the given source code. Uses the given string 1888 * cache. 1889 */ 1890 public auto byToken(R)(R range, StringCache* cache) 1891 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1892 { 1893 LexerConfig config; 1894 return DLexer(range, config, cache); 1895 } 1896 1897 /** 1898 * Creates a token range from the given source code. Uses the provided lexer 1899 * configuration and string cache. 1900 */ 1901 public auto byToken(R)(R range, const LexerConfig config, StringCache* cache) 1902 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1903 { 1904 return DLexer(range, config, cache); 1905 } 1906 1907 /** 1908 * Helper function used to avoid too much allocations while lexing. 1909 * 1910 * Params: 1911 * size = The length in bytes of the source file. 1912 * 1913 * Returns: 1914 * The optimal initial bucket count a `StringCache` should have. 1915 */ 1916 size_t optimalBucketCount(size_t size) 1917 { 1918 import std.math : nextPow2; 1919 return nextPow2((size + 31U) / 32U).min(1U << 30U); 1920 } 1921 /// 1922 unittest 1923 { 1924 assert(optimalBucketCount(1) == 2); 1925 assert(optimalBucketCount(9000 * 32) == 16384); 1926 static if (size_t.sizeof == ulong.sizeof) 1927 assert(optimalBucketCount(100_000_000_000UL) == 1 << 30); 1928 } 1929 1930 /** 1931 * The string cache is used for string interning. 1932 * 1933 * It will only store a single copy of any string that it is asked to hold. 1934 * Interned strings can be compared for equality by comparing their $(B .ptr) 1935 * field. 1936 * 1937 * Default and postbilt constructors are disabled. When a StringCache goes out 1938 * of scope, the memory held by it is freed. 1939 * 1940 * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning) 1941 */ 1942 struct StringCache 1943 { 1944 public pure nothrow @nogc: 1945 1946 @disable this(); 1947 @disable this(this); 1948 1949 /** 1950 * Params: bucketCount = the initial number of buckets. Must be a 1951 * power of two 1952 */ 1953 this(size_t bucketCount) nothrow @trusted @nogc 1954 in 1955 { 1956 import core.bitop : popcnt; 1957 static if (size_t.sizeof == 8) 1958 { 1959 immutable low = popcnt(cast(uint) bucketCount); 1960 immutable high = popcnt(cast(uint) (bucketCount >> 32)); 1961 assert ((low == 0 && high == 1) || (low == 1 && high == 0)); 1962 } 1963 else 1964 { 1965 static assert (size_t.sizeof == 4); 1966 assert (popcnt(cast(uint) bucketCount) == 1); 1967 } 1968 } 1969 do 1970 { 1971 buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount]; 1972 } 1973 1974 ~this() 1975 { 1976 Block* current = rootBlock; 1977 while (current !is null) 1978 { 1979 Block* prev = current; 1980 current = current.next; 1981 free(cast(void*) prev); 1982 } 1983 foreach (nodePointer; buckets) 1984 { 1985 Node* currentNode = nodePointer; 1986 while (currentNode !is null) 1987 { 1988 if (currentNode.mallocated) 1989 free(currentNode.str.ptr); 1990 Node* prev = currentNode; 1991 currentNode = currentNode.next; 1992 free(prev); 1993 } 1994 } 1995 rootBlock = null; 1996 free(buckets.ptr); 1997 buckets = null; 1998 } 1999 2000 /** 2001 * Caches a string. 2002 */ 2003 string intern(const(ubyte)[] str) @safe 2004 { 2005 if (str is null || str.length == 0) 2006 return ""; 2007 return _intern(str); 2008 } 2009 2010 /** 2011 * ditto 2012 */ 2013 string intern(string str) @trusted 2014 { 2015 return intern(cast(ubyte[]) str); 2016 } 2017 2018 /** 2019 * The default bucket count for the string cache. 2020 */ 2021 static enum defaultBucketCount = 4096; 2022 2023 private: 2024 2025 string _intern(const(ubyte)[] bytes) @trusted 2026 { 2027 immutable uint hash = hashBytes(bytes); 2028 immutable size_t index = hash & (buckets.length - 1); 2029 Node* s = find(bytes, hash); 2030 if (s !is null) 2031 return cast(string) s.str; 2032 ubyte[] mem = void; 2033 bool mallocated = bytes.length > BIG_STRING; 2034 if (mallocated) 2035 mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length]; 2036 else 2037 mem = allocate(bytes.length); 2038 mem[] = bytes[]; 2039 Node* node = cast(Node*) malloc(Node.sizeof); 2040 node.str = mem; 2041 node.hash = hash; 2042 node.next = buckets[index]; 2043 node.mallocated = mallocated; 2044 buckets[index] = node; 2045 return cast(string) mem; 2046 } 2047 2048 Node* find(const(ubyte)[] bytes, uint hash) @trusted 2049 { 2050 import std.algorithm : equal; 2051 immutable size_t index = hash & (buckets.length - 1); 2052 Node* node = buckets[index]; 2053 while (node !is null) 2054 { 2055 if (node.hash == hash && bytes == cast(ubyte[]) node.str) 2056 return node; 2057 node = node.next; 2058 } 2059 return node; 2060 } 2061 2062 static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc 2063 in 2064 { 2065 assert (data !is null); 2066 assert (data.length > 0); 2067 } 2068 do 2069 { 2070 immutable uint m = 0x5bd1e995; 2071 immutable int r = 24; 2072 uint h = cast(uint) data.length; 2073 while (data.length >= 4) 2074 { 2075 uint k = (cast(ubyte) data[3]) << 24 2076 | (cast(ubyte) data[2]) << 16 2077 | (cast(ubyte) data[1]) << 8 2078 | (cast(ubyte) data[0]); 2079 k *= m; 2080 k ^= k >> r; 2081 k *= m; 2082 h *= m; 2083 h ^= k; 2084 data = data[4 .. $]; 2085 } 2086 switch (data.length & 3) 2087 { 2088 case 3: 2089 h ^= data[2] << 16; 2090 goto case; 2091 case 2: 2092 h ^= data[1] << 8; 2093 goto case; 2094 case 1: 2095 h ^= data[0]; 2096 h *= m; 2097 break; 2098 default: 2099 break; 2100 } 2101 h ^= h >> 13; 2102 h *= m; 2103 h ^= h >> 15; 2104 return h; 2105 } 2106 2107 ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc 2108 in 2109 { 2110 assert (numBytes != 0); 2111 } 2112 out (result) 2113 { 2114 assert (result.length == numBytes); 2115 } 2116 do 2117 { 2118 Block* r = rootBlock; 2119 size_t i = 0; 2120 while (i <= 3 && r !is null) 2121 { 2122 immutable size_t available = r.bytes.length; 2123 immutable size_t oldUsed = r.used; 2124 immutable size_t newUsed = oldUsed + numBytes; 2125 if (newUsed <= available) 2126 { 2127 r.used = newUsed; 2128 return r.bytes[oldUsed .. newUsed]; 2129 } 2130 i++; 2131 r = r.next; 2132 } 2133 Block* b = cast(Block*) calloc(Block.sizeof, 1); 2134 b.used = numBytes; 2135 b.next = rootBlock; 2136 rootBlock = b; 2137 return b.bytes[0 .. numBytes]; 2138 } 2139 2140 static struct Node 2141 { 2142 ubyte[] str = void; 2143 Node* next = void; 2144 uint hash = void; 2145 bool mallocated = void; 2146 } 2147 2148 static struct Block 2149 { 2150 Block* next; 2151 size_t used; 2152 enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof; 2153 ubyte[BLOCK_CAPACITY] bytes; 2154 } 2155 2156 static assert (BLOCK_SIZE == Block.sizeof); 2157 2158 enum BLOCK_SIZE = 1024 * 16; 2159 2160 // If a string would take up more than 1/4 of a block, allocate it outside 2161 // of the block. 2162 enum BIG_STRING = BLOCK_SIZE / 4; 2163 2164 Node*[] buckets; 2165 Block* rootBlock; 2166 } 2167 2168 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted; 2169 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted; 2170 private extern(C) void free(void*) nothrow pure @nogc @trusted; 2171 2172 unittest 2173 { 2174 auto source = cast(ubyte[]) q{ import std.stdio;}}; 2175 auto tokens = getTokensForParser(source, LexerConfig(), 2176 new StringCache(StringCache.defaultBucketCount)); 2177 assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", 2178 tok!"identifier", tok!";"])); 2179 } 2180 2181 /// Test \x char sequence 2182 unittest 2183 { 2184 auto toks = (string s) => byToken(cast(ubyte[])s); 2185 2186 // valid 2187 immutable hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; 2188 auto source = ""; 2189 foreach (h1; hex) 2190 foreach (h2; hex) 2191 source ~= "'\\x" ~ h1 ~ h2 ~ "'"; 2192 assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty); 2193 2194 // invalid 2195 assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2196 assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2197 assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2198 assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2199 assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2200 } 2201 2202 version (X86_64) 2203 { 2204 version (DigitalMars) 2205 private enum useDMDStyle = true; 2206 else version (LDC) 2207 private enum useDMDStyle = (__VERSION__ < 2092); // GDC-style supported since v1.22 2208 else 2209 private enum useDMDStyle = false; // not supported by GDC 2210 2211 private ulong pcmpestri(ubyte flags, chars...)(const ubyte* bytes) pure nothrow 2212 @trusted @nogc if (chars.length <= 8) 2213 { 2214 enum constant = ByteCombine!chars; 2215 enum charsLength = chars.length; 2216 2217 static if (useDMDStyle) 2218 { 2219 asm pure nothrow @nogc 2220 { 2221 naked; 2222 } 2223 version (Windows) // `bytes` in RCX 2224 asm pure nothrow @nogc { movdqu XMM1, [RCX]; } 2225 else // `bytes` in RDI 2226 asm pure nothrow @nogc { movdqu XMM1, [RDI]; } 2227 asm pure nothrow @nogc 2228 { 2229 mov R10, constant; 2230 movq XMM2, R10; 2231 mov RAX, charsLength; 2232 mov RDX, 16; 2233 pcmpestri XMM2, XMM1, flags; 2234 mov RAX, RCX; 2235 ret; 2236 } 2237 } 2238 else // GDC-style inline asm (GCC basically) 2239 { 2240 ulong result; 2241 asm pure nothrow @nogc 2242 { 2243 `movdqu %1, %%xmm1 2244 movq %3, %%xmm2 2245 pcmpestri %5, %%xmm1, %%xmm2` 2246 : "=c" (result) // %0: pcmpestri result in RCX, to be stored into `result` 2247 : "m" (*bytes), // %1: address of `bytes` string 2248 "d" (16), // %2: length of `bytes` head in XMM1, as pcmpestri input in EDX 2249 "r" (constant), // %3: max 8 `chars` to load into GP register, then XMM2 2250 "a" (charsLength), // %4: length in XMM2, as pcmpestri input in EAX 2251 "i" (flags) // %5: `flags` immediate 2252 : "xmm1", "xmm2"; // clobbered registers 2253 } 2254 return result; 2255 } 2256 } 2257 2258 /** 2259 * Skips between 0 and 16 bytes that match (or do not match) one of the 2260 * given $(B chars). 2261 */ 2262 void skip(bool matching, chars...)(const ubyte* bytes, ulong* pindex, ulong* pcolumn) pure nothrow 2263 @trusted @nogc if (chars.length <= 8) 2264 { 2265 static if (matching) 2266 enum flags = 0b0001_0000; 2267 else 2268 enum flags = 0b0000_0000; 2269 2270 const r = pcmpestri!(flags, chars)(bytes); 2271 *pindex += r; 2272 *pcolumn += r; 2273 } 2274 2275 /** 2276 * Returns: the number of bytes starting at the given location that match 2277 * (or do not match if $(B invert) is true) the byte ranges in $(B chars). 2278 */ 2279 ulong rangeMatch(bool invert, chars...)(const ubyte* bytes) pure nothrow @trusted @nogc 2280 { 2281 static assert(chars.length % 2 == 0); 2282 static if (invert) 2283 enum rangeMatchFlags = 0b0000_0100; 2284 else 2285 enum rangeMatchFlags = 0b0001_0100; 2286 2287 return pcmpestri!(rangeMatchFlags, chars)(bytes); 2288 } 2289 2290 template ByteCombine(c...) 2291 { 2292 static assert (c.length <= 8); 2293 static if (c.length > 1) 2294 enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8); 2295 else 2296 enum ulong ByteCombine = c[0]; 2297 } 2298 } 2299 2300 unittest 2301 { 2302 import core.exception : RangeError; 2303 import std.exception : assertNotThrown; 2304 2305 static immutable src1 = "/++"; 2306 static immutable src2 = "/**"; 2307 2308 LexerConfig cf; 2309 StringCache ca = StringCache(16); 2310 2311 assertNotThrown!RangeError(getTokensForParser(src1, cf, &ca)); 2312 assertNotThrown!RangeError(getTokensForParser(src2, cf, &ca)); 2313 } 2314 2315 unittest 2316 { 2317 static immutable src = `"\eeee"`; 2318 2319 LexerConfig cf; 2320 StringCache ca = StringCache(16); 2321 2322 auto l = DLexer(src, cf, &ca); 2323 assert(l.front().type == tok!""); 2324 assert(!l.messages.empty); 2325 } 2326 2327 unittest 2328 { 2329 alias Msg = DLexer.Message; 2330 LexerConfig cf; 2331 StringCache ca = StringCache(16); 2332 2333 { 2334 auto l = DLexer(`"\©"`, cf, &ca); 2335 assert(l.front().type == tok!"stringLiteral"); 2336 assert(l.messages == []); 2337 } 2338 { 2339 auto l = DLexer(`"\™\⌝"`, cf, &ca); 2340 assert(l.front().type == tok!"stringLiteral"); 2341 assert(l.messages == []); 2342 } 2343 { 2344 auto l = DLexer(`"\&trade"`, cf, &ca); 2345 assert(l.front().type == tok!""); 2346 assert(l.messages == [ Msg(1, 9, "Error: invalid named character entity", true) ]); 2347 } 2348 { 2349 auto l = DLexer(`"\™\&urcorn"`, cf, &ca); 2350 assert(l.front().type == tok!""); 2351 assert(l.messages == [ Msg(1, 18, "Error: invalid named character entity", true) ]); 2352 } 2353 { 2354 auto l = DLexer(`"\&"`, cf, &ca); 2355 assert(l.front().type == tok!""); 2356 assert(l.messages == [ Msg(1, 4, "Error: invalid named character entity", true) ]); 2357 } 2358 { 2359 auto l = DLexer(`"\&0"`, cf, &ca); 2360 assert(l.front().type == tok!""); 2361 assert(l.messages == [ Msg(1, 5, "Error: invalid named character entity", true) ]); 2362 } 2363 { 2364 auto l = DLexer(`"\©`, cf, &ca); 2365 assert(l.front().type == tok!""); 2366 assert(l.messages == [ Msg(1, 8, "Error: invalid named character entity", true) ]); 2367 } 2368 { 2369 auto l = DLexer(`"\©`, cf, &ca); 2370 assert(l.front().type == tok!""); 2371 assert(l.messages == [ Msg(1, 9, "Error: unterminated string literal", true) ]); 2372 } 2373 } 2374 2375 // legacy code using compatibility comment and trailingComment 2376 unittest 2377 { 2378 import std.conv : to; 2379 import std.exception : enforce; 2380 2381 static immutable src = `/// this is a module. 2382 // mixed 2383 /// it can do stuff 2384 module foo.bar; 2385 2386 // hello 2387 2388 /** 2389 * some doc 2390 * hello 2391 */ 2392 int x; /// very nice 2393 2394 // TODO: do stuff 2395 void main() { 2396 #line 40 2397 /// could be better 2398 writeln(":)"); 2399 } 2400 2401 /// end of file`; 2402 2403 LexerConfig cf; 2404 StringCache ca = StringCache(16); 2405 2406 const tokens = getTokensForParser(src, cf, &ca); 2407 2408 void assertEquals(T)(T a, T b, string what, string file = __FILE__, size_t line = __LINE__) 2409 { 2410 enforce(a == b, "Failed " ~ what ~ " '" ~ a.to!string ~ "' == '" ~ b.to!string ~ "'", file, line); 2411 } 2412 2413 void test(size_t index, IdType type, string comment, string trailingComment, 2414 string file = __FILE__, size_t line = __LINE__) 2415 { 2416 assertEquals(tokens[index].type, type, "type", file, line); 2417 assertEquals(tokens[index].comment, comment, "comment", file, line); 2418 assertEquals(tokens[index].trailingComment, trailingComment, "trailingComment", file, line); 2419 } 2420 2421 test(0, tok!"module", "this is a module.\nit can do stuff", ""); 2422 test(1, tok!"identifier", "", ""); 2423 test(2, tok!".", "", ""); 2424 test(3, tok!"identifier", "", ""); 2425 test(4, tok!";", "", ""); 2426 test(5, tok!"int", "some doc\nhello", ""); 2427 test(6, tok!"identifier", "", ""); 2428 test(7, tok!";", "", "very nice"); 2429 test(8, tok!"void", "", ""); 2430 test(9, tok!"identifier", "", ""); 2431 test(10, tok!"(", "", ""); 2432 test(11, tok!")", "", ""); 2433 test(12, tok!"{", "", ""); 2434 test(13, tok!"identifier", "could be better", ""); 2435 test(14, tok!"(", "", ""); 2436 test(15, tok!"stringLiteral", "", ""); 2437 test(16, tok!")", "", ""); 2438 test(17, tok!";", "", ""); 2439 test(18, tok!"}", "", ""); 2440 } 2441 2442 // dlang-community/D-Scanner#805 2443 unittest 2444 { 2445 final class SomeExpr 2446 { 2447 Token tok; 2448 } 2449 2450 auto e1 = new SomeExpr(); 2451 const e2 = new SomeExpr(); 2452 immutable e3 = new immutable SomeExpr(); 2453 2454 immutable t1 = e1.tok; 2455 immutable t2 = e2.tok; 2456 immutable t3 = e3.tok; 2457 } 2458 2459 /// empty '' is invalid syntax, but should still get parsed properly, with an 2460 /// error token and proper location info 2461 unittest 2462 { 2463 import std.conv : to; 2464 import std.exception : enforce; 2465 2466 static immutable src = `module foo.bar; 2467 2468 void main() { 2469 x = ''; 2470 } 2471 `; 2472 2473 LexerConfig cf; 2474 StringCache ca = StringCache(16); 2475 2476 const tokens = getTokensForParser(src, cf, &ca); 2477 2478 int i; 2479 assert(tokens[i++].type == tok!"module"); 2480 assert(tokens[i++].type == tok!"identifier"); 2481 assert(tokens[i++].type == tok!"."); 2482 assert(tokens[i++].type == tok!"identifier"); 2483 assert(tokens[i++].type == tok!";"); 2484 assert(tokens[i++].type == tok!"void"); 2485 assert(tokens[i++].type == tok!"identifier"); 2486 assert(tokens[i++].type == tok!"("); 2487 assert(tokens[i++].type == tok!")"); 2488 assert(tokens[i++].type == tok!"{"); 2489 assert(tokens[i++].type == tok!"identifier"); 2490 assert(tokens[i++].type == tok!"="); 2491 assert(tokens[i].type == tok!""); 2492 assert(tokens[i].line == tokens[i - 1].line); 2493 assert(tokens[i].column == tokens[i - 1].column + 2); 2494 i++; 2495 assert(tokens[i++].type == tok!";"); 2496 assert(tokens[i++].type == tok!"}"); 2497 2498 void checkInvalidTrailingString(const Token[] tokens) 2499 { 2500 assert(tokens.length == 3); 2501 assert(tokens[2].index != 0); 2502 assert(tokens[2].column >= 4); 2503 assert(tokens[2].type == tok!""); 2504 } 2505 2506 checkInvalidTrailingString(getTokensForParser(`x = "foo`, cf, &ca)); 2507 checkInvalidTrailingString(getTokensForParser(`x = r"foo`, cf, &ca)); 2508 checkInvalidTrailingString(getTokensForParser(`x = x"00`, cf, &ca)); 2509 checkInvalidTrailingString(getTokensForParser("x = `foo", cf, &ca)); 2510 checkInvalidTrailingString(getTokensForParser("x = q{foo", cf, &ca)); 2511 checkInvalidTrailingString(getTokensForParser(`x = q"foo`, cf, &ca)); 2512 checkInvalidTrailingString(getTokensForParser("x = '", cf, &ca)); 2513 }