1 module dparse.lexer; 2 3 import std.typecons; 4 import std.typetuple; 5 import std.array; 6 import std.algorithm; 7 import std.range; 8 import std.experimental.lexer; 9 import std.traits; 10 import core.cpuid : sse42; 11 12 public import dparse.trivia; 13 14 /// Operators 15 private enum operators = [ 16 ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", 17 "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", 18 "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", 19 "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", 20 "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" 21 ]; 22 23 /// Kewords 24 private enum keywords = [ 25 "abstract", "alias", "align", "asm", "assert", "auto", "bool", 26 "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", 27 "char", "class", "const", "continue", "creal", "dchar", "debug", "default", 28 "delegate", "delete", "deprecated", "do", "double", "else", "enum", 29 "export", "extern", "false", "final", "finally", "float", "for", "foreach", 30 "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", 31 "immutable", "import", "in", "inout", "int", "interface", "invariant", 32 "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", 33 "null", "out", "override", "package", "pragma", "private", "protected", 34 "public", "pure", "real", "ref", "return", "scope", "shared", "short", 35 "static", "struct", "super", "switch", "synchronized", "template", "this", 36 "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", 37 "uint", "ulong", "union", "unittest", "ushort", "version", "void", 38 "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", 39 "__FILE_FULL_PATH__", "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", 40 "__parameters", "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", 41 "__vector", "__VENDOR__", "__VERSION__" 42 ]; 43 44 /// Other tokens 45 private enum dynamicTokens = [ 46 "specialTokenSequence", "comment", "identifier", "scriptLine", 47 "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", 48 "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", 49 "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", 50 "dstringLiteral", "stringLiteral", "wstringLiteral" 51 ]; 52 53 private enum pseudoTokenHandlers = [ 54 "\"", "lexStringLiteral", 55 "`", "lexWysiwygString", 56 "//", "lexSlashSlashComment", 57 "/*", "lexSlashStarComment", 58 "/+", "lexSlashPlusComment", 59 ".", "lexDot", 60 "'", "lexCharacterLiteral", 61 "0", "lexNumber", 62 "1", "lexDecimal", 63 "2", "lexDecimal", 64 "3", "lexDecimal", 65 "4", "lexDecimal", 66 "5", "lexDecimal", 67 "6", "lexDecimal", 68 "7", "lexDecimal", 69 "8", "lexDecimal", 70 "9", "lexDecimal", 71 "q\"", "lexDelimitedString", 72 "q{", "lexTokenString", 73 "r\"", "lexWysiwygString", 74 "x\"", "lexHexString", 75 " ", "lexWhitespace", 76 "\t", "lexWhitespace", 77 "\r", "lexWhitespace", 78 "\n", "lexWhitespace", 79 "\v", "lexWhitespace", 80 "\f", "lexWhitespace", 81 "\u2028", "lexLongNewline", 82 "\u2029", "lexLongNewline", 83 "#!", "lexScriptLine", 84 "#line", "lexSpecialTokenSequence" 85 ]; 86 87 /// Token ID type for the D lexer. 88 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 89 90 /** 91 * Function used for converting an IdType to a string. 92 * 93 * Examples: 94 * --- 95 * IdType c = tok!"case"; 96 * assert (str(c) == "case"); 97 * --- 98 */ 99 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 100 101 /** 102 * Template used to refer to D token types. 103 * 104 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 105 * values that can be passed to this template. 106 * Example: 107 * --- 108 * import dparse.lexer; 109 * IdType t = tok!"floatLiteral"; 110 * --- 111 */ 112 public template tok(string token) 113 { 114 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 115 } 116 117 mixin template TokenTriviaFields() 118 { 119 /** 120 * Whitespace and comment tokens attached to this token. 121 * 122 * All trivia tokens must have the text property set to the text with 123 * which they identify with. This means you can map all trivia tokens to 124 * their .text property and join them together to get the source code back 125 * without any loss of information. 126 * 127 * Trivia is only included when calling getTokensForParser. When iterating 128 * over DLexer all tokens will be in their raw form and none will be 129 * converted to trivia. 130 * 131 * Note: in the future you might need to explicitly pass 132 * WhitespaceBehavior.include (or keep the default) as getTokensForParser 133 * currently overrides it to include. 134 * 135 * Contains: `comment`, `whitespace`, `specialTokenSequence` 136 */ 137 immutable(typeof(this))[] leadingTrivia; 138 /// ditto 139 immutable(typeof(this))[] trailingTrivia; 140 141 string memoizedLeadingComment = null; 142 string memoizedTrailingComment = null; 143 144 /// Legacy property to get documentation comments, with comment border 145 /// stripped off, which is attached to this token. 146 string comment() const pure nothrow @safe @property { 147 import dparse.trivia : extractLeadingDdoc; 148 if (memoizedLeadingComment !is null) 149 return memoizedLeadingComment; 150 return (cast()memoizedLeadingComment) = this.extractLeadingDdoc; 151 } 152 153 /// ditto 154 string trailingComment() const pure nothrow @safe @property { 155 import dparse.trivia : extractTrailingDdoc; 156 if (memoizedTrailingComment !is null) 157 return memoizedTrailingComment; 158 return (cast()memoizedTrailingComment) = this.extractTrailingDdoc; 159 } 160 161 int opCmp(size_t i) const pure nothrow @safe @nogc { 162 if (index < i) return -1; 163 if (index > i) return 1; 164 return 0; 165 } 166 167 int opCmp(ref const typeof(this) other) const pure nothrow @safe @nogc { 168 return opCmp(other.index); 169 } 170 } 171 172 // mixin in from dparse.lexer to make error messages more managable size as the 173 // entire string is dumped when there is a type mismatch. 174 private enum extraFields = "import dparse.lexer:TokenTriviaFields; mixin TokenTriviaFields;"; 175 176 /// The token type in the D lexer 177 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); 178 179 /** 180 * Configure whitespace handling 181 */ 182 public enum WhitespaceBehavior : ubyte 183 { 184 include = 0b0000_0000, 185 skip = 0b0000_0001, 186 } 187 188 private enum stringBehaviorNotWorking = "Automatic string parsing is not " 189 ~ "supported and was previously not working. To unescape strings use the " 190 ~ "`dparse.strings : unescapeString` function on the token texts instead."; 191 192 /** 193 * Configure string lexing behavior 194 */ 195 // was enum, but struct now for deprecations and support with old compilers 196 public struct StringBehavior 197 { 198 /// Do not include quote characters, process escape sequences 199 deprecated(stringBehaviorNotWorking) static immutable StringBehavior compiler = StringBehavior(0b0000_0000); 200 /// Opening quotes, closing quotes, and string suffixes are included in 201 /// the string token 202 deprecated(stringBehaviorNotWorking) static immutable StringBehavior includeQuoteChars = StringBehavior(0b0000_0001); 203 /// String escape sequences are not replaced 204 deprecated(stringBehaviorNotWorking) static immutable StringBehavior notEscaped = StringBehavior(0b0000_0010); 205 /// Not modified at all. Useful for formatters or highlighters 206 static immutable StringBehavior source = StringBehavior(0b0000_0011); 207 208 ubyte behavior; 209 alias behavior this; 210 } 211 212 public enum CommentBehavior : bool 213 { 214 intern = true, 215 noIntern = false 216 } 217 /** 218 * Lexer configuration struct 219 */ 220 public struct LexerConfig 221 { 222 string fileName; 223 StringBehavior stringBehavior; 224 WhitespaceBehavior whitespaceBehavior; 225 CommentBehavior commentBehavior = CommentBehavior.intern; 226 } 227 228 /** 229 * Basic type token types. 230 */ 231 public alias BasicTypes = AliasSeq!(tok!"int", tok!"bool", tok!"byte", 232 tok!"cdouble", tok!"cent", tok!"cfloat", tok!"char", tok!"creal", 233 tok!"dchar", tok!"double", tok!"float", tok!"idouble", 234 tok!"ifloat", tok!"ireal", tok!"long", tok!"real", tok!"short", 235 tok!"ubyte", tok!"ucent", tok!"uint", tok!"ulong", tok!"ushort", 236 tok!"void", tok!"wchar"); 237 238 /** 239 * Returns: true if the given ID is for a basic type. 240 */ 241 public bool isBasicType(IdType type) nothrow pure @safe @nogc 242 { 243 switch (type) 244 { 245 foreach (T; BasicTypes) 246 { 247 case T: 248 return true; 249 } 250 default: 251 return false; 252 } 253 } 254 255 /** 256 * Number literal token types. 257 */ 258 public alias NumberLiterals = AliasSeq!(tok!"doubleLiteral", 259 tok!"floatLiteral", tok!"idoubleLiteral", tok!"ifloatLiteral", 260 tok!"intLiteral", tok!"longLiteral", tok!"realLiteral", 261 tok!"irealLiteral", tok!"uintLiteral", tok!"ulongLiteral"); 262 263 /** 264 * Returns: true if the given ID type is for a number literal. 265 */ 266 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc 267 { 268 switch (type) 269 { 270 foreach (T; NumberLiterals) 271 { 272 case T: 273 return true; 274 } 275 default: 276 return false; 277 } 278 } 279 280 /** 281 * Number literal token types. 282 */ 283 public alias IntegerLiterals = AliasSeq!(tok!"intLiteral", tok!"longLiteral", 284 tok!"uintLiteral", tok!"ulongLiteral"); 285 286 /** 287 * Returns: true if the given ID type is for a integer literal. 288 */ 289 public bool isIntegerLiteral(IdType type) nothrow pure @safe @nogc 290 { 291 switch (type) 292 { 293 foreach (T; IntegerLiterals) 294 { 295 case T: 296 return true; 297 } 298 default: 299 return false; 300 } 301 } 302 303 /** 304 * Operator token types. 305 */ 306 public alias Operators = AliasSeq!(tok!",", tok!".", tok!"..", tok!"...", 307 tok!"/", tok!"/=", tok!"!", tok!"!<", tok!"!<=", tok!"!<>", 308 tok!"!<>=", tok!"!=", tok!"!>", tok!"!>=", tok!"$", tok!"%", 309 tok!"%=", tok!"&", tok!"&&", tok!"&=", tok!"(", tok!")", 310 tok!"*", tok!"*=", tok!"+", tok!"++", tok!"+=", tok!"-", 311 tok!"--", tok!"-=", tok!":", tok!";", tok!"<", tok!"<<", 312 tok!"<<=", tok!"<=", tok!"<>", tok!"<>=", tok!"=", tok!"==", 313 tok!"=>", tok!">", tok!">=", tok!">>", tok!">>=", tok!">>>", 314 tok!">>>=", tok!"?", tok!"@", tok!"[", tok!"]", tok!"^", 315 tok!"^=", tok!"^^", tok!"^^=", tok!"{", tok!"|", tok!"|=", 316 tok!"||", tok!"}", tok!"~", tok!"~="); 317 318 /** 319 * Returns: true if the given ID type is for an operator. 320 */ 321 public bool isOperator(IdType type) nothrow pure @safe @nogc 322 { 323 switch (type) 324 { 325 foreach (T; Operators) 326 { 327 case T: 328 return true; 329 } 330 default: 331 return false; 332 } 333 } 334 335 /** 336 * Keyword token types. 337 */ 338 public alias Keywords = AliasSeq!(tok!"abstract", tok!"alias", tok!"align", 339 tok!"asm", tok!"assert", tok!"auto", tok!"break", 340 tok!"case", tok!"cast", tok!"catch", tok!"class", tok!"const", 341 tok!"continue", tok!"debug", tok!"default", tok!"delegate", 342 tok!"delete", tok!"deprecated", tok!"do", tok!"else", tok!"enum", 343 tok!"export", tok!"extern", tok!"false", tok!"final", tok!"finally", 344 tok!"for", tok!"foreach", tok!"foreach_reverse", tok!"function", 345 tok!"goto", tok!"if", tok!"immutable", tok!"import", tok!"in", 346 tok!"inout", tok!"interface", tok!"invariant", tok!"is", 347 tok!"lazy", tok!"macro", tok!"mixin", tok!"module", tok!"new", 348 tok!"nothrow", tok!"null", tok!"out", tok!"override", tok!"package", 349 tok!"pragma", tok!"private", tok!"protected", tok!"public", 350 tok!"pure", tok!"ref", tok!"return", tok!"scope", tok!"shared", 351 tok!"static", tok!"struct", tok!"super", tok!"switch", tok!"synchronized", 352 tok!"template", tok!"this", tok!"throw", tok!"true", tok!"try", 353 tok!"typedef", tok!"typeid", tok!"typeof", tok!"union", tok!"unittest", 354 tok!"version", tok!"while", tok!"with", tok!"__DATE__", 355 tok!"__EOF__", tok!"__FILE__", tok!"__FILE_FULL_PATH__", tok!"__FUNCTION__", 356 tok!"__gshared", tok!"__LINE__", tok!"__MODULE__", tok!"__parameters", 357 tok!"__PRETTY_FUNCTION__", tok!"__TIME__", tok!"__TIMESTAMP__", 358 tok!"__traits", tok!"__vector", tok!"__VENDOR__", tok!"__VERSION__"); 359 360 /** 361 * Returns: true if the given ID type is for a keyword. 362 */ 363 public bool isKeyword(IdType type) pure nothrow @safe @nogc 364 { 365 switch (type) 366 { 367 foreach (T; Keywords) 368 { 369 case T: 370 return true; 371 } 372 default: 373 return false; 374 } 375 } 376 377 /** 378 * String literal token types 379 */ 380 public alias StringLiterals = AliasSeq!(tok!"dstringLiteral", 381 tok!"stringLiteral", tok!"wstringLiteral"); 382 383 /** 384 * Returns: true if the given ID type is for a string literal. 385 */ 386 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc 387 { 388 switch (type) 389 { 390 foreach (T; StringLiterals) 391 { 392 case T: 393 return true; 394 } 395 default: 396 return false; 397 } 398 } 399 400 /** 401 * Protection token types. 402 */ 403 public alias Protections = AliasSeq!(tok!"export", tok!"package", 404 tok!"private", tok!"public", tok!"protected"); 405 406 /** 407 * Returns: true if the given ID type is for a protection attribute. 408 */ 409 public bool isProtection(IdType type) pure nothrow @safe @nogc 410 { 411 switch (type) 412 { 413 foreach (T; Protections) 414 { 415 case T: 416 return true; 417 } 418 default: 419 return false; 420 } 421 } 422 423 public alias SpecialTokens = AliasSeq!(tok!"__DATE__", tok!"__TIME__", 424 tok!"__TIMESTAMP__", tok!"__VENDOR__", tok!"__VERSION__", tok!"__FILE__", 425 tok!"__FILE_FULL_PATH__", tok!"__LINE__", tok!"__MODULE__", 426 tok!"__FUNCTION__", tok!"__PRETTY_FUNCTION__"); 427 428 public bool isSpecialToken(IdType type) pure nothrow @safe @nogc 429 { 430 switch (type) 431 { 432 foreach (T; SpecialTokens) 433 { 434 case T: 435 return true; 436 } 437 default: 438 return false; 439 } 440 } 441 442 public alias Literals = AliasSeq!(StringLiterals, NumberLiterals, tok!"characterLiteral", 443 SpecialTokens, tok!"true", tok!"false", tok!"null", tok!"$"); 444 445 public bool isLiteral(IdType type) pure nothrow @safe @nogc 446 { 447 switch (type) 448 { 449 foreach (T; Literals) 450 { 451 case T: 452 return true; 453 } 454 default: 455 return false; 456 } 457 } 458 459 /** 460 * Returns: an array of tokens lexed from the given source code to the output 461 * range. All whitespace, comment and specialTokenSequence tokens (trivia) are 462 * attached to the token nearest to them. 463 * 464 * Trivia is put on the last token as `trailingTrivia` if it is on the same 465 * line as the trivia, otherwise it will be attached to the next token in the 466 * `leadingTrivia` until there is the EOF, where it will be attached as 467 * `trailingTrivia` again. 468 */ 469 const(Token)[] getTokensForParser(R)(R sourceCode, LexerConfig config, StringCache* cache) 470 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 471 { 472 config.whitespaceBehavior = WhitespaceBehavior.include; 473 config.commentBehavior = CommentBehavior.noIntern; 474 475 auto leadingTriviaAppender = appender!(Token[])(); 476 leadingTriviaAppender.reserve(128); 477 auto trailingTriviaAppender = appender!(Token[])(); 478 trailingTriviaAppender.reserve(128); 479 480 auto output = appender!(typeof(return))(); 481 auto lexer = DLexer(sourceCode, config, cache); 482 loop: while (!lexer.empty) switch (lexer.front.type) 483 { 484 case tok!"specialTokenSequence": 485 case tok!"whitespace": 486 case tok!"comment": 487 if (!output.data.empty && lexer.front.line == output.data[$ - 1].line) 488 trailingTriviaAppender.put(lexer.front); 489 else 490 leadingTriviaAppender.put(lexer.front); 491 lexer.popFront(); 492 break; 493 case tok!"__EOF__": 494 break loop; 495 default: 496 Token t = lexer.front; 497 lexer.popFront(); 498 499 if (!output.data.empty && !trailingTriviaAppender.data.empty) 500 (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup; 501 t.leadingTrivia = leadingTriviaAppender.data.idup; 502 leadingTriviaAppender.clear(); 503 trailingTriviaAppender.clear(); 504 505 output.put(t); 506 break; 507 } 508 509 if (!output.data.empty) 510 { 511 trailingTriviaAppender.put(leadingTriviaAppender.data); 512 (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup; 513 } 514 515 return output.data; 516 } 517 518 /** 519 * The D lexer struct. 520 */ 521 public struct DLexer 522 { 523 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 524 keywords, pseudoTokenHandlers); 525 526 /// 527 @disable this(); 528 529 /** 530 * Params: 531 * range = the bytes that compose the source code that will be lexed. 532 * config = the lexer configuration to use. 533 * cache = the string interning cache for de-duplicating identifiers and 534 * other token text. 535 * haveSSE42 = Parse streaming SIMD Extensions 4.2 in inline assembly 536 */ 537 this(R)(R range, const LexerConfig config, StringCache* cache, 538 bool haveSSE42 = sse42()) pure nothrow @safe 539 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 540 { 541 this.haveSSE42 = haveSSE42; 542 auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf) 543 ? range[3 .. $] : range; 544 static if (is(ElementEncodingType!R == immutable)) 545 this.range = LexerRange(cast(const(ubyte)[]) r); 546 else 547 this.range = LexerRange(cast(const(ubyte)[]) r.idup); 548 this.config = config; 549 this.cache = cache; 550 popFront(); 551 } 552 553 /// 554 public void popFront()() pure nothrow @safe 555 { 556 do 557 _popFront(); 558 while (config.whitespaceBehavior == WhitespaceBehavior.skip 559 && _front.type == tok!"whitespace"); 560 } 561 562 /** 563 * Lexer error/warning message. 564 */ 565 static struct Message 566 { 567 /// 1-based line number 568 size_t line; 569 /// 1-based byte offset 570 size_t column; 571 /// Text of the message 572 string message; 573 /// `true` for an error, `false` for a warning 574 bool isError; 575 } 576 577 /** 578 * Returns: An array of all of the warnings and errors generated so far 579 * during lexing. It may make sense to only check this when `empty` 580 * returns `true`. 581 */ 582 const(Message[]) messages() const @property 583 { 584 return _messages; 585 } 586 587 private pure nothrow @safe: 588 589 bool isWhitespace() 590 { 591 switch (range.bytes[range.index]) 592 { 593 case ' ': 594 case '\r': 595 case '\n': 596 case '\t': 597 case '\v': 598 case '\f': 599 return true; 600 case 0xe2: 601 auto peek = range.peek(2); 602 return peek.length == 2 603 && peek[0] == 0x80 604 && (peek[1] == 0xa8 || peek[1] == 0xa9); 605 default: 606 return false; 607 } 608 } 609 610 void popFrontWhitespaceAware() 611 { 612 switch (range.bytes[range.index]) 613 { 614 case '\r': 615 range.popFront(); 616 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 617 { 618 range.popFront(); 619 range.incrementLine(); 620 } 621 else 622 range.incrementLine(); 623 return; 624 case '\n': 625 range.popFront(); 626 range.incrementLine(); 627 return; 628 case 0xe2: 629 auto lookahead = range.peek(3); 630 if (lookahead.length == 3 && lookahead[1] == 0x80 631 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 632 { 633 range.index+=3; 634 range.column+=3; 635 range.incrementLine(); 636 return; 637 } 638 else 639 { 640 range.popFront(); 641 return; 642 } 643 default: 644 range.popFront(); 645 return; 646 } 647 } 648 649 void lexWhitespace(ref Token token) @trusted 650 { 651 mixin (tokenStart); 652 loop: do 653 { 654 version (X86_64) 655 { 656 if (haveSSE42 && range.index + 16 < range.bytes.length) 657 { 658 skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index, 659 &range.index, &range.column); 660 } 661 } 662 switch (range.bytes[range.index]) 663 { 664 case '\r': 665 range.popFront(); 666 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 667 { 668 range.popFront(); 669 } 670 range.column = 1; 671 range.line += 1; 672 break; 673 case '\n': 674 range.popFront(); 675 range.column = 1; 676 range.line += 1; 677 break; 678 case ' ': 679 case '\t': 680 case '\v': 681 case '\f': 682 range.popFront(); 683 break; 684 case 0xe2: 685 if (range.index + 2 >= range.bytes.length) 686 break loop; 687 if (range.bytes[range.index + 1] != 0x80) 688 break loop; 689 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9) 690 { 691 range.index += 3; 692 range.column += 3; 693 range.column = 1; 694 range.line += 1; 695 break; 696 } 697 break loop; 698 default: 699 break loop; 700 } 701 } while (!(range.index >= range.bytes.length)); 702 string text = config.whitespaceBehavior == WhitespaceBehavior.include 703 ? cache.intern(range.slice(mark)) : ""; 704 token = Token(tok!"whitespace", text, line, column, index); 705 } 706 707 void lexNumber(ref Token token) 708 { 709 mixin (tokenStart); 710 if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length) 711 { 712 immutable ahead = range.bytes[range.index + 1]; 713 switch (ahead) 714 { 715 case 'x': 716 case 'X': 717 range.index += 2; 718 range.column += 2; 719 lexHex(token, mark, line, column, index); 720 return; 721 case 'b': 722 case 'B': 723 range.index += 2; 724 range.column += 2; 725 lexBinary(token, mark, line, column, index); 726 return; 727 default: 728 lexDecimal(token, mark, line, column, index); 729 return; 730 } 731 } 732 else 733 lexDecimal(token, mark, line, column, index); 734 } 735 736 void lexHex(ref Token token) 737 { 738 mixin (tokenStart); 739 lexHex(token, mark, line, column, index); 740 } 741 742 void lexHex(ref Token token, size_t mark, size_t line, size_t column, 743 size_t index) @trusted 744 { 745 IdType type = tok!"intLiteral"; 746 bool foundDot; 747 hexLoop: while (!(range.index >= range.bytes.length)) 748 { 749 switch (range.bytes[range.index]) 750 { 751 case 'a': .. case 'f': 752 case 'A': .. case 'F': 753 case '0': .. case '9': 754 case '_': 755 version (X86_64) 756 { 757 if (haveSSE42 && range.index + 16 < range.bytes.length) 758 { 759 immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_') 760 (range.bytes.ptr + range.index); 761 range.column += i; 762 range.index += i; 763 } 764 else 765 range.popFront(); 766 } 767 else 768 range.popFront(); 769 break; 770 case 'u': 771 case 'U': 772 lexIntSuffix(type); 773 break hexLoop; 774 case 'i': 775 if (foundDot) 776 lexFloatSuffix(type); 777 break hexLoop; 778 case 'L': 779 if (foundDot) 780 lexFloatSuffix(type); 781 else 782 lexIntSuffix(type); 783 break hexLoop; 784 case 'p': 785 case 'P': 786 lexExponent(type); 787 break hexLoop; 788 case '.': 789 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 790 break hexLoop; 791 else 792 { 793 // The following bit of silliness tries to tell the 794 // difference between "int dot identifier" and 795 // "double identifier". 796 if (range.index + 1 < range.bytes.length) 797 { 798 switch (range.peekAt(1)) 799 { 800 case '0': .. case '9': 801 case 'A': .. case 'F': 802 case 'a': .. case 'f': 803 goto doubleLiteral; 804 default: 805 break hexLoop; 806 } 807 } 808 else 809 { 810 doubleLiteral: 811 range.popFront(); 812 foundDot = true; 813 type = tok!"doubleLiteral"; 814 } 815 } 816 break; 817 default: 818 break hexLoop; 819 } 820 } 821 token = Token(type, cache.intern(range.slice(mark)), line, column, 822 index); 823 } 824 825 void lexBinary(ref Token token) 826 { 827 mixin (tokenStart); 828 return lexBinary(token, mark, line, column, index); 829 } 830 831 void lexBinary(ref Token token, size_t mark, size_t line, size_t column, 832 size_t index) @trusted 833 { 834 IdType type = tok!"intLiteral"; 835 binaryLoop: while (!(range.index >= range.bytes.length)) 836 { 837 switch (range.bytes[range.index]) 838 { 839 case '0': 840 case '1': 841 case '_': 842 version (X86_64) 843 { 844 if (haveSSE42 && range.index + 16 < range.bytes.length) 845 { 846 immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')( 847 range.bytes.ptr + range.index); 848 range.column += i; 849 range.index += i; 850 } 851 else 852 range.popFront(); 853 } 854 else 855 range.popFront(); 856 break; 857 case 'u': 858 case 'U': 859 case 'L': 860 lexIntSuffix(type); 861 break binaryLoop; 862 default: 863 break binaryLoop; 864 } 865 } 866 token = Token(type, cache.intern(range.slice(mark)), line, column, 867 index); 868 } 869 870 void lexDecimal(ref Token token) 871 { 872 mixin (tokenStart); 873 lexDecimal(token, mark, line, column, index); 874 } 875 876 void lexDecimal(ref Token token, size_t mark, size_t line, size_t column, 877 size_t index) @trusted 878 { 879 bool foundDot = range.bytes[range.index] == '.'; 880 IdType type = tok!"intLiteral"; 881 if (foundDot) 882 { 883 range.popFront(); 884 type = tok!"doubleLiteral"; 885 } 886 887 decimalLoop: while (!(range.index >= range.bytes.length)) 888 { 889 switch (range.bytes[range.index]) 890 { 891 case '0': .. case '9': 892 case '_': 893 version (X86_64) 894 { 895 if (haveSSE42 && range.index + 16 < range.bytes.length) 896 { 897 immutable ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index); 898 range.column += i; 899 range.index += i; 900 } 901 else 902 range.popFront(); 903 } 904 else 905 range.popFront(); 906 break; 907 case 'u': 908 case 'U': 909 if (!foundDot) 910 lexIntSuffix(type); 911 break decimalLoop; 912 case 'i': 913 lexFloatSuffix(type); 914 break decimalLoop; 915 case 'L': 916 if (foundDot) 917 lexFloatSuffix(type); 918 else 919 lexIntSuffix(type); 920 break decimalLoop; 921 case 'f': 922 case 'F': 923 lexFloatSuffix(type); 924 break decimalLoop; 925 case 'e': 926 case 'E': 927 lexExponent(type); 928 break decimalLoop; 929 case '.': 930 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 931 break decimalLoop; 932 else 933 { 934 // The following bit of silliness tries to tell the 935 // difference between "int dot identifier" and 936 // "double identifier". 937 if (range.index + 1 < range.bytes.length) 938 { 939 immutable ch = range.peekAt(1); 940 if (ch <= 0x2f 941 || (ch >= '0' && ch <= '9') 942 || (ch >= ':' && ch <= '@') 943 || (ch >= '[' && ch <= '^') 944 || (ch >= '{' && ch <= '~') 945 || ch == '`' || ch == '_') 946 { 947 goto doubleLiteral; 948 } 949 else 950 break decimalLoop; 951 } 952 else 953 { 954 doubleLiteral: 955 range.popFront(); 956 foundDot = true; 957 type = tok!"doubleLiteral"; 958 } 959 } 960 break; 961 default: 962 break decimalLoop; 963 } 964 } 965 token = Token(type, cache.intern(range.slice(mark)), line, column, 966 index); 967 } 968 969 void lexIntSuffix(ref IdType type) pure nothrow @safe 970 { 971 bool secondPass; 972 if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U') 973 { 974 U: 975 if (type == tok!"intLiteral") 976 type = tok!"uintLiteral"; 977 else 978 type = tok!"ulongLiteral"; 979 range.popFront(); 980 if (secondPass) 981 return; 982 if (range.index < range.bytes.length 983 && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')) 984 goto L; 985 goto I; 986 } 987 if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l') 988 { 989 L: 990 if (type == tok!"uintLiteral") 991 type = tok!"ulongLiteral"; 992 else 993 type = tok!"longLiteral"; 994 range.popFront(); 995 if (range.index < range.bytes.length 996 && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u')) 997 { 998 secondPass = true; 999 goto U; 1000 } 1001 goto I; 1002 } 1003 I: 1004 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 1005 { 1006 warning("Complex number literals are deprecated"); 1007 range.popFront(); 1008 if (type == tok!"longLiteral" || type == tok!"ulongLiteral") 1009 type = tok!"idoubleLiteral"; 1010 else 1011 type = tok!"ifloatLiteral"; 1012 } 1013 } 1014 1015 void lexFloatSuffix(ref IdType type) pure nothrow @safe 1016 { 1017 switch (range.bytes[range.index]) 1018 { 1019 case 'L': 1020 range.popFront(); 1021 type = tok!"doubleLiteral"; 1022 break; 1023 case 'f': 1024 case 'F': 1025 range.popFront(); 1026 type = tok!"floatLiteral"; 1027 break; 1028 default: 1029 break; 1030 } 1031 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 1032 { 1033 warning("Complex number literals are deprecated"); 1034 range.popFront(); 1035 if (type == tok!"floatLiteral") 1036 type = tok!"ifloatLiteral"; 1037 else 1038 type = tok!"idoubleLiteral"; 1039 } 1040 } 1041 1042 void lexExponent(ref IdType type) pure nothrow @safe 1043 { 1044 range.popFront(); 1045 bool foundSign = false; 1046 bool foundDigit = false; 1047 while (range.index < range.bytes.length) 1048 { 1049 switch (range.bytes[range.index]) 1050 { 1051 case '-': 1052 case '+': 1053 if (foundSign) 1054 { 1055 if (!foundDigit) 1056 error("Expected an exponent"); 1057 return; 1058 } 1059 foundSign = true; 1060 range.popFront(); 1061 break; 1062 case '0': .. case '9': 1063 case '_': 1064 foundDigit = true; 1065 range.popFront(); 1066 break; 1067 case 'L': 1068 case 'f': 1069 case 'F': 1070 case 'i': 1071 lexFloatSuffix(type); 1072 return; 1073 default: 1074 if (!foundDigit) 1075 error("Expected an exponent"); 1076 return; 1077 } 1078 } 1079 } 1080 1081 void lexScriptLine(ref Token token) 1082 { 1083 mixin (tokenStart); 1084 while (!(range.index >= range.bytes.length) && !isNewline) 1085 { 1086 range.popFront(); 1087 } 1088 token = Token(tok!"scriptLine", cache.intern(range.slice(mark)), 1089 line, column, index); 1090 } 1091 1092 void lexSpecialTokenSequence(ref Token token) 1093 { 1094 mixin (tokenStart); 1095 while (!(range.index >= range.bytes.length) && !isNewline) 1096 { 1097 range.popFront(); 1098 } 1099 token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), 1100 line, column, index); 1101 } 1102 1103 void lexSlashStarComment(ref Token token) @trusted 1104 { 1105 mixin (tokenStart); 1106 IdType type = tok!"comment"; 1107 range.popFrontN(2); 1108 while (range.index < range.bytes.length) 1109 { 1110 version (X86_64) 1111 { 1112 if (haveSSE42 && range.index + 16 < range.bytes.length) 1113 skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index, 1114 &range.index, &range.column); 1115 } 1116 if (range.bytes[range.index] == '*') 1117 { 1118 range.popFront(); 1119 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1120 { 1121 range.popFront(); 1122 break; 1123 } 1124 } 1125 else 1126 popFrontWhitespaceAware(); 1127 } 1128 if (config.commentBehavior == CommentBehavior.intern) 1129 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1130 else 1131 token = Token(type, cast(string) range.slice(mark), line, column, index); 1132 } 1133 1134 void lexSlashSlashComment(ref Token token) @trusted 1135 { 1136 mixin (tokenStart); 1137 IdType type = tok!"comment"; 1138 range.popFrontN(2); 1139 while (range.index < range.bytes.length) 1140 { 1141 version (X86_64) 1142 { 1143 if (haveSSE42 && range.index + 16 < range.bytes.length) 1144 { 1145 skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1146 &range.index, &range.column); 1147 } 1148 } 1149 if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n') 1150 break; 1151 range.popFront(); 1152 } 1153 if (config.commentBehavior == CommentBehavior.intern) 1154 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1155 else 1156 token = Token(type, cast(string) range.slice(mark), line, column, index); 1157 } 1158 1159 void lexSlashPlusComment(ref Token token) @trusted 1160 { 1161 mixin (tokenStart); 1162 IdType type = tok!"comment"; 1163 range.index += 2; 1164 range.column += 2; 1165 int depth = 1; 1166 while (depth > 0 && !(range.index >= range.bytes.length)) 1167 { 1168 version (X86_64) 1169 { 1170 if (haveSSE42 && range.index + 16 < range.bytes.length) 1171 { 1172 skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1173 &range.index, &range.column); 1174 } 1175 } 1176 if (range.bytes[range.index] == '+') 1177 { 1178 range.popFront(); 1179 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1180 { 1181 range.popFront(); 1182 depth--; 1183 } 1184 } 1185 else if (range.bytes[range.index] == '/') 1186 { 1187 range.popFront(); 1188 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+') 1189 { 1190 range.popFront(); 1191 depth++; 1192 } 1193 } 1194 else 1195 popFrontWhitespaceAware(); 1196 } 1197 if (config.commentBehavior == CommentBehavior.intern) 1198 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1199 else 1200 token = Token(type, cast(string) range.slice(mark), line, column, index); 1201 } 1202 1203 void lexStringLiteral(ref Token token) @trusted 1204 { 1205 mixin (tokenStart); 1206 range.popFront(); 1207 while (true) 1208 { 1209 if (range.index >= range.bytes.length) 1210 { 1211 error("Error: unterminated string literal"); 1212 token = Token(tok!""); 1213 return; 1214 } 1215 version (X86_64) 1216 { 1217 if (haveSSE42 && range.index + 16 < range.bytes.length) 1218 { 1219 skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1220 &range.index, &range.column); 1221 } 1222 } 1223 if (range.bytes[range.index] == '"') 1224 { 1225 range.popFront(); 1226 break; 1227 } 1228 else if (range.bytes[range.index] == '\\') 1229 { 1230 if (!lexEscapeSequence()) 1231 { 1232 token = Token.init; 1233 return; 1234 } 1235 } 1236 else 1237 popFrontWhitespaceAware(); 1238 } 1239 IdType type = tok!"stringLiteral"; 1240 lexStringSuffix(type); 1241 token = Token(type, cache.intern(range.slice(mark)), line, column, 1242 index); 1243 } 1244 1245 void lexWysiwygString(ref Token token) @trusted 1246 { 1247 mixin (tokenStart); 1248 IdType type = tok!"stringLiteral"; 1249 immutable bool backtick = range.bytes[range.index] == '`'; 1250 if (backtick) 1251 { 1252 range.popFront(); 1253 while (true) 1254 { 1255 if (range.index >= range.bytes.length) 1256 { 1257 error("Error: unterminated string literal"); 1258 token = Token(tok!""); 1259 return; 1260 } 1261 version (X86_64) 1262 { 1263 if (haveSSE42 && range.index + 16 < range.bytes.length) 1264 { 1265 skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index, 1266 &range.index, &range.column); 1267 } 1268 } 1269 if (range.bytes[range.index] == '`') 1270 { 1271 range.popFront(); 1272 break; 1273 } 1274 else 1275 popFrontWhitespaceAware(); 1276 } 1277 } 1278 else 1279 { 1280 range.popFront(); 1281 if (range.index >= range.bytes.length) 1282 { 1283 error("Error: unterminated string literal"); 1284 token = Token(tok!""); 1285 return; 1286 } 1287 range.popFront(); 1288 while (true) 1289 { 1290 if (range.index >= range.bytes.length) 1291 { 1292 error("Error: unterminated string literal"); 1293 token = Token(tok!""); 1294 return; 1295 } 1296 else if (range.bytes[range.index] == '"') 1297 { 1298 range.popFront(); 1299 break; 1300 } 1301 else 1302 popFrontWhitespaceAware(); 1303 } 1304 } 1305 lexStringSuffix(type); 1306 token = Token(type, cache.intern(range.slice(mark)), line, column, 1307 index); 1308 } 1309 1310 private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe 1311 { 1312 if (range.index >= range.bytes.length) 1313 { 1314 type = tok!"stringLiteral"; 1315 return 0; 1316 } 1317 else 1318 { 1319 switch (range.bytes[range.index]) 1320 { 1321 case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w'; 1322 case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd'; 1323 case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c'; 1324 default: type = tok!"stringLiteral"; return 0; 1325 } 1326 } 1327 } 1328 1329 void lexDelimitedString(ref Token token) 1330 { 1331 mixin (tokenStart); 1332 range.index += 2; 1333 range.column += 2; 1334 ubyte open; 1335 ubyte close; 1336 switch (range.bytes[range.index]) 1337 { 1338 case '<': 1339 open = '<'; 1340 close = '>'; 1341 range.popFront(); 1342 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1343 break; 1344 case '{': 1345 open = '{'; 1346 close = '}'; 1347 range.popFront(); 1348 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1349 break; 1350 case '[': 1351 open = '['; 1352 close = ']'; 1353 range.popFront(); 1354 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1355 break; 1356 case '(': 1357 open = '('; 1358 close = ')'; 1359 range.popFront(); 1360 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1361 break; 1362 default: 1363 lexHeredocString(token, mark, line, column, index); 1364 break; 1365 } 1366 } 1367 1368 void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, 1369 size_t index, ubyte open, ubyte close) 1370 { 1371 int depth = 1; 1372 while (!(range.index >= range.bytes.length) && depth > 0) 1373 { 1374 if (range.bytes[range.index] == open) 1375 { 1376 depth++; 1377 range.popFront(); 1378 } 1379 else if (range.bytes[range.index] == close) 1380 { 1381 depth--; 1382 range.popFront(); 1383 if (depth <= 0) 1384 { 1385 if (range.bytes[range.index] == '"') 1386 { 1387 range.popFront(); 1388 } 1389 else 1390 { 1391 error("Error: `\"` expected to end delimited string literal"); 1392 token = Token(tok!""); 1393 return; 1394 } 1395 } 1396 } 1397 else 1398 popFrontWhitespaceAware(); 1399 } 1400 IdType type = tok!"stringLiteral"; 1401 lexStringSuffix(type); 1402 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1403 } 1404 1405 void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index) 1406 { 1407 Token ident; 1408 lexIdentifier(ident); 1409 if (isNewline()) 1410 popFrontWhitespaceAware(); 1411 else 1412 error("Newline expected"); 1413 while (!(range.index >= range.bytes.length)) 1414 { 1415 if (isNewline()) 1416 { 1417 popFrontWhitespaceAware(); 1418 if (!range.canPeek(ident.text.length)) 1419 { 1420 error(ident.text ~ " expected"); 1421 break; 1422 } 1423 if (range.peek(ident.text.length - 1) == ident.text) 1424 { 1425 range.popFrontN(ident.text.length); 1426 break; 1427 } 1428 } 1429 else 1430 { 1431 range.popFront(); 1432 } 1433 } 1434 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"') 1435 { 1436 range.popFront(); 1437 } 1438 else 1439 error("`\"` expected"); 1440 IdType type = tok!"stringLiteral"; 1441 lexStringSuffix(type); 1442 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1443 } 1444 1445 void lexTokenString(ref Token token) 1446 { 1447 mixin (tokenStart); 1448 assert (range.bytes[range.index] == 'q'); 1449 range.popFront(); 1450 assert (range.bytes[range.index] == '{'); 1451 range.popFront(); 1452 auto app = appender!string(); 1453 app.put("q{"); 1454 int depth = 1; 1455 1456 immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; 1457 immutable StringBehavior oldString = config.stringBehavior; 1458 config.whitespaceBehavior = WhitespaceBehavior.include; 1459 config.stringBehavior = StringBehavior.source; 1460 scope (exit) 1461 { 1462 config.whitespaceBehavior = oldWhitespace; 1463 config.stringBehavior = oldString; 1464 } 1465 1466 advance(_front); 1467 while (depth > 0 && !empty) 1468 { 1469 auto t = front(); 1470 if (t.text is null) 1471 app.put(str(t.type)); 1472 else 1473 app.put(t.text); 1474 if (t.type == tok!"}") 1475 { 1476 depth--; 1477 if (depth > 0) 1478 popFront(); 1479 } 1480 else if (t.type == tok!"{") 1481 { 1482 depth++; 1483 popFront(); 1484 } 1485 else 1486 popFront(); 1487 } 1488 IdType type = tok!"stringLiteral"; 1489 auto b = lexStringSuffix(type); 1490 if (b != 0) 1491 app.put(b); 1492 token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, 1493 column, index); 1494 } 1495 1496 void lexHexString(ref Token token) 1497 { 1498 mixin (tokenStart); 1499 range.index += 2; 1500 range.column += 2; 1501 1502 loop: while (true) 1503 { 1504 if (range.index >= range.bytes.length) 1505 { 1506 error("Error: unterminated hex string literal"); 1507 token = Token(tok!""); 1508 return; 1509 } 1510 else if (isWhitespace()) 1511 popFrontWhitespaceAware(); 1512 else switch (range.bytes[range.index]) 1513 { 1514 case '0': .. case '9': 1515 case 'A': .. case 'F': 1516 case 'a': .. case 'f': 1517 range.popFront(); 1518 break; 1519 case '"': 1520 range.popFront(); 1521 break loop; 1522 default: 1523 error("Error: invalid character in hex string"); 1524 token = Token(tok!""); 1525 return; 1526 } 1527 } 1528 1529 IdType type = tok!"stringLiteral"; 1530 lexStringSuffix(type); 1531 token = Token(type, cache.intern(range.slice(mark)), line, column, 1532 index); 1533 } 1534 1535 bool lexNamedEntity() 1536 in { assert (range.bytes[range.index] == '&'); } 1537 do 1538 { 1539 Token t; 1540 range.popFront(); 1541 lexIdentifier(t, true); 1542 if (t.type != tok!"identifier" || range.empty || range.bytes[range.index] != ';') 1543 { 1544 error("Error: invalid named character entity"); 1545 return false; 1546 } 1547 range.popFront(); 1548 return true; 1549 } 1550 1551 bool lexEscapeSequence() 1552 { 1553 range.popFront(); 1554 if (range.index >= range.bytes.length) 1555 { 1556 error("Error: non-terminated character escape sequence."); 1557 return false; 1558 } 1559 switch (range.bytes[range.index]) 1560 { 1561 case '&': return lexNamedEntity(); 1562 case '\'': 1563 case '"': 1564 case '?': 1565 case '\\': 1566 case 'a': 1567 case 'b': 1568 case 'f': 1569 case 'n': 1570 case 'r': 1571 case 't': 1572 case 'v': 1573 range.popFront(); 1574 break; 1575 case 'x': 1576 range.popFront(); 1577 foreach (i; 0 .. 2) 1578 { 1579 if (range.index >= range.bytes.length) 1580 { 1581 error("Error: 2 hex digits expected."); 1582 return false; 1583 } 1584 switch (range.bytes[range.index]) 1585 { 1586 case '0': .. case '9': 1587 case 'a': .. case 'f': 1588 case 'A': .. case 'F': 1589 range.popFront(); 1590 break; 1591 default: 1592 error("Error: 2 hex digits expected."); 1593 return false; 1594 } 1595 } 1596 break; 1597 case '0': 1598 if (!(range.index + 1 < range.bytes.length) 1599 || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\'')) 1600 { 1601 range.popFront(); 1602 break; 1603 } 1604 goto case; 1605 case '1': .. case '7': 1606 for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) 1607 && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++) 1608 range.popFront(); 1609 break; 1610 case 'u': 1611 range.popFront(); 1612 foreach (i; 0 .. 4) 1613 { 1614 if (range.index >= range.bytes.length) 1615 { 1616 error("Error: at least 4 hex digits expected."); 1617 return false; 1618 } 1619 switch (range.bytes[range.index]) 1620 { 1621 case '0': .. case '9': 1622 case 'a': .. case 'f': 1623 case 'A': .. case 'F': 1624 range.popFront(); 1625 break; 1626 default: 1627 error("Error: at least 4 hex digits expected."); 1628 return false; 1629 } 1630 } 1631 break; 1632 case 'U': 1633 range.popFront(); 1634 foreach (i; 0 .. 8) 1635 { 1636 if (range.index >= range.bytes.length) 1637 { 1638 error("Error: at least 8 hex digits expected."); 1639 return false; 1640 } 1641 switch (range.bytes[range.index]) 1642 { 1643 case '0': .. case '9': 1644 case 'a': .. case 'f': 1645 case 'A': .. case 'F': 1646 range.popFront(); 1647 break; 1648 default: 1649 error("Error: at least 8 hex digits expected."); 1650 return false; 1651 } 1652 } 1653 break; 1654 default: 1655 error("Invalid escape sequence"); 1656 while (true) 1657 { 1658 if (range.index >= range.bytes.length) 1659 { 1660 error("Error: non-terminated character escape sequence."); 1661 break; 1662 } 1663 if (range.bytes[range.index] == ';') 1664 { 1665 range.popFront(); 1666 break; 1667 } 1668 else 1669 { 1670 range.popFront(); 1671 } 1672 } 1673 return false; 1674 } 1675 return true; 1676 } 1677 1678 void lexCharacterLiteral(ref Token token) 1679 { 1680 mixin (tokenStart); 1681 range.popFront(); 1682 if (range.empty) 1683 goto err; 1684 if (range.bytes[range.index] == '\\') 1685 lexEscapeSequence(); 1686 else if (range.bytes[range.index] == '\'') 1687 { 1688 range.popFront(); 1689 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1690 line, column, index); 1691 } 1692 else if (range.bytes[range.index] & 0x80) 1693 { 1694 while (range.bytes[range.index] & 0x80) 1695 range.popFront(); 1696 } 1697 else 1698 popFrontWhitespaceAware(); 1699 1700 if (range.index < range.bytes.length && range.bytes[range.index] == '\'') 1701 { 1702 range.popFront(); 1703 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1704 line, column, index); 1705 } 1706 else 1707 { 1708 err: 1709 error("Error: Expected `'` to end character literal"); 1710 token = Token(tok!""); 1711 } 1712 } 1713 1714 void lexIdentifier(ref Token token, const bool silent = false) @trusted 1715 { 1716 mixin (tokenStart); 1717 1718 if (isSeparating(0)) 1719 { 1720 if (silent) return; 1721 1722 error("Invalid identifier"); 1723 range.popFront(); 1724 } 1725 while (true) 1726 { 1727 version (X86_64) 1728 { 1729 if (haveSSE42 && range.index + 16 < range.bytes.length) 1730 { 1731 immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_') 1732 (range.bytes.ptr + range.index); 1733 range.column += i; 1734 range.index += i; 1735 } 1736 } 1737 if (isSeparating(0)) 1738 break; 1739 else 1740 range.popFront(); 1741 } 1742 token = Token(tok!"identifier", cache.intern(range.slice(mark)), line, 1743 column, index); 1744 } 1745 1746 void lexDot(ref Token token) 1747 { 1748 mixin (tokenStart); 1749 if (!(range.index + 1 < range.bytes.length)) 1750 { 1751 range.popFront(); 1752 token = Token(tok!".", null, line, column, index); 1753 return; 1754 } 1755 switch (range.peekAt(1)) 1756 { 1757 case '0': .. case '9': 1758 lexNumber(token); 1759 return; 1760 case '.': 1761 range.popFront(); 1762 range.popFront(); 1763 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.') 1764 { 1765 range.popFront(); 1766 token = Token(tok!"...", null, line, column, index); 1767 } 1768 else 1769 token = Token(tok!"..", null, line, column, index); 1770 return; 1771 default: 1772 range.popFront(); 1773 token = Token(tok!".", null, line, column, index); 1774 return; 1775 } 1776 } 1777 1778 void lexLongNewline(ref Token token) @nogc 1779 { 1780 mixin (tokenStart); 1781 range.popFront(); 1782 range.popFront(); 1783 range.popFront(); 1784 range.incrementLine(); 1785 string text = config.whitespaceBehavior == WhitespaceBehavior.include 1786 ? cache.intern(range.slice(mark)) : ""; 1787 token = Token(tok!"whitespace", text, line, 1788 column, index); 1789 } 1790 1791 bool isNewline() @nogc 1792 { 1793 if (range.bytes[range.index] == '\n') return true; 1794 if (range.bytes[range.index] == '\r') return true; 1795 return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length) 1796 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 1797 } 1798 1799 bool isSeparating(size_t offset) @nogc 1800 { 1801 enum : ubyte 1802 { 1803 n, y, m // no, yes, maybe 1804 } 1805 1806 if (range.index + offset >= range.bytes.length) 1807 return true; 1808 auto c = range.bytes[range.index + offset]; 1809 static immutable ubyte[256] LOOKUP_TABLE = [ 1810 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1811 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1812 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1813 n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y, 1814 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1815 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n, 1816 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1817 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, 1818 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1819 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1820 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1821 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1822 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1823 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1824 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1825 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m 1826 ]; 1827 immutable ubyte result = LOOKUP_TABLE[c]; 1828 if (result == n) 1829 return false; 1830 if (result == y) 1831 return true; 1832 if (result == m) 1833 { 1834 auto r = range; 1835 range.popFrontN(offset); 1836 return (r.canPeek(2) && (r.peek(2) == "\u2028" 1837 || r.peek(2) == "\u2029")); 1838 } 1839 assert (false); 1840 } 1841 1842 1843 1844 enum tokenStart = q{ 1845 size_t index = range.index; 1846 size_t column = range.column; 1847 size_t line = range.line; 1848 auto mark = range.mark(); 1849 }; 1850 1851 void error(string message) 1852 { 1853 _messages ~= Message(range.line, range.column, message, true); 1854 } 1855 1856 void warning(string message) 1857 { 1858 _messages ~= Message(range.line, range.column, message, false); 1859 assert (_messages.length > 0); 1860 } 1861 1862 Message[] _messages; 1863 StringCache* cache; 1864 LexerConfig config; 1865 bool haveSSE42; 1866 } 1867 1868 /** 1869 * Creates a token range from the given source code. Creates a default lexer 1870 * configuration and a GC-managed string cache. 1871 */ 1872 public auto byToken(R)(R range) 1873 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1874 { 1875 LexerConfig config; 1876 StringCache* cache = new StringCache(range.length.optimalBucketCount); 1877 return DLexer(range, config, cache); 1878 } 1879 1880 /** 1881 * Creates a token range from the given source code. Uses the given string 1882 * cache. 1883 */ 1884 public auto byToken(R)(R range, StringCache* cache) 1885 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1886 { 1887 LexerConfig config; 1888 return DLexer(range, config, cache); 1889 } 1890 1891 /** 1892 * Creates a token range from the given source code. Uses the provided lexer 1893 * configuration and string cache. 1894 */ 1895 public auto byToken(R)(R range, const LexerConfig config, StringCache* cache) 1896 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1897 { 1898 return DLexer(range, config, cache); 1899 } 1900 1901 /** 1902 * Helper function used to avoid too much allocations while lexing. 1903 * 1904 * Params: 1905 * size = The length in bytes of the source file. 1906 * 1907 * Returns: 1908 * The optimal initial bucket count a `StringCache` should have. 1909 */ 1910 size_t optimalBucketCount(size_t size) 1911 { 1912 import std.math : nextPow2; 1913 return nextPow2((size + 31U) / 32U).min(1U << 30U); 1914 } 1915 /// 1916 unittest 1917 { 1918 assert(optimalBucketCount(1) == 2); 1919 assert(optimalBucketCount(9000 * 32) == 16384); 1920 static if (size_t.sizeof == ulong.sizeof) 1921 assert(optimalBucketCount(100_000_000_000UL) == 1 << 30); 1922 } 1923 1924 /** 1925 * The string cache is used for string interning. 1926 * 1927 * It will only store a single copy of any string that it is asked to hold. 1928 * Interned strings can be compared for equality by comparing their $(B .ptr) 1929 * field. 1930 * 1931 * Default and postbilt constructors are disabled. When a StringCache goes out 1932 * of scope, the memory held by it is freed. 1933 * 1934 * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning) 1935 */ 1936 struct StringCache 1937 { 1938 public pure nothrow @nogc: 1939 1940 @disable this(); 1941 @disable this(this); 1942 1943 /** 1944 * Params: bucketCount = the initial number of buckets. Must be a 1945 * power of two 1946 */ 1947 this(size_t bucketCount) nothrow @trusted @nogc 1948 in 1949 { 1950 import core.bitop : popcnt; 1951 static if (size_t.sizeof == 8) 1952 { 1953 immutable low = popcnt(cast(uint) bucketCount); 1954 immutable high = popcnt(cast(uint) (bucketCount >> 32)); 1955 assert ((low == 0 && high == 1) || (low == 1 && high == 0)); 1956 } 1957 else 1958 { 1959 static assert (size_t.sizeof == 4); 1960 assert (popcnt(cast(uint) bucketCount) == 1); 1961 } 1962 } 1963 do 1964 { 1965 buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount]; 1966 } 1967 1968 ~this() 1969 { 1970 Block* current = rootBlock; 1971 while (current !is null) 1972 { 1973 Block* prev = current; 1974 current = current.next; 1975 free(cast(void*) prev); 1976 } 1977 foreach (nodePointer; buckets) 1978 { 1979 Node* currentNode = nodePointer; 1980 while (currentNode !is null) 1981 { 1982 if (currentNode.mallocated) 1983 free(currentNode.str.ptr); 1984 Node* prev = currentNode; 1985 currentNode = currentNode.next; 1986 free(prev); 1987 } 1988 } 1989 rootBlock = null; 1990 free(buckets.ptr); 1991 buckets = null; 1992 } 1993 1994 /** 1995 * Caches a string. 1996 */ 1997 string intern(const(ubyte)[] str) @safe 1998 { 1999 if (str is null || str.length == 0) 2000 return ""; 2001 return _intern(str); 2002 } 2003 2004 /** 2005 * ditto 2006 */ 2007 string intern(string str) @trusted 2008 { 2009 return intern(cast(ubyte[]) str); 2010 } 2011 2012 /** 2013 * The default bucket count for the string cache. 2014 */ 2015 static enum defaultBucketCount = 4096; 2016 2017 private: 2018 2019 string _intern(const(ubyte)[] bytes) @trusted 2020 { 2021 immutable uint hash = hashBytes(bytes); 2022 immutable size_t index = hash & (buckets.length - 1); 2023 Node* s = find(bytes, hash); 2024 if (s !is null) 2025 return cast(string) s.str; 2026 ubyte[] mem = void; 2027 bool mallocated = bytes.length > BIG_STRING; 2028 if (mallocated) 2029 mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length]; 2030 else 2031 mem = allocate(bytes.length); 2032 mem[] = bytes[]; 2033 Node* node = cast(Node*) malloc(Node.sizeof); 2034 node.str = mem; 2035 node.hash = hash; 2036 node.next = buckets[index]; 2037 node.mallocated = mallocated; 2038 buckets[index] = node; 2039 return cast(string) mem; 2040 } 2041 2042 Node* find(const(ubyte)[] bytes, uint hash) @trusted 2043 { 2044 import std.algorithm : equal; 2045 immutable size_t index = hash & (buckets.length - 1); 2046 Node* node = buckets[index]; 2047 while (node !is null) 2048 { 2049 if (node.hash == hash && bytes == cast(ubyte[]) node.str) 2050 return node; 2051 node = node.next; 2052 } 2053 return node; 2054 } 2055 2056 static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc 2057 in 2058 { 2059 assert (data !is null); 2060 assert (data.length > 0); 2061 } 2062 do 2063 { 2064 immutable uint m = 0x5bd1e995; 2065 immutable int r = 24; 2066 uint h = cast(uint) data.length; 2067 while (data.length >= 4) 2068 { 2069 uint k = (cast(ubyte) data[3]) << 24 2070 | (cast(ubyte) data[2]) << 16 2071 | (cast(ubyte) data[1]) << 8 2072 | (cast(ubyte) data[0]); 2073 k *= m; 2074 k ^= k >> r; 2075 k *= m; 2076 h *= m; 2077 h ^= k; 2078 data = data[4 .. $]; 2079 } 2080 switch (data.length & 3) 2081 { 2082 case 3: 2083 h ^= data[2] << 16; 2084 goto case; 2085 case 2: 2086 h ^= data[1] << 8; 2087 goto case; 2088 case 1: 2089 h ^= data[0]; 2090 h *= m; 2091 break; 2092 default: 2093 break; 2094 } 2095 h ^= h >> 13; 2096 h *= m; 2097 h ^= h >> 15; 2098 return h; 2099 } 2100 2101 ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc 2102 in 2103 { 2104 assert (numBytes != 0); 2105 } 2106 out (result) 2107 { 2108 assert (result.length == numBytes); 2109 } 2110 do 2111 { 2112 Block* r = rootBlock; 2113 size_t i = 0; 2114 while (i <= 3 && r !is null) 2115 { 2116 immutable size_t available = r.bytes.length; 2117 immutable size_t oldUsed = r.used; 2118 immutable size_t newUsed = oldUsed + numBytes; 2119 if (newUsed <= available) 2120 { 2121 r.used = newUsed; 2122 return r.bytes[oldUsed .. newUsed]; 2123 } 2124 i++; 2125 r = r.next; 2126 } 2127 Block* b = cast(Block*) calloc(Block.sizeof, 1); 2128 b.used = numBytes; 2129 b.next = rootBlock; 2130 rootBlock = b; 2131 return b.bytes[0 .. numBytes]; 2132 } 2133 2134 static struct Node 2135 { 2136 ubyte[] str = void; 2137 Node* next = void; 2138 uint hash = void; 2139 bool mallocated = void; 2140 } 2141 2142 static struct Block 2143 { 2144 Block* next; 2145 size_t used; 2146 enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof; 2147 ubyte[BLOCK_CAPACITY] bytes; 2148 } 2149 2150 static assert (BLOCK_SIZE == Block.sizeof); 2151 2152 enum BLOCK_SIZE = 1024 * 16; 2153 2154 // If a string would take up more than 1/4 of a block, allocate it outside 2155 // of the block. 2156 enum BIG_STRING = BLOCK_SIZE / 4; 2157 2158 Node*[] buckets; 2159 Block* rootBlock; 2160 } 2161 2162 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted; 2163 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted; 2164 private extern(C) void free(void*) nothrow pure @nogc @trusted; 2165 2166 unittest 2167 { 2168 auto source = cast(ubyte[]) q{ import std.stdio;}}; 2169 auto tokens = getTokensForParser(source, LexerConfig(), 2170 new StringCache(StringCache.defaultBucketCount)); 2171 assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", 2172 tok!"identifier", tok!";"])); 2173 } 2174 2175 /// Test \x char sequence 2176 unittest 2177 { 2178 auto toks = (string s) => byToken(cast(ubyte[])s); 2179 2180 // valid 2181 immutable hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; 2182 auto source = ""; 2183 foreach (h1; hex) 2184 foreach (h2; hex) 2185 source ~= "'\\x" ~ h1 ~ h2 ~ "'"; 2186 assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty); 2187 2188 // invalid 2189 assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2190 assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2191 assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2192 assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2193 assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2194 } 2195 2196 version (X86_64) 2197 { 2198 version (DigitalMars) 2199 private enum useDMDStyle = true; 2200 else version (LDC) 2201 private enum useDMDStyle = (__VERSION__ < 2092); // GDC-style supported since v1.22 2202 else 2203 private enum useDMDStyle = false; // not supported by GDC 2204 2205 private ulong pcmpestri(ubyte flags, chars...)(const ubyte* bytes) pure nothrow 2206 @trusted @nogc if (chars.length <= 8) 2207 { 2208 enum constant = ByteCombine!chars; 2209 enum charsLength = chars.length; 2210 2211 static if (useDMDStyle) 2212 { 2213 asm pure nothrow @nogc 2214 { 2215 naked; 2216 } 2217 version (Windows) // `bytes` in RCX 2218 asm pure nothrow @nogc { movdqu XMM1, [RCX]; } 2219 else // `bytes` in RDI 2220 asm pure nothrow @nogc { movdqu XMM1, [RDI]; } 2221 asm pure nothrow @nogc 2222 { 2223 mov R10, constant; 2224 movq XMM2, R10; 2225 mov RAX, charsLength; 2226 mov RDX, 16; 2227 pcmpestri XMM2, XMM1, flags; 2228 mov RAX, RCX; 2229 ret; 2230 } 2231 } 2232 else // GDC-style inline asm (GCC basically) 2233 { 2234 ulong result; 2235 asm pure nothrow @nogc 2236 { 2237 `movdqu %1, %%xmm1 2238 movq %3, %%xmm2 2239 pcmpestri %5, %%xmm1, %%xmm2` 2240 : "=c" (result) // %0: pcmpestri result in RCX, to be stored into `result` 2241 : "m" (*bytes), // %1: address of `bytes` string 2242 "d" (16), // %2: length of `bytes` head in XMM1, as pcmpestri input in EDX 2243 "r" (constant), // %3: max 8 `chars` to load into GP register, then XMM2 2244 "a" (charsLength), // %4: length in XMM2, as pcmpestri input in EAX 2245 "i" (flags) // %5: `flags` immediate 2246 : "xmm1", "xmm2"; // clobbered registers 2247 } 2248 return result; 2249 } 2250 } 2251 2252 /** 2253 * Skips between 0 and 16 bytes that match (or do not match) one of the 2254 * given $(B chars). 2255 */ 2256 void skip(bool matching, chars...)(const ubyte* bytes, ulong* pindex, ulong* pcolumn) pure nothrow 2257 @trusted @nogc if (chars.length <= 8) 2258 { 2259 static if (matching) 2260 enum flags = 0b0001_0000; 2261 else 2262 enum flags = 0b0000_0000; 2263 2264 const r = pcmpestri!(flags, chars)(bytes); 2265 *pindex += r; 2266 *pcolumn += r; 2267 } 2268 2269 /** 2270 * Returns: the number of bytes starting at the given location that match 2271 * (or do not match if $(B invert) is true) the byte ranges in $(B chars). 2272 */ 2273 ulong rangeMatch(bool invert, chars...)(const ubyte* bytes) pure nothrow @trusted @nogc 2274 { 2275 static assert(chars.length % 2 == 0); 2276 static if (invert) 2277 enum rangeMatchFlags = 0b0000_0100; 2278 else 2279 enum rangeMatchFlags = 0b0001_0100; 2280 2281 return pcmpestri!(rangeMatchFlags, chars)(bytes); 2282 } 2283 2284 template ByteCombine(c...) 2285 { 2286 static assert (c.length <= 8); 2287 static if (c.length > 1) 2288 enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8); 2289 else 2290 enum ulong ByteCombine = c[0]; 2291 } 2292 } 2293 2294 unittest 2295 { 2296 import core.exception : RangeError; 2297 import std.exception : assertNotThrown; 2298 2299 static immutable src1 = "/++"; 2300 static immutable src2 = "/**"; 2301 2302 LexerConfig cf; 2303 StringCache ca = StringCache(16); 2304 2305 assertNotThrown!RangeError(getTokensForParser(src1, cf, &ca)); 2306 assertNotThrown!RangeError(getTokensForParser(src2, cf, &ca)); 2307 } 2308 2309 unittest 2310 { 2311 static immutable src = `"\eeee"`; 2312 2313 LexerConfig cf; 2314 StringCache ca = StringCache(16); 2315 2316 auto l = DLexer(src, cf, &ca); 2317 assert(l.front().type == tok!""); 2318 assert(!l.messages.empty); 2319 } 2320 2321 unittest 2322 { 2323 alias Msg = DLexer.Message; 2324 LexerConfig cf; 2325 StringCache ca = StringCache(16); 2326 2327 { 2328 auto l = DLexer(`"\©"`, cf, &ca); 2329 assert(l.front().type == tok!"stringLiteral"); 2330 assert(l.messages == []); 2331 } 2332 { 2333 auto l = DLexer(`"\™\⌝"`, cf, &ca); 2334 assert(l.front().type == tok!"stringLiteral"); 2335 assert(l.messages == []); 2336 } 2337 { 2338 auto l = DLexer(`"\&trade"`, cf, &ca); 2339 assert(l.front().type == tok!""); 2340 assert(l.messages == [ Msg(1, 9, "Error: invalid named character entity", true) ]); 2341 } 2342 { 2343 auto l = DLexer(`"\™\&urcorn"`, cf, &ca); 2344 assert(l.front().type == tok!""); 2345 assert(l.messages == [ Msg(1, 18, "Error: invalid named character entity", true) ]); 2346 } 2347 { 2348 auto l = DLexer(`"\&"`, cf, &ca); 2349 assert(l.front().type == tok!""); 2350 assert(l.messages == [ Msg(1, 4, "Error: invalid named character entity", true) ]); 2351 } 2352 { 2353 auto l = DLexer(`"\&0"`, cf, &ca); 2354 assert(l.front().type == tok!""); 2355 assert(l.messages == [ Msg(1, 5, "Error: invalid named character entity", true) ]); 2356 } 2357 { 2358 auto l = DLexer(`"\©`, cf, &ca); 2359 assert(l.front().type == tok!""); 2360 assert(l.messages == [ Msg(1, 8, "Error: invalid named character entity", true) ]); 2361 } 2362 { 2363 auto l = DLexer(`"\©`, cf, &ca); 2364 assert(l.front().type == tok!""); 2365 assert(l.messages == [ Msg(1, 9, "Error: unterminated string literal", true) ]); 2366 } 2367 } 2368 2369 // legacy code using compatibility comment and trailingComment 2370 unittest 2371 { 2372 import std.conv : to; 2373 import std.exception : enforce; 2374 2375 static immutable src = `/// this is a module. 2376 // mixed 2377 /// it can do stuff 2378 module foo.bar; 2379 2380 // hello 2381 2382 /** 2383 * some doc 2384 * hello 2385 */ 2386 int x; /// very nice 2387 2388 // TODO: do stuff 2389 void main() { 2390 #line 40 2391 /// could be better 2392 writeln(":)"); 2393 } 2394 2395 /// end of file`; 2396 2397 LexerConfig cf; 2398 StringCache ca = StringCache(16); 2399 2400 const tokens = getTokensForParser(src, cf, &ca); 2401 2402 void assertEquals(T)(T a, T b, string what, string file = __FILE__, size_t line = __LINE__) 2403 { 2404 enforce(a == b, "Failed " ~ what ~ " '" ~ a.to!string ~ "' == '" ~ b.to!string ~ "'", file, line); 2405 } 2406 2407 void test(size_t index, IdType type, string comment, string trailingComment, 2408 string file = __FILE__, size_t line = __LINE__) 2409 { 2410 assertEquals(tokens[index].type, type, "type", file, line); 2411 assertEquals(tokens[index].comment, comment, "comment", file, line); 2412 assertEquals(tokens[index].trailingComment, trailingComment, "trailingComment", file, line); 2413 } 2414 2415 test(0, tok!"module", "this is a module.\nit can do stuff", ""); 2416 test(1, tok!"identifier", "", ""); 2417 test(2, tok!".", "", ""); 2418 test(3, tok!"identifier", "", ""); 2419 test(4, tok!";", "", ""); 2420 test(5, tok!"int", "some doc\nhello", ""); 2421 test(6, tok!"identifier", "", ""); 2422 test(7, tok!";", "", "very nice"); 2423 test(8, tok!"void", "", ""); 2424 test(9, tok!"identifier", "", ""); 2425 test(10, tok!"(", "", ""); 2426 test(11, tok!")", "", ""); 2427 test(12, tok!"{", "", ""); 2428 test(13, tok!"identifier", "could be better", ""); 2429 test(14, tok!"(", "", ""); 2430 test(15, tok!"stringLiteral", "", ""); 2431 test(16, tok!")", "", ""); 2432 test(17, tok!";", "", ""); 2433 test(18, tok!"}", "", ""); 2434 } 2435 2436 // dlang-community/D-Scanner#805 2437 unittest 2438 { 2439 final class SomeExpr 2440 { 2441 Token tok; 2442 } 2443 2444 auto e1 = new SomeExpr(); 2445 const e2 = new SomeExpr(); 2446 immutable e3 = new immutable SomeExpr(); 2447 2448 immutable t1 = e1.tok; 2449 immutable t2 = e2.tok; 2450 immutable t3 = e3.tok; 2451 }