1 module dparse.lexer; 2 3 import std.typecons; 4 import std.typetuple; 5 import std.array; 6 import std.algorithm; 7 import std.range; 8 import std.experimental.lexer; 9 import std.traits; 10 import core.cpuid : sse42; 11 version (D_InlineAsm_X86_64) 12 { 13 version (Windows) {} 14 else version = iasm64NotWindows; 15 } 16 17 /// Operators 18 private enum operators = [ 19 ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", 20 "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", 21 "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", 22 "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", 23 "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" 24 ]; 25 26 /// Kewords 27 private enum keywords = [ 28 "abstract", "alias", "align", "asm", "assert", "auto", "bool", 29 "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", 30 "char", "class", "const", "continue", "creal", "dchar", "debug", "default", 31 "delegate", "delete", "deprecated", "do", "double", "else", "enum", 32 "export", "extern", "false", "final", "finally", "float", "for", "foreach", 33 "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", 34 "immutable", "import", "in", "inout", "int", "interface", "invariant", 35 "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", 36 "null", "out", "override", "package", "pragma", "private", "protected", 37 "public", "pure", "real", "ref", "return", "scope", "shared", "short", 38 "static", "struct", "super", "switch", "synchronized", "template", "this", 39 "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", 40 "uint", "ulong", "union", "unittest", "ushort", "version", "void", 41 "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", 42 "__FILE_FULL_PATH__", "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", 43 "__parameters", "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", 44 "__vector", "__VENDOR__", "__VERSION__" 45 ]; 46 47 /// Other tokens 48 private enum dynamicTokens = [ 49 "specialTokenSequence", "comment", "identifier", "scriptLine", 50 "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", 51 "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", 52 "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", 53 "dstringLiteral", "stringLiteral", "wstringLiteral" 54 ]; 55 56 private enum pseudoTokenHandlers = [ 57 "\"", "lexStringLiteral", 58 "`", "lexWysiwygString", 59 "//", "lexSlashSlashComment", 60 "/*", "lexSlashStarComment", 61 "/+", "lexSlashPlusComment", 62 ".", "lexDot", 63 "'", "lexCharacterLiteral", 64 "0", "lexNumber", 65 "1", "lexDecimal", 66 "2", "lexDecimal", 67 "3", "lexDecimal", 68 "4", "lexDecimal", 69 "5", "lexDecimal", 70 "6", "lexDecimal", 71 "7", "lexDecimal", 72 "8", "lexDecimal", 73 "9", "lexDecimal", 74 "q\"", "lexDelimitedString", 75 "q{", "lexTokenString", 76 "r\"", "lexWysiwygString", 77 "x\"", "lexHexString", 78 " ", "lexWhitespace", 79 "\t", "lexWhitespace", 80 "\r", "lexWhitespace", 81 "\n", "lexWhitespace", 82 "\v", "lexWhitespace", 83 "\f", "lexWhitespace", 84 "\u2028", "lexLongNewline", 85 "\u2029", "lexLongNewline", 86 "#!", "lexScriptLine", 87 "#line", "lexSpecialTokenSequence" 88 ]; 89 90 /// Token ID type for the D lexer. 91 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); 92 93 /** 94 * Function used for converting an IdType to a string. 95 * 96 * Examples: 97 * --- 98 * IdType c = tok!"case"; 99 * assert (str(c) == "case"); 100 * --- 101 */ 102 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); 103 104 /** 105 * Template used to refer to D token types. 106 * 107 * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for 108 * values that can be passed to this template. 109 * Example: 110 * --- 111 * import dparse.lexer; 112 * IdType t = tok!"floatLiteral"; 113 * --- 114 */ 115 public template tok(string token) 116 { 117 alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); 118 } 119 120 private enum extraFields = q{ 121 string comment; 122 string trailingComment; 123 124 int opCmp(size_t i) const pure nothrow @safe { 125 if (index < i) return -1; 126 if (index > i) return 1; 127 return 0; 128 } 129 130 int opCmp(ref const typeof(this) other) const pure nothrow @safe { 131 return opCmp(other.index); 132 } 133 }; 134 135 /// The token type in the D lexer 136 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields); 137 138 /** 139 * Configure whitespace handling 140 */ 141 public enum WhitespaceBehavior : ubyte 142 { 143 include = 0b0000_0000, 144 skip = 0b0000_0001, 145 } 146 147 /** 148 * Configure string lexing behavior 149 */ 150 public enum StringBehavior : ubyte 151 { 152 /// Do not include quote characters, process escape sequences 153 compiler = 0b0000_0000, 154 /// Opening quotes, closing quotes, and string suffixes are included in the 155 /// string token 156 includeQuoteChars = 0b0000_0001, 157 /// String escape sequences are not replaced 158 notEscaped = 0b0000_0010, 159 /// Not modified at all. Useful for formatters or highlighters 160 source = includeQuoteChars | notEscaped 161 } 162 163 public enum CommentBehavior : bool 164 { 165 intern = true, 166 noIntern = false 167 } 168 /** 169 * Lexer configuration struct 170 */ 171 public struct LexerConfig 172 { 173 string fileName; 174 StringBehavior stringBehavior; 175 WhitespaceBehavior whitespaceBehavior; 176 CommentBehavior commentBehavior = CommentBehavior.intern; 177 } 178 179 /** 180 * Basic type token types. 181 */ 182 public alias BasicTypes = AliasSeq!(tok!"int", tok!"bool", tok!"byte", 183 tok!"cdouble", tok!"cent", tok!"cfloat", tok!"char", tok!"creal", 184 tok!"dchar", tok!"double", tok!"float", tok!"idouble", 185 tok!"ifloat", tok!"ireal", tok!"long", tok!"real", tok!"short", 186 tok!"ubyte", tok!"ucent", tok!"uint", tok!"ulong", tok!"ushort", 187 tok!"void", tok!"wchar"); 188 189 /** 190 * Returns: true if the given ID is for a basic type. 191 */ 192 public bool isBasicType(IdType type) nothrow pure @safe @nogc 193 { 194 switch (type) 195 { 196 foreach (T; BasicTypes) 197 { 198 case T: 199 return true; 200 } 201 default: 202 return false; 203 } 204 } 205 206 /** 207 * Number literal token types. 208 */ 209 public alias NumberLiterals = AliasSeq!(tok!"doubleLiteral", 210 tok!"floatLiteral", tok!"idoubleLiteral", tok!"ifloatLiteral", 211 tok!"intLiteral", tok!"longLiteral", tok!"realLiteral", 212 tok!"irealLiteral", tok!"uintLiteral", tok!"ulongLiteral"); 213 214 /** 215 * Returns: true if the given ID type is for a number literal. 216 */ 217 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc 218 { 219 switch (type) 220 { 221 foreach (T; NumberLiterals) 222 { 223 case T: 224 return true; 225 } 226 default: 227 return false; 228 } 229 } 230 231 /** 232 * Number literal token types. 233 */ 234 public alias IntegerLiterals = AliasSeq!(tok!"intLiteral", tok!"longLiteral", 235 tok!"uintLiteral", tok!"ulongLiteral"); 236 237 /** 238 * Returns: true if the given ID type is for a integer literal. 239 */ 240 public bool isIntegerLiteral(IdType type) nothrow pure @safe @nogc 241 { 242 switch (type) 243 { 244 foreach (T; IntegerLiterals) 245 { 246 case T: 247 return true; 248 } 249 default: 250 return false; 251 } 252 } 253 254 /** 255 * Operator token types. 256 */ 257 public alias Operators = AliasSeq!(tok!",", tok!".", tok!"..", tok!"...", 258 tok!"/", tok!"/=", tok!"!", tok!"!<", tok!"!<=", tok!"!<>", 259 tok!"!<>=", tok!"!=", tok!"!>", tok!"!>=", tok!"$", tok!"%", 260 tok!"%=", tok!"&", tok!"&&", tok!"&=", tok!"(", tok!")", 261 tok!"*", tok!"*=", tok!"+", tok!"++", tok!"+=", tok!"-", 262 tok!"--", tok!"-=", tok!":", tok!";", tok!"<", tok!"<<", 263 tok!"<<=", tok!"<=", tok!"<>", tok!"<>=", tok!"=", tok!"==", 264 tok!"=>", tok!">", tok!">=", tok!">>", tok!">>=", tok!">>>", 265 tok!">>>=", tok!"?", tok!"@", tok!"[", tok!"]", tok!"^", 266 tok!"^=", tok!"^^", tok!"^^=", tok!"{", tok!"|", tok!"|=", 267 tok!"||", tok!"}", tok!"~", tok!"~="); 268 269 /** 270 * Returns: true if the given ID type is for an operator. 271 */ 272 public bool isOperator(IdType type) nothrow pure @safe @nogc 273 { 274 switch (type) 275 { 276 foreach (T; Operators) 277 { 278 case T: 279 return true; 280 } 281 default: 282 return false; 283 } 284 } 285 286 /** 287 * Keyword token types. 288 */ 289 public alias Keywords = AliasSeq!(tok!"abstract", tok!"alias", tok!"align", 290 tok!"asm", tok!"assert", tok!"auto", tok!"break", 291 tok!"case", tok!"cast", tok!"catch", tok!"class", tok!"const", 292 tok!"continue", tok!"debug", tok!"default", tok!"delegate", 293 tok!"delete", tok!"deprecated", tok!"do", tok!"else", tok!"enum", 294 tok!"export", tok!"extern", tok!"false", tok!"final", tok!"finally", 295 tok!"for", tok!"foreach", tok!"foreach_reverse", tok!"function", 296 tok!"goto", tok!"if", tok!"immutable", tok!"import", tok!"in", 297 tok!"inout", tok!"interface", tok!"invariant", tok!"is", 298 tok!"lazy", tok!"macro", tok!"mixin", tok!"module", tok!"new", 299 tok!"nothrow", tok!"null", tok!"out", tok!"override", tok!"package", 300 tok!"pragma", tok!"private", tok!"protected", tok!"public", 301 tok!"pure", tok!"ref", tok!"return", tok!"scope", tok!"shared", 302 tok!"static", tok!"struct", tok!"super", tok!"switch", tok!"synchronized", 303 tok!"template", tok!"this", tok!"throw", tok!"true", tok!"try", 304 tok!"typedef", tok!"typeid", tok!"typeof", tok!"union", tok!"unittest", 305 tok!"version", tok!"volatile", tok!"while", tok!"with", tok!"__DATE__", 306 tok!"__EOF__", tok!"__FILE__", tok!"__FILE_FULL_PATH__", tok!"__FUNCTION__", 307 tok!"__gshared", tok!"__LINE__", tok!"__MODULE__", tok!"__parameters", 308 tok!"__PRETTY_FUNCTION__", tok!"__TIME__", tok!"__TIMESTAMP__", 309 tok!"__traits", tok!"__vector", tok!"__VENDOR__", tok!"__VERSION__"); 310 311 /** 312 * Returns: true if the given ID type is for a keyword. 313 */ 314 public bool isKeyword(IdType type) pure nothrow @safe @nogc 315 { 316 switch (type) 317 { 318 foreach (T; Keywords) 319 { 320 case T: 321 return true; 322 } 323 default: 324 return false; 325 } 326 } 327 328 /** 329 * String literal token types 330 */ 331 public alias StringLiterals = AliasSeq!(tok!"dstringLiteral", 332 tok!"stringLiteral", tok!"wstringLiteral"); 333 334 /** 335 * Returns: true if the given ID type is for a string literal. 336 */ 337 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc 338 { 339 switch (type) 340 { 341 foreach (T; StringLiterals) 342 { 343 case T: 344 return true; 345 } 346 default: 347 return false; 348 } 349 } 350 351 /** 352 * Protection token types. 353 */ 354 public alias Protections = AliasSeq!(tok!"export", tok!"package", 355 tok!"private", tok!"public", tok!"protected"); 356 357 /** 358 * Returns: true if the given ID type is for a protection attribute. 359 */ 360 public bool isProtection(IdType type) pure nothrow @safe @nogc 361 { 362 switch (type) 363 { 364 foreach (T; Protections) 365 { 366 case T: 367 return true; 368 } 369 default: 370 return false; 371 } 372 } 373 374 public alias SpecialTokens = AliasSeq!(tok!"__DATE__", tok!"__TIME__", 375 tok!"__TIMESTAMP__", tok!"__VENDOR__", tok!"__VERSION__", tok!"__FILE__", 376 tok!"__FILE_FULL_PATH__", tok!"__LINE__", tok!"__MODULE__", 377 tok!"__FUNCTION__", tok!"__PRETTY_FUNCTION__"); 378 379 public bool isSpecialToken(IdType type) pure nothrow @safe @nogc 380 { 381 switch (type) 382 { 383 foreach (T; SpecialTokens) 384 { 385 case T: 386 return true; 387 } 388 default: 389 return false; 390 } 391 } 392 393 public alias Literals = AliasSeq!(StringLiterals, NumberLiterals, tok!"characterLiteral", 394 SpecialTokens, tok!"true", tok!"false", tok!"null", tok!"$"); 395 396 public bool isLiteral(IdType type) pure nothrow @safe @nogc 397 { 398 switch (type) 399 { 400 foreach (T; Literals) 401 { 402 case T: 403 return true; 404 } 405 default: 406 return false; 407 } 408 } 409 410 /** 411 * Returns: an array of tokens lexed from the given source code to the output range. All 412 * whitespace tokens are skipped and comments are attached to the token nearest 413 * to them. 414 */ 415 const(Token)[] getTokensForParser(R)(R sourceCode, LexerConfig config, StringCache* cache) 416 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 417 { 418 enum CommentType : ubyte 419 { 420 notDoc, 421 line, 422 block 423 } 424 425 static CommentType commentType(string comment) pure nothrow @safe 426 { 427 if (comment.length < 3) 428 return CommentType.notDoc; 429 if (comment[0 ..3] == "///") 430 return CommentType.line; 431 if (comment[0 ..3] == "/++" || comment[0 ..3] == "/**") 432 return CommentType.block; 433 return CommentType.notDoc; 434 } 435 436 config.whitespaceBehavior = WhitespaceBehavior.skip; 437 config.commentBehavior = CommentBehavior.noIntern; 438 439 auto leadingCommentAppender = appender!(char[])(); 440 leadingCommentAppender.reserve(1024); 441 auto trailingCommentAppender = appender!(char[])(); 442 trailingCommentAppender.reserve(1024); 443 bool hadDdoc; 444 string empty = cache.intern(""); 445 auto output = appender!(typeof(return))(); 446 auto lexer = DLexer(sourceCode, config, cache); 447 size_t tokenCount; 448 loop: while (!lexer.empty) switch (lexer.front.type) 449 { 450 case tok!"specialTokenSequence": 451 case tok!"whitespace": 452 lexer.popFront(); 453 break; 454 case tok!"comment": 455 final switch (commentType(lexer.front.text)) 456 { 457 case CommentType.block: 458 case CommentType.line: 459 if (tokenCount > 0 && lexer.front.line == output.data[tokenCount - 1].line) 460 { 461 if (!trailingCommentAppender.data.empty) 462 trailingCommentAppender.put('\n'); 463 unDecorateComment(lexer.front.text, trailingCommentAppender); 464 hadDdoc = true; 465 } 466 else 467 { 468 if (!leadingCommentAppender.data.empty) 469 leadingCommentAppender.put('\n'); 470 unDecorateComment(lexer.front.text, leadingCommentAppender); 471 hadDdoc = true; 472 } 473 lexer.popFront(); 474 break; 475 case CommentType.notDoc: 476 lexer.popFront(); 477 break; 478 } 479 break; 480 case tok!"__EOF__": 481 if (!trailingCommentAppender.data.empty) 482 (cast() output.data[$ - 1].trailingComment) = cache.intern(cast(string) trailingCommentAppender.data); 483 break loop; 484 default: 485 Token t = lexer.front; 486 lexer.popFront(); 487 tokenCount++; 488 if (!output.data.empty && !trailingCommentAppender.data.empty) 489 { 490 (cast() output.data[$ - 1].trailingComment) = 491 cache.intern(cast(string) trailingCommentAppender.data); 492 hadDdoc = false; 493 } 494 t.comment = leadingCommentAppender.data.length > 0 495 ? cache.intern(cast(string) leadingCommentAppender.data) : (hadDdoc ? empty : null); 496 leadingCommentAppender.clear(); 497 trailingCommentAppender.clear(); 498 hadDdoc = false; 499 output.put(t); 500 break; 501 } 502 return output.data; 503 } 504 505 /** 506 * The D lexer struct. 507 */ 508 public struct DLexer 509 { 510 mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, 511 keywords, pseudoTokenHandlers); 512 513 /// 514 @disable this(); 515 516 /** 517 * Params: 518 * range = the bytes that compose the source code that will be lexed. 519 * config = the lexer configuration to use. 520 * cache = the string interning cache for de-duplicating identifiers and 521 * other token text. 522 * haveSSE42 = Parse streaming SIMD Extensions 4.2 in inline assembly 523 */ 524 this(R)(R range, const LexerConfig config, StringCache* cache, 525 bool haveSSE42 = sse42()) pure nothrow @safe 526 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 527 { 528 this.haveSSE42 = haveSSE42; 529 auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf) 530 ? range[3 .. $] : range; 531 this.range = LexerRange(cast(const(ubyte)[]) r); 532 this.config = config; 533 this.cache = cache; 534 popFront(); 535 } 536 537 /// 538 public void popFront()() pure nothrow @safe 539 { 540 do 541 _popFront(); 542 while (config.whitespaceBehavior == WhitespaceBehavior.skip 543 && _front.type == tok!"whitespace"); 544 } 545 546 private pure nothrow @safe: 547 548 bool isWhitespace() 549 { 550 switch (range.bytes[range.index]) 551 { 552 case ' ': 553 case '\r': 554 case '\n': 555 case '\t': 556 case '\v': 557 case '\f': 558 return true; 559 case 0xe2: 560 auto peek = range.peek(2); 561 return peek.length == 2 562 && peek[0] == 0x80 563 && (peek[1] == 0xa8 || peek[1] == 0xa9); 564 default: 565 return false; 566 } 567 } 568 569 void popFrontWhitespaceAware() 570 { 571 switch (range.bytes[range.index]) 572 { 573 case '\r': 574 range.popFront(); 575 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 576 { 577 range.popFront(); 578 range.incrementLine(); 579 } 580 else 581 range.incrementLine(); 582 return; 583 case '\n': 584 range.popFront(); 585 range.incrementLine(); 586 return; 587 case 0xe2: 588 auto lookahead = range.peek(3); 589 if (lookahead.length == 3 && lookahead[1] == 0x80 590 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) 591 { 592 range.index+=3; 593 range.column+=3; 594 range.incrementLine(); 595 return; 596 } 597 else 598 { 599 range.popFront(); 600 return; 601 } 602 default: 603 range.popFront(); 604 return; 605 } 606 } 607 608 void lexWhitespace(ref Token token) @trusted 609 { 610 mixin (tokenStart); 611 loop: do 612 { 613 version (iasm64NotWindows) 614 { 615 if (haveSSE42 && range.index + 16 < range.bytes.length) 616 { 617 skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index, 618 &range.index, &range.column); 619 } 620 } 621 switch (range.bytes[range.index]) 622 { 623 case '\r': 624 range.popFront(); 625 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') 626 { 627 range.popFront(); 628 } 629 range.column = 1; 630 range.line += 1; 631 break; 632 case '\n': 633 range.popFront(); 634 range.column = 1; 635 range.line += 1; 636 break; 637 case ' ': 638 case '\t': 639 case '\v': 640 case '\f': 641 range.popFront(); 642 break; 643 case 0xe2: 644 if (range.index + 2 >= range.bytes.length) 645 break loop; 646 if (range.bytes[range.index + 1] != 0x80) 647 break loop; 648 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9) 649 { 650 range.index += 3; 651 range.column += 3; 652 range.column = 1; 653 range.line += 1; 654 break; 655 } 656 break loop; 657 default: 658 break loop; 659 } 660 } while (!(range.index >= range.bytes.length)); 661 string text = config.whitespaceBehavior == WhitespaceBehavior.include 662 ? cache.intern(range.slice(mark)) : ""; 663 token = Token(tok!"whitespace", text, line, column, index); 664 } 665 666 void lexNumber(ref Token token) 667 { 668 mixin (tokenStart); 669 if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length) 670 { 671 immutable ahead = range.bytes[range.index + 1]; 672 switch (ahead) 673 { 674 case 'x': 675 case 'X': 676 range.index += 2; 677 range.column += 2; 678 lexHex(token, mark, line, column, index); 679 return; 680 case 'b': 681 case 'B': 682 range.index += 2; 683 range.column += 2; 684 lexBinary(token, mark, line, column, index); 685 return; 686 default: 687 lexDecimal(token, mark, line, column, index); 688 return; 689 } 690 } 691 else 692 lexDecimal(token, mark, line, column, index); 693 } 694 695 void lexHex(ref Token token) 696 { 697 mixin (tokenStart); 698 lexHex(token, mark, line, column, index); 699 } 700 701 void lexHex(ref Token token, size_t mark, size_t line, size_t column, 702 size_t index) @trusted 703 { 704 IdType type = tok!"intLiteral"; 705 bool foundDot; 706 hexLoop: while (!(range.index >= range.bytes.length)) 707 { 708 switch (range.bytes[range.index]) 709 { 710 case 'a': .. case 'f': 711 case 'A': .. case 'F': 712 case '0': .. case '9': 713 case '_': 714 version (iasm64NotWindows) 715 { 716 if (haveSSE42 && range.index + 16 < range.bytes.length) 717 { 718 immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_') 719 (range.bytes.ptr + range.index); 720 range.column += i; 721 range.index += i; 722 } 723 else 724 range.popFront(); 725 } 726 else 727 range.popFront(); 728 break; 729 case 'u': 730 case 'U': 731 lexIntSuffix(type); 732 break hexLoop; 733 case 'i': 734 if (foundDot) 735 lexFloatSuffix(type); 736 break hexLoop; 737 case 'L': 738 if (foundDot) 739 lexFloatSuffix(type); 740 else 741 lexIntSuffix(type); 742 break hexLoop; 743 case 'p': 744 case 'P': 745 lexExponent(type); 746 break hexLoop; 747 case '.': 748 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 749 break hexLoop; 750 else 751 { 752 // The following bit of silliness tries to tell the 753 // difference between "int dot identifier" and 754 // "double identifier". 755 if (range.index + 1 < range.bytes.length) 756 { 757 switch (range.peekAt(1)) 758 { 759 case '0': .. case '9': 760 case 'A': .. case 'F': 761 case 'a': .. case 'f': 762 goto doubleLiteral; 763 default: 764 break hexLoop; 765 } 766 } 767 else 768 { 769 doubleLiteral: 770 range.popFront(); 771 foundDot = true; 772 type = tok!"doubleLiteral"; 773 } 774 } 775 break; 776 default: 777 break hexLoop; 778 } 779 } 780 token = Token(type, cache.intern(range.slice(mark)), line, column, 781 index); 782 } 783 784 void lexBinary(ref Token token) 785 { 786 mixin (tokenStart); 787 return lexBinary(token, mark, line, column, index); 788 } 789 790 void lexBinary(ref Token token, size_t mark, size_t line, size_t column, 791 size_t index) @trusted 792 { 793 IdType type = tok!"intLiteral"; 794 binaryLoop: while (!(range.index >= range.bytes.length)) 795 { 796 switch (range.bytes[range.index]) 797 { 798 case '0': 799 case '1': 800 case '_': 801 version (iasm64NotWindows) 802 { 803 if (haveSSE42 && range.index + 16 < range.bytes.length) 804 { 805 immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')( 806 range.bytes.ptr + range.index); 807 range.column += i; 808 range.index += i; 809 } 810 else 811 range.popFront(); 812 } 813 else 814 range.popFront(); 815 break; 816 case 'u': 817 case 'U': 818 case 'L': 819 lexIntSuffix(type); 820 break binaryLoop; 821 default: 822 break binaryLoop; 823 } 824 } 825 token = Token(type, cache.intern(range.slice(mark)), line, column, 826 index); 827 } 828 829 void lexDecimal(ref Token token) 830 { 831 mixin (tokenStart); 832 lexDecimal(token, mark, line, column, index); 833 } 834 835 void lexDecimal(ref Token token, size_t mark, size_t line, size_t column, 836 size_t index) @trusted 837 { 838 bool foundDot = range.bytes[range.index] == '.'; 839 IdType type = tok!"intLiteral"; 840 if (foundDot) 841 { 842 range.popFront(); 843 type = tok!"doubleLiteral"; 844 } 845 846 decimalLoop: while (!(range.index >= range.bytes.length)) 847 { 848 switch (range.bytes[range.index]) 849 { 850 case '0': .. case '9': 851 case '_': 852 version (iasm64NotWindows) 853 { 854 if (haveSSE42 && range.index + 16 < range.bytes.length) 855 { 856 immutable ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index); 857 range.column += i; 858 range.index += i; 859 } 860 else 861 range.popFront(); 862 } 863 else 864 range.popFront(); 865 break; 866 case 'u': 867 case 'U': 868 if (!foundDot) 869 lexIntSuffix(type); 870 break decimalLoop; 871 case 'i': 872 lexFloatSuffix(type); 873 break decimalLoop; 874 case 'L': 875 if (foundDot) 876 lexFloatSuffix(type); 877 else 878 lexIntSuffix(type); 879 break decimalLoop; 880 case 'f': 881 case 'F': 882 lexFloatSuffix(type); 883 break decimalLoop; 884 case 'e': 885 case 'E': 886 lexExponent(type); 887 break decimalLoop; 888 case '.': 889 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.') 890 break decimalLoop; 891 else 892 { 893 // The following bit of silliness tries to tell the 894 // difference between "int dot identifier" and 895 // "double identifier". 896 if (range.index + 1 < range.bytes.length) 897 { 898 immutable ch = range.peekAt(1); 899 if (ch <= 0x2f 900 || (ch >= '0' && ch <= '9') 901 || (ch >= ':' && ch <= '@') 902 || (ch >= '[' && ch <= '^') 903 || (ch >= '{' && ch <= '~') 904 || ch == '`' || ch == '_') 905 { 906 goto doubleLiteral; 907 } 908 else 909 break decimalLoop; 910 } 911 else 912 { 913 doubleLiteral: 914 range.popFront(); 915 foundDot = true; 916 type = tok!"doubleLiteral"; 917 } 918 } 919 break; 920 default: 921 break decimalLoop; 922 } 923 } 924 token = Token(type, cache.intern(range.slice(mark)), line, column, 925 index); 926 } 927 928 void lexIntSuffix(ref IdType type) pure nothrow @safe 929 { 930 bool secondPass; 931 if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U') 932 { 933 U: 934 if (type == tok!"intLiteral") 935 type = tok!"uintLiteral"; 936 else 937 type = tok!"ulongLiteral"; 938 range.popFront(); 939 if (secondPass) 940 return; 941 if (range.index < range.bytes.length 942 && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')) 943 goto L; 944 goto I; 945 } 946 if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l') 947 { 948 L: 949 if (type == tok!"uintLiteral") 950 type = tok!"ulongLiteral"; 951 else 952 type = tok!"longLiteral"; 953 range.popFront(); 954 if (range.index < range.bytes.length 955 && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u')) 956 { 957 secondPass = true; 958 goto U; 959 } 960 goto I; 961 } 962 I: 963 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 964 { 965 warning("Complex number literals are deprecated"); 966 range.popFront(); 967 if (type == tok!"longLiteral" || type == tok!"ulongLiteral") 968 type = tok!"idoubleLiteral"; 969 else 970 type = tok!"ifloatLiteral"; 971 } 972 } 973 974 void lexFloatSuffix(ref IdType type) pure nothrow @safe 975 { 976 switch (range.bytes[range.index]) 977 { 978 case 'L': 979 range.popFront(); 980 type = tok!"doubleLiteral"; 981 break; 982 case 'f': 983 case 'F': 984 range.popFront(); 985 type = tok!"floatLiteral"; 986 break; 987 default: 988 break; 989 } 990 if (range.index < range.bytes.length && range.bytes[range.index] == 'i') 991 { 992 warning("Complex number literals are deprecated"); 993 range.popFront(); 994 if (type == tok!"floatLiteral") 995 type = tok!"ifloatLiteral"; 996 else 997 type = tok!"idoubleLiteral"; 998 } 999 } 1000 1001 void lexExponent(ref IdType type) pure nothrow @safe 1002 { 1003 range.popFront(); 1004 bool foundSign = false; 1005 bool foundDigit = false; 1006 while (range.index < range.bytes.length) 1007 { 1008 switch (range.bytes[range.index]) 1009 { 1010 case '-': 1011 case '+': 1012 if (foundSign) 1013 { 1014 if (!foundDigit) 1015 error("Expected an exponent"); 1016 return; 1017 } 1018 foundSign = true; 1019 range.popFront(); 1020 break; 1021 case '0': .. case '9': 1022 case '_': 1023 foundDigit = true; 1024 range.popFront(); 1025 break; 1026 case 'L': 1027 case 'f': 1028 case 'F': 1029 case 'i': 1030 lexFloatSuffix(type); 1031 return; 1032 default: 1033 if (!foundDigit) 1034 error("Expected an exponent"); 1035 return; 1036 } 1037 } 1038 } 1039 1040 void lexScriptLine(ref Token token) 1041 { 1042 mixin (tokenStart); 1043 while (!(range.index >= range.bytes.length) && !isNewline) 1044 { 1045 range.popFront(); 1046 } 1047 token = Token(tok!"scriptLine", cache.intern(range.slice(mark)), 1048 line, column, index); 1049 } 1050 1051 void lexSpecialTokenSequence(ref Token token) 1052 { 1053 mixin (tokenStart); 1054 while (!(range.index >= range.bytes.length) && !isNewline) 1055 { 1056 range.popFront(); 1057 } 1058 token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), 1059 line, column, index); 1060 } 1061 1062 void lexSlashStarComment(ref Token token) @trusted 1063 { 1064 mixin (tokenStart); 1065 IdType type = tok!"comment"; 1066 range.popFrontN(2); 1067 while (range.index < range.bytes.length) 1068 { 1069 version (iasm64NotWindows) 1070 { 1071 if (haveSSE42 && range.index + 16 < range.bytes.length) 1072 skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index, 1073 &range.index, &range.column); 1074 } 1075 if (range.bytes[range.index] == '*') 1076 { 1077 range.popFront(); 1078 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1079 { 1080 range.popFront(); 1081 break; 1082 } 1083 } 1084 else 1085 popFrontWhitespaceAware(); 1086 } 1087 if (config.commentBehavior == CommentBehavior.intern) 1088 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1089 else 1090 token = Token(type, cast(string) range.slice(mark), line, column, index); 1091 } 1092 1093 void lexSlashSlashComment(ref Token token) @trusted 1094 { 1095 mixin (tokenStart); 1096 IdType type = tok!"comment"; 1097 range.popFrontN(2); 1098 while (range.index < range.bytes.length) 1099 { 1100 version (iasm64NotWindows) 1101 { 1102 if (haveSSE42 && range.index + 16 < range.bytes.length) 1103 { 1104 skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1105 &range.index, &range.column); 1106 } 1107 } 1108 if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n') 1109 break; 1110 range.popFront(); 1111 } 1112 if (config.commentBehavior == CommentBehavior.intern) 1113 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1114 else 1115 token = Token(type, cast(string) range.slice(mark), line, column, index); 1116 } 1117 1118 void lexSlashPlusComment(ref Token token) @trusted 1119 { 1120 mixin (tokenStart); 1121 IdType type = tok!"comment"; 1122 range.index += 2; 1123 range.column += 2; 1124 int depth = 1; 1125 while (depth > 0 && !(range.index >= range.bytes.length)) 1126 { 1127 version (iasm64NotWindows) 1128 { 1129 if (haveSSE42 && range.index + 16 < range.bytes.length) 1130 { 1131 skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1132 &range.index, &range.column); 1133 } 1134 } 1135 if (range.bytes[range.index] == '+') 1136 { 1137 range.popFront(); 1138 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/') 1139 { 1140 range.popFront(); 1141 depth--; 1142 } 1143 } 1144 else if (range.bytes[range.index] == '/') 1145 { 1146 range.popFront(); 1147 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+') 1148 { 1149 range.popFront(); 1150 depth++; 1151 } 1152 } 1153 else 1154 popFrontWhitespaceAware(); 1155 } 1156 if (config.commentBehavior == CommentBehavior.intern) 1157 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1158 else 1159 token = Token(type, cast(string) range.slice(mark), line, column, index); 1160 } 1161 1162 void lexStringLiteral(ref Token token) @trusted 1163 { 1164 mixin (tokenStart); 1165 range.popFront(); 1166 while (true) 1167 { 1168 if (range.index >= range.bytes.length) 1169 { 1170 error("Error: unterminated string literal"); 1171 token = Token(tok!""); 1172 return; 1173 } 1174 version (iasm64NotWindows) 1175 { 1176 if (haveSSE42 && range.index + 16 < range.bytes.length) 1177 { 1178 skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index, 1179 &range.index, &range.column); 1180 } 1181 } 1182 if (range.bytes[range.index] == '"') 1183 { 1184 range.popFront(); 1185 break; 1186 } 1187 else if (range.bytes[range.index] == '\\') 1188 { 1189 lexEscapeSequence(); 1190 } 1191 else 1192 popFrontWhitespaceAware(); 1193 } 1194 IdType type = tok!"stringLiteral"; 1195 lexStringSuffix(type); 1196 token = Token(type, cache.intern(range.slice(mark)), line, column, 1197 index); 1198 } 1199 1200 void lexWysiwygString(ref Token token) @trusted 1201 { 1202 mixin (tokenStart); 1203 IdType type = tok!"stringLiteral"; 1204 immutable bool backtick = range.bytes[range.index] == '`'; 1205 if (backtick) 1206 { 1207 range.popFront(); 1208 while (true) 1209 { 1210 if (range.index >= range.bytes.length) 1211 { 1212 error("Error: unterminated string literal"); 1213 token = Token(tok!""); 1214 return; 1215 } 1216 version (iasm64NotWindows) 1217 { 1218 if (haveSSE42 && range.index + 16 < range.bytes.length) 1219 { 1220 skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index, 1221 &range.index, &range.column); 1222 } 1223 } 1224 if (range.bytes[range.index] == '`') 1225 { 1226 range.popFront(); 1227 break; 1228 } 1229 else 1230 popFrontWhitespaceAware(); 1231 } 1232 } 1233 else 1234 { 1235 range.popFront(); 1236 if (range.index >= range.bytes.length) 1237 { 1238 error("Error: unterminated string literal"); 1239 token = Token(tok!""); 1240 return; 1241 } 1242 range.popFront(); 1243 while (true) 1244 { 1245 if (range.index >= range.bytes.length) 1246 { 1247 error("Error: unterminated string literal"); 1248 token = Token(tok!""); 1249 return; 1250 } 1251 else if (range.bytes[range.index] == '"') 1252 { 1253 range.popFront(); 1254 break; 1255 } 1256 else 1257 popFrontWhitespaceAware(); 1258 } 1259 } 1260 lexStringSuffix(type); 1261 token = Token(type, cache.intern(range.slice(mark)), line, column, 1262 index); 1263 } 1264 1265 private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe 1266 { 1267 if (range.index >= range.bytes.length) 1268 { 1269 type = tok!"stringLiteral"; 1270 return 0; 1271 } 1272 else 1273 { 1274 switch (range.bytes[range.index]) 1275 { 1276 case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w'; 1277 case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd'; 1278 case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c'; 1279 default: type = tok!"stringLiteral"; return 0; 1280 } 1281 } 1282 } 1283 1284 void lexDelimitedString(ref Token token) 1285 { 1286 mixin (tokenStart); 1287 range.index += 2; 1288 range.column += 2; 1289 ubyte open; 1290 ubyte close; 1291 switch (range.bytes[range.index]) 1292 { 1293 case '<': 1294 open = '<'; 1295 close = '>'; 1296 range.popFront(); 1297 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1298 break; 1299 case '{': 1300 open = '{'; 1301 close = '}'; 1302 range.popFront(); 1303 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1304 break; 1305 case '[': 1306 open = '['; 1307 close = ']'; 1308 range.popFront(); 1309 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1310 break; 1311 case '(': 1312 open = '('; 1313 close = ')'; 1314 range.popFront(); 1315 lexNormalDelimitedString(token, mark, line, column, index, open, close); 1316 break; 1317 default: 1318 lexHeredocString(token, mark, line, column, index); 1319 break; 1320 } 1321 } 1322 1323 void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, 1324 size_t index, ubyte open, ubyte close) 1325 { 1326 int depth = 1; 1327 while (!(range.index >= range.bytes.length) && depth > 0) 1328 { 1329 if (range.bytes[range.index] == open) 1330 { 1331 depth++; 1332 range.popFront(); 1333 } 1334 else if (range.bytes[range.index] == close) 1335 { 1336 depth--; 1337 range.popFront(); 1338 if (depth <= 0) 1339 { 1340 if (range.bytes[range.index] == '"') 1341 { 1342 range.popFront(); 1343 } 1344 else 1345 { 1346 error("Error: `\"` expected to end delimited string literal"); 1347 token = Token(tok!""); 1348 return; 1349 } 1350 } 1351 } 1352 else 1353 popFrontWhitespaceAware(); 1354 } 1355 IdType type = tok!"stringLiteral"; 1356 lexStringSuffix(type); 1357 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1358 } 1359 1360 void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index) 1361 { 1362 Token ident; 1363 lexIdentifier(ident); 1364 if (isNewline()) 1365 popFrontWhitespaceAware(); 1366 else 1367 error("Newline expected"); 1368 while (!(range.index >= range.bytes.length)) 1369 { 1370 if (isNewline()) 1371 { 1372 popFrontWhitespaceAware(); 1373 if (!range.canPeek(ident.text.length)) 1374 { 1375 error(ident.text ~ " expected"); 1376 break; 1377 } 1378 if (range.peek(ident.text.length - 1) == ident.text) 1379 { 1380 range.popFrontN(ident.text.length); 1381 break; 1382 } 1383 } 1384 else 1385 { 1386 range.popFront(); 1387 } 1388 } 1389 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"') 1390 { 1391 range.popFront(); 1392 } 1393 else 1394 error("`\"` expected"); 1395 IdType type = tok!"stringLiteral"; 1396 lexStringSuffix(type); 1397 token = Token(type, cache.intern(range.slice(mark)), line, column, index); 1398 } 1399 1400 void lexTokenString(ref Token token) 1401 { 1402 mixin (tokenStart); 1403 assert (range.bytes[range.index] == 'q'); 1404 range.popFront(); 1405 assert (range.bytes[range.index] == '{'); 1406 range.popFront(); 1407 auto app = appender!string(); 1408 app.put("q{"); 1409 int depth = 1; 1410 1411 immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; 1412 immutable StringBehavior oldString = config.stringBehavior; 1413 config.whitespaceBehavior = WhitespaceBehavior.include; 1414 config.stringBehavior = StringBehavior.source; 1415 scope (exit) 1416 { 1417 config.whitespaceBehavior = oldWhitespace; 1418 config.stringBehavior = oldString; 1419 } 1420 1421 advance(_front); 1422 while (depth > 0 && !empty) 1423 { 1424 auto t = front(); 1425 if (t.text is null) 1426 app.put(str(t.type)); 1427 else 1428 app.put(t.text); 1429 if (t.type == tok!"}") 1430 { 1431 depth--; 1432 if (depth > 0) 1433 popFront(); 1434 } 1435 else if (t.type == tok!"{") 1436 { 1437 depth++; 1438 popFront(); 1439 } 1440 else 1441 popFront(); 1442 } 1443 IdType type = tok!"stringLiteral"; 1444 auto b = lexStringSuffix(type); 1445 if (b != 0) 1446 app.put(b); 1447 token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, 1448 column, index); 1449 } 1450 1451 void lexHexString(ref Token token) 1452 { 1453 mixin (tokenStart); 1454 range.index += 2; 1455 range.column += 2; 1456 1457 loop: while (true) 1458 { 1459 if (range.index >= range.bytes.length) 1460 { 1461 error("Error: unterminated hex string literal"); 1462 token = Token(tok!""); 1463 return; 1464 } 1465 else if (isWhitespace()) 1466 popFrontWhitespaceAware(); 1467 else switch (range.bytes[range.index]) 1468 { 1469 case '0': .. case '9': 1470 case 'A': .. case 'F': 1471 case 'a': .. case 'f': 1472 range.popFront(); 1473 break; 1474 case '"': 1475 range.popFront(); 1476 break loop; 1477 default: 1478 error("Error: invalid character in hex string"); 1479 token = Token(tok!""); 1480 return; 1481 } 1482 } 1483 1484 IdType type = tok!"stringLiteral"; 1485 lexStringSuffix(type); 1486 token = Token(type, cache.intern(range.slice(mark)), line, column, 1487 index); 1488 } 1489 1490 bool lexEscapeSequence() 1491 { 1492 range.popFront(); 1493 if (range.index >= range.bytes.length) 1494 { 1495 error("Error: non-terminated character escape sequence."); 1496 return false; 1497 } 1498 switch (range.bytes[range.index]) 1499 { 1500 case '\'': 1501 case '"': 1502 case '?': 1503 case '\\': 1504 case 'a': 1505 case 'b': 1506 case 'f': 1507 case 'n': 1508 case 'r': 1509 case 't': 1510 case 'v': 1511 range.popFront(); 1512 break; 1513 case 'x': 1514 range.popFront(); 1515 foreach (i; 0 .. 2) 1516 { 1517 if (range.index >= range.bytes.length) 1518 { 1519 error("Error: 2 hex digits expected."); 1520 return false; 1521 } 1522 switch (range.bytes[range.index]) 1523 { 1524 case '0': .. case '9': 1525 case 'a': .. case 'f': 1526 case 'A': .. case 'F': 1527 range.popFront(); 1528 break; 1529 default: 1530 error("Error: 2 hex digits expected."); 1531 return false; 1532 } 1533 } 1534 break; 1535 case '0': 1536 if (!(range.index + 1 < range.bytes.length) 1537 || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\'')) 1538 { 1539 range.popFront(); 1540 break; 1541 } 1542 goto case; 1543 case '1': .. case '7': 1544 for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length) 1545 && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++) 1546 range.popFront(); 1547 break; 1548 case 'u': 1549 range.popFront(); 1550 foreach (i; 0 .. 4) 1551 { 1552 if (range.index >= range.bytes.length) 1553 { 1554 error("Error: at least 4 hex digits expected."); 1555 return false; 1556 } 1557 switch (range.bytes[range.index]) 1558 { 1559 case '0': .. case '9': 1560 case 'a': .. case 'f': 1561 case 'A': .. case 'F': 1562 range.popFront(); 1563 break; 1564 default: 1565 error("Error: at least 4 hex digits expected."); 1566 return false; 1567 } 1568 } 1569 break; 1570 case 'U': 1571 range.popFront(); 1572 foreach (i; 0 .. 8) 1573 { 1574 if (range.index >= range.bytes.length) 1575 { 1576 error("Error: at least 8 hex digits expected."); 1577 return false; 1578 } 1579 switch (range.bytes[range.index]) 1580 { 1581 case '0': .. case '9': 1582 case 'a': .. case 'f': 1583 case 'A': .. case 'F': 1584 range.popFront(); 1585 break; 1586 default: 1587 error("Error: at least 8 hex digits expected."); 1588 return false; 1589 } 1590 } 1591 break; 1592 default: 1593 while (true) 1594 { 1595 if (range.index >= range.bytes.length) 1596 { 1597 error("Error: non-terminated character escape sequence."); 1598 return false; 1599 } 1600 if (range.bytes[range.index] == ';') 1601 { 1602 range.popFront(); 1603 break; 1604 } 1605 else 1606 { 1607 range.popFront(); 1608 } 1609 } 1610 } 1611 return true; 1612 } 1613 1614 void lexCharacterLiteral(ref Token token) 1615 { 1616 mixin (tokenStart); 1617 range.popFront(); 1618 if (range.empty) 1619 goto err; 1620 if (range.bytes[range.index] == '\\') 1621 lexEscapeSequence(); 1622 else if (range.bytes[range.index] == '\'') 1623 { 1624 range.popFront(); 1625 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1626 line, column, index); 1627 } 1628 else if (range.bytes[range.index] & 0x80) 1629 { 1630 while (range.bytes[range.index] & 0x80) 1631 range.popFront(); 1632 } 1633 else 1634 popFrontWhitespaceAware(); 1635 1636 if (range.index < range.bytes.length && range.bytes[range.index] == '\'') 1637 { 1638 range.popFront(); 1639 token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)), 1640 line, column, index); 1641 } 1642 else 1643 { 1644 err: 1645 error("Error: Expected `'` to end character literal"); 1646 token = Token(tok!""); 1647 } 1648 } 1649 1650 void lexIdentifier(ref Token token) @trusted 1651 { 1652 mixin (tokenStart); 1653 if (isSeparating(0)) 1654 { 1655 error("Invalid identifier"); 1656 range.popFront(); 1657 } 1658 while (true) 1659 { 1660 version (iasm64NotWindows) 1661 { 1662 if (haveSSE42 && range.index + 16 < range.bytes.length) 1663 { 1664 immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_') 1665 (range.bytes.ptr + range.index); 1666 range.column += i; 1667 range.index += i; 1668 } 1669 } 1670 if (isSeparating(0)) 1671 break; 1672 else 1673 range.popFront(); 1674 } 1675 token = Token(tok!"identifier", cache.intern(range.slice(mark)), line, 1676 column, index); 1677 } 1678 1679 void lexDot(ref Token token) 1680 { 1681 mixin (tokenStart); 1682 if (!(range.index + 1 < range.bytes.length)) 1683 { 1684 range.popFront(); 1685 token = Token(tok!".", null, line, column, index); 1686 return; 1687 } 1688 switch (range.peekAt(1)) 1689 { 1690 case '0': .. case '9': 1691 lexNumber(token); 1692 return; 1693 case '.': 1694 range.popFront(); 1695 range.popFront(); 1696 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.') 1697 { 1698 range.popFront(); 1699 token = Token(tok!"...", null, line, column, index); 1700 } 1701 else 1702 token = Token(tok!"..", null, line, column, index); 1703 return; 1704 default: 1705 range.popFront(); 1706 token = Token(tok!".", null, line, column, index); 1707 return; 1708 } 1709 } 1710 1711 void lexLongNewline(ref Token token) @nogc 1712 { 1713 mixin (tokenStart); 1714 range.popFront(); 1715 range.popFront(); 1716 range.popFront(); 1717 range.incrementLine(); 1718 string text = config.whitespaceBehavior == WhitespaceBehavior.include 1719 ? cache.intern(range.slice(mark)) : ""; 1720 token = Token(tok!"whitespace", text, line, 1721 column, index); 1722 } 1723 1724 bool isNewline() @nogc 1725 { 1726 if (range.bytes[range.index] == '\n') return true; 1727 if (range.bytes[range.index] == '\r') return true; 1728 return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length) 1729 && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); 1730 } 1731 1732 bool isSeparating(size_t offset) @nogc 1733 { 1734 enum : ubyte 1735 { 1736 n, y, m // no, yes, maybe 1737 } 1738 1739 if (range.index + offset >= range.bytes.length) 1740 return true; 1741 auto c = range.bytes[range.index + offset]; 1742 static immutable ubyte[256] LOOKUP_TABLE = [ 1743 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1744 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1745 y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, 1746 n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y, 1747 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1748 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n, 1749 y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, 1750 n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, 1751 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1752 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1753 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1754 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1755 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1756 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1757 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, 1758 m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m 1759 ]; 1760 immutable ubyte result = LOOKUP_TABLE[c]; 1761 if (result == n) 1762 return false; 1763 if (result == y) 1764 return true; 1765 if (result == m) 1766 { 1767 auto r = range; 1768 range.popFrontN(offset); 1769 return (r.canPeek(2) && (r.peek(2) == "\u2028" 1770 || r.peek(2) == "\u2029")); 1771 } 1772 assert (false); 1773 } 1774 1775 1776 1777 enum tokenStart = q{ 1778 size_t index = range.index; 1779 size_t column = range.column; 1780 size_t line = range.line; 1781 auto mark = range.mark(); 1782 }; 1783 1784 void error(string message) 1785 { 1786 messages ~= Message(range.line, range.column, message, true); 1787 } 1788 1789 void warning(string message) 1790 { 1791 messages ~= Message(range.line, range.column, message, false); 1792 assert (messages.length > 0); 1793 } 1794 1795 static struct Message 1796 { 1797 size_t line; 1798 size_t column; 1799 string message; 1800 bool isError; 1801 } 1802 1803 Message[] messages; 1804 StringCache* cache; 1805 LexerConfig config; 1806 bool haveSSE42; 1807 } 1808 1809 /// copy from phobos b/c we need to build on older versions of dmd 1810 /// Returns : the next power of two from a given value 1811 private static size_t nextPow2(size_t value) 1812 { 1813 import core.bitop : bsr; 1814 return 1 << bsr(value) + 1; 1815 } 1816 1817 /** 1818 * Creates a token range from the given source code. Creates a default lexer 1819 * configuration and a GC-managed string cache. 1820 */ 1821 public auto byToken(R)(R range) 1822 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1823 { 1824 uint bc = cast(uint)((range.length > 2^^31UL) ? 2^^31 1825 : nextPow2(1 + range.length / 32)); 1826 LexerConfig config; 1827 StringCache* cache = new StringCache(bc); 1828 return DLexer(range, config, cache); 1829 } 1830 1831 /** 1832 * Creates a token range from the given source code. Uses the given string 1833 * cache. 1834 */ 1835 public auto byToken(R)(R range, StringCache* cache) 1836 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1837 { 1838 LexerConfig config; 1839 return DLexer(range, config, cache); 1840 } 1841 1842 /** 1843 * Creates a token range from the given source code. Uses the provided lexer 1844 * configuration and string cache. 1845 */ 1846 public auto byToken(R)(R range, const LexerConfig config, StringCache* cache) 1847 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R) 1848 { 1849 return DLexer(range, config, cache); 1850 } 1851 1852 /** 1853 * Removes "decoration" such as leading whitespace, leading + and * characters, 1854 * and places the result into the given output range 1855 */ 1856 public void unDecorateComment(T)(string comment, auto ref T outputRange) 1857 if (isOutputRange!(T, string)) 1858 in 1859 { 1860 assert (comment.length >= 3); 1861 } 1862 body 1863 { 1864 import std.string : lineSplitter, stripRight; 1865 1866 static void adjustBeginningAndEnd(string s, ref size_t a, ref size_t b) pure nothrow @nogc @safe 1867 { 1868 immutable char c = s[1]; 1869 while (a < b && s[a] == c) a++; 1870 while (b > a && s[b] == c) b--; 1871 b++; 1872 } 1873 1874 string leadingChars; 1875 size_t i = 3; 1876 size_t j; 1877 bool hasOutput = false; 1878 bool lastWasBlank = false; 1879 switch (comment[0 .. 3]) 1880 { 1881 case "///": 1882 j = comment.length; 1883 1884 foreach (line; lineSplitter(comment)) 1885 { 1886 auto l = line[3 .. $]; 1887 if (leadingChars.empty) 1888 { 1889 size_t k = 0; 1890 while (k < l.length && (l[k] == ' ' || l[k] == '\t')) k++; 1891 leadingChars = l[0 .. k]; 1892 } 1893 immutable string stripped = l.stripRight(); 1894 if (hasOutput) 1895 outputRange.put('\n'); 1896 else 1897 hasOutput = true; 1898 if (stripped.length >= leadingChars.length && stripped.startsWith(leadingChars)) 1899 outputRange.put(stripped[leadingChars.length .. $]); 1900 else 1901 outputRange.put(stripped); 1902 } 1903 break; 1904 case "/++": 1905 case "/**": 1906 if (comment.length == 3) 1907 { 1908 comment = ""; 1909 goto default; 1910 } 1911 j = comment.length - 2; 1912 // Skip beginning and ending stars and plusses 1913 adjustBeginningAndEnd(comment, i, j); 1914 foreach (line; lineSplitter(comment[i .. j])) 1915 { 1916 immutable string stripped = line.stripRight(); 1917 if (leadingChars.empty) 1918 { 1919 size_t k = 0; 1920 while (k < line.length && (line[k] == ' ' || line[k] == '\t')) k++; 1921 if (k < line.length && line[k] == comment[1]) 1922 { 1923 k++; 1924 while (k < line.length && (line[k] == ' ' || line[k] == '\t')) k++; 1925 } 1926 if (k == stripped.length) 1927 continue; 1928 leadingChars = line[0 .. k]; 1929 } 1930 1931 if (stripped.startsWith(leadingChars)) 1932 { 1933 if (stripped.length > leadingChars.length) 1934 { 1935 if (hasOutput) 1936 outputRange.put('\n'); 1937 hasOutput = true; 1938 if (lastWasBlank) 1939 outputRange.put('\n'); 1940 lastWasBlank = false; 1941 outputRange.put(stripped[leadingChars.length .. $]); 1942 } 1943 } 1944 else if (hasOutput && stripped.length == leadingChars.stripRight().length) 1945 lastWasBlank = true; 1946 else if (!stripped.empty && !leadingChars.startsWith(stripped)) 1947 { 1948 if (hasOutput) 1949 outputRange.put('\n'); 1950 hasOutput = true; 1951 if (lastWasBlank) 1952 outputRange.put('\n'); 1953 lastWasBlank = false; 1954 outputRange.put(stripped); 1955 } 1956 else 1957 lastWasBlank = false; 1958 } 1959 break; 1960 default: 1961 outputRange.put(comment); 1962 break; 1963 } 1964 } 1965 1966 /// 1967 unittest 1968 { 1969 import std.array:array, appender; 1970 import std.stdio:stderr; 1971 stderr.writeln("Running unittest for unDecorateComment..."); 1972 1973 1974 string[] inputs = [ 1975 "/***************\n*******************/", 1976 "/***************\n *\n ******************/", 1977 "/**\n*/", 1978 "/** */", 1979 "/***/", 1980 "/** abcde */", 1981 "/// abcde\n/// abcde", 1982 "/**\n * stuff\n */", 1983 "/**\n *\n * stuff\n */", 1984 "/**\n *\n * stuff\n *\n */", 1985 "/**\n *\n * stuff\n *\n*/", 1986 "/**\n * abcde\n * abcde \n */", 1987 "/**\n * abcde\n *\n * abcde\n */", 1988 ]; 1989 string[] outputs = [ 1990 "", 1991 "", 1992 "", 1993 "", 1994 "", 1995 "abcde", 1996 "abcde\nabcde", 1997 "stuff", 1998 "stuff", 1999 "stuff", 2000 "stuff", 2001 "abcde\n abcde", 2002 "abcde\n\nabcde" 2003 ]; 2004 assert(inputs.length == outputs.length); 2005 foreach (pair; zip(inputs, outputs)) 2006 { 2007 foreach (b; [true, false]) 2008 { 2009 auto app = appender!string(); 2010 unDecorateComment(b ? pair[0] : pair[0].replace("*", "+"), app); 2011 assert(pair[1] == app.data, "[[" ~ pair[0] ~ "]] => [[" ~ app.data ~ "]]"); 2012 } 2013 } 2014 stderr.writeln("Unittest for unDecorateComment passed."); 2015 } 2016 2017 2018 /** 2019 * The string cache is used for string interning. 2020 * 2021 * It will only store a single copy of any string that it is asked to hold. 2022 * Interned strings can be compared for equality by comparing their $(B .ptr) 2023 * field. 2024 * 2025 * Default and postbilt constructors are disabled. When a StringCache goes out 2026 * of scope, the memory held by it is freed. 2027 * 2028 * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning) 2029 */ 2030 struct StringCache 2031 { 2032 public pure nothrow @nogc: 2033 2034 @disable this(); 2035 @disable this(this); 2036 2037 /** 2038 * Params: bucketCount = the initial number of buckets. Must be a 2039 * power of two 2040 */ 2041 this(size_t bucketCount) nothrow @trusted @nogc 2042 in 2043 { 2044 import core.bitop : popcnt; 2045 static if (size_t.sizeof == 8) 2046 { 2047 immutable low = popcnt(cast(uint) bucketCount); 2048 immutable high = popcnt(cast(uint) (bucketCount >> 32)); 2049 assert ((low == 0 && high == 1) || (low == 1 && high == 0)); 2050 } 2051 else 2052 { 2053 static assert (size_t.sizeof == 4); 2054 assert (popcnt(cast(uint) bucketCount) == 1); 2055 } 2056 } 2057 body 2058 { 2059 buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount]; 2060 } 2061 2062 ~this() 2063 { 2064 Block* current = rootBlock; 2065 while (current !is null) 2066 { 2067 Block* prev = current; 2068 current = current.next; 2069 free(cast(void*) prev); 2070 } 2071 foreach (nodePointer; buckets) 2072 { 2073 Node* currentNode = nodePointer; 2074 while (currentNode !is null) 2075 { 2076 if (currentNode.mallocated) 2077 free(currentNode.str.ptr); 2078 Node* prev = currentNode; 2079 currentNode = currentNode.next; 2080 free(prev); 2081 } 2082 } 2083 rootBlock = null; 2084 free(buckets.ptr); 2085 buckets = null; 2086 } 2087 2088 /** 2089 * Caches a string. 2090 */ 2091 string intern(const(ubyte)[] str) @safe 2092 { 2093 if (str is null || str.length == 0) 2094 return ""; 2095 return _intern(str); 2096 } 2097 2098 /** 2099 * ditto 2100 */ 2101 string intern(string str) @trusted 2102 { 2103 return intern(cast(ubyte[]) str); 2104 } 2105 2106 /** 2107 * The default bucket count for the string cache. 2108 */ 2109 static enum defaultBucketCount = 4096; 2110 2111 private: 2112 2113 string _intern(const(ubyte)[] bytes) @trusted 2114 { 2115 immutable uint hash = hashBytes(bytes); 2116 immutable size_t index = hash & (buckets.length - 1); 2117 Node* s = find(bytes, hash); 2118 if (s !is null) 2119 return cast(string) s.str; 2120 ubyte[] mem = void; 2121 bool mallocated = bytes.length > BIG_STRING; 2122 if (mallocated) 2123 mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length]; 2124 else 2125 mem = allocate(bytes.length); 2126 mem[] = bytes[]; 2127 Node* node = cast(Node*) malloc(Node.sizeof); 2128 node.str = mem; 2129 node.hash = hash; 2130 node.next = buckets[index]; 2131 node.mallocated = mallocated; 2132 buckets[index] = node; 2133 return cast(string) mem; 2134 } 2135 2136 Node* find(const(ubyte)[] bytes, uint hash) @trusted 2137 { 2138 import std.algorithm : equal; 2139 immutable size_t index = hash & (buckets.length - 1); 2140 Node* node = buckets[index]; 2141 while (node !is null) 2142 { 2143 if (node.hash == hash && bytes == cast(ubyte[]) node.str) 2144 return node; 2145 node = node.next; 2146 } 2147 return node; 2148 } 2149 2150 static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc 2151 in 2152 { 2153 assert (data !is null); 2154 assert (data.length > 0); 2155 } 2156 body 2157 { 2158 immutable uint m = 0x5bd1e995; 2159 immutable int r = 24; 2160 uint h = cast(uint) data.length; 2161 while (data.length >= 4) 2162 { 2163 uint k = (cast(ubyte) data[3]) << 24 2164 | (cast(ubyte) data[2]) << 16 2165 | (cast(ubyte) data[1]) << 8 2166 | (cast(ubyte) data[0]); 2167 k *= m; 2168 k ^= k >> r; 2169 k *= m; 2170 h *= m; 2171 h ^= k; 2172 data = data[4 .. $]; 2173 } 2174 switch (data.length & 3) 2175 { 2176 case 3: 2177 h ^= data[2] << 16; 2178 goto case; 2179 case 2: 2180 h ^= data[1] << 8; 2181 goto case; 2182 case 1: 2183 h ^= data[0]; 2184 h *= m; 2185 break; 2186 default: 2187 break; 2188 } 2189 h ^= h >> 13; 2190 h *= m; 2191 h ^= h >> 15; 2192 return h; 2193 } 2194 2195 ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc 2196 in 2197 { 2198 assert (numBytes != 0); 2199 } 2200 out (result) 2201 { 2202 assert (result.length == numBytes); 2203 } 2204 body 2205 { 2206 Block* r = rootBlock; 2207 size_t i = 0; 2208 while (i <= 3 && r !is null) 2209 { 2210 immutable size_t available = r.bytes.length; 2211 immutable size_t oldUsed = r.used; 2212 immutable size_t newUsed = oldUsed + numBytes; 2213 if (newUsed <= available) 2214 { 2215 r.used = newUsed; 2216 return r.bytes[oldUsed .. newUsed]; 2217 } 2218 i++; 2219 r = r.next; 2220 } 2221 Block* b = cast(Block*) calloc(Block.sizeof, 1); 2222 b.used = numBytes; 2223 b.next = rootBlock; 2224 rootBlock = b; 2225 return b.bytes[0 .. numBytes]; 2226 } 2227 2228 static struct Node 2229 { 2230 ubyte[] str = void; 2231 Node* next = void; 2232 uint hash = void; 2233 bool mallocated = void; 2234 } 2235 2236 static struct Block 2237 { 2238 Block* next; 2239 size_t used; 2240 enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof; 2241 ubyte[BLOCK_CAPACITY] bytes; 2242 } 2243 2244 static assert (BLOCK_SIZE == Block.sizeof); 2245 2246 enum BLOCK_SIZE = 1024 * 16; 2247 2248 // If a string would take up more than 1/4 of a block, allocate it outside 2249 // of the block. 2250 enum BIG_STRING = BLOCK_SIZE / 4; 2251 2252 Node*[] buckets; 2253 Block* rootBlock; 2254 } 2255 2256 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted; 2257 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted; 2258 private extern(C) void free(void*) nothrow pure @nogc @trusted; 2259 2260 unittest 2261 { 2262 auto source = cast(ubyte[]) q{ import std.stdio;}}; 2263 auto tokens = getTokensForParser(source, LexerConfig(), 2264 new StringCache(StringCache.defaultBucketCount)); 2265 assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", 2266 tok!"identifier", tok!";"])); 2267 } 2268 2269 /// Test \x char sequence 2270 unittest 2271 { 2272 auto toks = (string s) => byToken(cast(ubyte[])s); 2273 2274 // valid 2275 immutable hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; 2276 auto source = ""; 2277 foreach (h1; hex) 2278 foreach (h2; hex) 2279 source ~= "'\\x" ~ h1 ~ h2 ~ "'"; 2280 assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty); 2281 2282 // invalid 2283 assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2284 assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2285 assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2286 assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); 2287 assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); 2288 } 2289 2290 version (iasm64NotWindows) 2291 { 2292 /** 2293 * Returns: 2294 */ 2295 ushort newlineMask(const ubyte*) pure nothrow @trusted @nogc 2296 { 2297 asm pure nothrow @nogc 2298 { 2299 naked; 2300 movdqu XMM1, [RDI]; 2301 mov RAX, 3; 2302 mov RDX, 16; 2303 mov R8, 0x0d0d0d0d0d0d0d0dL; 2304 movq XMM2, R8; 2305 shufpd XMM2, XMM2, 0; 2306 pcmpeqb XMM2, XMM1; 2307 mov R9, 0x0a0a0a0a0a0a0a0aL; 2308 movq XMM3, R9; 2309 shufpd XMM3, XMM3, 0; 2310 pcmpeqb XMM3, XMM1; 2311 mov R10, 0xe280a8L; 2312 movq XMM4, R10; 2313 pcmpestrm XMM4, XMM1, 0b01001100; 2314 movdqa XMM4, XMM0; 2315 mov R11, 0xe280a9L; 2316 movq XMM5, R11; 2317 pcmpestrm XMM5, XMM1, 0b01001100; 2318 movdqa XMM5, XMM0; 2319 mov RCX, 0x0a0d; 2320 dec RAX; 2321 movq XMM6, RCX; 2322 pcmpestrm XMM6, XMM1, 0b01001100; 2323 movdqa XMM6, XMM0; 2324 movdqa XMM7, XMM6; 2325 pslldq XMM7, 1; 2326 movdqa XMM0, XMM4; 2327 por XMM0, XMM5; 2328 por XMM7, XMM6; 2329 movdqa XMM1, XMM2; 2330 por XMM1, XMM3; 2331 pxor XMM7, XMM1; 2332 por XMM7, XMM0; 2333 por XMM7, XMM6; 2334 pmovmskb RAX, XMM7; 2335 and RAX, 0b0011_1111_1111_1111; 2336 ret; 2337 } 2338 } 2339 2340 /** 2341 * Skips between 0 and 16 bytes that match (or do not match) one of the 2342 * given $(B chars). 2343 */ 2344 void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow 2345 @trusted @nogc if (chars.length <= 8) 2346 { 2347 enum constant = ByteCombine!chars; 2348 enum charsLength = chars.length; 2349 static if (matching) 2350 enum flags = 0b0001_0000; 2351 else 2352 enum flags = 0b0000_0000; 2353 asm pure nothrow @nogc 2354 { 2355 naked; 2356 movdqu XMM1, [RDX]; 2357 mov R10, constant; 2358 movq XMM2, R10; 2359 mov RAX, charsLength; 2360 mov RDX, 16; 2361 pcmpestri XMM2, XMM1, flags; 2362 add [RSI], RCX; 2363 add [RDI], RCX; 2364 ret; 2365 } 2366 } 2367 2368 /** 2369 * Returns: the number of bytes starting at the given location that match 2370 * (or do not match if $(B invert) is true) the byte ranges in $(B chars). 2371 */ 2372 ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc 2373 { 2374 static assert (chars.length % 2 == 0); 2375 enum constant = ByteCombine!chars; 2376 static if (invert) 2377 enum rangeMatchFlags = 0b0000_0100; 2378 else 2379 enum rangeMatchFlags = 0b0001_0100; 2380 enum charsLength = chars.length; 2381 asm pure nothrow @nogc 2382 { 2383 naked; 2384 movdqu XMM1, [RDI]; 2385 mov R10, constant; 2386 movq XMM2, R10; 2387 mov RAX, charsLength; 2388 mov RDX, 16; 2389 pcmpestri XMM2, XMM1, rangeMatchFlags; 2390 mov RAX, RCX; 2391 ret; 2392 } 2393 } 2394 2395 template ByteCombine(c...) 2396 { 2397 static assert (c.length <= 8); 2398 static if (c.length > 1) 2399 enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8); 2400 else 2401 enum ulong ByteCombine = c[0]; 2402 } 2403 } 2404 2405 unittest 2406 { 2407 import core.exception : RangeError; 2408 import std.exception : assertNotThrown; 2409 2410 static immutable src1 = "/++"; 2411 static immutable src2 = "/**"; 2412 2413 LexerConfig cf; 2414 StringCache ca = StringCache(16); 2415 2416 assertNotThrown!RangeError(getTokensForParser(src1, cf, &ca)); 2417 assertNotThrown!RangeError(getTokensForParser(src2, cf, &ca)); 2418 }