dparse.lexer source code

1 module dparse.lexer;
2 
3 import std.typecons;
4 import std.typetuple;
5 import std.array;
6 import std.algorithm;
7 import std.range;
8 import std.experimental.lexer;
9 import std.traits;
10 import core.cpuid : sse42;
11 version (D_InlineAsm_X86_64)
12 {
13     version (Windows) {}
14     else version = iasm64NotWindows;
15 }
16 
17 public import dparse.trivia;
18 
19 /// Operators
20 private enum operators = [
21     ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
22     "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
23     "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
24     "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^",
25     "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
26 ];
27 
28 /// Kewords
29 private enum keywords = [
30     "abstract", "alias", "align", "asm", "assert", "auto", "bool",
31     "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
32     "char", "class", "const", "continue", "creal", "dchar", "debug", "default",
33     "delegate", "delete", "deprecated", "do", "double", "else", "enum",
34     "export", "extern", "false", "final", "finally", "float", "for", "foreach",
35     "foreach_reverse", "function", "goto", "idouble", "if", "ifloat",
36     "immutable", "import", "in", "inout", "int", "interface", "invariant",
37     "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow",
38     "null", "out", "override", "package", "pragma", "private", "protected",
39     "public", "pure", "real", "ref", "return", "scope", "shared", "short",
40     "static", "struct", "super", "switch", "synchronized", "template", "this",
41     "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent",
42     "uint", "ulong", "union", "unittest", "ushort", "version", "void",
43     "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__",
44     "__FILE_FULL_PATH__", "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__",
45     "__parameters", "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits",
46     "__vector", "__VENDOR__", "__VERSION__"
47 ];
48 
49 /// Other tokens
50 private enum dynamicTokens = [
51     "specialTokenSequence", "comment", "identifier", "scriptLine",
52     "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral",
53     "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral",
54     "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral",
55     "dstringLiteral", "stringLiteral", "wstringLiteral"
56 ];
57 
58 private enum pseudoTokenHandlers = [
59     "\"", "lexStringLiteral",
60     "`", "lexWysiwygString",
61     "//", "lexSlashSlashComment",
62     "/*", "lexSlashStarComment",
63     "/+", "lexSlashPlusComment",
64     ".", "lexDot",
65     "'", "lexCharacterLiteral",
66     "0", "lexNumber",
67     "1", "lexDecimal",
68     "2", "lexDecimal",
69     "3", "lexDecimal",
70     "4", "lexDecimal",
71     "5", "lexDecimal",
72     "6", "lexDecimal",
73     "7", "lexDecimal",
74     "8", "lexDecimal",
75     "9", "lexDecimal",
76     "q\"", "lexDelimitedString",
77     "q{", "lexTokenString",
78     "r\"", "lexWysiwygString",
79     "x\"", "lexHexString",
80     " ", "lexWhitespace",
81     "\t", "lexWhitespace",
82     "\r", "lexWhitespace",
83     "\n", "lexWhitespace",
84     "\v", "lexWhitespace",
85     "\f", "lexWhitespace",
86     "\u2028", "lexLongNewline",
87     "\u2029", "lexLongNewline",
88     "#!", "lexScriptLine",
89     "#line", "lexSpecialTokenSequence"
90 ];
91 
92 /// Token ID type for the D lexer.
93 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
94 
95 /**
96  * Function used for converting an IdType to a string.
97  *
98  * Examples:
99  * ---
100  * IdType c = tok!"case";
101  * assert (str(c) == "case");
102  * ---
103  */
104 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
105 
106 /**
107  * Template used to refer to D token types.
108  *
109  * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for
110  * values that can be passed to this template.
111  * Example:
112  * ---
113  * import dparse.lexer;
114  * IdType t = tok!"floatLiteral";
115  * ---
116  */
117 public template tok(string token)
118 {
119     alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
120 }
121 
122 mixin template TokenTriviaFields()
123 {
124     /**
125      * Whitespace and comment tokens attached to this token.
126      *
127      * All trivia tokens must have the text property set to the text with
128      * which they identify with. This means you can map all trivia tokens to
129      * their .text property and join them together to get the source code back
130      * without any loss of information.
131      *
132      * Trivia is only included when calling getTokensForParser. When iterating
133      * over DLexer all tokens will be in their raw form and none will be
134      * converted to trivia.
135      *
136      * Note: in the future you might need to explicitly pass
137      * WhitespaceBehavior.include (or keep the default) as getTokensForParser
138      * currently overrides it to include.
139      *
140      * Contains: `comment`, `whitespace`, `specialTokenSequence`
141      */
142     immutable(typeof(this))[] leadingTrivia;
143     /// ditto
144     immutable(typeof(this))[] trailingTrivia;
145 
146     string memoizedLeadingComment = null;
147     string memoizedTrailingComment = null;
148 
149     /// Legacy property to get documentation comments, with comment border
150     /// stripped off, which is attached to this token.
151     string comment() const pure nothrow @safe @property {
152         import dparse.trivia : extractLeadingDdoc;
153         if (memoizedLeadingComment !is null)
154             return memoizedLeadingComment;
155         return (cast()memoizedLeadingComment) = this.extractLeadingDdoc;
156     }
157 
158     /// ditto
159     string trailingComment() const pure nothrow @safe @property {
160         import dparse.trivia : extractTrailingDdoc;
161         if (memoizedTrailingComment !is null)
162             return memoizedTrailingComment;
163         return (cast()memoizedTrailingComment) = this.extractTrailingDdoc;
164     }
165 
166     int opCmp(size_t i) const pure nothrow @safe @nogc {
167         if (index < i) return -1;
168         if (index > i) return 1;
169         return 0;
170     }
171 
172     int opCmp(ref const typeof(this) other) const pure nothrow @safe @nogc {
173         return opCmp(other.index);
174     }
175 }
176 
177 // mixin in from dparse.lexer to make error messages more managable size as the
178 // entire string is dumped when there is a type mismatch.
179 private enum extraFields = "import dparse.lexer:TokenTriviaFields; mixin TokenTriviaFields;";
180 
181 /// The token type in the D lexer
182 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields);
183 
184 /**
185  * Configure whitespace handling
186  */
187 public enum WhitespaceBehavior : ubyte
188 {
189     include = 0b0000_0000,
190     skip = 0b0000_0001,
191 }
192 
193 private enum stringBehaviorNotWorking = "Automatic string parsing is not "
194     ~ "supported and was previously not working. To unescape strings use the "
195     ~ "`dparse.strings : unescapeString` function on the token texts instead.";
196 
197 /**
198  * Configure string lexing behavior
199  */
200 // was enum, but struct now for deprecations and support with old compilers
201 public struct StringBehavior
202 {
203     /// Do not include quote characters, process escape sequences
204     deprecated(stringBehaviorNotWorking) static immutable StringBehavior compiler = StringBehavior(0b0000_0000);
205     /// Opening quotes, closing quotes, and string suffixes are included in
206     /// the string token
207     deprecated(stringBehaviorNotWorking) static immutable StringBehavior includeQuoteChars = StringBehavior(0b0000_0001);
208     /// String escape sequences are not replaced
209     deprecated(stringBehaviorNotWorking) static immutable StringBehavior notEscaped = StringBehavior(0b0000_0010);
210     /// Not modified at all. Useful for formatters or highlighters
211     static immutable StringBehavior source = StringBehavior(0b0000_0011);
212 
213     ubyte behavior;
214     alias behavior this;
215 }
216 
217 public enum CommentBehavior : bool
218 {
219     intern = true,
220     noIntern = false
221 }
222 /**
223  * Lexer configuration struct
224  */
225 public struct LexerConfig
226 {
227     string fileName;
228     StringBehavior stringBehavior;
229     WhitespaceBehavior whitespaceBehavior;
230     CommentBehavior commentBehavior = CommentBehavior.intern;
231 }
232 
233 /**
234  * Basic type token types.
235  */
236 public alias BasicTypes = AliasSeq!(tok!"int", tok!"bool", tok!"byte",
237         tok!"cdouble", tok!"cent", tok!"cfloat", tok!"char", tok!"creal",
238         tok!"dchar", tok!"double", tok!"float", tok!"idouble",
239         tok!"ifloat", tok!"ireal", tok!"long", tok!"real", tok!"short",
240         tok!"ubyte", tok!"ucent", tok!"uint", tok!"ulong", tok!"ushort",
241         tok!"void", tok!"wchar");
242 
243 /**
244  * Returns: true if the given ID is for a basic type.
245  */
246 public bool isBasicType(IdType type) nothrow pure @safe @nogc
247 {
248     switch (type)
249     {
250     foreach (T; BasicTypes)
251     {
252     case T:
253         return true;
254     }
255     default:
256         return false;
257     }
258 }
259 
260 /**
261  * Number literal token types.
262  */
263 public alias NumberLiterals = AliasSeq!(tok!"doubleLiteral",
264         tok!"floatLiteral", tok!"idoubleLiteral", tok!"ifloatLiteral",
265         tok!"intLiteral", tok!"longLiteral", tok!"realLiteral",
266         tok!"irealLiteral", tok!"uintLiteral", tok!"ulongLiteral");
267 
268 /**
269  * Returns: true if the given ID type is for a number literal.
270  */
271 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc
272 {
273     switch (type)
274     {
275     foreach (T; NumberLiterals)
276     {
277     case T:
278         return true;
279     }
280     default:
281         return false;
282     }
283 }
284 
285 /**
286  * Number literal token types.
287  */
288 public alias IntegerLiterals = AliasSeq!(tok!"intLiteral", tok!"longLiteral",
289         tok!"uintLiteral", tok!"ulongLiteral");
290 
291 /**
292  * Returns: true if the given ID type is for a integer literal.
293  */
294 public bool isIntegerLiteral(IdType type) nothrow pure @safe @nogc
295 {
296     switch (type)
297     {
298     foreach (T; IntegerLiterals)
299     {
300     case T:
301         return true;
302     }
303     default:
304         return false;
305     }
306 }
307 
308 /**
309  * Operator token types.
310  */
311 public alias Operators = AliasSeq!(tok!",", tok!".", tok!"..", tok!"...",
312         tok!"/", tok!"/=", tok!"!", tok!"!<", tok!"!<=", tok!"!<>",
313         tok!"!<>=", tok!"!=", tok!"!>", tok!"!>=", tok!"$", tok!"%",
314         tok!"%=", tok!"&", tok!"&&", tok!"&=", tok!"(", tok!")",
315         tok!"*", tok!"*=", tok!"+", tok!"++", tok!"+=", tok!"-",
316         tok!"--", tok!"-=", tok!":", tok!";", tok!"<", tok!"<<",
317         tok!"<<=", tok!"<=", tok!"<>", tok!"<>=", tok!"=", tok!"==",
318         tok!"=>", tok!">", tok!">=", tok!">>", tok!">>=", tok!">>>",
319         tok!">>>=", tok!"?", tok!"@", tok!"[", tok!"]", tok!"^",
320         tok!"^=", tok!"^^", tok!"^^=", tok!"{", tok!"|", tok!"|=",
321         tok!"||", tok!"}", tok!"~", tok!"~=");
322 
323 /**
324  * Returns: true if the given ID type is for an operator.
325  */
326 public bool isOperator(IdType type) nothrow pure @safe @nogc
327 {
328     switch (type)
329     {
330     foreach (T; Operators)
331     {
332     case T:
333         return true;
334     }
335     default:
336         return false;
337     }
338 }
339 
340 /**
341  * Keyword token types.
342  */
343 public alias Keywords = AliasSeq!(tok!"abstract", tok!"alias", tok!"align",
344         tok!"asm", tok!"assert", tok!"auto", tok!"break",
345         tok!"case", tok!"cast", tok!"catch", tok!"class", tok!"const",
346         tok!"continue", tok!"debug", tok!"default", tok!"delegate",
347         tok!"delete", tok!"deprecated", tok!"do", tok!"else", tok!"enum",
348         tok!"export", tok!"extern", tok!"false", tok!"final", tok!"finally",
349         tok!"for", tok!"foreach", tok!"foreach_reverse", tok!"function",
350         tok!"goto", tok!"if", tok!"immutable", tok!"import", tok!"in",
351         tok!"inout", tok!"interface", tok!"invariant", tok!"is",
352         tok!"lazy", tok!"macro", tok!"mixin", tok!"module", tok!"new",
353         tok!"nothrow", tok!"null", tok!"out", tok!"override", tok!"package",
354         tok!"pragma", tok!"private", tok!"protected", tok!"public",
355         tok!"pure", tok!"ref", tok!"return", tok!"scope", tok!"shared",
356         tok!"static", tok!"struct", tok!"super", tok!"switch", tok!"synchronized",
357         tok!"template", tok!"this", tok!"throw", tok!"true", tok!"try",
358         tok!"typedef", tok!"typeid", tok!"typeof", tok!"union", tok!"unittest",
359         tok!"version", tok!"while", tok!"with", tok!"__DATE__",
360         tok!"__EOF__", tok!"__FILE__", tok!"__FILE_FULL_PATH__", tok!"__FUNCTION__",
361         tok!"__gshared", tok!"__LINE__", tok!"__MODULE__", tok!"__parameters",
362         tok!"__PRETTY_FUNCTION__", tok!"__TIME__", tok!"__TIMESTAMP__",
363         tok!"__traits", tok!"__vector", tok!"__VENDOR__", tok!"__VERSION__");
364 
365 /**
366  * Returns: true if the given ID type is for a keyword.
367  */
368 public bool isKeyword(IdType type) pure nothrow @safe @nogc
369 {
370     switch (type)
371     {
372     foreach (T; Keywords)
373     {
374     case T:
375         return true;
376     }
377     default:
378         return false;
379     }
380 }
381 
382 /**
383  * String literal token types
384  */
385 public alias StringLiterals = AliasSeq!(tok!"dstringLiteral",
386         tok!"stringLiteral", tok!"wstringLiteral");
387 
388 /**
389  * Returns: true if the given ID type is for a string literal.
390  */
391 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc
392 {
393     switch (type)
394     {
395     foreach (T; StringLiterals)
396     {
397     case T:
398         return true;
399     }
400     default:
401         return false;
402     }
403 }
404 
405 /**
406  * Protection token types.
407  */
408 public alias Protections = AliasSeq!(tok!"export", tok!"package",
409         tok!"private", tok!"public", tok!"protected");
410 
411 /**
412  * Returns: true if the given ID type is for a protection attribute.
413  */
414 public bool isProtection(IdType type) pure nothrow @safe @nogc
415 {
416     switch (type)
417     {
418     foreach (T; Protections)
419     {
420     case T:
421         return true;
422     }
423     default:
424         return false;
425     }
426 }
427 
428 public alias SpecialTokens = AliasSeq!(tok!"__DATE__", tok!"__TIME__",
429     tok!"__TIMESTAMP__", tok!"__VENDOR__", tok!"__VERSION__", tok!"__FILE__",
430     tok!"__FILE_FULL_PATH__", tok!"__LINE__", tok!"__MODULE__",
431     tok!"__FUNCTION__", tok!"__PRETTY_FUNCTION__");
432 
433 public bool isSpecialToken(IdType type) pure nothrow @safe @nogc
434 {
435     switch (type)
436     {
437     foreach (T; SpecialTokens)
438     {
439     case T:
440         return true;
441     }
442     default:
443         return false;
444     }
445 }
446 
447 public alias Literals = AliasSeq!(StringLiterals, NumberLiterals, tok!"characterLiteral",
448         SpecialTokens, tok!"true", tok!"false", tok!"null", tok!"$");
449 
450 public bool isLiteral(IdType type) pure nothrow @safe @nogc
451 {
452     switch (type)
453     {
454     foreach (T; Literals)
455     {
456     case T:
457         return true;
458     }
459     default:
460         return false;
461     }
462 }
463 
464 /**
465  * Returns: an array of tokens lexed from the given source code to the output
466  * range. All whitespace, comment and specialTokenSequence tokens (trivia) are
467  * attached to the token nearest to them.
468  *
469  * Trivia is put on the last token as `trailingTrivia` if it is on the same
470  * line as the trivia, otherwise it will be attached to the next token in the
471  * `leadingTrivia` until there is the EOF, where it will be attached as
472  * `trailingTrivia` again.
473  */
474 const(Token)[] getTokensForParser(R)(R sourceCode, LexerConfig config, StringCache* cache)
475 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
476 {
477     config.whitespaceBehavior = WhitespaceBehavior.include;
478     config.commentBehavior = CommentBehavior.noIntern;
479 
480     auto leadingTriviaAppender = appender!(Token[])();
481     leadingTriviaAppender.reserve(128);
482     auto trailingTriviaAppender = appender!(Token[])();
483     trailingTriviaAppender.reserve(128);
484 
485     auto output = appender!(typeof(return))();
486     auto lexer = DLexer(sourceCode, config, cache);
487     loop: while (!lexer.empty) switch (lexer.front.type)
488     {
489     case tok!"specialTokenSequence":
490     case tok!"whitespace":
491     case tok!"comment":
492         if (!output.data.empty && lexer.front.line == output.data[$ - 1].line)
493             trailingTriviaAppender.put(lexer.front);
494         else
495             leadingTriviaAppender.put(lexer.front);
496         lexer.popFront();
497         break;
498     case tok!"__EOF__":
499         break loop;
500     default:
501         Token t = lexer.front;
502         lexer.popFront();
503 
504         if (!output.data.empty && !trailingTriviaAppender.data.empty)
505             (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup;
506         t.leadingTrivia = leadingTriviaAppender.data.idup;
507         leadingTriviaAppender.clear();
508         trailingTriviaAppender.clear();
509 
510         output.put(t);
511         break;
512     }
513 
514     if (!output.data.empty)
515     {
516         trailingTriviaAppender.put(leadingTriviaAppender.data);
517         (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup;
518     }
519 
520     return output.data;
521 }
522 
523 /**
524  * The D lexer struct.
525  */
526 public struct DLexer
527 {
528     mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
529         keywords, pseudoTokenHandlers);
530 
531     ///
532     @disable this();
533 
534     /**
535      * Params:
536      *     range = the bytes that compose the source code that will be lexed.
537      *     config = the lexer configuration to use.
538      *     cache = the string interning cache for de-duplicating identifiers and
539      *         other token text.
540      *     haveSSE42 = Parse streaming SIMD Extensions 4.2 in inline assembly
541      */
542     this(R)(R range, const LexerConfig config, StringCache* cache,
543         bool haveSSE42 = sse42()) pure nothrow @safe
544     if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
545     {
546         this.haveSSE42 = haveSSE42;
547         auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf)
548             ? range[3 .. $] : range;
549         static if (is(ElementEncodingType!R == immutable))
550             this.range = LexerRange(cast(const(ubyte)[]) r);
551         else
552             this.range = LexerRange(cast(const(ubyte)[]) r.idup);
553         this.config = config;
554         this.cache = cache;
555         popFront();
556     }
557 
558     ///
559     public void popFront()() pure nothrow @safe
560     {
561         do
562             _popFront();
563         while (config.whitespaceBehavior == WhitespaceBehavior.skip
564             && _front.type == tok!"whitespace");
565     }
566 
567     /**
568      * Lexer error/warning message.
569      */
570     static struct Message
571     {
572         /// 1-based line number
573         size_t line;
574         /// 1-based byte offset
575         size_t column;
576         /// Text of the message
577         string message;
578         /// `true` for an error, `false` for a warning
579         bool isError;
580     }
581 
582     /**
583      * Returns: An array of all of the warnings and errors generated so far
584      *     during lexing. It may make sense to only check this when `empty`
585      *     returns `true`.
586      */
587     const(Message[]) messages() const @property
588     {
589         return _messages;
590     }
591 
592 private pure nothrow @safe:
593 
594     bool isWhitespace()
595     {
596         switch (range.bytes[range.index])
597         {
598         case ' ':
599         case '\r':
600         case '\n':
601         case '\t':
602         case '\v':
603         case '\f':
604             return true;
605         case 0xe2:
606             auto peek = range.peek(2);
607             return peek.length == 2
608                 && peek[0] == 0x80
609                 && (peek[1] == 0xa8 || peek[1] == 0xa9);
610         default:
611             return false;
612         }
613     }
614 
615     void popFrontWhitespaceAware()
616     {
617         switch (range.bytes[range.index])
618         {
619         case '\r':
620             range.popFront();
621             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
622             {
623                 range.popFront();
624                 range.incrementLine();
625             }
626             else
627                 range.incrementLine();
628             return;
629         case '\n':
630             range.popFront();
631             range.incrementLine();
632             return;
633         case 0xe2:
634             auto lookahead = range.peek(3);
635             if (lookahead.length == 3 && lookahead[1] == 0x80
636                 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
637             {
638                 range.index+=3;
639                 range.column+=3;
640                 range.incrementLine();
641                 return;
642             }
643             else
644             {
645                 range.popFront();
646                 return;
647             }
648         default:
649             range.popFront();
650             return;
651         }
652     }
653 
654     void lexWhitespace(ref Token token) @trusted
655     {
656         mixin (tokenStart);
657         loop: do
658         {
659             version (iasm64NotWindows)
660             {
661                 if (haveSSE42 && range.index + 16 < range.bytes.length)
662                 {
663                     skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index,
664                         &range.index, &range.column);
665                 }
666             }
667             switch (range.bytes[range.index])
668             {
669             case '\r':
670                 range.popFront();
671                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
672                 {
673                     range.popFront();
674                 }
675                 range.column = 1;
676                 range.line += 1;
677                 break;
678             case '\n':
679                 range.popFront();
680                 range.column = 1;
681                 range.line += 1;
682                 break;
683             case ' ':
684             case '\t':
685             case '\v':
686             case '\f':
687                 range.popFront();
688                 break;
689             case 0xe2:
690                 if (range.index + 2 >= range.bytes.length)
691                     break loop;
692                 if (range.bytes[range.index + 1] != 0x80)
693                     break loop;
694                 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9)
695                 {
696                     range.index += 3;
697                     range.column += 3;
698                     range.column = 1;
699                     range.line += 1;
700                     break;
701                 }
702                 break loop;
703             default:
704                 break loop;
705             }
706         } while (!(range.index >= range.bytes.length));
707         string text = config.whitespaceBehavior == WhitespaceBehavior.include
708             ? cache.intern(range.slice(mark)) : "";
709         token = Token(tok!"whitespace", text, line, column, index);
710     }
711 
712     void lexNumber(ref Token token)
713     {
714         mixin (tokenStart);
715         if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length)
716         {
717             immutable ahead = range.bytes[range.index + 1];
718             switch (ahead)
719             {
720             case 'x':
721             case 'X':
722                 range.index += 2;
723                 range.column += 2;
724                 lexHex(token, mark, line, column, index);
725                 return;
726             case 'b':
727             case 'B':
728                 range.index += 2;
729                 range.column += 2;
730                 lexBinary(token, mark, line, column, index);
731                 return;
732             default:
733                 lexDecimal(token, mark, line, column, index);
734                 return;
735             }
736         }
737         else
738             lexDecimal(token, mark, line, column, index);
739     }
740 
741     void lexHex(ref Token token)
742     {
743         mixin (tokenStart);
744         lexHex(token, mark, line, column, index);
745     }
746 
747     void lexHex(ref Token token, size_t mark, size_t line, size_t column,
748         size_t index) @trusted
749     {
750         IdType type = tok!"intLiteral";
751         bool foundDot;
752         hexLoop: while (!(range.index >= range.bytes.length))
753         {
754             switch (range.bytes[range.index])
755             {
756             case 'a': .. case 'f':
757             case 'A': .. case 'F':
758             case '0': .. case '9':
759             case '_':
760                 version (iasm64NotWindows)
761                 {
762                     if (haveSSE42 && range.index + 16 < range.bytes.length)
763                     {
764                         immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_')
765                             (range.bytes.ptr + range.index);
766                         range.column += i;
767                         range.index += i;
768                     }
769                     else
770                         range.popFront();
771                 }
772                 else
773                     range.popFront();
774                 break;
775             case 'u':
776             case 'U':
777                 lexIntSuffix(type);
778                 break hexLoop;
779             case 'i':
780                 if (foundDot)
781                     lexFloatSuffix(type);
782                 break hexLoop;
783             case 'L':
784                 if (foundDot)
785                     lexFloatSuffix(type);
786                 else
787                     lexIntSuffix(type);
788                 break hexLoop;
789             case 'p':
790             case 'P':
791                 lexExponent(type);
792                 break hexLoop;
793             case '.':
794                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
795                     break hexLoop;
796                 else
797                 {
798                     // The following bit of silliness tries to tell the
799                     // difference between "int dot identifier" and
800                     // "double identifier".
801                     if (range.index + 1 < range.bytes.length)
802                     {
803                         switch (range.peekAt(1))
804                         {
805                         case '0': .. case '9':
806                         case 'A': .. case 'F':
807                         case 'a': .. case 'f':
808                             goto doubleLiteral;
809                         default:
810                             break hexLoop;
811                         }
812                     }
813                     else
814                     {
815                     doubleLiteral:
816                         range.popFront();
817                         foundDot = true;
818                         type = tok!"doubleLiteral";
819                     }
820                 }
821                 break;
822             default:
823                 break hexLoop;
824             }
825         }
826         token = Token(type, cache.intern(range.slice(mark)), line, column,
827             index);
828     }
829 
830     void lexBinary(ref Token token)
831     {
832         mixin (tokenStart);
833         return lexBinary(token, mark, line, column, index);
834     }
835 
836     void lexBinary(ref Token token, size_t mark, size_t line, size_t column,
837         size_t index) @trusted
838     {
839         IdType type = tok!"intLiteral";
840         binaryLoop: while (!(range.index >= range.bytes.length))
841         {
842             switch (range.bytes[range.index])
843             {
844             case '0':
845             case '1':
846             case '_':
847                 version (iasm64NotWindows)
848                 {
849                     if (haveSSE42 && range.index + 16 < range.bytes.length)
850                     {
851                         immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')(
852                             range.bytes.ptr + range.index);
853                         range.column += i;
854                         range.index += i;
855                     }
856                     else
857                         range.popFront();
858                 }
859                 else
860                     range.popFront();
861                 break;
862             case 'u':
863             case 'U':
864             case 'L':
865                 lexIntSuffix(type);
866                 break binaryLoop;
867             default:
868                 break binaryLoop;
869             }
870         }
871         token = Token(type, cache.intern(range.slice(mark)), line, column,
872             index);
873     }
874 
875     void lexDecimal(ref Token token)
876     {
877         mixin (tokenStart);
878         lexDecimal(token, mark, line, column, index);
879     }
880 
881     void lexDecimal(ref Token token, size_t mark, size_t line, size_t column,
882         size_t index) @trusted
883     {
884         bool foundDot = range.bytes[range.index] == '.';
885         IdType type = tok!"intLiteral";
886         if (foundDot)
887         {
888             range.popFront();
889             type = tok!"doubleLiteral";
890         }
891 
892         decimalLoop: while (!(range.index >= range.bytes.length))
893         {
894             switch (range.bytes[range.index])
895             {
896             case '0': .. case '9':
897             case '_':
898                 version (iasm64NotWindows)
899                 {
900                     if (haveSSE42 && range.index + 16 < range.bytes.length)
901                     {
902                         immutable ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index);
903                         range.column += i;
904                         range.index += i;
905                     }
906                     else
907                         range.popFront();
908                 }
909                 else
910                     range.popFront();
911                 break;
912             case 'u':
913             case 'U':
914                 if (!foundDot)
915                     lexIntSuffix(type);
916                 break decimalLoop;
917             case 'i':
918                 lexFloatSuffix(type);
919                 break decimalLoop;
920             case 'L':
921                 if (foundDot)
922                     lexFloatSuffix(type);
923                 else
924                     lexIntSuffix(type);
925                 break decimalLoop;
926             case 'f':
927             case 'F':
928                 lexFloatSuffix(type);
929                 break decimalLoop;
930             case 'e':
931             case 'E':
932                 lexExponent(type);
933                 break decimalLoop;
934             case '.':
935                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
936                     break decimalLoop;
937                 else
938                 {
939                     // The following bit of silliness tries to tell the
940                     // difference between "int dot identifier" and
941                     // "double identifier".
942                     if (range.index + 1 < range.bytes.length)
943                     {
944                         immutable ch = range.peekAt(1);
945                         if (ch <= 0x2f
946                             || (ch >= '0' && ch <= '9')
947                             || (ch >= ':' && ch <= '@')
948                             || (ch >= '[' && ch <= '^')
949                             || (ch >= '{' && ch <= '~')
950                             || ch == '`' || ch == '_')
951                         {
952                             goto doubleLiteral;
953                         }
954                         else
955                             break decimalLoop;
956                     }
957                     else
958                     {
959                     doubleLiteral:
960                         range.popFront();
961                         foundDot = true;
962                         type = tok!"doubleLiteral";
963                     }
964                 }
965                 break;
966             default:
967                 break decimalLoop;
968             }
969         }
970         token = Token(type, cache.intern(range.slice(mark)), line, column,
971             index);
972     }
973 
974     void lexIntSuffix(ref IdType type) pure nothrow @safe
975     {
976         bool secondPass;
977         if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U')
978         {
979     U:
980             if (type == tok!"intLiteral")
981                 type = tok!"uintLiteral";
982             else
983                 type = tok!"ulongLiteral";
984             range.popFront();
985             if (secondPass)
986                 return;
987             if (range.index < range.bytes.length
988                     && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l'))
989                 goto L;
990             goto I;
991         }
992         if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')
993         {
994     L:
995             if (type == tok!"uintLiteral")
996                 type = tok!"ulongLiteral";
997             else
998                 type = tok!"longLiteral";
999             range.popFront();
1000             if (range.index < range.bytes.length
1001                     && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u'))
1002             {
1003                 secondPass = true;
1004                 goto U;
1005             }
1006             goto I;
1007         }
1008     I:
1009         if (range.index < range.bytes.length && range.bytes[range.index] == 'i')
1010         {
1011             warning("Complex number literals are deprecated");
1012             range.popFront();
1013             if (type == tok!"longLiteral" || type == tok!"ulongLiteral")
1014                 type = tok!"idoubleLiteral";
1015             else
1016                 type = tok!"ifloatLiteral";
1017         }
1018     }
1019 
1020     void lexFloatSuffix(ref IdType type) pure nothrow @safe
1021     {
1022         switch (range.bytes[range.index])
1023         {
1024         case 'L':
1025             range.popFront();
1026             type = tok!"doubleLiteral";
1027             break;
1028         case 'f':
1029         case 'F':
1030             range.popFront();
1031             type = tok!"floatLiteral";
1032             break;
1033         default:
1034             break;
1035         }
1036         if (range.index < range.bytes.length && range.bytes[range.index] == 'i')
1037         {
1038             warning("Complex number literals are deprecated");
1039             range.popFront();
1040             if (type == tok!"floatLiteral")
1041                 type = tok!"ifloatLiteral";
1042             else
1043                 type = tok!"idoubleLiteral";
1044         }
1045     }
1046 
1047     void lexExponent(ref IdType type) pure nothrow @safe
1048     {
1049         range.popFront();
1050         bool foundSign = false;
1051         bool foundDigit = false;
1052         while (range.index < range.bytes.length)
1053         {
1054             switch (range.bytes[range.index])
1055             {
1056             case '-':
1057             case '+':
1058                 if (foundSign)
1059                 {
1060                     if (!foundDigit)
1061                     error("Expected an exponent");
1062                     return;
1063                 }
1064                 foundSign = true;
1065                 range.popFront();
1066                 break;
1067             case '0': .. case '9':
1068             case '_':
1069                 foundDigit = true;
1070                 range.popFront();
1071                 break;
1072             case 'L':
1073             case 'f':
1074             case 'F':
1075             case 'i':
1076                 lexFloatSuffix(type);
1077                 return;
1078             default:
1079                 if (!foundDigit)
1080                     error("Expected an exponent");
1081                 return;
1082             }
1083         }
1084     }
1085 
1086     void lexScriptLine(ref Token token)
1087     {
1088         mixin (tokenStart);
1089         while (!(range.index >= range.bytes.length) && !isNewline)
1090         {
1091             range.popFront();
1092         }
1093         token = Token(tok!"scriptLine", cache.intern(range.slice(mark)),
1094             line, column, index);
1095     }
1096 
1097     void lexSpecialTokenSequence(ref Token token)
1098     {
1099         mixin (tokenStart);
1100         while (!(range.index >= range.bytes.length) && !isNewline)
1101         {
1102             range.popFront();
1103         }
1104         token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
1105             line, column, index);
1106     }
1107 
1108     void lexSlashStarComment(ref Token token) @trusted
1109     {
1110         mixin (tokenStart);
1111         IdType type = tok!"comment";
1112         range.popFrontN(2);
1113         while (range.index < range.bytes.length)
1114         {
1115             version (iasm64NotWindows)
1116             {
1117                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1118                     skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index,
1119                         &range.index, &range.column);
1120             }
1121             if (range.bytes[range.index] == '*')
1122             {
1123                 range.popFront();
1124                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1125                 {
1126                     range.popFront();
1127                     break;
1128                 }
1129             }
1130             else
1131                 popFrontWhitespaceAware();
1132         }
1133         if (config.commentBehavior == CommentBehavior.intern)
1134             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1135         else
1136             token = Token(type, cast(string) range.slice(mark), line, column, index);
1137     }
1138 
1139     void lexSlashSlashComment(ref Token token) @trusted
1140     {
1141         mixin (tokenStart);
1142         IdType type = tok!"comment";
1143         range.popFrontN(2);
1144         while (range.index < range.bytes.length)
1145         {
1146             version (iasm64NotWindows)
1147             {
1148                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1149                 {
1150                     skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1151                         &range.index, &range.column);
1152                 }
1153             }
1154             if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n')
1155                 break;
1156             range.popFront();
1157         }
1158         if (config.commentBehavior == CommentBehavior.intern)
1159             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1160         else
1161             token = Token(type, cast(string) range.slice(mark), line, column, index);
1162     }
1163 
1164     void lexSlashPlusComment(ref Token token) @trusted
1165     {
1166         mixin (tokenStart);
1167         IdType type = tok!"comment";
1168         range.index += 2;
1169         range.column += 2;
1170         int depth = 1;
1171         while (depth > 0 && !(range.index >= range.bytes.length))
1172         {
1173             version (iasm64NotWindows)
1174             {
1175                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1176                 {
1177                     skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1178                         &range.index, &range.column);
1179                 }
1180             }
1181             if (range.bytes[range.index] == '+')
1182             {
1183                 range.popFront();
1184                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1185                 {
1186                     range.popFront();
1187                     depth--;
1188                 }
1189             }
1190             else if (range.bytes[range.index] == '/')
1191             {
1192                 range.popFront();
1193                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+')
1194                 {
1195                     range.popFront();
1196                     depth++;
1197                 }
1198             }
1199             else
1200                 popFrontWhitespaceAware();
1201         }
1202         if (config.commentBehavior == CommentBehavior.intern)
1203             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1204         else
1205             token = Token(type, cast(string) range.slice(mark), line, column, index);
1206     }
1207 
1208     void lexStringLiteral(ref Token token) @trusted
1209     {
1210         mixin (tokenStart);
1211         range.popFront();
1212         while (true)
1213         {
1214             if (range.index >= range.bytes.length)
1215             {
1216                 error("Error: unterminated string literal");
1217                 token = Token(tok!"");
1218                 return;
1219             }
1220             version (iasm64NotWindows)
1221             {
1222                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1223                 {
1224                     skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1225                         &range.index, &range.column);
1226                 }
1227             }
1228             if (range.bytes[range.index] == '"')
1229             {
1230                 range.popFront();
1231                 break;
1232             }
1233             else if (range.bytes[range.index] == '\\')
1234             {
1235                 if (!lexEscapeSequence())
1236                 {
1237                     token = Token.init;
1238                     return;
1239                 }
1240             }
1241             else
1242                 popFrontWhitespaceAware();
1243         }
1244         IdType type = tok!"stringLiteral";
1245         lexStringSuffix(type);
1246         token = Token(type, cache.intern(range.slice(mark)), line, column,
1247             index);
1248     }
1249 
1250     void lexWysiwygString(ref Token token) @trusted
1251     {
1252         mixin (tokenStart);
1253         IdType type = tok!"stringLiteral";
1254         immutable bool backtick = range.bytes[range.index] == '`';
1255         if (backtick)
1256         {
1257             range.popFront();
1258             while (true)
1259             {
1260                 if (range.index >= range.bytes.length)
1261                 {
1262                     error("Error: unterminated string literal");
1263                     token = Token(tok!"");
1264                     return;
1265                 }
1266                 version (iasm64NotWindows)
1267                 {
1268                     if (haveSSE42 && range.index + 16 < range.bytes.length)
1269                     {
1270                         skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index,
1271                             &range.index, &range.column);
1272                     }
1273                 }
1274                 if (range.bytes[range.index] == '`')
1275                 {
1276                     range.popFront();
1277                     break;
1278                 }
1279                 else
1280                     popFrontWhitespaceAware();
1281             }
1282         }
1283         else
1284         {
1285             range.popFront();
1286             if (range.index >= range.bytes.length)
1287             {
1288                 error("Error: unterminated string literal");
1289                 token = Token(tok!"");
1290                 return;
1291             }
1292             range.popFront();
1293             while (true)
1294             {
1295                 if (range.index >= range.bytes.length)
1296                 {
1297                     error("Error: unterminated string literal");
1298                     token = Token(tok!"");
1299                     return;
1300                 }
1301                 else if (range.bytes[range.index] == '"')
1302                 {
1303                     range.popFront();
1304                     break;
1305                 }
1306                 else
1307                     popFrontWhitespaceAware();
1308             }
1309         }
1310         lexStringSuffix(type);
1311         token = Token(type, cache.intern(range.slice(mark)), line, column,
1312             index);
1313     }
1314 
1315     private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe
1316     {
1317         if (range.index >= range.bytes.length)
1318         {
1319             type = tok!"stringLiteral";
1320             return 0;
1321         }
1322         else
1323         {
1324             switch (range.bytes[range.index])
1325             {
1326             case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w';
1327             case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd';
1328             case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c';
1329             default: type = tok!"stringLiteral"; return 0;
1330             }
1331         }
1332     }
1333 
1334     void lexDelimitedString(ref Token token)
1335     {
1336         mixin (tokenStart);
1337         range.index += 2;
1338         range.column += 2;
1339         ubyte open;
1340         ubyte close;
1341         switch (range.bytes[range.index])
1342         {
1343         case '<':
1344             open = '<';
1345             close = '>';
1346             range.popFront();
1347             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1348             break;
1349         case '{':
1350             open = '{';
1351             close = '}';
1352             range.popFront();
1353             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1354             break;
1355         case '[':
1356             open = '[';
1357             close = ']';
1358             range.popFront();
1359             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1360             break;
1361         case '(':
1362             open = '(';
1363             close = ')';
1364             range.popFront();
1365             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1366             break;
1367         default:
1368             lexHeredocString(token, mark, line, column, index);
1369             break;
1370         }
1371     }
1372 
1373     void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column,
1374         size_t index, ubyte open, ubyte close)
1375     {
1376         int depth = 1;
1377         while (!(range.index >= range.bytes.length) && depth > 0)
1378         {
1379             if (range.bytes[range.index] == open)
1380             {
1381                 depth++;
1382                 range.popFront();
1383             }
1384             else if (range.bytes[range.index] == close)
1385             {
1386                 depth--;
1387                 range.popFront();
1388                 if (depth <= 0)
1389                 {
1390                     if (range.bytes[range.index] == '"')
1391                     {
1392                         range.popFront();
1393                     }
1394                     else
1395                     {
1396                         error("Error: `\"` expected to end delimited string literal");
1397                         token = Token(tok!"");
1398                         return;
1399                     }
1400                 }
1401             }
1402             else
1403                 popFrontWhitespaceAware();
1404         }
1405         IdType type = tok!"stringLiteral";
1406         lexStringSuffix(type);
1407         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1408     }
1409 
1410     void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index)
1411     {
1412         Token ident;
1413         lexIdentifier(ident);
1414         if (isNewline())
1415             popFrontWhitespaceAware();
1416         else
1417             error("Newline expected");
1418         while (!(range.index >= range.bytes.length))
1419         {
1420             if (isNewline())
1421             {
1422                 popFrontWhitespaceAware();
1423                 if (!range.canPeek(ident.text.length))
1424                 {
1425                     error(ident.text ~ " expected");
1426                     break;
1427                 }
1428                 if (range.peek(ident.text.length - 1) == ident.text)
1429                 {
1430                     range.popFrontN(ident.text.length);
1431                     break;
1432                 }
1433             }
1434             else
1435             {
1436                 range.popFront();
1437             }
1438         }
1439         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"')
1440         {
1441             range.popFront();
1442         }
1443         else
1444             error("`\"` expected");
1445         IdType type = tok!"stringLiteral";
1446         lexStringSuffix(type);
1447         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1448     }
1449 
1450     void lexTokenString(ref Token token)
1451     {
1452         mixin (tokenStart);
1453         assert (range.bytes[range.index] == 'q');
1454         range.popFront();
1455         assert (range.bytes[range.index] == '{');
1456         range.popFront();
1457         auto app = appender!string();
1458         app.put("q{");
1459         int depth = 1;
1460 
1461         immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior;
1462         immutable StringBehavior oldString = config.stringBehavior;
1463         config.whitespaceBehavior = WhitespaceBehavior.include;
1464         config.stringBehavior = StringBehavior.source;
1465         scope (exit)
1466         {
1467             config.whitespaceBehavior = oldWhitespace;
1468             config.stringBehavior = oldString;
1469         }
1470 
1471         advance(_front);
1472         while (depth > 0 && !empty)
1473         {
1474             auto t = front();
1475             if (t.text is null)
1476                 app.put(str(t.type));
1477             else
1478                 app.put(t.text);
1479             if (t.type == tok!"}")
1480             {
1481                 depth--;
1482                 if (depth > 0)
1483                 popFront();
1484             }
1485             else if (t.type == tok!"{")
1486             {
1487                 depth++;
1488                 popFront();
1489             }
1490             else
1491                 popFront();
1492         }
1493         IdType type = tok!"stringLiteral";
1494         auto b = lexStringSuffix(type);
1495         if (b != 0)
1496             app.put(b);
1497         token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
1498             column, index);
1499     }
1500 
1501     void lexHexString(ref Token token)
1502     {
1503         mixin (tokenStart);
1504         range.index += 2;
1505         range.column += 2;
1506 
1507         loop: while (true)
1508         {
1509             if (range.index >= range.bytes.length)
1510             {
1511                 error("Error: unterminated hex string literal");
1512                 token = Token(tok!"");
1513                 return;
1514             }
1515             else if (isWhitespace())
1516                 popFrontWhitespaceAware();
1517             else switch (range.bytes[range.index])
1518             {
1519             case '0': .. case '9':
1520             case 'A': .. case 'F':
1521             case 'a': .. case 'f':
1522                 range.popFront();
1523                 break;
1524             case '"':
1525                 range.popFront();
1526                 break loop;
1527             default:
1528                 error("Error: invalid character in hex string");
1529                 token = Token(tok!"");
1530                 return;
1531             }
1532         }
1533 
1534         IdType type = tok!"stringLiteral";
1535         lexStringSuffix(type);
1536         token = Token(type, cache.intern(range.slice(mark)), line, column,
1537             index);
1538     }
1539 
1540     bool lexNamedEntity()
1541     in { assert (range.bytes[range.index] == '&'); }
1542     do
1543     {
1544         Token t;
1545         range.popFront();
1546         lexIdentifier(t, true);
1547         if (t.type != tok!"identifier" || range.empty || range.bytes[range.index] != ';')
1548         {
1549             error("Error: invalid named character entity");
1550             return false;
1551         }
1552         range.popFront();
1553         return true;
1554     }
1555 
1556     bool lexEscapeSequence()
1557     {
1558         range.popFront();
1559         if (range.index >= range.bytes.length)
1560         {
1561             error("Error: non-terminated character escape sequence.");
1562             return false;
1563         }
1564         switch (range.bytes[range.index])
1565         {
1566         case '&': return lexNamedEntity();
1567         case '\'':
1568         case '"':
1569         case '?':
1570         case '\\':
1571         case 'a':
1572         case 'b':
1573         case 'f':
1574         case 'n':
1575         case 'r':
1576         case 't':
1577         case 'v':
1578             range.popFront();
1579             break;
1580         case 'x':
1581             range.popFront();
1582             foreach (i; 0 .. 2)
1583             {
1584                 if (range.index >= range.bytes.length)
1585                 {
1586                     error("Error: 2 hex digits expected.");
1587                     return false;
1588                 }
1589                 switch (range.bytes[range.index])
1590                 {
1591                 case '0': .. case '9':
1592                 case 'a': .. case 'f':
1593                 case 'A': .. case 'F':
1594                     range.popFront();
1595                     break;
1596                 default:
1597                     error("Error: 2 hex digits expected.");
1598                     return false;
1599                 }
1600             }
1601             break;
1602         case '0':
1603             if (!(range.index + 1 < range.bytes.length)
1604                 || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\''))
1605             {
1606                 range.popFront();
1607                 break;
1608             }
1609             goto case;
1610         case '1': .. case '7':
1611             for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length)
1612                     && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++)
1613                 range.popFront();
1614             break;
1615         case 'u':
1616             range.popFront();
1617             foreach (i; 0 .. 4)
1618             {
1619                 if (range.index >= range.bytes.length)
1620                 {
1621                     error("Error: at least 4 hex digits expected.");
1622                     return false;
1623                 }
1624                 switch (range.bytes[range.index])
1625                 {
1626                 case '0': .. case '9':
1627                 case 'a': .. case 'f':
1628                 case 'A': .. case 'F':
1629                     range.popFront();
1630                     break;
1631                 default:
1632                     error("Error: at least 4 hex digits expected.");
1633                     return false;
1634                 }
1635             }
1636             break;
1637         case 'U':
1638             range.popFront();
1639             foreach (i; 0 .. 8)
1640             {
1641                 if (range.index >= range.bytes.length)
1642                 {
1643                     error("Error: at least 8 hex digits expected.");
1644                     return false;
1645                 }
1646                 switch (range.bytes[range.index])
1647                 {
1648                 case '0': .. case '9':
1649                 case 'a': .. case 'f':
1650                 case 'A': .. case 'F':
1651                     range.popFront();
1652                     break;
1653                 default:
1654                     error("Error: at least 8 hex digits expected.");
1655                     return false;
1656                 }
1657             }
1658             break;
1659         default:
1660             error("Invalid escape sequence");
1661             while (true)
1662             {
1663                 if (range.index >= range.bytes.length)
1664                 {
1665                     error("Error: non-terminated character escape sequence.");
1666                     break;
1667                 }
1668                 if (range.bytes[range.index] == ';')
1669                 {
1670                     range.popFront();
1671                     break;
1672                 }
1673                 else
1674                 {
1675                     range.popFront();
1676                 }
1677             }
1678             return false;
1679         }
1680         return true;
1681     }
1682 
1683     void lexCharacterLiteral(ref Token token)
1684     {
1685         mixin (tokenStart);
1686         range.popFront();
1687         if (range.empty)
1688             goto err;
1689         if (range.bytes[range.index] == '\\')
1690             lexEscapeSequence();
1691         else if (range.bytes[range.index] == '\'')
1692         {
1693             range.popFront();
1694             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1695                 line, column, index);
1696         }
1697         else if (range.bytes[range.index] & 0x80)
1698         {
1699             while (range.bytes[range.index] & 0x80)
1700                 range.popFront();
1701         }
1702         else
1703             popFrontWhitespaceAware();
1704 
1705         if (range.index < range.bytes.length && range.bytes[range.index] == '\'')
1706         {
1707             range.popFront();
1708             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1709                 line, column, index);
1710         }
1711         else
1712         {
1713     err:
1714             error("Error: Expected `'` to end character literal");
1715             token = Token(tok!"");
1716         }
1717     }
1718 
1719     void lexIdentifier(ref Token token, const bool silent = false) @trusted
1720     {
1721         mixin (tokenStart);
1722 
1723         if (isSeparating(0))
1724         {
1725             if (silent) return;
1726 
1727             error("Invalid identifier");
1728             range.popFront();
1729         }
1730         while (true)
1731         {
1732             version (iasm64NotWindows)
1733             {
1734                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1735                 {
1736                     immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_')
1737                         (range.bytes.ptr + range.index);
1738                     range.column += i;
1739                     range.index += i;
1740                 }
1741             }
1742             if (isSeparating(0))
1743                 break;
1744             else
1745                 range.popFront();
1746         }
1747         token = Token(tok!"identifier", cache.intern(range.slice(mark)), line,
1748             column, index);
1749     }
1750 
1751     void lexDot(ref Token token)
1752     {
1753         mixin (tokenStart);
1754         if (!(range.index + 1 < range.bytes.length))
1755         {
1756             range.popFront();
1757             token = Token(tok!".", null, line, column, index);
1758             return;
1759         }
1760         switch (range.peekAt(1))
1761         {
1762         case '0': .. case '9':
1763             lexNumber(token);
1764             return;
1765         case '.':
1766             range.popFront();
1767             range.popFront();
1768             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.')
1769             {
1770                 range.popFront();
1771                 token = Token(tok!"...", null, line, column, index);
1772             }
1773             else
1774                 token = Token(tok!"..", null, line, column, index);
1775             return;
1776         default:
1777             range.popFront();
1778             token = Token(tok!".", null, line, column, index);
1779             return;
1780         }
1781     }
1782 
1783     void lexLongNewline(ref Token token) @nogc
1784     {
1785         mixin (tokenStart);
1786         range.popFront();
1787         range.popFront();
1788         range.popFront();
1789         range.incrementLine();
1790         string text = config.whitespaceBehavior == WhitespaceBehavior.include
1791             ? cache.intern(range.slice(mark)) : "";
1792         token = Token(tok!"whitespace", text, line,
1793             column, index);
1794     }
1795 
1796     bool isNewline() @nogc
1797     {
1798         if (range.bytes[range.index] == '\n') return true;
1799         if (range.bytes[range.index] == '\r') return true;
1800         return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length)
1801             && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
1802     }
1803 
1804     bool isSeparating(size_t offset) @nogc
1805     {
1806         enum : ubyte
1807         {
1808             n, y, m // no, yes, maybe
1809         }
1810 
1811         if (range.index + offset >= range.bytes.length)
1812             return true;
1813         auto c = range.bytes[range.index + offset];
1814         static immutable ubyte[256] LOOKUP_TABLE = [
1815             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1816             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1817             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1818             n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y,
1819             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1820             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n,
1821             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1822             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y,
1823             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1824             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1825             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1826             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1827             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1828             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1829             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1830             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m
1831         ];
1832         immutable ubyte result = LOOKUP_TABLE[c];
1833         if (result == n)
1834             return false;
1835         if (result == y)
1836             return true;
1837         if (result == m)
1838         {
1839             auto r = range;
1840             range.popFrontN(offset);
1841             return (r.canPeek(2) && (r.peek(2) == "\u2028"
1842                 || r.peek(2) == "\u2029"));
1843         }
1844         assert (false);
1845     }
1846 
1847 
1848 
1849     enum tokenStart = q{
1850         size_t index = range.index;
1851         size_t column = range.column;
1852         size_t line = range.line;
1853         auto mark = range.mark();
1854     };
1855 
1856     void error(string message)
1857     {
1858         _messages ~= Message(range.line, range.column, message, true);
1859     }
1860 
1861     void warning(string message)
1862     {
1863         _messages ~= Message(range.line, range.column, message, false);
1864         assert (_messages.length > 0);
1865     }
1866 
1867     Message[] _messages;
1868     StringCache* cache;
1869     LexerConfig config;
1870     bool haveSSE42;
1871 }
1872 
1873 /**
1874  * Creates a token range from the given source code. Creates a default lexer
1875  * configuration and a GC-managed string cache.
1876  */
1877 public auto byToken(R)(R range)
1878 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1879 {
1880     LexerConfig config;
1881     StringCache* cache = new StringCache(range.length.optimalBucketCount);
1882     return DLexer(range, config, cache);
1883 }
1884 
1885 /**
1886  * Creates a token range from the given source code. Uses the given string
1887  * cache.
1888  */
1889 public auto byToken(R)(R range, StringCache* cache)
1890 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1891 {
1892     LexerConfig config;
1893     return DLexer(range, config, cache);
1894 }
1895 
1896 /**
1897  * Creates a token range from the given source code. Uses the provided lexer
1898  * configuration and string cache.
1899  */
1900 public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
1901 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1902 {
1903     return DLexer(range, config, cache);
1904 }
1905 
1906 /**
1907  * Helper function used to avoid too much allocations while lexing.
1908  *
1909  * Params:
1910  *      size = The length in bytes of the source file.
1911  *
1912  * Returns:
1913  *      The optimal initial bucket count a `StringCache` should have.
1914  */
1915 size_t optimalBucketCount(size_t size)
1916 {
1917     import std.math : nextPow2;
1918     return nextPow2((size + 31U) / 32U).min(1U << 30U);
1919 }
1920 ///
1921 unittest
1922 {
1923     assert(optimalBucketCount(1) == 2);
1924     assert(optimalBucketCount(9000 * 32) == 16384);
1925     static if (size_t.sizeof == ulong.sizeof)
1926         assert(optimalBucketCount(100_000_000_000UL) == 1 << 30);
1927 }
1928 
1929 /**
1930  * The string cache is used for string interning.
1931  *
1932  * It will only store a single copy of any string that it is asked to hold.
1933  * Interned strings can be compared for equality by comparing their $(B .ptr)
1934  * field.
1935  *
1936  * Default and postbilt constructors are disabled. When a StringCache goes out
1937  * of scope, the memory held by it is freed.
1938  *
1939  * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning)
1940  */
1941 struct StringCache
1942 {
1943 public pure nothrow @nogc:
1944 
1945     @disable this();
1946     @disable this(this);
1947 
1948     /**
1949      * Params: bucketCount = the initial number of buckets. Must be a
1950      * power of two
1951      */
1952     this(size_t bucketCount) nothrow @trusted @nogc
1953     in
1954     {
1955         import core.bitop : popcnt;
1956         static if (size_t.sizeof == 8)
1957         {
1958             immutable low = popcnt(cast(uint) bucketCount);
1959             immutable high = popcnt(cast(uint) (bucketCount >> 32));
1960             assert ((low == 0 && high == 1) || (low == 1 && high == 0));
1961         }
1962         else
1963         {
1964             static assert (size_t.sizeof == 4);
1965             assert (popcnt(cast(uint) bucketCount) == 1);
1966         }
1967     }
1968     do
1969     {
1970         buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount];
1971     }
1972 
1973     ~this()
1974     {
1975         Block* current = rootBlock;
1976         while (current !is null)
1977         {
1978             Block* prev = current;
1979             current = current.next;
1980             free(cast(void*) prev);
1981         }
1982         foreach (nodePointer; buckets)
1983         {
1984             Node* currentNode = nodePointer;
1985             while (currentNode !is null)
1986             {
1987                 if (currentNode.mallocated)
1988                     free(currentNode.str.ptr);
1989                 Node* prev = currentNode;
1990                 currentNode = currentNode.next;
1991                 free(prev);
1992             }
1993         }
1994         rootBlock = null;
1995         free(buckets.ptr);
1996         buckets = null;
1997     }
1998 
1999     /**
2000      * Caches a string.
2001      */
2002     string intern(const(ubyte)[] str) @safe
2003     {
2004         if (str is null || str.length == 0)
2005             return "";
2006         return _intern(str);
2007     }
2008 
2009     /**
2010      * ditto
2011      */
2012     string intern(string str) @trusted
2013     {
2014         return intern(cast(ubyte[]) str);
2015     }
2016 
2017     /**
2018      * The default bucket count for the string cache.
2019      */
2020     static enum defaultBucketCount = 4096;
2021 
2022 private:
2023 
2024     string _intern(const(ubyte)[] bytes) @trusted
2025     {
2026         immutable uint hash = hashBytes(bytes);
2027         immutable size_t index = hash & (buckets.length - 1);
2028         Node* s = find(bytes, hash);
2029         if (s !is null)
2030             return cast(string) s.str;
2031         ubyte[] mem = void;
2032         bool mallocated = bytes.length > BIG_STRING;
2033         if (mallocated)
2034             mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length];
2035         else
2036             mem = allocate(bytes.length);
2037         mem[] = bytes[];
2038         Node* node = cast(Node*) malloc(Node.sizeof);
2039         node.str = mem;
2040         node.hash = hash;
2041         node.next = buckets[index];
2042         node.mallocated = mallocated;
2043         buckets[index] = node;
2044         return cast(string) mem;
2045     }
2046 
2047     Node* find(const(ubyte)[] bytes, uint hash) @trusted
2048     {
2049         import std.algorithm : equal;
2050         immutable size_t index = hash & (buckets.length - 1);
2051         Node* node = buckets[index];
2052         while (node !is null)
2053         {
2054             if (node.hash == hash && bytes == cast(ubyte[]) node.str)
2055                 return node;
2056             node = node.next;
2057         }
2058         return node;
2059     }
2060 
2061     static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc
2062     in
2063     {
2064         assert (data !is null);
2065         assert (data.length > 0);
2066     }
2067     do
2068     {
2069         immutable uint m = 0x5bd1e995;
2070         immutable int r = 24;
2071         uint h = cast(uint) data.length;
2072         while (data.length >= 4)
2073         {
2074             uint k = (cast(ubyte) data[3]) << 24
2075                 | (cast(ubyte) data[2]) << 16
2076                 | (cast(ubyte) data[1]) << 8
2077                 | (cast(ubyte) data[0]);
2078             k *= m;
2079             k ^= k >> r;
2080             k *= m;
2081             h *= m;
2082             h ^= k;
2083             data = data[4 .. $];
2084         }
2085         switch (data.length & 3)
2086         {
2087         case 3:
2088             h ^= data[2] << 16;
2089             goto case;
2090         case 2:
2091             h ^= data[1] << 8;
2092             goto case;
2093         case 1:
2094             h ^= data[0];
2095             h *= m;
2096             break;
2097         default:
2098             break;
2099         }
2100         h ^= h >> 13;
2101         h *= m;
2102         h ^= h >> 15;
2103         return h;
2104     }
2105 
2106     ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc
2107     in
2108     {
2109         assert (numBytes != 0);
2110     }
2111     out (result)
2112     {
2113         assert (result.length == numBytes);
2114     }
2115     do
2116     {
2117         Block* r = rootBlock;
2118         size_t i = 0;
2119         while  (i <= 3 && r !is null)
2120         {
2121             immutable size_t available = r.bytes.length;
2122             immutable size_t oldUsed = r.used;
2123             immutable size_t newUsed = oldUsed + numBytes;
2124             if (newUsed <= available)
2125             {
2126                 r.used = newUsed;
2127                 return r.bytes[oldUsed .. newUsed];
2128             }
2129             i++;
2130             r = r.next;
2131         }
2132         Block* b = cast(Block*) calloc(Block.sizeof, 1);
2133         b.used = numBytes;
2134         b.next = rootBlock;
2135         rootBlock = b;
2136         return b.bytes[0 .. numBytes];
2137     }
2138 
2139     static struct Node
2140     {
2141         ubyte[] str = void;
2142         Node* next = void;
2143         uint hash = void;
2144         bool mallocated = void;
2145     }
2146 
2147     static struct Block
2148     {
2149         Block* next;
2150         size_t used;
2151         enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof;
2152         ubyte[BLOCK_CAPACITY] bytes;
2153     }
2154 
2155     static assert (BLOCK_SIZE == Block.sizeof);
2156 
2157     enum BLOCK_SIZE = 1024 * 16;
2158 
2159     // If a string would take up more than 1/4 of a block, allocate it outside
2160     // of the block.
2161     enum BIG_STRING = BLOCK_SIZE / 4;
2162 
2163     Node*[] buckets;
2164     Block* rootBlock;
2165 }
2166 
2167 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted;
2168 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted;
2169 private extern(C) void free(void*) nothrow pure @nogc @trusted;
2170 
2171 unittest
2172 {
2173     auto source = cast(ubyte[]) q{ import std.stdio;}};
2174     auto tokens = getTokensForParser(source, LexerConfig(),
2175         new StringCache(StringCache.defaultBucketCount));
2176     assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
2177         tok!"identifier", tok!";"]));
2178 }
2179 
2180 /// Test \x char sequence
2181 unittest
2182 {
2183     auto toks = (string s) => byToken(cast(ubyte[])s);
2184 
2185     // valid
2186     immutable hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F'];
2187     auto source = "";
2188     foreach (h1; hex)
2189         foreach (h2; hex)
2190             source ~= "'\\x" ~ h1 ~ h2 ~ "'";
2191     assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty);
2192 
2193     // invalid
2194     assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2195     assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2196     assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2197     assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2198     assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2199 }
2200 
2201 version (iasm64NotWindows)
2202 {
2203     /**
2204      * Skips between 0 and 16 bytes that match (or do not match) one of the
2205      * given $(B chars).
2206      */
2207     void skip(bool matching, chars...)(const ubyte*, ulong*, ulong*) pure nothrow
2208         @trusted @nogc if (chars.length <= 8)
2209     {
2210         enum constant = ByteCombine!chars;
2211         enum charsLength = chars.length;
2212         static if (matching)
2213             enum flags = 0b0001_0000;
2214         else
2215             enum flags = 0b0000_0000;
2216         asm pure nothrow @nogc
2217         {
2218             naked;
2219             movdqu XMM1, [RDX];
2220             mov R10, constant;
2221             movq XMM2, R10;
2222             mov RAX, charsLength;
2223             mov RDX, 16;
2224             pcmpestri XMM2, XMM1, flags;
2225             add [RSI], RCX;
2226             add [RDI], RCX;
2227             ret;
2228         }
2229     }
2230 
2231     /**
2232      * Returns: the number of bytes starting at the given location that match
2233      *     (or do not match if $(B invert) is true) the byte ranges in $(B chars).
2234      */
2235     ulong rangeMatch(bool invert, chars...)(const ubyte*) pure nothrow @trusted @nogc
2236     {
2237         static assert (chars.length % 2 == 0);
2238         enum constant = ByteCombine!chars;
2239         static if (invert)
2240             enum rangeMatchFlags = 0b0000_0100;
2241         else
2242             enum rangeMatchFlags = 0b0001_0100;
2243         enum charsLength = chars.length;
2244         asm pure nothrow @nogc
2245         {
2246             naked;
2247             movdqu XMM1, [RDI];
2248             mov R10, constant;
2249             movq XMM2, R10;
2250             mov RAX, charsLength;
2251             mov RDX, 16;
2252             pcmpestri XMM2, XMM1, rangeMatchFlags;
2253             mov RAX, RCX;
2254             ret;
2255         }
2256     }
2257 
2258     template ByteCombine(c...)
2259     {
2260         static assert (c.length <= 8);
2261         static if (c.length > 1)
2262             enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8);
2263         else
2264             enum ulong ByteCombine = c[0];
2265     }
2266 }
2267 
2268 unittest
2269 {
2270     import core.exception : RangeError;
2271     import std.exception : assertNotThrown;
2272 
2273     static immutable src1 = "/++";
2274     static immutable src2 = "/**";
2275 
2276     LexerConfig cf;
2277     StringCache ca = StringCache(16);
2278 
2279     assertNotThrown!RangeError(getTokensForParser(src1, cf, &ca));
2280     assertNotThrown!RangeError(getTokensForParser(src2, cf, &ca));
2281 }
2282 
2283 unittest
2284 {
2285     static immutable src = `"\eeee"`;
2286 
2287     LexerConfig cf;
2288     StringCache ca = StringCache(16);
2289 
2290     auto l = DLexer(src, cf, &ca);
2291     assert(l.front().type == tok!"");
2292     assert(!l.messages.empty);
2293 }
2294 
2295 unittest
2296 {
2297     alias Msg = DLexer.Message;
2298     LexerConfig cf;
2299     StringCache ca = StringCache(16);
2300 
2301     {
2302         auto l = DLexer(`"\&copy;"`, cf, &ca);
2303         assert(l.front().type == tok!"stringLiteral");
2304         assert(l.messages == []);
2305     }
2306     {
2307         auto l = DLexer(`"\&trade;\&urcorn;"`, cf, &ca);
2308         assert(l.front().type == tok!"stringLiteral");
2309         assert(l.messages == []);
2310     }
2311     {
2312         auto l = DLexer(`"\&trade"`, cf, &ca);
2313         assert(l.front().type == tok!"");
2314         assert(l.messages == [ Msg(1, 9, "Error: invalid named character entity", true) ]);
2315     }
2316     {
2317         auto l = DLexer(`"\&trade;\&urcorn"`, cf, &ca);
2318         assert(l.front().type == tok!"");
2319         assert(l.messages == [ Msg(1, 18, "Error: invalid named character entity", true) ]);
2320     }
2321     {
2322         auto l = DLexer(`"\&"`, cf, &ca);
2323         assert(l.front().type == tok!"");
2324         assert(l.messages == [ Msg(1, 4, "Error: invalid named character entity", true) ]);
2325     }
2326     {
2327         auto l = DLexer(`"\&0"`, cf, &ca);
2328         assert(l.front().type == tok!"");
2329         assert(l.messages == [ Msg(1, 5, "Error: invalid named character entity", true) ]);
2330     }
2331     {
2332         auto l = DLexer(`"\&copy`, cf, &ca);
2333         assert(l.front().type == tok!"");
2334         assert(l.messages == [ Msg(1, 8, "Error: invalid named character entity", true) ]);
2335     }
2336     {
2337         auto l = DLexer(`"\&copy;`, cf, &ca);
2338         assert(l.front().type == tok!"");
2339         assert(l.messages == [ Msg(1, 9, "Error: unterminated string literal", true) ]);
2340     }
2341 }
2342 
2343 // legacy code using compatibility comment and trailingComment
2344 unittest
2345 {
2346     import std.conv : to;
2347     import std.exception : enforce;
2348 
2349     static immutable src = `/// this is a module.
2350 // mixed
2351 /// it can do stuff
2352 module foo.bar;
2353 
2354 // hello
2355 
2356 /**
2357  * some doc
2358  * hello
2359  */
2360 int x; /// very nice
2361 
2362 // TODO: do stuff
2363 void main() {
2364     #line 40
2365     /// could be better
2366     writeln(":)");
2367 }
2368 
2369 /// end of file`;
2370 
2371     LexerConfig cf;
2372     StringCache ca = StringCache(16);
2373 
2374     const tokens = getTokensForParser(src, cf, &ca);
2375 
2376     void assertEquals(T)(T a, T b, string what, string file = __FILE__, size_t line = __LINE__)
2377     {
2378         enforce(a == b, "Failed " ~ what ~ " '" ~ a.to!string ~ "' == '" ~ b.to!string ~ "'", file, line);
2379     }
2380 
2381     void test(size_t index, IdType type, string comment, string trailingComment,
2382             string file = __FILE__, size_t line = __LINE__)
2383     {
2384         assertEquals(tokens[index].type, type, "type", file, line);
2385         assertEquals(tokens[index].comment, comment, "comment", file, line);
2386         assertEquals(tokens[index].trailingComment, trailingComment, "trailingComment", file, line);
2387     }
2388 
2389     test(0, tok!"module", "this is a module.\nit can do stuff", "");
2390     test(1, tok!"identifier", "", "");
2391     test(2, tok!".", "", "");
2392     test(3, tok!"identifier", "", "");
2393     test(4, tok!";", "", "");
2394     test(5, tok!"int", "some doc\nhello", "");
2395     test(6, tok!"identifier", "", "");
2396     test(7, tok!";", "", "very nice");
2397     test(8, tok!"void", "", "");
2398     test(9, tok!"identifier", "", "");
2399     test(10, tok!"(", "", "");
2400     test(11, tok!")", "", "");
2401     test(12, tok!"{", "", "");
2402     test(13, tok!"identifier", "could be better", "");
2403     test(14, tok!"(", "", "");
2404     test(15, tok!"stringLiteral", "", "");
2405     test(16, tok!")", "", "");
2406     test(17, tok!";", "", "");
2407     test(18, tok!"}", "", "");
2408 }
2409 
2410 // dlang-community/D-Scanner#805
2411 unittest
2412 {
2413     final class SomeExpr
2414     {
2415         Token tok;
2416     }
2417 
2418     auto e1 = new SomeExpr();
2419     const e2 = new SomeExpr();
2420     immutable e3 = new immutable SomeExpr();
2421 
2422     immutable t1 = e1.tok;
2423     immutable t2 = e2.tok;
2424     immutable t3 = e3.tok;
2425 }