1 module dparse.lexer;
2 
3 import std.typecons;
4 import std.typetuple;
5 import std.array;
6 import std.algorithm;
7 import std.range;
8 import std.experimental.lexer;
9 import std.traits;
10 import core.cpuid : sse42;
11 
12 public import dparse.trivia;
13 
14 /// Operators
15 private enum operators = [
16     ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
17     "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
18     "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
19     "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^",
20     "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
21 ];
22 
23 /// Kewords
24 private enum keywords = [
25     "abstract", "alias", "align", "asm", "assert", "auto", "bool",
26     "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
27     "char", "class", "const", "continue", "creal", "dchar", "debug", "default",
28     "delegate", "delete", "deprecated", "do", "double", "else", "enum",
29     "export", "extern", "false", "final", "finally", "float", "for", "foreach",
30     "foreach_reverse", "function", "goto", "idouble", "if", "ifloat",
31     "immutable", "import", "in", "inout", "int", "interface", "invariant",
32     "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow",
33     "null", "out", "override", "package", "pragma", "private", "protected",
34     "public", "pure", "real", "ref", "return", "scope", "shared", "short",
35     "static", "struct", "super", "switch", "synchronized", "template", "this",
36     "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent",
37     "uint", "ulong", "union", "unittest", "ushort", "version", "void",
38     "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__",
39     "__FILE_FULL_PATH__", "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__",
40     "__parameters", "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits",
41     "__vector", "__VENDOR__", "__VERSION__"
42 ];
43 
44 /// Other tokens
45 private enum dynamicTokens = [
46     "specialTokenSequence", "comment", "identifier", "scriptLine",
47     "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral",
48     "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral",
49     "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral",
50     "dstringLiteral", "stringLiteral", "wstringLiteral"
51 ];
52 
53 private enum pseudoTokenHandlers = [
54     "\"", "lexStringLiteral",
55     "`", "lexWysiwygString",
56     "//", "lexSlashSlashComment",
57     "/*", "lexSlashStarComment",
58     "/+", "lexSlashPlusComment",
59     ".", "lexDot",
60     "'", "lexCharacterLiteral",
61     "0", "lexNumber",
62     "1", "lexDecimal",
63     "2", "lexDecimal",
64     "3", "lexDecimal",
65     "4", "lexDecimal",
66     "5", "lexDecimal",
67     "6", "lexDecimal",
68     "7", "lexDecimal",
69     "8", "lexDecimal",
70     "9", "lexDecimal",
71     "q\"", "lexDelimitedString",
72     "q{", "lexTokenString",
73     "r\"", "lexWysiwygString",
74     "x\"", "lexHexString",
75     " ", "lexWhitespace",
76     "\t", "lexWhitespace",
77     "\r", "lexWhitespace",
78     "\n", "lexWhitespace",
79     "\v", "lexWhitespace",
80     "\f", "lexWhitespace",
81     "\u2028", "lexLongNewline",
82     "\u2029", "lexLongNewline",
83     "#!", "lexScriptLine",
84     "#line", "lexSpecialTokenSequence"
85 ];
86 
87 /// Token ID type for the D lexer.
88 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
89 
90 /**
91  * Function used for converting an IdType to a string.
92  *
93  * Examples:
94  * ---
95  * IdType c = tok!"case";
96  * assert (str(c) == "case");
97  * ---
98  */
99 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
100 
101 /**
102  * Template used to refer to D token types.
103  *
104  * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for
105  * values that can be passed to this template.
106  * Example:
107  * ---
108  * import dparse.lexer;
109  * IdType t = tok!"floatLiteral";
110  * ---
111  */
112 public template tok(string token)
113 {
114     alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
115 }
116 
117 mixin template TokenTriviaFields()
118 {
119     /**
120      * Whitespace and comment tokens attached to this token.
121      *
122      * All trivia tokens must have the text property set to the text with
123      * which they identify with. This means you can map all trivia tokens to
124      * their .text property and join them together to get the source code back
125      * without any loss of information.
126      *
127      * Trivia is only included when calling getTokensForParser. When iterating
128      * over DLexer all tokens will be in their raw form and none will be
129      * converted to trivia.
130      *
131      * Note: in the future you might need to explicitly pass
132      * WhitespaceBehavior.include (or keep the default) as getTokensForParser
133      * currently overrides it to include.
134      *
135      * Contains: `comment`, `whitespace`, `specialTokenSequence`
136      */
137     immutable(typeof(this))[] leadingTrivia;
138     /// ditto
139     immutable(typeof(this))[] trailingTrivia;
140 
141     string memoizedLeadingComment = null;
142     string memoizedTrailingComment = null;
143 
144     /// Legacy property to get documentation comments, with comment border
145     /// stripped off, which is attached to this token.
146     string comment() const pure nothrow @safe @property {
147         import dparse.trivia : extractLeadingDdoc;
148         if (memoizedLeadingComment !is null)
149             return memoizedLeadingComment;
150         return (cast()memoizedLeadingComment) = this.extractLeadingDdoc;
151     }
152 
153     /// ditto
154     string trailingComment() const pure nothrow @safe @property {
155         import dparse.trivia : extractTrailingDdoc;
156         if (memoizedTrailingComment !is null)
157             return memoizedTrailingComment;
158         return (cast()memoizedTrailingComment) = this.extractTrailingDdoc;
159     }
160 
161     int opCmp(size_t i) const pure nothrow @safe @nogc {
162         if (index < i) return -1;
163         if (index > i) return 1;
164         return 0;
165     }
166 
167     int opCmp(ref const typeof(this) other) const pure nothrow @safe @nogc {
168         return opCmp(other.index);
169     }
170 }
171 
172 // mixin in from dparse.lexer to make error messages more managable size as the
173 // entire string is dumped when there is a type mismatch.
174 private enum extraFields = "import dparse.lexer:TokenTriviaFields; mixin TokenTriviaFields;";
175 
176 /// The token type in the D lexer
177 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields);
178 
179 /**
180  * Configure whitespace handling
181  */
182 public enum WhitespaceBehavior : ubyte
183 {
184     include = 0b0000_0000,
185     skip = 0b0000_0001,
186 }
187 
188 private enum stringBehaviorNotWorking = "Automatic string parsing is not "
189     ~ "supported and was previously not working. To unescape strings use the "
190     ~ "`dparse.strings : unescapeString` function on the token texts instead.";
191 
192 /**
193  * Configure string lexing behavior
194  */
195 // was enum, but struct now for deprecations and support with old compilers
196 public struct StringBehavior
197 {
198     /// Do not include quote characters, process escape sequences
199     deprecated(stringBehaviorNotWorking) static immutable StringBehavior compiler = StringBehavior(0b0000_0000);
200     /// Opening quotes, closing quotes, and string suffixes are included in
201     /// the string token
202     deprecated(stringBehaviorNotWorking) static immutable StringBehavior includeQuoteChars = StringBehavior(0b0000_0001);
203     /// String escape sequences are not replaced
204     deprecated(stringBehaviorNotWorking) static immutable StringBehavior notEscaped = StringBehavior(0b0000_0010);
205     /// Not modified at all. Useful for formatters or highlighters
206     static immutable StringBehavior source = StringBehavior(0b0000_0011);
207 
208     ubyte behavior;
209     alias behavior this;
210 }
211 
212 public enum CommentBehavior : bool
213 {
214     intern = true,
215     noIntern = false
216 }
217 /**
218  * Lexer configuration struct
219  */
220 public struct LexerConfig
221 {
222     string fileName;
223     StringBehavior stringBehavior;
224     WhitespaceBehavior whitespaceBehavior;
225     CommentBehavior commentBehavior = CommentBehavior.intern;
226 }
227 
228 /**
229  * Basic type token types.
230  */
231 public alias BasicTypes = AliasSeq!(tok!"int", tok!"bool", tok!"byte",
232         tok!"cdouble", tok!"cent", tok!"cfloat", tok!"char", tok!"creal",
233         tok!"dchar", tok!"double", tok!"float", tok!"idouble",
234         tok!"ifloat", tok!"ireal", tok!"long", tok!"real", tok!"short",
235         tok!"ubyte", tok!"ucent", tok!"uint", tok!"ulong", tok!"ushort",
236         tok!"void", tok!"wchar");
237 
238 /**
239  * Returns: true if the given ID is for a basic type.
240  */
241 public bool isBasicType(IdType type) nothrow pure @safe @nogc
242 {
243     switch (type)
244     {
245     foreach (T; BasicTypes)
246     {
247     case T:
248         return true;
249     }
250     default:
251         return false;
252     }
253 }
254 
255 /**
256  * Number literal token types.
257  */
258 public alias NumberLiterals = AliasSeq!(tok!"doubleLiteral",
259         tok!"floatLiteral", tok!"idoubleLiteral", tok!"ifloatLiteral",
260         tok!"intLiteral", tok!"longLiteral", tok!"realLiteral",
261         tok!"irealLiteral", tok!"uintLiteral", tok!"ulongLiteral");
262 
263 /**
264  * Returns: true if the given ID type is for a number literal.
265  */
266 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc
267 {
268     switch (type)
269     {
270     foreach (T; NumberLiterals)
271     {
272     case T:
273         return true;
274     }
275     default:
276         return false;
277     }
278 }
279 
280 /**
281  * Number literal token types.
282  */
283 public alias IntegerLiterals = AliasSeq!(tok!"intLiteral", tok!"longLiteral",
284         tok!"uintLiteral", tok!"ulongLiteral");
285 
286 /**
287  * Returns: true if the given ID type is for a integer literal.
288  */
289 public bool isIntegerLiteral(IdType type) nothrow pure @safe @nogc
290 {
291     switch (type)
292     {
293     foreach (T; IntegerLiterals)
294     {
295     case T:
296         return true;
297     }
298     default:
299         return false;
300     }
301 }
302 
303 /**
304  * Operator token types.
305  */
306 public alias Operators = AliasSeq!(tok!",", tok!".", tok!"..", tok!"...",
307         tok!"/", tok!"/=", tok!"!", tok!"!<", tok!"!<=", tok!"!<>",
308         tok!"!<>=", tok!"!=", tok!"!>", tok!"!>=", tok!"$", tok!"%",
309         tok!"%=", tok!"&", tok!"&&", tok!"&=", tok!"(", tok!")",
310         tok!"*", tok!"*=", tok!"+", tok!"++", tok!"+=", tok!"-",
311         tok!"--", tok!"-=", tok!":", tok!";", tok!"<", tok!"<<",
312         tok!"<<=", tok!"<=", tok!"<>", tok!"<>=", tok!"=", tok!"==",
313         tok!"=>", tok!">", tok!">=", tok!">>", tok!">>=", tok!">>>",
314         tok!">>>=", tok!"?", tok!"@", tok!"[", tok!"]", tok!"^",
315         tok!"^=", tok!"^^", tok!"^^=", tok!"{", tok!"|", tok!"|=",
316         tok!"||", tok!"}", tok!"~", tok!"~=");
317 
318 /**
319  * Returns: true if the given ID type is for an operator.
320  */
321 public bool isOperator(IdType type) nothrow pure @safe @nogc
322 {
323     switch (type)
324     {
325     foreach (T; Operators)
326     {
327     case T:
328         return true;
329     }
330     default:
331         return false;
332     }
333 }
334 
335 /**
336  * Keyword token types.
337  */
338 public alias Keywords = AliasSeq!(tok!"abstract", tok!"alias", tok!"align",
339         tok!"asm", tok!"assert", tok!"auto", tok!"break",
340         tok!"case", tok!"cast", tok!"catch", tok!"class", tok!"const",
341         tok!"continue", tok!"debug", tok!"default", tok!"delegate",
342         tok!"delete", tok!"deprecated", tok!"do", tok!"else", tok!"enum",
343         tok!"export", tok!"extern", tok!"false", tok!"final", tok!"finally",
344         tok!"for", tok!"foreach", tok!"foreach_reverse", tok!"function",
345         tok!"goto", tok!"if", tok!"immutable", tok!"import", tok!"in",
346         tok!"inout", tok!"interface", tok!"invariant", tok!"is",
347         tok!"lazy", tok!"macro", tok!"mixin", tok!"module", tok!"new",
348         tok!"nothrow", tok!"null", tok!"out", tok!"override", tok!"package",
349         tok!"pragma", tok!"private", tok!"protected", tok!"public",
350         tok!"pure", tok!"ref", tok!"return", tok!"scope", tok!"shared",
351         tok!"static", tok!"struct", tok!"super", tok!"switch", tok!"synchronized",
352         tok!"template", tok!"this", tok!"throw", tok!"true", tok!"try",
353         tok!"typedef", tok!"typeid", tok!"typeof", tok!"union", tok!"unittest",
354         tok!"version", tok!"while", tok!"with", tok!"__DATE__",
355         tok!"__EOF__", tok!"__FILE__", tok!"__FILE_FULL_PATH__", tok!"__FUNCTION__",
356         tok!"__gshared", tok!"__LINE__", tok!"__MODULE__", tok!"__parameters",
357         tok!"__PRETTY_FUNCTION__", tok!"__TIME__", tok!"__TIMESTAMP__",
358         tok!"__traits", tok!"__vector", tok!"__VENDOR__", tok!"__VERSION__");
359 
360 /**
361  * Returns: true if the given ID type is for a keyword.
362  */
363 public bool isKeyword(IdType type) pure nothrow @safe @nogc
364 {
365     switch (type)
366     {
367     foreach (T; Keywords)
368     {
369     case T:
370         return true;
371     }
372     default:
373         return false;
374     }
375 }
376 
377 /**
378  * String literal token types
379  */
380 public alias StringLiterals = AliasSeq!(tok!"dstringLiteral",
381         tok!"stringLiteral", tok!"wstringLiteral");
382 
383 /**
384  * Returns: true if the given ID type is for a string literal.
385  */
386 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc
387 {
388     switch (type)
389     {
390     foreach (T; StringLiterals)
391     {
392     case T:
393         return true;
394     }
395     default:
396         return false;
397     }
398 }
399 
400 /**
401  * Protection token types.
402  */
403 public alias Protections = AliasSeq!(tok!"export", tok!"package",
404         tok!"private", tok!"public", tok!"protected");
405 
406 /**
407  * Returns: true if the given ID type is for a protection attribute.
408  */
409 public bool isProtection(IdType type) pure nothrow @safe @nogc
410 {
411     switch (type)
412     {
413     foreach (T; Protections)
414     {
415     case T:
416         return true;
417     }
418     default:
419         return false;
420     }
421 }
422 
423 public alias SpecialTokens = AliasSeq!(tok!"__DATE__", tok!"__TIME__",
424     tok!"__TIMESTAMP__", tok!"__VENDOR__", tok!"__VERSION__", tok!"__FILE__",
425     tok!"__FILE_FULL_PATH__", tok!"__LINE__", tok!"__MODULE__",
426     tok!"__FUNCTION__", tok!"__PRETTY_FUNCTION__");
427 
428 public bool isSpecialToken(IdType type) pure nothrow @safe @nogc
429 {
430     switch (type)
431     {
432     foreach (T; SpecialTokens)
433     {
434     case T:
435         return true;
436     }
437     default:
438         return false;
439     }
440 }
441 
442 public alias Literals = AliasSeq!(StringLiterals, NumberLiterals, tok!"characterLiteral",
443         SpecialTokens, tok!"true", tok!"false", tok!"null", tok!"$");
444 
445 public bool isLiteral(IdType type) pure nothrow @safe @nogc
446 {
447     switch (type)
448     {
449     foreach (T; Literals)
450     {
451     case T:
452         return true;
453     }
454     default:
455         return false;
456     }
457 }
458 
459 /**
460  * Returns: an array of tokens lexed from the given source code to the output
461  * range. All whitespace, comment and specialTokenSequence tokens (trivia) are
462  * attached to the token nearest to them.
463  *
464  * Trivia is put on the last token as `trailingTrivia` if it is on the same
465  * line as the trivia, otherwise it will be attached to the next token in the
466  * `leadingTrivia` until there is the EOF, where it will be attached as
467  * `trailingTrivia` again.
468  */
469 const(Token)[] getTokensForParser(R)(R sourceCode, LexerConfig config, StringCache* cache)
470 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
471 {
472     config.whitespaceBehavior = WhitespaceBehavior.include;
473     config.commentBehavior = CommentBehavior.noIntern;
474 
475     auto leadingTriviaAppender = appender!(Token[])();
476     leadingTriviaAppender.reserve(128);
477     auto trailingTriviaAppender = appender!(Token[])();
478     trailingTriviaAppender.reserve(128);
479 
480     auto output = appender!(typeof(return))();
481     auto lexer = DLexer(sourceCode, config, cache);
482     loop: while (!lexer.empty) switch (lexer.front.type)
483     {
484     case tok!"specialTokenSequence":
485     case tok!"whitespace":
486     case tok!"comment":
487         if (!output.data.empty && lexer.front.line == output.data[$ - 1].line)
488             trailingTriviaAppender.put(lexer.front);
489         else
490             leadingTriviaAppender.put(lexer.front);
491         lexer.popFront();
492         break;
493     case tok!"__EOF__":
494         break loop;
495     default:
496         Token t = lexer.front;
497         lexer.popFront();
498 
499         if (!output.data.empty && !trailingTriviaAppender.data.empty)
500             (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup;
501         t.leadingTrivia = leadingTriviaAppender.data.idup;
502         leadingTriviaAppender.clear();
503         trailingTriviaAppender.clear();
504 
505         output.put(t);
506         break;
507     }
508 
509     if (!output.data.empty)
510     {
511         trailingTriviaAppender.put(leadingTriviaAppender.data);
512         (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup;
513     }
514 
515     return output.data;
516 }
517 
518 /**
519  * The D lexer struct.
520  */
521 public struct DLexer
522 {
523     mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
524         keywords, pseudoTokenHandlers);
525 
526     ///
527     @disable this();
528 
529     /**
530      * Params:
531      *     range = the bytes that compose the source code that will be lexed.
532      *     config = the lexer configuration to use.
533      *     cache = the string interning cache for de-duplicating identifiers and
534      *         other token text.
535      *     haveSSE42 = Parse streaming SIMD Extensions 4.2 in inline assembly
536      */
537     this(R)(R range, const LexerConfig config, StringCache* cache,
538         bool haveSSE42 = sse42()) pure nothrow @safe
539     if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
540     {
541         this.haveSSE42 = haveSSE42;
542         auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf)
543             ? range[3 .. $] : range;
544         static if (is(ElementEncodingType!R == immutable))
545             this.range = LexerRange(cast(const(ubyte)[]) r);
546         else
547             this.range = LexerRange(cast(const(ubyte)[]) r.idup);
548         this.config = config;
549         this.cache = cache;
550         popFront();
551     }
552 
553     ///
554     public void popFront()() pure nothrow @safe
555     {
556         do
557             _popFront();
558         while (config.whitespaceBehavior == WhitespaceBehavior.skip
559             && _front.type == tok!"whitespace");
560     }
561 
562     /**
563      * Lexer error/warning message.
564      */
565     static struct Message
566     {
567         /// 1-based line number
568         size_t line;
569         /// 1-based byte offset
570         size_t column;
571         /// Text of the message
572         string message;
573         /// `true` for an error, `false` for a warning
574         bool isError;
575     }
576 
577     /**
578      * Returns: An array of all of the warnings and errors generated so far
579      *     during lexing. It may make sense to only check this when `empty`
580      *     returns `true`.
581      */
582     const(Message[]) messages() const @property
583     {
584         return _messages;
585     }
586 
587 private pure nothrow @safe:
588 
589     bool isWhitespace()
590     {
591         switch (range.bytes[range.index])
592         {
593         case ' ':
594         case '\r':
595         case '\n':
596         case '\t':
597         case '\v':
598         case '\f':
599             return true;
600         case 0xe2:
601             auto peek = range.peek(2);
602             return peek.length == 2
603                 && peek[0] == 0x80
604                 && (peek[1] == 0xa8 || peek[1] == 0xa9);
605         default:
606             return false;
607         }
608     }
609 
610     void popFrontWhitespaceAware()
611     {
612         switch (range.bytes[range.index])
613         {
614         case '\r':
615             range.popFront();
616             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
617             {
618                 range.popFront();
619                 range.incrementLine();
620             }
621             else
622                 range.incrementLine();
623             return;
624         case '\n':
625             range.popFront();
626             range.incrementLine();
627             return;
628         case 0xe2:
629             auto lookahead = range.peek(3);
630             if (lookahead.length == 3 && lookahead[1] == 0x80
631                 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
632             {
633                 range.index+=3;
634                 range.column+=3;
635                 range.incrementLine();
636                 return;
637             }
638             else
639             {
640                 range.popFront();
641                 return;
642             }
643         default:
644             range.popFront();
645             return;
646         }
647     }
648 
649     void lexWhitespace(ref Token token) @trusted
650     {
651         mixin (tokenStart);
652         loop: do
653         {
654             version (X86_64)
655             {
656                 if (haveSSE42 && range.index + 16 < range.bytes.length)
657                 {
658                     skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index,
659                         &range.index, &range.column);
660                 }
661             }
662             switch (range.bytes[range.index])
663             {
664             case '\r':
665                 range.popFront();
666                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
667                 {
668                     range.popFront();
669                 }
670                 range.column = 1;
671                 range.line += 1;
672                 break;
673             case '\n':
674                 range.popFront();
675                 range.column = 1;
676                 range.line += 1;
677                 break;
678             case ' ':
679             case '\t':
680             case '\v':
681             case '\f':
682                 range.popFront();
683                 break;
684             case 0xe2:
685                 if (range.index + 2 >= range.bytes.length)
686                     break loop;
687                 if (range.bytes[range.index + 1] != 0x80)
688                     break loop;
689                 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9)
690                 {
691                     range.index += 3;
692                     range.column += 3;
693                     range.column = 1;
694                     range.line += 1;
695                     break;
696                 }
697                 break loop;
698             default:
699                 break loop;
700             }
701         } while (!(range.index >= range.bytes.length));
702         string text = config.whitespaceBehavior == WhitespaceBehavior.include
703             ? cache.intern(range.slice(mark)) : "";
704         token = Token(tok!"whitespace", text, line, column, index);
705     }
706 
707     void lexNumber(ref Token token)
708     {
709         mixin (tokenStart);
710         if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length)
711         {
712             immutable ahead = range.bytes[range.index + 1];
713             switch (ahead)
714             {
715             case 'x':
716             case 'X':
717                 range.index += 2;
718                 range.column += 2;
719                 lexHex(token, mark, line, column, index);
720                 return;
721             case 'b':
722             case 'B':
723                 range.index += 2;
724                 range.column += 2;
725                 lexBinary(token, mark, line, column, index);
726                 return;
727             default:
728                 lexDecimal(token, mark, line, column, index);
729                 return;
730             }
731         }
732         else
733             lexDecimal(token, mark, line, column, index);
734     }
735 
736     void lexHex(ref Token token)
737     {
738         mixin (tokenStart);
739         lexHex(token, mark, line, column, index);
740     }
741 
742     void lexHex(ref Token token, size_t mark, size_t line, size_t column,
743         size_t index) @trusted
744     {
745         IdType type = tok!"intLiteral";
746         bool foundDot;
747         hexLoop: while (!(range.index >= range.bytes.length))
748         {
749             switch (range.bytes[range.index])
750             {
751             case 'a': .. case 'f':
752             case 'A': .. case 'F':
753             case '0': .. case '9':
754             case '_':
755                 version (X86_64)
756                 {
757                     if (haveSSE42 && range.index + 16 < range.bytes.length)
758                     {
759                         immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_')
760                             (range.bytes.ptr + range.index);
761                         range.column += i;
762                         range.index += i;
763                     }
764                     else
765                         range.popFront();
766                 }
767                 else
768                     range.popFront();
769                 break;
770             case 'u':
771             case 'U':
772                 lexIntSuffix(type);
773                 break hexLoop;
774             case 'i':
775                 if (foundDot)
776                     lexFloatSuffix(type);
777                 break hexLoop;
778             case 'L':
779                 if (foundDot)
780                     lexFloatSuffix(type);
781                 else
782                     lexIntSuffix(type);
783                 break hexLoop;
784             case 'p':
785             case 'P':
786                 lexExponent(type);
787                 break hexLoop;
788             case '.':
789                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
790                     break hexLoop;
791                 else
792                 {
793                     // The following bit of silliness tries to tell the
794                     // difference between "int dot identifier" and
795                     // "double identifier".
796                     if (range.index + 1 < range.bytes.length)
797                     {
798                         switch (range.peekAt(1))
799                         {
800                         case '0': .. case '9':
801                         case 'A': .. case 'F':
802                         case 'a': .. case 'f':
803                             goto doubleLiteral;
804                         default:
805                             break hexLoop;
806                         }
807                     }
808                     else
809                     {
810                     doubleLiteral:
811                         range.popFront();
812                         foundDot = true;
813                         type = tok!"doubleLiteral";
814                     }
815                 }
816                 break;
817             default:
818                 break hexLoop;
819             }
820         }
821         token = Token(type, cache.intern(range.slice(mark)), line, column,
822             index);
823     }
824 
825     void lexBinary(ref Token token)
826     {
827         mixin (tokenStart);
828         return lexBinary(token, mark, line, column, index);
829     }
830 
831     void lexBinary(ref Token token, size_t mark, size_t line, size_t column,
832         size_t index) @trusted
833     {
834         IdType type = tok!"intLiteral";
835         binaryLoop: while (!(range.index >= range.bytes.length))
836         {
837             switch (range.bytes[range.index])
838             {
839             case '0':
840             case '1':
841             case '_':
842                 version (X86_64)
843                 {
844                     if (haveSSE42 && range.index + 16 < range.bytes.length)
845                     {
846                         immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')(
847                             range.bytes.ptr + range.index);
848                         range.column += i;
849                         range.index += i;
850                     }
851                     else
852                         range.popFront();
853                 }
854                 else
855                     range.popFront();
856                 break;
857             case 'u':
858             case 'U':
859             case 'L':
860                 lexIntSuffix(type);
861                 break binaryLoop;
862             default:
863                 break binaryLoop;
864             }
865         }
866         token = Token(type, cache.intern(range.slice(mark)), line, column,
867             index);
868     }
869 
870     void lexDecimal(ref Token token)
871     {
872         mixin (tokenStart);
873         lexDecimal(token, mark, line, column, index);
874     }
875 
876     void lexDecimal(ref Token token, size_t mark, size_t line, size_t column,
877         size_t index) @trusted
878     {
879         bool foundDot = range.bytes[range.index] == '.';
880         IdType type = tok!"intLiteral";
881         if (foundDot)
882         {
883             range.popFront();
884             type = tok!"doubleLiteral";
885         }
886 
887         decimalLoop: while (!(range.index >= range.bytes.length))
888         {
889             switch (range.bytes[range.index])
890             {
891             case '0': .. case '9':
892             case '_':
893                 version (X86_64)
894                 {
895                     if (haveSSE42 && range.index + 16 < range.bytes.length)
896                     {
897                         immutable ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index);
898                         range.column += i;
899                         range.index += i;
900                     }
901                     else
902                         range.popFront();
903                 }
904                 else
905                     range.popFront();
906                 break;
907             case 'u':
908             case 'U':
909                 if (!foundDot)
910                     lexIntSuffix(type);
911                 break decimalLoop;
912             case 'i':
913                 lexFloatSuffix(type);
914                 break decimalLoop;
915             case 'L':
916                 if (foundDot)
917                     lexFloatSuffix(type);
918                 else
919                     lexIntSuffix(type);
920                 break decimalLoop;
921             case 'f':
922             case 'F':
923                 lexFloatSuffix(type);
924                 break decimalLoop;
925             case 'e':
926             case 'E':
927                 lexExponent(type);
928                 break decimalLoop;
929             case '.':
930                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
931                     break decimalLoop;
932                 else
933                 {
934                     // The following bit of silliness tries to tell the
935                     // difference between "int dot identifier" and
936                     // "double identifier".
937                     if (range.index + 1 < range.bytes.length)
938                     {
939                         immutable ch = range.peekAt(1);
940                         if (ch <= 0x2f
941                             || (ch >= '0' && ch <= '9')
942                             || (ch >= ':' && ch <= '@')
943                             || (ch >= '[' && ch <= '^')
944                             || (ch >= '{' && ch <= '~')
945                             || ch == '`' || ch == '_')
946                         {
947                             goto doubleLiteral;
948                         }
949                         else
950                             break decimalLoop;
951                     }
952                     else
953                     {
954                     doubleLiteral:
955                         range.popFront();
956                         foundDot = true;
957                         type = tok!"doubleLiteral";
958                     }
959                 }
960                 break;
961             default:
962                 break decimalLoop;
963             }
964         }
965         token = Token(type, cache.intern(range.slice(mark)), line, column,
966             index);
967     }
968 
969     void lexIntSuffix(ref IdType type) pure nothrow @safe
970     {
971         bool secondPass;
972         if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U')
973         {
974     U:
975             if (type == tok!"intLiteral")
976                 type = tok!"uintLiteral";
977             else
978                 type = tok!"ulongLiteral";
979             range.popFront();
980             if (secondPass)
981                 return;
982             if (range.index < range.bytes.length
983                     && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l'))
984                 goto L;
985             goto I;
986         }
987         if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')
988         {
989     L:
990             if (type == tok!"uintLiteral")
991                 type = tok!"ulongLiteral";
992             else
993                 type = tok!"longLiteral";
994             range.popFront();
995             if (range.index < range.bytes.length
996                     && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u'))
997             {
998                 secondPass = true;
999                 goto U;
1000             }
1001             goto I;
1002         }
1003     I:
1004         if (range.index < range.bytes.length && range.bytes[range.index] == 'i')
1005         {
1006             warning("Complex number literals are deprecated");
1007             range.popFront();
1008             if (type == tok!"longLiteral" || type == tok!"ulongLiteral")
1009                 type = tok!"idoubleLiteral";
1010             else
1011                 type = tok!"ifloatLiteral";
1012         }
1013     }
1014 
1015     void lexFloatSuffix(ref IdType type) pure nothrow @safe
1016     {
1017         switch (range.bytes[range.index])
1018         {
1019         case 'L':
1020             range.popFront();
1021             type = tok!"doubleLiteral";
1022             break;
1023         case 'f':
1024         case 'F':
1025             range.popFront();
1026             type = tok!"floatLiteral";
1027             break;
1028         default:
1029             break;
1030         }
1031         if (range.index < range.bytes.length && range.bytes[range.index] == 'i')
1032         {
1033             warning("Complex number literals are deprecated");
1034             range.popFront();
1035             if (type == tok!"floatLiteral")
1036                 type = tok!"ifloatLiteral";
1037             else
1038                 type = tok!"idoubleLiteral";
1039         }
1040     }
1041 
1042     void lexExponent(ref IdType type) pure nothrow @safe
1043     {
1044         range.popFront();
1045         bool foundSign = false;
1046         bool foundDigit = false;
1047         while (range.index < range.bytes.length)
1048         {
1049             switch (range.bytes[range.index])
1050             {
1051             case '-':
1052             case '+':
1053                 if (foundSign)
1054                 {
1055                     if (!foundDigit)
1056                     error("Expected an exponent");
1057                     return;
1058                 }
1059                 foundSign = true;
1060                 range.popFront();
1061                 break;
1062             case '0': .. case '9':
1063             case '_':
1064                 foundDigit = true;
1065                 range.popFront();
1066                 break;
1067             case 'L':
1068             case 'f':
1069             case 'F':
1070             case 'i':
1071                 lexFloatSuffix(type);
1072                 return;
1073             default:
1074                 if (!foundDigit)
1075                     error("Expected an exponent");
1076                 return;
1077             }
1078         }
1079     }
1080 
1081     void lexScriptLine(ref Token token)
1082     {
1083         mixin (tokenStart);
1084         while (!(range.index >= range.bytes.length) && !isNewline)
1085         {
1086             range.popFront();
1087         }
1088         token = Token(tok!"scriptLine", cache.intern(range.slice(mark)),
1089             line, column, index);
1090     }
1091 
1092     void lexSpecialTokenSequence(ref Token token)
1093     {
1094         mixin (tokenStart);
1095         while (!(range.index >= range.bytes.length) && !isNewline)
1096         {
1097             range.popFront();
1098         }
1099         token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
1100             line, column, index);
1101     }
1102 
1103     void lexSlashStarComment(ref Token token) @trusted
1104     {
1105         mixin (tokenStart);
1106         IdType type = tok!"comment";
1107         range.popFrontN(2);
1108         while (range.index < range.bytes.length)
1109         {
1110             version (X86_64)
1111             {
1112                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1113                     skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index,
1114                         &range.index, &range.column);
1115             }
1116             if (range.bytes[range.index] == '*')
1117             {
1118                 range.popFront();
1119                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1120                 {
1121                     range.popFront();
1122                     break;
1123                 }
1124             }
1125             else
1126                 popFrontWhitespaceAware();
1127         }
1128         if (config.commentBehavior == CommentBehavior.intern)
1129             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1130         else
1131             token = Token(type, cast(string) range.slice(mark), line, column, index);
1132     }
1133 
1134     void lexSlashSlashComment(ref Token token) @trusted
1135     {
1136         mixin (tokenStart);
1137         IdType type = tok!"comment";
1138         range.popFrontN(2);
1139         while (range.index < range.bytes.length)
1140         {
1141             version (X86_64)
1142             {
1143                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1144                 {
1145                     skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1146                         &range.index, &range.column);
1147                 }
1148             }
1149             if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n')
1150                 break;
1151             range.popFront();
1152         }
1153         if (config.commentBehavior == CommentBehavior.intern)
1154             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1155         else
1156             token = Token(type, cast(string) range.slice(mark), line, column, index);
1157     }
1158 
1159     void lexSlashPlusComment(ref Token token) @trusted
1160     {
1161         mixin (tokenStart);
1162         IdType type = tok!"comment";
1163         range.index += 2;
1164         range.column += 2;
1165         int depth = 1;
1166         while (depth > 0 && !(range.index >= range.bytes.length))
1167         {
1168             version (X86_64)
1169             {
1170                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1171                 {
1172                     skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1173                         &range.index, &range.column);
1174                 }
1175             }
1176             if (range.bytes[range.index] == '+')
1177             {
1178                 range.popFront();
1179                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1180                 {
1181                     range.popFront();
1182                     depth--;
1183                 }
1184             }
1185             else if (range.bytes[range.index] == '/')
1186             {
1187                 range.popFront();
1188                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+')
1189                 {
1190                     range.popFront();
1191                     depth++;
1192                 }
1193             }
1194             else
1195                 popFrontWhitespaceAware();
1196         }
1197         if (config.commentBehavior == CommentBehavior.intern)
1198             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1199         else
1200             token = Token(type, cast(string) range.slice(mark), line, column, index);
1201     }
1202 
1203     void lexStringLiteral(ref Token token) @trusted
1204     {
1205         mixin (tokenStart);
1206         range.popFront();
1207         while (true)
1208         {
1209             if (range.index >= range.bytes.length)
1210             {
1211                 error("Error: unterminated string literal");
1212                 token = Token(tok!"");
1213                 return;
1214             }
1215             version (X86_64)
1216             {
1217                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1218                 {
1219                     skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1220                         &range.index, &range.column);
1221                 }
1222             }
1223             if (range.bytes[range.index] == '"')
1224             {
1225                 range.popFront();
1226                 break;
1227             }
1228             else if (range.bytes[range.index] == '\\')
1229             {
1230                 if (!lexEscapeSequence())
1231                 {
1232                     token = Token.init;
1233                     return;
1234                 }
1235             }
1236             else
1237                 popFrontWhitespaceAware();
1238         }
1239         IdType type = tok!"stringLiteral";
1240         lexStringSuffix(type);
1241         token = Token(type, cache.intern(range.slice(mark)), line, column,
1242             index);
1243     }
1244 
1245     void lexWysiwygString(ref Token token) @trusted
1246     {
1247         mixin (tokenStart);
1248         IdType type = tok!"stringLiteral";
1249         immutable bool backtick = range.bytes[range.index] == '`';
1250         if (backtick)
1251         {
1252             range.popFront();
1253             while (true)
1254             {
1255                 if (range.index >= range.bytes.length)
1256                 {
1257                     error("Error: unterminated string literal");
1258                     token = Token(tok!"");
1259                     return;
1260                 }
1261                 version (X86_64)
1262                 {
1263                     if (haveSSE42 && range.index + 16 < range.bytes.length)
1264                     {
1265                         skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index,
1266                             &range.index, &range.column);
1267                     }
1268                 }
1269                 if (range.bytes[range.index] == '`')
1270                 {
1271                     range.popFront();
1272                     break;
1273                 }
1274                 else
1275                     popFrontWhitespaceAware();
1276             }
1277         }
1278         else
1279         {
1280             range.popFront();
1281             if (range.index >= range.bytes.length)
1282             {
1283                 error("Error: unterminated string literal");
1284                 token = Token(tok!"");
1285                 return;
1286             }
1287             range.popFront();
1288             while (true)
1289             {
1290                 if (range.index >= range.bytes.length)
1291                 {
1292                     error("Error: unterminated string literal");
1293                     token = Token(tok!"");
1294                     return;
1295                 }
1296                 else if (range.bytes[range.index] == '"')
1297                 {
1298                     range.popFront();
1299                     break;
1300                 }
1301                 else
1302                     popFrontWhitespaceAware();
1303             }
1304         }
1305         lexStringSuffix(type);
1306         token = Token(type, cache.intern(range.slice(mark)), line, column,
1307             index);
1308     }
1309 
1310     private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe
1311     {
1312         if (range.index >= range.bytes.length)
1313         {
1314             type = tok!"stringLiteral";
1315             return 0;
1316         }
1317         else
1318         {
1319             switch (range.bytes[range.index])
1320             {
1321             case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w';
1322             case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd';
1323             case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c';
1324             default: type = tok!"stringLiteral"; return 0;
1325             }
1326         }
1327     }
1328 
1329     void lexDelimitedString(ref Token token)
1330     {
1331         mixin (tokenStart);
1332         range.index += 2;
1333         range.column += 2;
1334         ubyte open;
1335         ubyte close;
1336         switch (range.bytes[range.index])
1337         {
1338         case '<':
1339             open = '<';
1340             close = '>';
1341             range.popFront();
1342             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1343             break;
1344         case '{':
1345             open = '{';
1346             close = '}';
1347             range.popFront();
1348             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1349             break;
1350         case '[':
1351             open = '[';
1352             close = ']';
1353             range.popFront();
1354             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1355             break;
1356         case '(':
1357             open = '(';
1358             close = ')';
1359             range.popFront();
1360             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1361             break;
1362         default:
1363             lexHeredocString(token, mark, line, column, index);
1364             break;
1365         }
1366     }
1367 
1368     void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column,
1369         size_t index, ubyte open, ubyte close)
1370     {
1371         int depth = 1;
1372         while (!(range.index >= range.bytes.length) && depth > 0)
1373         {
1374             if (range.bytes[range.index] == open)
1375             {
1376                 depth++;
1377                 range.popFront();
1378             }
1379             else if (range.bytes[range.index] == close)
1380             {
1381                 depth--;
1382                 range.popFront();
1383                 if (depth <= 0)
1384                 {
1385                     if (range.bytes[range.index] == '"')
1386                     {
1387                         range.popFront();
1388                     }
1389                     else
1390                     {
1391                         error("Error: `\"` expected to end delimited string literal");
1392                         token = Token(tok!"");
1393                         return;
1394                     }
1395                 }
1396             }
1397             else
1398                 popFrontWhitespaceAware();
1399         }
1400         IdType type = tok!"stringLiteral";
1401         lexStringSuffix(type);
1402         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1403     }
1404 
1405     void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index)
1406     {
1407         Token ident;
1408         lexIdentifier(ident);
1409         if (isNewline())
1410             popFrontWhitespaceAware();
1411         else
1412             error("Newline expected");
1413         while (!(range.index >= range.bytes.length))
1414         {
1415             if (isNewline())
1416             {
1417                 popFrontWhitespaceAware();
1418                 if (!range.canPeek(ident.text.length))
1419                 {
1420                     error(ident.text ~ " expected");
1421                     break;
1422                 }
1423                 if (range.peek(ident.text.length - 1) == ident.text)
1424                 {
1425                     range.popFrontN(ident.text.length);
1426                     break;
1427                 }
1428             }
1429             else
1430             {
1431                 range.popFront();
1432             }
1433         }
1434         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"')
1435         {
1436             range.popFront();
1437         }
1438         else
1439             error("`\"` expected");
1440         IdType type = tok!"stringLiteral";
1441         lexStringSuffix(type);
1442         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1443     }
1444 
1445     void lexTokenString(ref Token token)
1446     {
1447         mixin (tokenStart);
1448         assert (range.bytes[range.index] == 'q');
1449         range.popFront();
1450         assert (range.bytes[range.index] == '{');
1451         range.popFront();
1452         auto app = appender!string();
1453         app.put("q{");
1454         int depth = 1;
1455 
1456         immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior;
1457         immutable StringBehavior oldString = config.stringBehavior;
1458         config.whitespaceBehavior = WhitespaceBehavior.include;
1459         config.stringBehavior = StringBehavior.source;
1460         scope (exit)
1461         {
1462             config.whitespaceBehavior = oldWhitespace;
1463             config.stringBehavior = oldString;
1464         }
1465 
1466         advance(_front);
1467         while (depth > 0 && !empty)
1468         {
1469             auto t = front();
1470             if (t.text is null)
1471                 app.put(str(t.type));
1472             else
1473                 app.put(t.text);
1474             if (t.type == tok!"}")
1475             {
1476                 depth--;
1477                 if (depth > 0)
1478                 popFront();
1479             }
1480             else if (t.type == tok!"{")
1481             {
1482                 depth++;
1483                 popFront();
1484             }
1485             else
1486                 popFront();
1487         }
1488         IdType type = tok!"stringLiteral";
1489         auto b = lexStringSuffix(type);
1490         if (b != 0)
1491             app.put(b);
1492         token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
1493             column, index);
1494     }
1495 
1496     void lexHexString(ref Token token)
1497     {
1498         mixin (tokenStart);
1499         range.index += 2;
1500         range.column += 2;
1501 
1502         loop: while (true)
1503         {
1504             if (range.index >= range.bytes.length)
1505             {
1506                 error("Error: unterminated hex string literal");
1507                 token = Token(tok!"");
1508                 return;
1509             }
1510             else if (isWhitespace())
1511                 popFrontWhitespaceAware();
1512             else switch (range.bytes[range.index])
1513             {
1514             case '0': .. case '9':
1515             case 'A': .. case 'F':
1516             case 'a': .. case 'f':
1517                 range.popFront();
1518                 break;
1519             case '"':
1520                 range.popFront();
1521                 break loop;
1522             default:
1523                 error("Error: invalid character in hex string");
1524                 token = Token(tok!"");
1525                 return;
1526             }
1527         }
1528 
1529         IdType type = tok!"stringLiteral";
1530         lexStringSuffix(type);
1531         token = Token(type, cache.intern(range.slice(mark)), line, column,
1532             index);
1533     }
1534 
1535     bool lexNamedEntity()
1536     in { assert (range.bytes[range.index] == '&'); }
1537     do
1538     {
1539         Token t;
1540         range.popFront();
1541         lexIdentifier(t, true);
1542         if (t.type != tok!"identifier" || range.empty || range.bytes[range.index] != ';')
1543         {
1544             error("Error: invalid named character entity");
1545             return false;
1546         }
1547         range.popFront();
1548         return true;
1549     }
1550 
1551     bool lexEscapeSequence()
1552     {
1553         range.popFront();
1554         if (range.index >= range.bytes.length)
1555         {
1556             error("Error: non-terminated character escape sequence.");
1557             return false;
1558         }
1559         switch (range.bytes[range.index])
1560         {
1561         case '&': return lexNamedEntity();
1562         case '\'':
1563         case '"':
1564         case '?':
1565         case '\\':
1566         case 'a':
1567         case 'b':
1568         case 'f':
1569         case 'n':
1570         case 'r':
1571         case 't':
1572         case 'v':
1573             range.popFront();
1574             break;
1575         case 'x':
1576             range.popFront();
1577             foreach (i; 0 .. 2)
1578             {
1579                 if (range.index >= range.bytes.length)
1580                 {
1581                     error("Error: 2 hex digits expected.");
1582                     return false;
1583                 }
1584                 switch (range.bytes[range.index])
1585                 {
1586                 case '0': .. case '9':
1587                 case 'a': .. case 'f':
1588                 case 'A': .. case 'F':
1589                     range.popFront();
1590                     break;
1591                 default:
1592                     error("Error: 2 hex digits expected.");
1593                     return false;
1594                 }
1595             }
1596             break;
1597         case '0':
1598             if (!(range.index + 1 < range.bytes.length)
1599                 || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\''))
1600             {
1601                 range.popFront();
1602                 break;
1603             }
1604             goto case;
1605         case '1': .. case '7':
1606             for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length)
1607                     && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++)
1608                 range.popFront();
1609             break;
1610         case 'u':
1611             range.popFront();
1612             foreach (i; 0 .. 4)
1613             {
1614                 if (range.index >= range.bytes.length)
1615                 {
1616                     error("Error: at least 4 hex digits expected.");
1617                     return false;
1618                 }
1619                 switch (range.bytes[range.index])
1620                 {
1621                 case '0': .. case '9':
1622                 case 'a': .. case 'f':
1623                 case 'A': .. case 'F':
1624                     range.popFront();
1625                     break;
1626                 default:
1627                     error("Error: at least 4 hex digits expected.");
1628                     return false;
1629                 }
1630             }
1631             break;
1632         case 'U':
1633             range.popFront();
1634             foreach (i; 0 .. 8)
1635             {
1636                 if (range.index >= range.bytes.length)
1637                 {
1638                     error("Error: at least 8 hex digits expected.");
1639                     return false;
1640                 }
1641                 switch (range.bytes[range.index])
1642                 {
1643                 case '0': .. case '9':
1644                 case 'a': .. case 'f':
1645                 case 'A': .. case 'F':
1646                     range.popFront();
1647                     break;
1648                 default:
1649                     error("Error: at least 8 hex digits expected.");
1650                     return false;
1651                 }
1652             }
1653             break;
1654         default:
1655             error("Invalid escape sequence");
1656             while (true)
1657             {
1658                 if (range.index >= range.bytes.length)
1659                 {
1660                     error("Error: non-terminated character escape sequence.");
1661                     break;
1662                 }
1663                 if (range.bytes[range.index] == ';')
1664                 {
1665                     range.popFront();
1666                     break;
1667                 }
1668                 else
1669                 {
1670                     range.popFront();
1671                 }
1672             }
1673             return false;
1674         }
1675         return true;
1676     }
1677 
1678     void lexCharacterLiteral(ref Token token)
1679     {
1680         mixin (tokenStart);
1681         range.popFront();
1682         if (range.empty)
1683             goto err;
1684         if (range.bytes[range.index] == '\\')
1685             lexEscapeSequence();
1686         else if (range.bytes[range.index] == '\'')
1687         {
1688             range.popFront();
1689             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1690                 line, column, index);
1691         }
1692         else if (range.bytes[range.index] & 0x80)
1693         {
1694             while (range.bytes[range.index] & 0x80)
1695                 range.popFront();
1696         }
1697         else
1698             popFrontWhitespaceAware();
1699 
1700         if (range.index < range.bytes.length && range.bytes[range.index] == '\'')
1701         {
1702             range.popFront();
1703             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1704                 line, column, index);
1705         }
1706         else
1707         {
1708     err:
1709             error("Error: Expected `'` to end character literal");
1710             token = Token(tok!"");
1711         }
1712     }
1713 
1714     void lexIdentifier(ref Token token, const bool silent = false) @trusted
1715     {
1716         mixin (tokenStart);
1717 
1718         if (isSeparating(0))
1719         {
1720             if (silent) return;
1721 
1722             error("Invalid identifier");
1723             range.popFront();
1724         }
1725         while (true)
1726         {
1727             version (X86_64)
1728             {
1729                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1730                 {
1731                     immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_')
1732                         (range.bytes.ptr + range.index);
1733                     range.column += i;
1734                     range.index += i;
1735                 }
1736             }
1737             if (isSeparating(0))
1738                 break;
1739             else
1740                 range.popFront();
1741         }
1742         token = Token(tok!"identifier", cache.intern(range.slice(mark)), line,
1743             column, index);
1744     }
1745 
1746     void lexDot(ref Token token)
1747     {
1748         mixin (tokenStart);
1749         if (!(range.index + 1 < range.bytes.length))
1750         {
1751             range.popFront();
1752             token = Token(tok!".", null, line, column, index);
1753             return;
1754         }
1755         switch (range.peekAt(1))
1756         {
1757         case '0': .. case '9':
1758             lexNumber(token);
1759             return;
1760         case '.':
1761             range.popFront();
1762             range.popFront();
1763             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.')
1764             {
1765                 range.popFront();
1766                 token = Token(tok!"...", null, line, column, index);
1767             }
1768             else
1769                 token = Token(tok!"..", null, line, column, index);
1770             return;
1771         default:
1772             range.popFront();
1773             token = Token(tok!".", null, line, column, index);
1774             return;
1775         }
1776     }
1777 
1778     void lexLongNewline(ref Token token) @nogc
1779     {
1780         mixin (tokenStart);
1781         range.popFront();
1782         range.popFront();
1783         range.popFront();
1784         range.incrementLine();
1785         string text = config.whitespaceBehavior == WhitespaceBehavior.include
1786             ? cache.intern(range.slice(mark)) : "";
1787         token = Token(tok!"whitespace", text, line,
1788             column, index);
1789     }
1790 
1791     bool isNewline() @nogc
1792     {
1793         if (range.bytes[range.index] == '\n') return true;
1794         if (range.bytes[range.index] == '\r') return true;
1795         return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length)
1796             && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
1797     }
1798 
1799     bool isSeparating(size_t offset) @nogc
1800     {
1801         enum : ubyte
1802         {
1803             n, y, m // no, yes, maybe
1804         }
1805 
1806         if (range.index + offset >= range.bytes.length)
1807             return true;
1808         auto c = range.bytes[range.index + offset];
1809         static immutable ubyte[256] LOOKUP_TABLE = [
1810             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1811             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1812             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1813             n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y,
1814             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1815             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n,
1816             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1817             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y,
1818             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1819             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1820             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1821             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1822             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1823             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1824             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1825             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m
1826         ];
1827         immutable ubyte result = LOOKUP_TABLE[c];
1828         if (result == n)
1829             return false;
1830         if (result == y)
1831             return true;
1832         if (result == m)
1833         {
1834             auto r = range;
1835             range.popFrontN(offset);
1836             return (r.canPeek(2) && (r.peek(2) == "\u2028"
1837                 || r.peek(2) == "\u2029"));
1838         }
1839         assert (false);
1840     }
1841 
1842 
1843 
1844     enum tokenStart = q{
1845         size_t index = range.index;
1846         size_t column = range.column;
1847         size_t line = range.line;
1848         auto mark = range.mark();
1849     };
1850 
1851     void error(string message)
1852     {
1853         _messages ~= Message(range.line, range.column, message, true);
1854     }
1855 
1856     void warning(string message)
1857     {
1858         _messages ~= Message(range.line, range.column, message, false);
1859         assert (_messages.length > 0);
1860     }
1861 
1862     Message[] _messages;
1863     StringCache* cache;
1864     LexerConfig config;
1865     bool haveSSE42;
1866 }
1867 
1868 /**
1869  * Creates a token range from the given source code. Creates a default lexer
1870  * configuration and a GC-managed string cache.
1871  */
1872 public auto byToken(R)(R range)
1873 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1874 {
1875     LexerConfig config;
1876     StringCache* cache = new StringCache(range.length.optimalBucketCount);
1877     return DLexer(range, config, cache);
1878 }
1879 
1880 /**
1881  * Creates a token range from the given source code. Uses the given string
1882  * cache.
1883  */
1884 public auto byToken(R)(R range, StringCache* cache)
1885 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1886 {
1887     LexerConfig config;
1888     return DLexer(range, config, cache);
1889 }
1890 
1891 /**
1892  * Creates a token range from the given source code. Uses the provided lexer
1893  * configuration and string cache.
1894  */
1895 public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
1896 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1897 {
1898     return DLexer(range, config, cache);
1899 }
1900 
1901 /**
1902  * Helper function used to avoid too much allocations while lexing.
1903  *
1904  * Params:
1905  *      size = The length in bytes of the source file.
1906  *
1907  * Returns:
1908  *      The optimal initial bucket count a `StringCache` should have.
1909  */
1910 size_t optimalBucketCount(size_t size)
1911 {
1912     import std.math : nextPow2;
1913     return nextPow2((size + 31U) / 32U).min(1U << 30U);
1914 }
1915 ///
1916 unittest
1917 {
1918     assert(optimalBucketCount(1) == 2);
1919     assert(optimalBucketCount(9000 * 32) == 16384);
1920     static if (size_t.sizeof == ulong.sizeof)
1921         assert(optimalBucketCount(100_000_000_000UL) == 1 << 30);
1922 }
1923 
1924 /**
1925  * The string cache is used for string interning.
1926  *
1927  * It will only store a single copy of any string that it is asked to hold.
1928  * Interned strings can be compared for equality by comparing their $(B .ptr)
1929  * field.
1930  *
1931  * Default and postbilt constructors are disabled. When a StringCache goes out
1932  * of scope, the memory held by it is freed.
1933  *
1934  * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning)
1935  */
1936 struct StringCache
1937 {
1938 public pure nothrow @nogc:
1939 
1940     @disable this();
1941     @disable this(this);
1942 
1943     /**
1944      * Params: bucketCount = the initial number of buckets. Must be a
1945      * power of two
1946      */
1947     this(size_t bucketCount) nothrow @trusted @nogc
1948     in
1949     {
1950         import core.bitop : popcnt;
1951         static if (size_t.sizeof == 8)
1952         {
1953             immutable low = popcnt(cast(uint) bucketCount);
1954             immutable high = popcnt(cast(uint) (bucketCount >> 32));
1955             assert ((low == 0 && high == 1) || (low == 1 && high == 0));
1956         }
1957         else
1958         {
1959             static assert (size_t.sizeof == 4);
1960             assert (popcnt(cast(uint) bucketCount) == 1);
1961         }
1962     }
1963     do
1964     {
1965         buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount];
1966     }
1967 
1968     ~this()
1969     {
1970         Block* current = rootBlock;
1971         while (current !is null)
1972         {
1973             Block* prev = current;
1974             current = current.next;
1975             free(cast(void*) prev);
1976         }
1977         foreach (nodePointer; buckets)
1978         {
1979             Node* currentNode = nodePointer;
1980             while (currentNode !is null)
1981             {
1982                 if (currentNode.mallocated)
1983                     free(currentNode.str.ptr);
1984                 Node* prev = currentNode;
1985                 currentNode = currentNode.next;
1986                 free(prev);
1987             }
1988         }
1989         rootBlock = null;
1990         free(buckets.ptr);
1991         buckets = null;
1992     }
1993 
1994     /**
1995      * Caches a string.
1996      */
1997     string intern(const(ubyte)[] str) @safe
1998     {
1999         if (str is null || str.length == 0)
2000             return "";
2001         return _intern(str);
2002     }
2003 
2004     /**
2005      * ditto
2006      */
2007     string intern(string str) @trusted
2008     {
2009         return intern(cast(ubyte[]) str);
2010     }
2011 
2012     /**
2013      * The default bucket count for the string cache.
2014      */
2015     static enum defaultBucketCount = 4096;
2016 
2017 private:
2018 
2019     string _intern(const(ubyte)[] bytes) @trusted
2020     {
2021         immutable uint hash = hashBytes(bytes);
2022         immutable size_t index = hash & (buckets.length - 1);
2023         Node* s = find(bytes, hash);
2024         if (s !is null)
2025             return cast(string) s.str;
2026         ubyte[] mem = void;
2027         bool mallocated = bytes.length > BIG_STRING;
2028         if (mallocated)
2029             mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length];
2030         else
2031             mem = allocate(bytes.length);
2032         mem[] = bytes[];
2033         Node* node = cast(Node*) malloc(Node.sizeof);
2034         node.str = mem;
2035         node.hash = hash;
2036         node.next = buckets[index];
2037         node.mallocated = mallocated;
2038         buckets[index] = node;
2039         return cast(string) mem;
2040     }
2041 
2042     Node* find(const(ubyte)[] bytes, uint hash) @trusted
2043     {
2044         import std.algorithm : equal;
2045         immutable size_t index = hash & (buckets.length - 1);
2046         Node* node = buckets[index];
2047         while (node !is null)
2048         {
2049             if (node.hash == hash && bytes == cast(ubyte[]) node.str)
2050                 return node;
2051             node = node.next;
2052         }
2053         return node;
2054     }
2055 
2056     static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc
2057     in
2058     {
2059         assert (data !is null);
2060         assert (data.length > 0);
2061     }
2062     do
2063     {
2064         immutable uint m = 0x5bd1e995;
2065         immutable int r = 24;
2066         uint h = cast(uint) data.length;
2067         while (data.length >= 4)
2068         {
2069             uint k = (cast(ubyte) data[3]) << 24
2070                 | (cast(ubyte) data[2]) << 16
2071                 | (cast(ubyte) data[1]) << 8
2072                 | (cast(ubyte) data[0]);
2073             k *= m;
2074             k ^= k >> r;
2075             k *= m;
2076             h *= m;
2077             h ^= k;
2078             data = data[4 .. $];
2079         }
2080         switch (data.length & 3)
2081         {
2082         case 3:
2083             h ^= data[2] << 16;
2084             goto case;
2085         case 2:
2086             h ^= data[1] << 8;
2087             goto case;
2088         case 1:
2089             h ^= data[0];
2090             h *= m;
2091             break;
2092         default:
2093             break;
2094         }
2095         h ^= h >> 13;
2096         h *= m;
2097         h ^= h >> 15;
2098         return h;
2099     }
2100 
2101     ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc
2102     in
2103     {
2104         assert (numBytes != 0);
2105     }
2106     out (result)
2107     {
2108         assert (result.length == numBytes);
2109     }
2110     do
2111     {
2112         Block* r = rootBlock;
2113         size_t i = 0;
2114         while  (i <= 3 && r !is null)
2115         {
2116             immutable size_t available = r.bytes.length;
2117             immutable size_t oldUsed = r.used;
2118             immutable size_t newUsed = oldUsed + numBytes;
2119             if (newUsed <= available)
2120             {
2121                 r.used = newUsed;
2122                 return r.bytes[oldUsed .. newUsed];
2123             }
2124             i++;
2125             r = r.next;
2126         }
2127         Block* b = cast(Block*) calloc(Block.sizeof, 1);
2128         b.used = numBytes;
2129         b.next = rootBlock;
2130         rootBlock = b;
2131         return b.bytes[0 .. numBytes];
2132     }
2133 
2134     static struct Node
2135     {
2136         ubyte[] str = void;
2137         Node* next = void;
2138         uint hash = void;
2139         bool mallocated = void;
2140     }
2141 
2142     static struct Block
2143     {
2144         Block* next;
2145         size_t used;
2146         enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof;
2147         ubyte[BLOCK_CAPACITY] bytes;
2148     }
2149 
2150     static assert (BLOCK_SIZE == Block.sizeof);
2151 
2152     enum BLOCK_SIZE = 1024 * 16;
2153 
2154     // If a string would take up more than 1/4 of a block, allocate it outside
2155     // of the block.
2156     enum BIG_STRING = BLOCK_SIZE / 4;
2157 
2158     Node*[] buckets;
2159     Block* rootBlock;
2160 }
2161 
2162 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted;
2163 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted;
2164 private extern(C) void free(void*) nothrow pure @nogc @trusted;
2165 
2166 unittest
2167 {
2168     auto source = cast(ubyte[]) q{ import std.stdio;}};
2169     auto tokens = getTokensForParser(source, LexerConfig(),
2170         new StringCache(StringCache.defaultBucketCount));
2171     assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
2172         tok!"identifier", tok!";"]));
2173 }
2174 
2175 /// Test \x char sequence
2176 unittest
2177 {
2178     auto toks = (string s) => byToken(cast(ubyte[])s);
2179 
2180     // valid
2181     immutable hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F'];
2182     auto source = "";
2183     foreach (h1; hex)
2184         foreach (h2; hex)
2185             source ~= "'\\x" ~ h1 ~ h2 ~ "'";
2186     assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty);
2187 
2188     // invalid
2189     assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2190     assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2191     assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2192     assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2193     assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2194 }
2195 
2196 version (X86_64)
2197 {
2198     version (DigitalMars)
2199         private enum useDMDStyle = true;
2200     else version (LDC)
2201         private enum useDMDStyle = (__VERSION__ < 2092); // GDC-style supported since v1.22
2202     else
2203         private enum useDMDStyle = false; // not supported by GDC
2204 
2205     private ulong pcmpestri(ubyte flags, chars...)(const ubyte* bytes) pure nothrow
2206         @trusted @nogc if (chars.length <= 8)
2207     {
2208         enum constant = ByteCombine!chars;
2209         enum charsLength = chars.length;
2210 
2211         static if (useDMDStyle)
2212         {
2213             asm pure nothrow @nogc
2214             {
2215                 naked;
2216             }
2217             version (Windows) // `bytes` in RCX
2218                 asm pure nothrow @nogc { movdqu XMM1, [RCX]; }
2219             else // `bytes` in RDI
2220                 asm pure nothrow @nogc { movdqu XMM1, [RDI]; }
2221             asm pure nothrow @nogc
2222             {
2223                 mov R10, constant;
2224                 movq XMM2, R10;
2225                 mov RAX, charsLength;
2226                 mov RDX, 16;
2227                 pcmpestri XMM2, XMM1, flags;
2228                 mov RAX, RCX;
2229                 ret;
2230             }
2231         }
2232         else // GDC-style inline asm (GCC basically)
2233         {
2234             ulong result;
2235             asm pure nothrow @nogc
2236             {
2237                 `movdqu    %1, %%xmm1
2238                  movq      %3, %%xmm2
2239                  pcmpestri %5, %%xmm1, %%xmm2`
2240                 : "=c" (result)   // %0: pcmpestri result in RCX, to be stored into `result`
2241                 : "m" (*bytes),   // %1: address of `bytes` string
2242                   "d" (16),       // %2: length of `bytes` head in XMM1, as pcmpestri input in EDX
2243                   "r" (constant), // %3: max 8 `chars` to load into GP register, then XMM2
2244                   "a" (charsLength), // %4: length in XMM2, as pcmpestri input in EAX
2245                   "i" (flags)     // %5: `flags` immediate
2246                 : "xmm1", "xmm2"; // clobbered registers
2247             }
2248             return result;
2249         }
2250     }
2251 
2252     /**
2253      * Skips between 0 and 16 bytes that match (or do not match) one of the
2254      * given $(B chars).
2255      */
2256     void skip(bool matching, chars...)(const ubyte* bytes, ulong* pindex, ulong* pcolumn) pure nothrow
2257         @trusted @nogc if (chars.length <= 8)
2258     {
2259         static if (matching)
2260             enum flags = 0b0001_0000;
2261         else
2262             enum flags = 0b0000_0000;
2263 
2264         const r = pcmpestri!(flags, chars)(bytes);
2265         *pindex += r;
2266         *pcolumn += r;
2267     }
2268 
2269     /**
2270      * Returns: the number of bytes starting at the given location that match
2271      *     (or do not match if $(B invert) is true) the byte ranges in $(B chars).
2272      */
2273     ulong rangeMatch(bool invert, chars...)(const ubyte* bytes) pure nothrow @trusted @nogc
2274     {
2275         static assert(chars.length % 2 == 0);
2276         static if (invert)
2277             enum rangeMatchFlags = 0b0000_0100;
2278         else
2279             enum rangeMatchFlags = 0b0001_0100;
2280 
2281         return pcmpestri!(rangeMatchFlags, chars)(bytes);
2282     }
2283 
2284     template ByteCombine(c...)
2285     {
2286         static assert (c.length <= 8);
2287         static if (c.length > 1)
2288             enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8);
2289         else
2290             enum ulong ByteCombine = c[0];
2291     }
2292 }
2293 
2294 unittest
2295 {
2296     import core.exception : RangeError;
2297     import std.exception : assertNotThrown;
2298 
2299     static immutable src1 = "/++";
2300     static immutable src2 = "/**";
2301 
2302     LexerConfig cf;
2303     StringCache ca = StringCache(16);
2304 
2305     assertNotThrown!RangeError(getTokensForParser(src1, cf, &ca));
2306     assertNotThrown!RangeError(getTokensForParser(src2, cf, &ca));
2307 }
2308 
2309 unittest
2310 {
2311     static immutable src = `"\eeee"`;
2312 
2313     LexerConfig cf;
2314     StringCache ca = StringCache(16);
2315 
2316     auto l = DLexer(src, cf, &ca);
2317     assert(l.front().type == tok!"");
2318     assert(!l.messages.empty);
2319 }
2320 
2321 unittest
2322 {
2323     alias Msg = DLexer.Message;
2324     LexerConfig cf;
2325     StringCache ca = StringCache(16);
2326 
2327     {
2328         auto l = DLexer(`"\&copy;"`, cf, &ca);
2329         assert(l.front().type == tok!"stringLiteral");
2330         assert(l.messages == []);
2331     }
2332     {
2333         auto l = DLexer(`"\&trade;\&urcorn;"`, cf, &ca);
2334         assert(l.front().type == tok!"stringLiteral");
2335         assert(l.messages == []);
2336     }
2337     {
2338         auto l = DLexer(`"\&trade"`, cf, &ca);
2339         assert(l.front().type == tok!"");
2340         assert(l.messages == [ Msg(1, 9, "Error: invalid named character entity", true) ]);
2341     }
2342     {
2343         auto l = DLexer(`"\&trade;\&urcorn"`, cf, &ca);
2344         assert(l.front().type == tok!"");
2345         assert(l.messages == [ Msg(1, 18, "Error: invalid named character entity", true) ]);
2346     }
2347     {
2348         auto l = DLexer(`"\&"`, cf, &ca);
2349         assert(l.front().type == tok!"");
2350         assert(l.messages == [ Msg(1, 4, "Error: invalid named character entity", true) ]);
2351     }
2352     {
2353         auto l = DLexer(`"\&0"`, cf, &ca);
2354         assert(l.front().type == tok!"");
2355         assert(l.messages == [ Msg(1, 5, "Error: invalid named character entity", true) ]);
2356     }
2357     {
2358         auto l = DLexer(`"\&copy`, cf, &ca);
2359         assert(l.front().type == tok!"");
2360         assert(l.messages == [ Msg(1, 8, "Error: invalid named character entity", true) ]);
2361     }
2362     {
2363         auto l = DLexer(`"\&copy;`, cf, &ca);
2364         assert(l.front().type == tok!"");
2365         assert(l.messages == [ Msg(1, 9, "Error: unterminated string literal", true) ]);
2366     }
2367 }
2368 
2369 // legacy code using compatibility comment and trailingComment
2370 unittest
2371 {
2372     import std.conv : to;
2373     import std.exception : enforce;
2374 
2375     static immutable src = `/// this is a module.
2376 // mixed
2377 /// it can do stuff
2378 module foo.bar;
2379 
2380 // hello
2381 
2382 /**
2383  * some doc
2384  * hello
2385  */
2386 int x; /// very nice
2387 
2388 // TODO: do stuff
2389 void main() {
2390     #line 40
2391     /// could be better
2392     writeln(":)");
2393 }
2394 
2395 /// end of file`;
2396 
2397     LexerConfig cf;
2398     StringCache ca = StringCache(16);
2399 
2400     const tokens = getTokensForParser(src, cf, &ca);
2401 
2402     void assertEquals(T)(T a, T b, string what, string file = __FILE__, size_t line = __LINE__)
2403     {
2404         enforce(a == b, "Failed " ~ what ~ " '" ~ a.to!string ~ "' == '" ~ b.to!string ~ "'", file, line);
2405     }
2406 
2407     void test(size_t index, IdType type, string comment, string trailingComment,
2408             string file = __FILE__, size_t line = __LINE__)
2409     {
2410         assertEquals(tokens[index].type, type, "type", file, line);
2411         assertEquals(tokens[index].comment, comment, "comment", file, line);
2412         assertEquals(tokens[index].trailingComment, trailingComment, "trailingComment", file, line);
2413     }
2414 
2415     test(0, tok!"module", "this is a module.\nit can do stuff", "");
2416     test(1, tok!"identifier", "", "");
2417     test(2, tok!".", "", "");
2418     test(3, tok!"identifier", "", "");
2419     test(4, tok!";", "", "");
2420     test(5, tok!"int", "some doc\nhello", "");
2421     test(6, tok!"identifier", "", "");
2422     test(7, tok!";", "", "very nice");
2423     test(8, tok!"void", "", "");
2424     test(9, tok!"identifier", "", "");
2425     test(10, tok!"(", "", "");
2426     test(11, tok!")", "", "");
2427     test(12, tok!"{", "", "");
2428     test(13, tok!"identifier", "could be better", "");
2429     test(14, tok!"(", "", "");
2430     test(15, tok!"stringLiteral", "", "");
2431     test(16, tok!")", "", "");
2432     test(17, tok!";", "", "");
2433     test(18, tok!"}", "", "");
2434 }
2435 
2436 // dlang-community/D-Scanner#805
2437 unittest
2438 {
2439     final class SomeExpr
2440     {
2441         Token tok;
2442     }
2443 
2444     auto e1 = new SomeExpr();
2445     const e2 = new SomeExpr();
2446     immutable e3 = new immutable SomeExpr();
2447 
2448     immutable t1 = e1.tok;
2449     immutable t2 = e2.tok;
2450     immutable t3 = e3.tok;
2451 }