1 module dparse.lexer;
2 
3 import std.typecons;
4 import std.typetuple;
5 import std.array;
6 import std.algorithm;
7 import std.range;
8 import std.experimental.lexer;
9 import std.traits;
10 import core.cpuid : sse42;
11 
12 public import dparse.trivia;
13 
14 /// Operators
15 private enum operators = [
16     ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
17     "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
18     "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
19     "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^",
20     "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
21 ];
22 
23 /// Kewords
24 private enum keywords = [
25     "abstract", "alias", "align", "asm", "assert", "auto", "bool",
26     "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
27     "char", "class", "const", "continue", "creal", "dchar", "debug", "default",
28     "delegate", "delete", "deprecated", "do", "double", "else", "enum",
29     "export", "extern", "false", "final", "finally", "float", "for", "foreach",
30     "foreach_reverse", "function", "goto", "idouble", "if", "ifloat",
31     "immutable", "import", "in", "inout", "int", "interface", "invariant",
32     "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow",
33     "null", "out", "override", "package", "pragma", "private", "protected",
34     "public", "pure", "real", "ref", "return", "scope", "shared", "short",
35     "static", "struct", "super", "switch", "synchronized", "template", "this",
36     "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent",
37     "uint", "ulong", "union", "unittest", "ushort", "version", "void",
38     "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__",
39     "__FILE_FULL_PATH__", "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__",
40     "__parameters", "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits",
41     "__vector", "__VENDOR__", "__VERSION__"
42 ];
43 
44 /// Other tokens
45 private enum dynamicTokens = [
46     "specialTokenSequence", "comment", "identifier", "scriptLine",
47     "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral",
48     "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral",
49     "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral",
50     "dstringLiteral", "stringLiteral", "wstringLiteral"
51 ];
52 
53 private enum pseudoTokenHandlers = [
54     "\"", "lexStringLiteral",
55     "`", "lexWysiwygString",
56     "//", "lexSlashSlashComment",
57     "/*", "lexSlashStarComment",
58     "/+", "lexSlashPlusComment",
59     ".", "lexDot",
60     "'", "lexCharacterLiteral",
61     "0", "lexNumber",
62     "1", "lexDecimal",
63     "2", "lexDecimal",
64     "3", "lexDecimal",
65     "4", "lexDecimal",
66     "5", "lexDecimal",
67     "6", "lexDecimal",
68     "7", "lexDecimal",
69     "8", "lexDecimal",
70     "9", "lexDecimal",
71     "q\"", "lexDelimitedString",
72     "q{", "lexTokenString",
73     "r\"", "lexWysiwygString",
74     "x\"", "lexHexString",
75     " ", "lexWhitespace",
76     "\t", "lexWhitespace",
77     "\r", "lexWhitespace",
78     "\n", "lexWhitespace",
79     "\v", "lexWhitespace",
80     "\f", "lexWhitespace",
81     "\u2028", "lexLongNewline",
82     "\u2029", "lexLongNewline",
83     "#!", "lexScriptLine",
84     "#line", "lexSpecialTokenSequence"
85 ];
86 
87 /// Token ID type for the D lexer.
88 public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
89 
90 /**
91  * Function used for converting an IdType to a string.
92  *
93  * Examples:
94  * ---
95  * IdType c = tok!"case";
96  * assert (str(c) == "case");
97  * ---
98  */
99 public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
100 
101 /**
102  * Template used to refer to D token types.
103  *
104  * See the $(B operators), $(B keywords), and $(B dynamicTokens) enums for
105  * values that can be passed to this template.
106  * Example:
107  * ---
108  * import dparse.lexer;
109  * IdType t = tok!"floatLiteral";
110  * ---
111  */
112 public template tok(string token)
113 {
114     alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
115 }
116 
117 mixin template TokenTriviaFields()
118 {
119     /**
120      * Whitespace and comment tokens attached to this token.
121      *
122      * All trivia tokens must have the text property set to the text with
123      * which they identify with. This means you can map all trivia tokens to
124      * their .text property and join them together to get the source code back
125      * without any loss of information.
126      *
127      * Trivia is only included when calling getTokensForParser. When iterating
128      * over DLexer all tokens will be in their raw form and none will be
129      * converted to trivia.
130      *
131      * Note: in the future you might need to explicitly pass
132      * WhitespaceBehavior.include (or keep the default) as getTokensForParser
133      * currently overrides it to include.
134      *
135      * Contains: `comment`, `whitespace`, `specialTokenSequence`
136      */
137     immutable(typeof(this))[] leadingTrivia;
138     /// ditto
139     immutable(typeof(this))[] trailingTrivia;
140 
141     string memoizedLeadingComment = null;
142     string memoizedTrailingComment = null;
143 
144     /// Legacy property to get documentation comments, with comment border
145     /// stripped off, which is attached to this token.
146     string comment() const pure nothrow @safe @property {
147         import dparse.trivia : extractLeadingDdoc;
148         if (memoizedLeadingComment !is null)
149             return memoizedLeadingComment;
150         return (cast()memoizedLeadingComment) = this.extractLeadingDdoc;
151     }
152 
153     /// ditto
154     string trailingComment() const pure nothrow @safe @property {
155         import dparse.trivia : extractTrailingDdoc;
156         if (memoizedTrailingComment !is null)
157             return memoizedTrailingComment;
158         return (cast()memoizedTrailingComment) = this.extractTrailingDdoc;
159     }
160 
161     int opCmp(size_t i) const pure nothrow @safe @nogc {
162         if (index < i) return -1;
163         if (index > i) return 1;
164         return 0;
165     }
166 
167     int opCmp(ref const typeof(this) other) const pure nothrow @safe @nogc {
168         return opCmp(other.index);
169     }
170 }
171 
172 // mixin in from dparse.lexer to make error messages more managable size as the
173 // entire string is dumped when there is a type mismatch.
174 private enum extraFields = "import dparse.lexer:TokenTriviaFields; mixin TokenTriviaFields;";
175 
176 /// The token type in the D lexer
177 public alias Token = std.experimental.lexer.TokenStructure!(IdType, extraFields);
178 
179 /**
180  * Configure whitespace handling
181  */
182 public enum WhitespaceBehavior : ubyte
183 {
184     include = 0b0000_0000,
185     skip = 0b0000_0001,
186 }
187 
188 private enum stringBehaviorNotWorking = "Automatic string parsing is not "
189     ~ "supported and was previously not working. To unescape strings use the "
190     ~ "`dparse.strings : unescapeString` function on the token texts instead.";
191 
192 /**
193  * Configure string lexing behavior
194  */
195 // was enum, but struct now for deprecations and support with old compilers
196 public struct StringBehavior
197 {
198     /// Do not include quote characters, process escape sequences
199     deprecated(stringBehaviorNotWorking) static immutable StringBehavior compiler = StringBehavior(0b0000_0000);
200     /// Opening quotes, closing quotes, and string suffixes are included in
201     /// the string token
202     deprecated(stringBehaviorNotWorking) static immutable StringBehavior includeQuoteChars = StringBehavior(0b0000_0001);
203     /// String escape sequences are not replaced
204     deprecated(stringBehaviorNotWorking) static immutable StringBehavior notEscaped = StringBehavior(0b0000_0010);
205     /// Not modified at all. Useful for formatters or highlighters
206     static immutable StringBehavior source = StringBehavior(0b0000_0011);
207 
208     ubyte behavior;
209     alias behavior this;
210 }
211 
212 public enum CommentBehavior : bool
213 {
214     intern = true,
215     noIntern = false
216 }
217 /**
218  * Lexer configuration struct
219  */
220 public struct LexerConfig
221 {
222     string fileName;
223     StringBehavior stringBehavior;
224     WhitespaceBehavior whitespaceBehavior;
225     CommentBehavior commentBehavior = CommentBehavior.intern;
226 }
227 
228 /**
229  * Basic type token types.
230  */
231 public alias BasicTypes = AliasSeq!(tok!"int", tok!"bool", tok!"byte",
232         tok!"cdouble", tok!"cent", tok!"cfloat", tok!"char", tok!"creal",
233         tok!"dchar", tok!"double", tok!"float", tok!"idouble",
234         tok!"ifloat", tok!"ireal", tok!"long", tok!"real", tok!"short",
235         tok!"ubyte", tok!"ucent", tok!"uint", tok!"ulong", tok!"ushort",
236         tok!"void", tok!"wchar");
237 
238 /**
239  * Returns: true if the given ID is for a basic type.
240  */
241 public bool isBasicType(IdType type) nothrow pure @safe @nogc
242 {
243     switch (type)
244     {
245     foreach (T; BasicTypes)
246     {
247     case T:
248         return true;
249     }
250     default:
251         return false;
252     }
253 }
254 
255 /**
256  * Number literal token types.
257  */
258 public alias NumberLiterals = AliasSeq!(tok!"doubleLiteral",
259         tok!"floatLiteral", tok!"idoubleLiteral", tok!"ifloatLiteral",
260         tok!"intLiteral", tok!"longLiteral", tok!"realLiteral",
261         tok!"irealLiteral", tok!"uintLiteral", tok!"ulongLiteral");
262 
263 /**
264  * Returns: true if the given ID type is for a number literal.
265  */
266 public bool isNumberLiteral(IdType type) nothrow pure @safe @nogc
267 {
268     switch (type)
269     {
270     foreach (T; NumberLiterals)
271     {
272     case T:
273         return true;
274     }
275     default:
276         return false;
277     }
278 }
279 
280 /**
281  * Number literal token types.
282  */
283 public alias IntegerLiterals = AliasSeq!(tok!"intLiteral", tok!"longLiteral",
284         tok!"uintLiteral", tok!"ulongLiteral");
285 
286 /**
287  * Returns: true if the given ID type is for a integer literal.
288  */
289 public bool isIntegerLiteral(IdType type) nothrow pure @safe @nogc
290 {
291     switch (type)
292     {
293     foreach (T; IntegerLiterals)
294     {
295     case T:
296         return true;
297     }
298     default:
299         return false;
300     }
301 }
302 
303 /**
304  * Operator token types.
305  */
306 public alias Operators = AliasSeq!(tok!",", tok!".", tok!"..", tok!"...",
307         tok!"/", tok!"/=", tok!"!", tok!"!<", tok!"!<=", tok!"!<>",
308         tok!"!<>=", tok!"!=", tok!"!>", tok!"!>=", tok!"$", tok!"%",
309         tok!"%=", tok!"&", tok!"&&", tok!"&=", tok!"(", tok!")",
310         tok!"*", tok!"*=", tok!"+", tok!"++", tok!"+=", tok!"-",
311         tok!"--", tok!"-=", tok!":", tok!";", tok!"<", tok!"<<",
312         tok!"<<=", tok!"<=", tok!"<>", tok!"<>=", tok!"=", tok!"==",
313         tok!"=>", tok!">", tok!">=", tok!">>", tok!">>=", tok!">>>",
314         tok!">>>=", tok!"?", tok!"@", tok!"[", tok!"]", tok!"^",
315         tok!"^=", tok!"^^", tok!"^^=", tok!"{", tok!"|", tok!"|=",
316         tok!"||", tok!"}", tok!"~", tok!"~=");
317 
318 /**
319  * Returns: true if the given ID type is for an operator.
320  */
321 public bool isOperator(IdType type) nothrow pure @safe @nogc
322 {
323     switch (type)
324     {
325     foreach (T; Operators)
326     {
327     case T:
328         return true;
329     }
330     default:
331         return false;
332     }
333 }
334 
335 /**
336  * Keyword token types.
337  */
338 public alias Keywords = AliasSeq!(tok!"abstract", tok!"alias", tok!"align",
339         tok!"asm", tok!"assert", tok!"auto", tok!"break",
340         tok!"case", tok!"cast", tok!"catch", tok!"class", tok!"const",
341         tok!"continue", tok!"debug", tok!"default", tok!"delegate",
342         tok!"delete", tok!"deprecated", tok!"do", tok!"else", tok!"enum",
343         tok!"export", tok!"extern", tok!"false", tok!"final", tok!"finally",
344         tok!"for", tok!"foreach", tok!"foreach_reverse", tok!"function",
345         tok!"goto", tok!"if", tok!"immutable", tok!"import", tok!"in",
346         tok!"inout", tok!"interface", tok!"invariant", tok!"is",
347         tok!"lazy", tok!"macro", tok!"mixin", tok!"module", tok!"new",
348         tok!"nothrow", tok!"null", tok!"out", tok!"override", tok!"package",
349         tok!"pragma", tok!"private", tok!"protected", tok!"public",
350         tok!"pure", tok!"ref", tok!"return", tok!"scope", tok!"shared",
351         tok!"static", tok!"struct", tok!"super", tok!"switch", tok!"synchronized",
352         tok!"template", tok!"this", tok!"throw", tok!"true", tok!"try",
353         tok!"typedef", tok!"typeid", tok!"typeof", tok!"union", tok!"unittest",
354         tok!"version", tok!"while", tok!"with", tok!"__DATE__",
355         tok!"__EOF__", tok!"__FILE__", tok!"__FILE_FULL_PATH__", tok!"__FUNCTION__",
356         tok!"__gshared", tok!"__LINE__", tok!"__MODULE__", tok!"__parameters",
357         tok!"__PRETTY_FUNCTION__", tok!"__TIME__", tok!"__TIMESTAMP__",
358         tok!"__traits", tok!"__vector", tok!"__VENDOR__", tok!"__VERSION__");
359 
360 /**
361  * Returns: true if the given ID type is for a keyword.
362  */
363 public bool isKeyword(IdType type) pure nothrow @safe @nogc
364 {
365     switch (type)
366     {
367     foreach (T; Keywords)
368     {
369     case T:
370         return true;
371     }
372     default:
373         return false;
374     }
375 }
376 
377 /**
378  * String literal token types
379  */
380 public alias StringLiterals = AliasSeq!(tok!"dstringLiteral",
381         tok!"stringLiteral", tok!"wstringLiteral");
382 
383 /**
384  * Returns: true if the given ID type is for a string literal.
385  */
386 public bool isStringLiteral(IdType type) pure nothrow @safe @nogc
387 {
388     switch (type)
389     {
390     foreach (T; StringLiterals)
391     {
392     case T:
393         return true;
394     }
395     default:
396         return false;
397     }
398 }
399 
400 /**
401  * Protection token types.
402  */
403 public alias Protections = AliasSeq!(tok!"export", tok!"package",
404         tok!"private", tok!"public", tok!"protected");
405 
406 /**
407  * Returns: true if the given ID type is for a protection attribute.
408  */
409 public bool isProtection(IdType type) pure nothrow @safe @nogc
410 {
411     switch (type)
412     {
413     foreach (T; Protections)
414     {
415     case T:
416         return true;
417     }
418     default:
419         return false;
420     }
421 }
422 
423 public alias SpecialTokens = AliasSeq!(tok!"__DATE__", tok!"__TIME__",
424     tok!"__TIMESTAMP__", tok!"__VENDOR__", tok!"__VERSION__", tok!"__FILE__",
425     tok!"__FILE_FULL_PATH__", tok!"__LINE__", tok!"__MODULE__",
426     tok!"__FUNCTION__", tok!"__PRETTY_FUNCTION__");
427 
428 public bool isSpecialToken(IdType type) pure nothrow @safe @nogc
429 {
430     switch (type)
431     {
432     foreach (T; SpecialTokens)
433     {
434     case T:
435         return true;
436     }
437     default:
438         return false;
439     }
440 }
441 
442 public alias Literals = AliasSeq!(StringLiterals, NumberLiterals, tok!"characterLiteral",
443         SpecialTokens, tok!"true", tok!"false", tok!"null", tok!"$");
444 
445 public bool isLiteral(IdType type) pure nothrow @safe @nogc
446 {
447     switch (type)
448     {
449     foreach (T; Literals)
450     {
451     case T:
452         return true;
453     }
454     default:
455         return false;
456     }
457 }
458 
459 /**
460  * Returns: an array of tokens lexed from the given source code to the output
461  * range. All whitespace, comment and specialTokenSequence tokens (trivia) are
462  * attached to the token nearest to them.
463  *
464  * Trivia is put on the last token as `trailingTrivia` if it is on the same
465  * line as the trivia, otherwise it will be attached to the next token in the
466  * `leadingTrivia` until there is the EOF, where it will be attached as
467  * `trailingTrivia` again.
468  */
469 const(Token)[] getTokensForParser(R)(R sourceCode, LexerConfig config, StringCache* cache)
470 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
471 {
472     config.whitespaceBehavior = WhitespaceBehavior.include;
473     config.commentBehavior = CommentBehavior.noIntern;
474 
475     auto leadingTriviaAppender = appender!(Token[])();
476     leadingTriviaAppender.reserve(128);
477     auto trailingTriviaAppender = appender!(Token[])();
478     trailingTriviaAppender.reserve(128);
479 
480     auto output = appender!(typeof(return))();
481     auto lexer = DLexer(sourceCode, config, cache);
482     loop: while (!lexer.empty) switch (lexer.front.type)
483     {
484     case tok!"specialTokenSequence":
485     case tok!"whitespace":
486     case tok!"comment":
487         if (!output.data.empty && lexer.front.line == output.data[$ - 1].line)
488             trailingTriviaAppender.put(lexer.front);
489         else
490             leadingTriviaAppender.put(lexer.front);
491         lexer.popFront();
492         break;
493     case tok!"__EOF__":
494         break loop;
495     default:
496         Token t = lexer.front;
497         lexer.popFront();
498 
499         if (!output.data.empty && !trailingTriviaAppender.data.empty)
500             (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup;
501         t.leadingTrivia = leadingTriviaAppender.data.idup;
502         leadingTriviaAppender.clear();
503         trailingTriviaAppender.clear();
504 
505         output.put(t);
506         break;
507     }
508 
509     if (!output.data.empty)
510     {
511         trailingTriviaAppender.put(leadingTriviaAppender.data);
512         (cast() output.data[$ - 1].trailingTrivia) = trailingTriviaAppender.data.idup;
513     }
514 
515     return output.data;
516 }
517 
518 /**
519  * The D lexer struct.
520  */
521 public struct DLexer
522 {
523     mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
524         keywords, pseudoTokenHandlers);
525 
526     ///
527     @disable this();
528 
529     /**
530      * Params:
531      *     range = the bytes that compose the source code that will be lexed.
532      *     config = the lexer configuration to use.
533      *     cache = the string interning cache for de-duplicating identifiers and
534      *         other token text.
535      *     haveSSE42 = Parse streaming SIMD Extensions 4.2 in inline assembly
536      */
537     this(R)(R range, const LexerConfig config, StringCache* cache,
538         bool haveSSE42 = sse42()) pure nothrow @safe
539     if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
540     {
541         this.haveSSE42 = haveSSE42;
542         auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf)
543             ? range[3 .. $] : range;
544         static if (is(ElementEncodingType!R == immutable))
545             this.range = LexerRange(cast(const(ubyte)[]) r);
546         else
547             this.range = LexerRange(cast(const(ubyte)[]) r.idup);
548         this.config = config;
549         this.cache = cache;
550         popFront();
551     }
552 
553     ///
554     public void popFront()() pure nothrow @safe
555     {
556         do
557             _popFront();
558         while (config.whitespaceBehavior == WhitespaceBehavior.skip
559             && _front.type == tok!"whitespace");
560     }
561 
562     /**
563      * Lexer error/warning message.
564      */
565     static struct Message
566     {
567         /// 1-based line number
568         size_t line;
569         /// 1-based byte offset
570         size_t column;
571         /// Text of the message
572         string message;
573         /// `true` for an error, `false` for a warning
574         bool isError;
575     }
576 
577     /**
578      * Returns: An array of all of the warnings and errors generated so far
579      *     during lexing. It may make sense to only check this when `empty`
580      *     returns `true`.
581      */
582     const(Message[]) messages() const @property
583     {
584         return _messages;
585     }
586 
587 private pure nothrow @safe:
588 
589     bool isWhitespace()
590     {
591         switch (range.bytes[range.index])
592         {
593         case ' ':
594         case '\r':
595         case '\n':
596         case '\t':
597         case '\v':
598         case '\f':
599             return true;
600         case 0xe2:
601             auto peek = range.peek(2);
602             return peek.length == 2
603                 && peek[0] == 0x80
604                 && (peek[1] == 0xa8 || peek[1] == 0xa9);
605         default:
606             return false;
607         }
608     }
609 
610     void popFrontWhitespaceAware()
611     {
612         switch (range.bytes[range.index])
613         {
614         case '\r':
615             range.popFront();
616             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
617             {
618                 range.popFront();
619                 range.incrementLine();
620             }
621             else
622                 range.incrementLine();
623             return;
624         case '\n':
625             range.popFront();
626             range.incrementLine();
627             return;
628         case 0xe2:
629             auto lookahead = range.peek(3);
630             if (lookahead.length == 3 && lookahead[1] == 0x80
631                 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
632             {
633                 range.index+=3;
634                 range.column+=3;
635                 range.incrementLine();
636                 return;
637             }
638             else
639             {
640                 range.popFront();
641                 return;
642             }
643         default:
644             range.popFront();
645             return;
646         }
647     }
648 
649     void lexWhitespace(ref Token token) @trusted
650     {
651         mixin (tokenStart);
652         loop: do
653         {
654             version (X86_64)
655             {
656                 if (haveSSE42 && range.index + 16 < range.bytes.length)
657                 {
658                     skip!(true, '\t', ' ', '\v', '\f')(range.bytes.ptr + range.index,
659                         &range.index, &range.column);
660                 }
661             }
662             switch (range.bytes[range.index])
663             {
664             case '\r':
665                 range.popFront();
666                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n')
667                 {
668                     range.popFront();
669                 }
670                 range.column = 1;
671                 range.line += 1;
672                 break;
673             case '\n':
674                 range.popFront();
675                 range.column = 1;
676                 range.line += 1;
677                 break;
678             case ' ':
679             case '\t':
680             case '\v':
681             case '\f':
682                 range.popFront();
683                 break;
684             case 0xe2:
685                 if (range.index + 2 >= range.bytes.length)
686                     break loop;
687                 if (range.bytes[range.index + 1] != 0x80)
688                     break loop;
689                 if (range.bytes[range.index + 2] == 0xa8 || range.bytes[range.index + 2] == 0xa9)
690                 {
691                     range.index += 3;
692                     range.column += 3;
693                     range.column = 1;
694                     range.line += 1;
695                     break;
696                 }
697                 break loop;
698             default:
699                 break loop;
700             }
701         } while (!(range.index >= range.bytes.length));
702         string text = config.whitespaceBehavior == WhitespaceBehavior.include
703             ? cache.intern(range.slice(mark)) : "";
704         token = Token(tok!"whitespace", text, line, column, index);
705     }
706 
707     void lexNumber(ref Token token)
708     {
709         mixin (tokenStart);
710         if (range.bytes[range.index] == '0' && range.index + 1 < range.bytes.length)
711         {
712             immutable ahead = range.bytes[range.index + 1];
713             switch (ahead)
714             {
715             case 'x':
716             case 'X':
717                 range.index += 2;
718                 range.column += 2;
719                 lexHex(token, mark, line, column, index);
720                 return;
721             case 'b':
722             case 'B':
723                 range.index += 2;
724                 range.column += 2;
725                 lexBinary(token, mark, line, column, index);
726                 return;
727             default:
728                 lexDecimal(token, mark, line, column, index);
729                 return;
730             }
731         }
732         else
733             lexDecimal(token, mark, line, column, index);
734     }
735 
736     void lexHex(ref Token token)
737     {
738         mixin (tokenStart);
739         lexHex(token, mark, line, column, index);
740     }
741 
742     void lexHex(ref Token token, size_t mark, size_t line, size_t column,
743         size_t index) @trusted
744     {
745         IdType type = tok!"intLiteral";
746         bool foundDot;
747         hexLoop: while (!(range.index >= range.bytes.length))
748         {
749             switch (range.bytes[range.index])
750             {
751             case 'a': .. case 'f':
752             case 'A': .. case 'F':
753             case '0': .. case '9':
754             case '_':
755                 version (X86_64)
756                 {
757                     if (haveSSE42 && range.index + 16 < range.bytes.length)
758                     {
759                         immutable ulong i = rangeMatch!(false, '0', '9', 'a', 'f', 'A', 'F', '_', '_')
760                             (range.bytes.ptr + range.index);
761                         range.column += i;
762                         range.index += i;
763                     }
764                     else
765                         range.popFront();
766                 }
767                 else
768                     range.popFront();
769                 break;
770             case 'u':
771             case 'U':
772                 lexIntSuffix(type);
773                 break hexLoop;
774             case 'i':
775                 if (foundDot)
776                     lexFloatSuffix(type);
777                 break hexLoop;
778             case 'L':
779                 if (foundDot)
780                     lexFloatSuffix(type);
781                 else
782                     lexIntSuffix(type);
783                 break hexLoop;
784             case 'p':
785             case 'P':
786                 lexExponent(type);
787                 break hexLoop;
788             case '.':
789                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
790                     break hexLoop;
791                 else
792                 {
793                     // The following bit of silliness tries to tell the
794                     // difference between "int dot identifier" and
795                     // "double identifier".
796                     if (range.index + 1 < range.bytes.length)
797                     {
798                         switch (range.peekAt(1))
799                         {
800                         case '0': .. case '9':
801                         case 'A': .. case 'F':
802                         case 'a': .. case 'f':
803                             goto doubleLiteral;
804                         default:
805                             break hexLoop;
806                         }
807                     }
808                     else
809                     {
810                     doubleLiteral:
811                         range.popFront();
812                         foundDot = true;
813                         type = tok!"doubleLiteral";
814                     }
815                 }
816                 break;
817             default:
818                 break hexLoop;
819             }
820         }
821         token = Token(type, cache.intern(range.slice(mark)), line, column,
822             index);
823     }
824 
825     void lexBinary(ref Token token)
826     {
827         mixin (tokenStart);
828         return lexBinary(token, mark, line, column, index);
829     }
830 
831     void lexBinary(ref Token token, size_t mark, size_t line, size_t column,
832         size_t index) @trusted
833     {
834         IdType type = tok!"intLiteral";
835         binaryLoop: while (!(range.index >= range.bytes.length))
836         {
837             switch (range.bytes[range.index])
838             {
839             case '0':
840             case '1':
841             case '_':
842                 version (X86_64)
843                 {
844                     if (haveSSE42 && range.index + 16 < range.bytes.length)
845                     {
846                         immutable ulong i = rangeMatch!(false, '0', '1', '_', '_')(
847                             range.bytes.ptr + range.index);
848                         range.column += i;
849                         range.index += i;
850                     }
851                     else
852                         range.popFront();
853                 }
854                 else
855                     range.popFront();
856                 break;
857             case 'u':
858             case 'U':
859             case 'L':
860                 lexIntSuffix(type);
861                 break binaryLoop;
862             default:
863                 break binaryLoop;
864             }
865         }
866         token = Token(type, cache.intern(range.slice(mark)), line, column,
867             index);
868     }
869 
870     void lexDecimal(ref Token token)
871     {
872         mixin (tokenStart);
873         lexDecimal(token, mark, line, column, index);
874     }
875 
876     void lexDecimal(ref Token token, size_t mark, size_t line, size_t column,
877         size_t index) @trusted
878     {
879         bool foundDot = range.bytes[range.index] == '.';
880         IdType type = tok!"intLiteral";
881         if (foundDot)
882         {
883             range.popFront();
884             type = tok!"doubleLiteral";
885         }
886 
887         decimalLoop: while (!(range.index >= range.bytes.length))
888         {
889             switch (range.bytes[range.index])
890             {
891             case '0': .. case '9':
892             case '_':
893                 version (X86_64)
894                 {
895                     if (haveSSE42 && range.index + 16 < range.bytes.length)
896                     {
897                         immutable ulong i = rangeMatch!(false, '0', '9', '_', '_')(range.bytes.ptr + range.index);
898                         range.column += i;
899                         range.index += i;
900                     }
901                     else
902                         range.popFront();
903                 }
904                 else
905                     range.popFront();
906                 break;
907             case 'u':
908             case 'U':
909                 if (!foundDot)
910                     lexIntSuffix(type);
911                 break decimalLoop;
912             case 'i':
913                 lexFloatSuffix(type);
914                 break decimalLoop;
915             case 'L':
916                 if (foundDot)
917                     lexFloatSuffix(type);
918                 else
919                     lexIntSuffix(type);
920                 break decimalLoop;
921             case 'f':
922             case 'F':
923                 lexFloatSuffix(type);
924                 break decimalLoop;
925             case 'e':
926             case 'E':
927                 lexExponent(type);
928                 break decimalLoop;
929             case '.':
930                 if (foundDot || !(range.index + 1 < range.bytes.length) || range.peekAt(1) == '.')
931                     break decimalLoop;
932                 else
933                 {
934                     // The following bit of silliness tries to tell the
935                     // difference between "int dot identifier" and
936                     // "double identifier".
937                     if (range.index + 1 < range.bytes.length)
938                     {
939                         immutable ch = range.peekAt(1);
940                         if (ch <= 0x2f
941                             || (ch >= '0' && ch <= '9')
942                             || (ch >= ':' && ch <= '@')
943                             || (ch >= '[' && ch <= '^')
944                             || (ch >= '{' && ch <= '~')
945                             || ch == '`' || ch == '_')
946                         {
947                             goto doubleLiteral;
948                         }
949                         else
950                             break decimalLoop;
951                     }
952                     else
953                     {
954                     doubleLiteral:
955                         range.popFront();
956                         foundDot = true;
957                         type = tok!"doubleLiteral";
958                     }
959                 }
960                 break;
961             default:
962                 break decimalLoop;
963             }
964         }
965         token = Token(type, cache.intern(range.slice(mark)), line, column,
966             index);
967     }
968 
969     void lexIntSuffix(ref IdType type) pure nothrow @safe
970     {
971         bool secondPass;
972         if (range.bytes[range.index] == 'u' || range.bytes[range.index] == 'U')
973         {
974     U:
975             if (type == tok!"intLiteral")
976                 type = tok!"uintLiteral";
977             else
978                 type = tok!"ulongLiteral";
979             range.popFront();
980             if (secondPass)
981                 return;
982             if (range.index < range.bytes.length
983                     && (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l'))
984                 goto L;
985             goto I;
986         }
987         if (range.bytes[range.index] == 'L' || range.bytes[range.index] == 'l')
988         {
989     L:
990             if (type == tok!"uintLiteral")
991                 type = tok!"ulongLiteral";
992             else
993                 type = tok!"longLiteral";
994             range.popFront();
995             if (range.index < range.bytes.length
996                     && (range.bytes[range.index] == 'U' || range.bytes[range.index] == 'u'))
997             {
998                 secondPass = true;
999                 goto U;
1000             }
1001             goto I;
1002         }
1003     I:
1004         if (range.index < range.bytes.length && range.bytes[range.index] == 'i')
1005         {
1006             warning("Complex number literals are deprecated");
1007             range.popFront();
1008             if (type == tok!"longLiteral" || type == tok!"ulongLiteral")
1009                 type = tok!"idoubleLiteral";
1010             else
1011                 type = tok!"ifloatLiteral";
1012         }
1013     }
1014 
1015     void lexFloatSuffix(ref IdType type) pure nothrow @safe
1016     {
1017         switch (range.bytes[range.index])
1018         {
1019         case 'L':
1020             range.popFront();
1021             type = tok!"doubleLiteral";
1022             break;
1023         case 'f':
1024         case 'F':
1025             range.popFront();
1026             type = tok!"floatLiteral";
1027             break;
1028         default:
1029             break;
1030         }
1031         if (range.index < range.bytes.length && range.bytes[range.index] == 'i')
1032         {
1033             warning("Complex number literals are deprecated");
1034             range.popFront();
1035             if (type == tok!"floatLiteral")
1036                 type = tok!"ifloatLiteral";
1037             else
1038                 type = tok!"idoubleLiteral";
1039         }
1040     }
1041 
1042     void lexExponent(ref IdType type) pure nothrow @safe
1043     {
1044         range.popFront();
1045         bool foundSign = false;
1046         bool foundDigit = false;
1047         while (range.index < range.bytes.length)
1048         {
1049             switch (range.bytes[range.index])
1050             {
1051             case '-':
1052             case '+':
1053                 if (foundSign)
1054                 {
1055                     if (!foundDigit)
1056                     error("Expected an exponent");
1057                     return;
1058                 }
1059                 foundSign = true;
1060                 range.popFront();
1061                 break;
1062             case '0': .. case '9':
1063             case '_':
1064                 foundDigit = true;
1065                 range.popFront();
1066                 break;
1067             case 'L':
1068             case 'f':
1069             case 'F':
1070             case 'i':
1071                 lexFloatSuffix(type);
1072                 return;
1073             default:
1074                 if (!foundDigit)
1075                     error("Expected an exponent");
1076                 return;
1077             }
1078         }
1079     }
1080 
1081     void lexScriptLine(ref Token token)
1082     {
1083         mixin (tokenStart);
1084         while (!(range.index >= range.bytes.length) && !isNewline)
1085         {
1086             range.popFront();
1087         }
1088         token = Token(tok!"scriptLine", cache.intern(range.slice(mark)),
1089             line, column, index);
1090     }
1091 
1092     void lexSpecialTokenSequence(ref Token token)
1093     {
1094         mixin (tokenStart);
1095         while (!(range.index >= range.bytes.length) && !isNewline)
1096         {
1097             range.popFront();
1098         }
1099         token = Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
1100             line, column, index);
1101     }
1102 
1103     void lexSlashStarComment(ref Token token) @trusted
1104     {
1105         mixin (tokenStart);
1106         IdType type = tok!"comment";
1107         range.popFrontN(2);
1108         while (range.index < range.bytes.length)
1109         {
1110             version (X86_64)
1111             {
1112                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1113                     skip!(false, '\r', '\n', '/', '*', 0xe2)(range.bytes.ptr + range.index,
1114                         &range.index, &range.column);
1115             }
1116             if (range.bytes[range.index] == '*')
1117             {
1118                 range.popFront();
1119                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1120                 {
1121                     range.popFront();
1122                     break;
1123                 }
1124             }
1125             else
1126                 popFrontWhitespaceAware();
1127         }
1128         if (config.commentBehavior == CommentBehavior.intern)
1129             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1130         else
1131             token = Token(type, cast(string) range.slice(mark), line, column, index);
1132     }
1133 
1134     void lexSlashSlashComment(ref Token token) @trusted
1135     {
1136         mixin (tokenStart);
1137         IdType type = tok!"comment";
1138         range.popFrontN(2);
1139         while (range.index < range.bytes.length)
1140         {
1141             version (X86_64)
1142             {
1143                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1144                 {
1145                     skip!(false, '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1146                         &range.index, &range.column);
1147                 }
1148             }
1149             if (range.bytes[range.index] == '\r' || range.bytes[range.index] == '\n')
1150                 break;
1151             range.popFront();
1152         }
1153         if (config.commentBehavior == CommentBehavior.intern)
1154             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1155         else
1156             token = Token(type, cast(string) range.slice(mark), line, column, index);
1157     }
1158 
1159     void lexSlashPlusComment(ref Token token) @trusted
1160     {
1161         mixin (tokenStart);
1162         IdType type = tok!"comment";
1163         range.index += 2;
1164         range.column += 2;
1165         int depth = 1;
1166         while (depth > 0 && !(range.index >= range.bytes.length))
1167         {
1168             version (X86_64)
1169             {
1170                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1171                 {
1172                     skip!(false, '+', '/', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1173                         &range.index, &range.column);
1174                 }
1175             }
1176             if (range.bytes[range.index] == '+')
1177             {
1178                 range.popFront();
1179                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '/')
1180                 {
1181                     range.popFront();
1182                     depth--;
1183                 }
1184             }
1185             else if (range.bytes[range.index] == '/')
1186             {
1187                 range.popFront();
1188                 if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '+')
1189                 {
1190                     range.popFront();
1191                     depth++;
1192                 }
1193             }
1194             else
1195                 popFrontWhitespaceAware();
1196         }
1197         if (config.commentBehavior == CommentBehavior.intern)
1198             token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1199         else
1200             token = Token(type, cast(string) range.slice(mark), line, column, index);
1201     }
1202 
1203     void lexStringLiteral(ref Token token) @trusted
1204     {
1205         mixin (tokenStart);
1206         range.popFront();
1207         while (true)
1208         {
1209             if (range.index >= range.bytes.length)
1210             {
1211                 error(token, "Error: unterminated string literal");
1212                 return;
1213             }
1214             version (X86_64)
1215             {
1216                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1217                 {
1218                     skip!(false, '"', '\\', '\r', '\n', 0xe2)(range.bytes.ptr + range.index,
1219                         &range.index, &range.column);
1220                 }
1221             }
1222             if (range.bytes[range.index] == '"')
1223             {
1224                 range.popFront();
1225                 break;
1226             }
1227             else if (range.bytes[range.index] == '\\')
1228             {
1229                 if (!lexEscapeSequence())
1230                 {
1231                     token = Token.init;
1232                     return;
1233                 }
1234             }
1235             else
1236                 popFrontWhitespaceAware();
1237         }
1238         IdType type = tok!"stringLiteral";
1239         lexStringSuffix(type);
1240         token = Token(type, cache.intern(range.slice(mark)), line, column,
1241             index);
1242     }
1243 
1244     void lexWysiwygString(ref Token token) @trusted
1245     {
1246         mixin (tokenStart);
1247         IdType type = tok!"stringLiteral";
1248         immutable bool backtick = range.bytes[range.index] == '`';
1249         if (backtick)
1250         {
1251             range.popFront();
1252             while (true)
1253             {
1254                 if (range.index >= range.bytes.length)
1255                 {
1256                     error(token, "Error: unterminated string literal");
1257                     return;
1258                 }
1259                 version (X86_64)
1260                 {
1261                     if (haveSSE42 && range.index + 16 < range.bytes.length)
1262                     {
1263                         skip!(false, '\r', '\n', 0xe2, '`')(range.bytes.ptr + range.index,
1264                             &range.index, &range.column);
1265                     }
1266                 }
1267                 if (range.bytes[range.index] == '`')
1268                 {
1269                     range.popFront();
1270                     break;
1271                 }
1272                 else
1273                     popFrontWhitespaceAware();
1274             }
1275         }
1276         else
1277         {
1278             range.popFront();
1279             if (range.index >= range.bytes.length)
1280             {
1281                 error(token, "Error: unterminated string literal");
1282                 return;
1283             }
1284             range.popFront();
1285             while (true)
1286             {
1287                 if (range.index >= range.bytes.length)
1288                 {
1289                     error(token, "Error: unterminated string literal");
1290                     return;
1291                 }
1292                 else if (range.bytes[range.index] == '"')
1293                 {
1294                     range.popFront();
1295                     break;
1296                 }
1297                 else
1298                     popFrontWhitespaceAware();
1299             }
1300         }
1301         lexStringSuffix(type);
1302         token = Token(type, cache.intern(range.slice(mark)), line, column,
1303             index);
1304     }
1305 
1306     private ubyte lexStringSuffix(ref IdType type) pure nothrow @safe
1307     {
1308         if (range.index >= range.bytes.length)
1309         {
1310             type = tok!"stringLiteral";
1311             return 0;
1312         }
1313         else
1314         {
1315             switch (range.bytes[range.index])
1316             {
1317             case 'w': range.popFront(); type = tok!"wstringLiteral"; return 'w';
1318             case 'd': range.popFront(); type = tok!"dstringLiteral"; return 'd';
1319             case 'c': range.popFront(); type = tok!"stringLiteral"; return 'c';
1320             default: type = tok!"stringLiteral"; return 0;
1321             }
1322         }
1323     }
1324 
1325     void lexDelimitedString(ref Token token)
1326     {
1327         mixin (tokenStart);
1328         range.index += 2;
1329         range.column += 2;
1330         ubyte open;
1331         ubyte close;
1332         switch (range.bytes[range.index])
1333         {
1334         case '<':
1335             open = '<';
1336             close = '>';
1337             range.popFront();
1338             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1339             break;
1340         case '{':
1341             open = '{';
1342             close = '}';
1343             range.popFront();
1344             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1345             break;
1346         case '[':
1347             open = '[';
1348             close = ']';
1349             range.popFront();
1350             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1351             break;
1352         case '(':
1353             open = '(';
1354             close = ')';
1355             range.popFront();
1356             lexNormalDelimitedString(token, mark, line, column, index, open, close);
1357             break;
1358         default:
1359             lexHeredocString(token, mark, line, column, index);
1360             break;
1361         }
1362     }
1363 
1364     void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column,
1365         size_t index, ubyte open, ubyte close)
1366     {
1367         int depth = 1;
1368         while (!(range.index >= range.bytes.length) && depth > 0)
1369         {
1370             if (range.bytes[range.index] == open)
1371             {
1372                 depth++;
1373                 range.popFront();
1374             }
1375             else if (range.bytes[range.index] == close)
1376             {
1377                 depth--;
1378                 range.popFront();
1379                 if (depth <= 0)
1380                 {
1381                     if (range.bytes[range.index] == '"')
1382                     {
1383                         range.popFront();
1384                     }
1385                     else
1386                     {
1387                         error(token, "Error: `\"` expected to end delimited string literal");
1388                         return;
1389                     }
1390                 }
1391             }
1392             else
1393                 popFrontWhitespaceAware();
1394         }
1395         IdType type = tok!"stringLiteral";
1396         lexStringSuffix(type);
1397         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1398     }
1399 
1400     void lexHeredocString(ref Token token, size_t mark, size_t line, size_t column, size_t index)
1401     {
1402         Token ident;
1403         lexIdentifier(ident);
1404         if (!(range.index >= range.bytes.length) && isNewline())
1405             popFrontWhitespaceAware();
1406         else
1407             error("Newline expected");
1408         while (!(range.index >= range.bytes.length))
1409         {
1410             if (isNewline())
1411             {
1412                 popFrontWhitespaceAware();
1413                 if (!range.canPeek(ident.text.length))
1414                 {
1415                     error(ident.text ~ " expected");
1416                     break;
1417                 }
1418                 if (range.peek(ident.text.length - 1) == ident.text)
1419                 {
1420                     range.popFrontN(ident.text.length);
1421                     break;
1422                 }
1423             }
1424             else
1425             {
1426                 range.popFront();
1427             }
1428         }
1429         IdType type;
1430         if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '"')
1431         {
1432             type = tok!"stringLiteral";
1433             lexStringSuffix(type);
1434             range.popFront();
1435         }
1436         else
1437             error("`\"` expected");
1438         token = Token(type, cache.intern(range.slice(mark)), line, column, index);
1439     }
1440 
1441     void lexTokenString(ref Token token)
1442     {
1443         mixin (tokenStart);
1444         assert (range.bytes[range.index] == 'q');
1445         range.popFront();
1446         assert (range.bytes[range.index] == '{');
1447         range.popFront();
1448         auto app = appender!string();
1449         app.put("q{");
1450         int depth = 1;
1451 
1452         immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior;
1453         immutable StringBehavior oldString = config.stringBehavior;
1454         config.whitespaceBehavior = WhitespaceBehavior.include;
1455         config.stringBehavior = StringBehavior.source;
1456         scope (exit)
1457         {
1458             config.whitespaceBehavior = oldWhitespace;
1459             config.stringBehavior = oldString;
1460         }
1461 
1462         advance(_front);
1463 
1464         if (range.index >= range.bytes.length)
1465         {
1466             error(token, "Error: unterminated token string literal");
1467             return;
1468         }
1469 
1470         while (depth > 0 && !empty)
1471         {
1472             auto t = front();
1473             if (t.text is null)
1474                 app.put(str(t.type));
1475             else
1476                 app.put(t.text);
1477             if (t.type == tok!"}")
1478             {
1479                 depth--;
1480                 if (depth > 0)
1481                 popFront();
1482             }
1483             else if (t.type == tok!"{")
1484             {
1485                 depth++;
1486                 popFront();
1487             }
1488             else
1489                 popFront();
1490         }
1491         IdType type = tok!"stringLiteral";
1492         auto b = lexStringSuffix(type);
1493         if (b != 0)
1494             app.put(b);
1495         token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
1496             column, index);
1497     }
1498 
1499     void lexHexString(ref Token token)
1500     {
1501         mixin (tokenStart);
1502         range.index += 2;
1503         range.column += 2;
1504 
1505         loop: while (true)
1506         {
1507             if (range.index >= range.bytes.length)
1508             {
1509                 error(token, "Error: unterminated hex string literal");
1510                 return;
1511             }
1512             else if (isWhitespace())
1513                 popFrontWhitespaceAware();
1514             else switch (range.bytes[range.index])
1515             {
1516             case '0': .. case '9':
1517             case 'A': .. case 'F':
1518             case 'a': .. case 'f':
1519                 range.popFront();
1520                 break;
1521             case '"':
1522                 range.popFront();
1523                 break loop;
1524             default:
1525                 error(token, "Error: invalid character in hex string");
1526                 return;
1527             }
1528         }
1529 
1530         IdType type = tok!"stringLiteral";
1531         lexStringSuffix(type);
1532         token = Token(type, cache.intern(range.slice(mark)), line, column,
1533             index);
1534     }
1535 
1536     bool lexNamedEntity()
1537     in { assert (range.bytes[range.index] == '&'); }
1538     do
1539     {
1540         Token t;
1541         range.popFront();
1542         lexIdentifier(t, true);
1543         if (t.type != tok!"identifier" || range.empty || range.bytes[range.index] != ';')
1544         {
1545             error("Error: invalid named character entity");
1546             return false;
1547         }
1548         range.popFront();
1549         return true;
1550     }
1551 
1552     bool lexEscapeSequence()
1553     {
1554         range.popFront();
1555         if (range.index >= range.bytes.length)
1556         {
1557             error("Error: non-terminated character escape sequence.");
1558             return false;
1559         }
1560         switch (range.bytes[range.index])
1561         {
1562         case '&': return lexNamedEntity();
1563         case '\'':
1564         case '"':
1565         case '?':
1566         case '\\':
1567         case 'a':
1568         case 'b':
1569         case 'f':
1570         case 'n':
1571         case 'r':
1572         case 't':
1573         case 'v':
1574             range.popFront();
1575             break;
1576         case 'x':
1577             range.popFront();
1578             foreach (i; 0 .. 2)
1579             {
1580                 if (range.index >= range.bytes.length)
1581                 {
1582                     error("Error: 2 hex digits expected.");
1583                     return false;
1584                 }
1585                 switch (range.bytes[range.index])
1586                 {
1587                 case '0': .. case '9':
1588                 case 'a': .. case 'f':
1589                 case 'A': .. case 'F':
1590                     range.popFront();
1591                     break;
1592                 default:
1593                     error("Error: 2 hex digits expected.");
1594                     return false;
1595                 }
1596             }
1597             break;
1598         case '0':
1599             if (!(range.index + 1 < range.bytes.length)
1600                 || ((range.index + 1 < range.bytes.length) && range.peekAt(1) == '\''))
1601             {
1602                 range.popFront();
1603                 break;
1604             }
1605             goto case;
1606         case '1': .. case '7':
1607             for (size_t i = 0; i < 3 && !(range.index >= range.bytes.length)
1608                     && range.bytes[range.index] >= '0' && range.bytes[range.index] <= '7'; i++)
1609                 range.popFront();
1610             break;
1611         case 'u':
1612             range.popFront();
1613             foreach (i; 0 .. 4)
1614             {
1615                 if (range.index >= range.bytes.length)
1616                 {
1617                     error("Error: at least 4 hex digits expected.");
1618                     return false;
1619                 }
1620                 switch (range.bytes[range.index])
1621                 {
1622                 case '0': .. case '9':
1623                 case 'a': .. case 'f':
1624                 case 'A': .. case 'F':
1625                     range.popFront();
1626                     break;
1627                 default:
1628                     error("Error: at least 4 hex digits expected.");
1629                     return false;
1630                 }
1631             }
1632             break;
1633         case 'U':
1634             range.popFront();
1635             foreach (i; 0 .. 8)
1636             {
1637                 if (range.index >= range.bytes.length)
1638                 {
1639                     error("Error: at least 8 hex digits expected.");
1640                     return false;
1641                 }
1642                 switch (range.bytes[range.index])
1643                 {
1644                 case '0': .. case '9':
1645                 case 'a': .. case 'f':
1646                 case 'A': .. case 'F':
1647                     range.popFront();
1648                     break;
1649                 default:
1650                     error("Error: at least 8 hex digits expected.");
1651                     return false;
1652                 }
1653             }
1654             break;
1655         default:
1656             error("Invalid escape sequence");
1657             while (true)
1658             {
1659                 if (range.index >= range.bytes.length)
1660                 {
1661                     error("Error: non-terminated character escape sequence.");
1662                     break;
1663                 }
1664                 if (range.bytes[range.index] == ';')
1665                 {
1666                     range.popFront();
1667                     break;
1668                 }
1669                 else
1670                 {
1671                     range.popFront();
1672                 }
1673             }
1674             return false;
1675         }
1676         return true;
1677     }
1678 
1679     void lexCharacterLiteral(ref Token token)
1680     {
1681         mixin (tokenStart);
1682         range.popFront();
1683         if (range.empty)
1684             goto err;
1685         if (range.bytes[range.index] == '\\')
1686             lexEscapeSequence();
1687         else if (range.bytes[range.index] == '\'')
1688         {
1689             range.popFront();
1690             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1691                 line, column, index);
1692         }
1693         else if (range.bytes[range.index] & 0x80)
1694         {
1695             while (range.bytes[range.index] & 0x80)
1696                 range.popFront();
1697         }
1698         else
1699             popFrontWhitespaceAware();
1700 
1701         if (range.index < range.bytes.length && range.bytes[range.index] == '\'')
1702         {
1703             range.popFront();
1704             token = Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
1705                 line, column, index);
1706         }
1707         else
1708         {
1709     err:
1710             error(token, "Error: Expected `'` to end character literal");
1711         }
1712     }
1713 
1714     void lexIdentifier(ref Token token, const bool silent = false) @trusted
1715     {
1716         mixin (tokenStart);
1717 
1718         if (isSeparating(0))
1719         {
1720             if (silent) return;
1721 
1722             error("Invalid identifier");
1723             range.popFront();
1724         }
1725         while (true)
1726         {
1727             version (X86_64)
1728             {
1729                 if (haveSSE42 && range.index + 16 < range.bytes.length)
1730                 {
1731                     immutable ulong i = rangeMatch!(false, 'a', 'z', 'A', 'Z', '_', '_')
1732                         (range.bytes.ptr + range.index);
1733                     range.column += i;
1734                     range.index += i;
1735                 }
1736             }
1737             if (isSeparating(0))
1738                 break;
1739             else
1740                 range.popFront();
1741         }
1742         token = Token(tok!"identifier", cache.intern(range.slice(mark)), line,
1743             column, index);
1744     }
1745 
1746     void lexDot(ref Token token)
1747     {
1748         mixin (tokenStart);
1749         if (!(range.index + 1 < range.bytes.length))
1750         {
1751             range.popFront();
1752             token = Token(tok!".", null, line, column, index);
1753             return;
1754         }
1755         switch (range.peekAt(1))
1756         {
1757         case '0': .. case '9':
1758             lexNumber(token);
1759             return;
1760         case '.':
1761             range.popFront();
1762             range.popFront();
1763             if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '.')
1764             {
1765                 range.popFront();
1766                 token = Token(tok!"...", null, line, column, index);
1767             }
1768             else
1769                 token = Token(tok!"..", null, line, column, index);
1770             return;
1771         default:
1772             range.popFront();
1773             token = Token(tok!".", null, line, column, index);
1774             return;
1775         }
1776     }
1777 
1778     void lexLongNewline(ref Token token) @nogc
1779     {
1780         mixin (tokenStart);
1781         range.popFront();
1782         range.popFront();
1783         range.popFront();
1784         range.incrementLine();
1785         string text = config.whitespaceBehavior == WhitespaceBehavior.include
1786             ? cache.intern(range.slice(mark)) : "";
1787         token = Token(tok!"whitespace", text, line,
1788             column, index);
1789     }
1790 
1791     bool isNewline() @nogc
1792     {
1793         if (range.bytes[range.index] == '\n') return true;
1794         if (range.bytes[range.index] == '\r') return true;
1795         return (range.bytes[range.index] & 0x80) && (range.index + 2 < range.bytes.length)
1796             && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
1797     }
1798 
1799     bool isSeparating(size_t offset) @nogc
1800     {
1801         enum : ubyte
1802         {
1803             n, y, m // no, yes, maybe
1804         }
1805 
1806         if (range.index + offset >= range.bytes.length)
1807             return true;
1808         auto c = range.bytes[range.index + offset];
1809         static immutable ubyte[256] LOOKUP_TABLE = [
1810             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1811             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1812             y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y,
1813             n, n, n, n, n, n, n, n, n, n, y, y, y, y, y, y,
1814             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1815             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, n,
1816             y, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,
1817             n, n, n, n, n, n, n, n, n, n, n, y, y, y, y, y,
1818             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1819             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1820             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1821             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1822             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1823             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1824             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m,
1825             m, m, m, m, m, m, m, m, m, m, m, m, m, m, m, m
1826         ];
1827         immutable ubyte result = LOOKUP_TABLE[c];
1828         if (result == n)
1829             return false;
1830         if (result == y)
1831             return true;
1832         if (result == m)
1833         {
1834             auto r = range;
1835             range.popFrontN(offset);
1836             return (r.canPeek(2) && (r.peek(2) == "\u2028"
1837                 || r.peek(2) == "\u2029"));
1838         }
1839         assert (false);
1840     }
1841 
1842 
1843 
1844     enum tokenStart = q{
1845         size_t index = range.index;
1846         size_t column = range.column;
1847         size_t line = range.line;
1848         auto mark = range.mark();
1849     };
1850 
1851     void error(ref Token token, string message)
1852     {
1853         token.type = tok!"";
1854         error(message);
1855     }
1856 
1857     void error(string message)
1858     {
1859         _messages ~= Message(range.line, range.column, message, true);
1860     }
1861 
1862     void warning(string message)
1863     {
1864         _messages ~= Message(range.line, range.column, message, false);
1865         assert (_messages.length > 0);
1866     }
1867 
1868     Message[] _messages;
1869     StringCache* cache;
1870     LexerConfig config;
1871     bool haveSSE42;
1872 }
1873 
1874 /**
1875  * Creates a token range from the given source code. Creates a default lexer
1876  * configuration and a GC-managed string cache.
1877  */
1878 public auto byToken(R)(R range)
1879 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1880 {
1881     LexerConfig config;
1882     StringCache* cache = new StringCache(range.length.optimalBucketCount);
1883     return DLexer(range, config, cache);
1884 }
1885 
1886 /**
1887  * Creates a token range from the given source code. Uses the given string
1888  * cache.
1889  */
1890 public auto byToken(R)(R range, StringCache* cache)
1891 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1892 {
1893     LexerConfig config;
1894     return DLexer(range, config, cache);
1895 }
1896 
1897 /**
1898  * Creates a token range from the given source code. Uses the provided lexer
1899  * configuration and string cache.
1900  */
1901 public auto byToken(R)(R range, const LexerConfig config, StringCache* cache)
1902 if (is(Unqual!(ElementEncodingType!R) : ubyte) && isDynamicArray!R)
1903 {
1904     return DLexer(range, config, cache);
1905 }
1906 
1907 /**
1908  * Helper function used to avoid too much allocations while lexing.
1909  *
1910  * Params:
1911  *      size = The length in bytes of the source file.
1912  *
1913  * Returns:
1914  *      The optimal initial bucket count a `StringCache` should have.
1915  */
1916 size_t optimalBucketCount(size_t size)
1917 {
1918     import std.math : nextPow2;
1919     return nextPow2((size + 31U) / 32U).min(1U << 30U);
1920 }
1921 ///
1922 unittest
1923 {
1924     assert(optimalBucketCount(1) == 2);
1925     assert(optimalBucketCount(9000 * 32) == 16384);
1926     static if (size_t.sizeof == ulong.sizeof)
1927         assert(optimalBucketCount(100_000_000_000UL) == 1 << 30);
1928 }
1929 
1930 /**
1931  * The string cache is used for string interning.
1932  *
1933  * It will only store a single copy of any string that it is asked to hold.
1934  * Interned strings can be compared for equality by comparing their $(B .ptr)
1935  * field.
1936  *
1937  * Default and postbilt constructors are disabled. When a StringCache goes out
1938  * of scope, the memory held by it is freed.
1939  *
1940  * See_also: $(LINK http://en.wikipedia.org/wiki/String_interning)
1941  */
1942 struct StringCache
1943 {
1944 public pure nothrow @nogc:
1945 
1946     @disable this();
1947     @disable this(this);
1948 
1949     /**
1950      * Params: bucketCount = the initial number of buckets. Must be a
1951      * power of two
1952      */
1953     this(size_t bucketCount) nothrow @trusted @nogc
1954     in
1955     {
1956         import core.bitop : popcnt;
1957         static if (size_t.sizeof == 8)
1958         {
1959             immutable low = popcnt(cast(uint) bucketCount);
1960             immutable high = popcnt(cast(uint) (bucketCount >> 32));
1961             assert ((low == 0 && high == 1) || (low == 1 && high == 0));
1962         }
1963         else
1964         {
1965             static assert (size_t.sizeof == 4);
1966             assert (popcnt(cast(uint) bucketCount) == 1);
1967         }
1968     }
1969     do
1970     {
1971         buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount];
1972     }
1973 
1974     ~this()
1975     {
1976         Block* current = rootBlock;
1977         while (current !is null)
1978         {
1979             Block* prev = current;
1980             current = current.next;
1981             free(cast(void*) prev);
1982         }
1983         foreach (nodePointer; buckets)
1984         {
1985             Node* currentNode = nodePointer;
1986             while (currentNode !is null)
1987             {
1988                 if (currentNode.mallocated)
1989                     free(currentNode.str.ptr);
1990                 Node* prev = currentNode;
1991                 currentNode = currentNode.next;
1992                 free(prev);
1993             }
1994         }
1995         rootBlock = null;
1996         free(buckets.ptr);
1997         buckets = null;
1998     }
1999 
2000     /**
2001      * Caches a string.
2002      */
2003     string intern(const(ubyte)[] str) @safe
2004     {
2005         if (str is null || str.length == 0)
2006             return "";
2007         return _intern(str);
2008     }
2009 
2010     /**
2011      * ditto
2012      */
2013     string intern(string str) @trusted
2014     {
2015         return intern(cast(ubyte[]) str);
2016     }
2017 
2018     /**
2019      * The default bucket count for the string cache.
2020      */
2021     static enum defaultBucketCount = 4096;
2022 
2023 private:
2024 
2025     string _intern(const(ubyte)[] bytes) @trusted
2026     {
2027         immutable uint hash = hashBytes(bytes);
2028         immutable size_t index = hash & (buckets.length - 1);
2029         Node* s = find(bytes, hash);
2030         if (s !is null)
2031             return cast(string) s.str;
2032         ubyte[] mem = void;
2033         bool mallocated = bytes.length > BIG_STRING;
2034         if (mallocated)
2035             mem = (cast(ubyte*) malloc(bytes.length))[0 .. bytes.length];
2036         else
2037             mem = allocate(bytes.length);
2038         mem[] = bytes[];
2039         Node* node = cast(Node*) malloc(Node.sizeof);
2040         node.str = mem;
2041         node.hash = hash;
2042         node.next = buckets[index];
2043         node.mallocated = mallocated;
2044         buckets[index] = node;
2045         return cast(string) mem;
2046     }
2047 
2048     Node* find(const(ubyte)[] bytes, uint hash) @trusted
2049     {
2050         import std.algorithm : equal;
2051         immutable size_t index = hash & (buckets.length - 1);
2052         Node* node = buckets[index];
2053         while (node !is null)
2054         {
2055             if (node.hash == hash && bytes == cast(ubyte[]) node.str)
2056                 return node;
2057             node = node.next;
2058         }
2059         return node;
2060     }
2061 
2062     static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted @nogc
2063     in
2064     {
2065         assert (data !is null);
2066         assert (data.length > 0);
2067     }
2068     do
2069     {
2070         immutable uint m = 0x5bd1e995;
2071         immutable int r = 24;
2072         uint h = cast(uint) data.length;
2073         while (data.length >= 4)
2074         {
2075             uint k = (cast(ubyte) data[3]) << 24
2076                 | (cast(ubyte) data[2]) << 16
2077                 | (cast(ubyte) data[1]) << 8
2078                 | (cast(ubyte) data[0]);
2079             k *= m;
2080             k ^= k >> r;
2081             k *= m;
2082             h *= m;
2083             h ^= k;
2084             data = data[4 .. $];
2085         }
2086         switch (data.length & 3)
2087         {
2088         case 3:
2089             h ^= data[2] << 16;
2090             goto case;
2091         case 2:
2092             h ^= data[1] << 8;
2093             goto case;
2094         case 1:
2095             h ^= data[0];
2096             h *= m;
2097             break;
2098         default:
2099             break;
2100         }
2101         h ^= h >> 13;
2102         h *= m;
2103         h ^= h >> 15;
2104         return h;
2105     }
2106 
2107     ubyte[] allocate(size_t numBytes) pure nothrow @trusted @nogc
2108     in
2109     {
2110         assert (numBytes != 0);
2111     }
2112     out (result)
2113     {
2114         assert (result.length == numBytes);
2115     }
2116     do
2117     {
2118         Block* r = rootBlock;
2119         size_t i = 0;
2120         while  (i <= 3 && r !is null)
2121         {
2122             immutable size_t available = r.bytes.length;
2123             immutable size_t oldUsed = r.used;
2124             immutable size_t newUsed = oldUsed + numBytes;
2125             if (newUsed <= available)
2126             {
2127                 r.used = newUsed;
2128                 return r.bytes[oldUsed .. newUsed];
2129             }
2130             i++;
2131             r = r.next;
2132         }
2133         Block* b = cast(Block*) calloc(Block.sizeof, 1);
2134         b.used = numBytes;
2135         b.next = rootBlock;
2136         rootBlock = b;
2137         return b.bytes[0 .. numBytes];
2138     }
2139 
2140     static struct Node
2141     {
2142         ubyte[] str = void;
2143         Node* next = void;
2144         uint hash = void;
2145         bool mallocated = void;
2146     }
2147 
2148     static struct Block
2149     {
2150         Block* next;
2151         size_t used;
2152         enum BLOCK_CAPACITY = BLOCK_SIZE - size_t.sizeof - (void*).sizeof;
2153         ubyte[BLOCK_CAPACITY] bytes;
2154     }
2155 
2156     static assert (BLOCK_SIZE == Block.sizeof);
2157 
2158     enum BLOCK_SIZE = 1024 * 16;
2159 
2160     // If a string would take up more than 1/4 of a block, allocate it outside
2161     // of the block.
2162     enum BIG_STRING = BLOCK_SIZE / 4;
2163 
2164     Node*[] buckets;
2165     Block* rootBlock;
2166 }
2167 
2168 private extern(C) void* calloc(size_t, size_t) nothrow pure @nogc @trusted;
2169 private extern(C) void* malloc(size_t) nothrow pure @nogc @trusted;
2170 private extern(C) void free(void*) nothrow pure @nogc @trusted;
2171 
2172 unittest
2173 {
2174     auto source = cast(ubyte[]) q{ import std.stdio;}};
2175     auto tokens = getTokensForParser(source, LexerConfig(),
2176         new StringCache(StringCache.defaultBucketCount));
2177     assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
2178         tok!"identifier", tok!";"]));
2179 }
2180 
2181 /// Test \x char sequence
2182 unittest
2183 {
2184     auto toks = (string s) => byToken(cast(ubyte[])s);
2185 
2186     // valid
2187     immutable hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F'];
2188     auto source = "";
2189     foreach (h1; hex)
2190         foreach (h2; hex)
2191             source ~= "'\\x" ~ h1 ~ h2 ~ "'";
2192     assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty);
2193 
2194     // invalid
2195     assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2196     assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2197     assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2198     assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
2199     assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
2200 }
2201 
2202 version (X86_64)
2203 {
2204     version (DigitalMars)
2205         private enum useDMDStyle = true;
2206     else version (LDC)
2207         private enum useDMDStyle = (__VERSION__ < 2092); // GDC-style supported since v1.22
2208     else
2209         private enum useDMDStyle = false; // not supported by GDC
2210 
2211     private ulong pcmpestri(ubyte flags, chars...)(const ubyte* bytes) pure nothrow
2212         @trusted @nogc if (chars.length <= 8)
2213     {
2214         enum constant = ByteCombine!chars;
2215         enum charsLength = chars.length;
2216 
2217         static if (useDMDStyle)
2218         {
2219             asm pure nothrow @nogc
2220             {
2221                 naked;
2222             }
2223             version (Windows) // `bytes` in RCX
2224                 asm pure nothrow @nogc { movdqu XMM1, [RCX]; }
2225             else // `bytes` in RDI
2226                 asm pure nothrow @nogc { movdqu XMM1, [RDI]; }
2227             asm pure nothrow @nogc
2228             {
2229                 mov R10, constant;
2230                 movq XMM2, R10;
2231                 mov RAX, charsLength;
2232                 mov RDX, 16;
2233                 pcmpestri XMM2, XMM1, flags;
2234                 mov RAX, RCX;
2235                 ret;
2236             }
2237         }
2238         else // GDC-style inline asm (GCC basically)
2239         {
2240             ulong result;
2241             asm pure nothrow @nogc
2242             {
2243                 `movdqu    %1, %%xmm1
2244                  movq      %3, %%xmm2
2245                  pcmpestri %5, %%xmm1, %%xmm2`
2246                 : "=c" (result)   // %0: pcmpestri result in RCX, to be stored into `result`
2247                 : "m" (*bytes),   // %1: address of `bytes` string
2248                   "d" (16),       // %2: length of `bytes` head in XMM1, as pcmpestri input in EDX
2249                   "r" (constant), // %3: max 8 `chars` to load into GP register, then XMM2
2250                   "a" (charsLength), // %4: length in XMM2, as pcmpestri input in EAX
2251                   "i" (flags)     // %5: `flags` immediate
2252                 : "xmm1", "xmm2"; // clobbered registers
2253             }
2254             return result;
2255         }
2256     }
2257 
2258     /**
2259      * Skips between 0 and 16 bytes that match (or do not match) one of the
2260      * given $(B chars).
2261      */
2262     void skip(bool matching, chars...)(const ubyte* bytes, ulong* pindex, ulong* pcolumn) pure nothrow
2263         @trusted @nogc if (chars.length <= 8)
2264     {
2265         static if (matching)
2266             enum flags = 0b0001_0000;
2267         else
2268             enum flags = 0b0000_0000;
2269 
2270         const r = pcmpestri!(flags, chars)(bytes);
2271         *pindex += r;
2272         *pcolumn += r;
2273     }
2274 
2275     /**
2276      * Returns: the number of bytes starting at the given location that match
2277      *     (or do not match if $(B invert) is true) the byte ranges in $(B chars).
2278      */
2279     ulong rangeMatch(bool invert, chars...)(const ubyte* bytes) pure nothrow @trusted @nogc
2280     {
2281         static assert(chars.length % 2 == 0);
2282         static if (invert)
2283             enum rangeMatchFlags = 0b0000_0100;
2284         else
2285             enum rangeMatchFlags = 0b0001_0100;
2286 
2287         return pcmpestri!(rangeMatchFlags, chars)(bytes);
2288     }
2289 
2290     template ByteCombine(c...)
2291     {
2292         static assert (c.length <= 8);
2293         static if (c.length > 1)
2294             enum ulong ByteCombine = c[0] | (ByteCombine!(c[1..$]) << 8);
2295         else
2296             enum ulong ByteCombine = c[0];
2297     }
2298 }
2299 
2300 unittest
2301 {
2302     import core.exception : RangeError;
2303     import std.exception : assertNotThrown;
2304 
2305     static immutable src1 = "/++";
2306     static immutable src2 = "/**";
2307 
2308     LexerConfig cf;
2309     StringCache ca = StringCache(16);
2310 
2311     assertNotThrown!RangeError(getTokensForParser(src1, cf, &ca));
2312     assertNotThrown!RangeError(getTokensForParser(src2, cf, &ca));
2313 }
2314 
2315 unittest
2316 {
2317     static immutable src = `"\eeee"`;
2318 
2319     LexerConfig cf;
2320     StringCache ca = StringCache(16);
2321 
2322     auto l = DLexer(src, cf, &ca);
2323     assert(l.front().type == tok!"");
2324     assert(!l.messages.empty);
2325 }
2326 
2327 unittest
2328 {
2329     alias Msg = DLexer.Message;
2330     LexerConfig cf;
2331     StringCache ca = StringCache(16);
2332 
2333     {
2334         auto l = DLexer(`"\&copy;"`, cf, &ca);
2335         assert(l.front().type == tok!"stringLiteral");
2336         assert(l.messages == []);
2337     }
2338     {
2339         auto l = DLexer(`"\&trade;\&urcorn;"`, cf, &ca);
2340         assert(l.front().type == tok!"stringLiteral");
2341         assert(l.messages == []);
2342     }
2343     {
2344         auto l = DLexer(`"\&trade"`, cf, &ca);
2345         assert(l.front().type == tok!"");
2346         assert(l.messages == [ Msg(1, 9, "Error: invalid named character entity", true) ]);
2347     }
2348     {
2349         auto l = DLexer(`"\&trade;\&urcorn"`, cf, &ca);
2350         assert(l.front().type == tok!"");
2351         assert(l.messages == [ Msg(1, 18, "Error: invalid named character entity", true) ]);
2352     }
2353     {
2354         auto l = DLexer(`"\&"`, cf, &ca);
2355         assert(l.front().type == tok!"");
2356         assert(l.messages == [ Msg(1, 4, "Error: invalid named character entity", true) ]);
2357     }
2358     {
2359         auto l = DLexer(`"\&0"`, cf, &ca);
2360         assert(l.front().type == tok!"");
2361         assert(l.messages == [ Msg(1, 5, "Error: invalid named character entity", true) ]);
2362     }
2363     {
2364         auto l = DLexer(`"\&copy`, cf, &ca);
2365         assert(l.front().type == tok!"");
2366         assert(l.messages == [ Msg(1, 8, "Error: invalid named character entity", true) ]);
2367     }
2368     {
2369         auto l = DLexer(`"\&copy;`, cf, &ca);
2370         assert(l.front().type == tok!"");
2371         assert(l.messages == [ Msg(1, 9, "Error: unterminated string literal", true) ]);
2372     }
2373 }
2374 
2375 // legacy code using compatibility comment and trailingComment
2376 unittest
2377 {
2378     import std.conv : to;
2379     import std.exception : enforce;
2380 
2381     static immutable src = `/// this is a module.
2382 // mixed
2383 /// it can do stuff
2384 module foo.bar;
2385 
2386 // hello
2387 
2388 /**
2389  * some doc
2390  * hello
2391  */
2392 int x; /// very nice
2393 
2394 // TODO: do stuff
2395 void main() {
2396     #line 40
2397     /// could be better
2398     writeln(":)");
2399 }
2400 
2401 /// end of file`;
2402 
2403     LexerConfig cf;
2404     StringCache ca = StringCache(16);
2405 
2406     const tokens = getTokensForParser(src, cf, &ca);
2407 
2408     void assertEquals(T)(T a, T b, string what, string file = __FILE__, size_t line = __LINE__)
2409     {
2410         enforce(a == b, "Failed " ~ what ~ " '" ~ a.to!string ~ "' == '" ~ b.to!string ~ "'", file, line);
2411     }
2412 
2413     void test(size_t index, IdType type, string comment, string trailingComment,
2414             string file = __FILE__, size_t line = __LINE__)
2415     {
2416         assertEquals(tokens[index].type, type, "type", file, line);
2417         assertEquals(tokens[index].comment, comment, "comment", file, line);
2418         assertEquals(tokens[index].trailingComment, trailingComment, "trailingComment", file, line);
2419     }
2420 
2421     test(0, tok!"module", "this is a module.\nit can do stuff", "");
2422     test(1, tok!"identifier", "", "");
2423     test(2, tok!".", "", "");
2424     test(3, tok!"identifier", "", "");
2425     test(4, tok!";", "", "");
2426     test(5, tok!"int", "some doc\nhello", "");
2427     test(6, tok!"identifier", "", "");
2428     test(7, tok!";", "", "very nice");
2429     test(8, tok!"void", "", "");
2430     test(9, tok!"identifier", "", "");
2431     test(10, tok!"(", "", "");
2432     test(11, tok!")", "", "");
2433     test(12, tok!"{", "", "");
2434     test(13, tok!"identifier", "could be better", "");
2435     test(14, tok!"(", "", "");
2436     test(15, tok!"stringLiteral", "", "");
2437     test(16, tok!")", "", "");
2438     test(17, tok!";", "", "");
2439     test(18, tok!"}", "", "");
2440 }
2441 
2442 // dlang-community/D-Scanner#805
2443 unittest
2444 {
2445     final class SomeExpr
2446     {
2447         Token tok;
2448     }
2449 
2450     auto e1 = new SomeExpr();
2451     const e2 = new SomeExpr();
2452     immutable e3 = new immutable SomeExpr();
2453 
2454     immutable t1 = e1.tok;
2455     immutable t2 = e2.tok;
2456     immutable t3 = e3.tok;
2457 }
2458 
2459 /// empty '' is invalid syntax, but should still get parsed properly, with an
2460 /// error token and proper location info
2461 unittest
2462 {
2463     import std.conv : to;
2464     import std.exception : enforce;
2465 
2466     static immutable src = `module foo.bar;
2467 
2468 void main() {
2469     x = '';
2470 }
2471 `;
2472 
2473     LexerConfig cf;
2474     StringCache ca = StringCache(16);
2475 
2476     const tokens = getTokensForParser(src, cf, &ca);
2477 
2478     int i;
2479     assert(tokens[i++].type == tok!"module");
2480     assert(tokens[i++].type == tok!"identifier");
2481     assert(tokens[i++].type == tok!".");
2482     assert(tokens[i++].type == tok!"identifier");
2483     assert(tokens[i++].type == tok!";");
2484     assert(tokens[i++].type == tok!"void");
2485     assert(tokens[i++].type == tok!"identifier");
2486     assert(tokens[i++].type == tok!"(");
2487     assert(tokens[i++].type == tok!")");
2488     assert(tokens[i++].type == tok!"{");
2489     assert(tokens[i++].type == tok!"identifier");
2490     assert(tokens[i++].type == tok!"=");
2491     assert(tokens[i].type == tok!"");
2492     assert(tokens[i].line == tokens[i - 1].line);
2493     assert(tokens[i].column == tokens[i - 1].column + 2);
2494     i++;
2495     assert(tokens[i++].type == tok!";");
2496     assert(tokens[i++].type == tok!"}");
2497 
2498     void checkInvalidTrailingString(const Token[] tokens)
2499     {
2500         assert(tokens.length == 3);
2501         assert(tokens[2].index != 0);
2502         assert(tokens[2].column >= 4);
2503         assert(tokens[2].type == tok!"");
2504     }
2505 
2506     checkInvalidTrailingString(getTokensForParser(`x = "foo`, cf, &ca));
2507     checkInvalidTrailingString(getTokensForParser(`x = r"foo`, cf, &ca));
2508     checkInvalidTrailingString(getTokensForParser(`x = x"00`, cf, &ca));
2509     checkInvalidTrailingString(getTokensForParser("x = `foo", cf, &ca));
2510     checkInvalidTrailingString(getTokensForParser("x = q{foo", cf, &ca));
2511     checkInvalidTrailingString(getTokensForParser(`x = q"foo`, cf, &ca));
2512     checkInvalidTrailingString(getTokensForParser("x = '", cf, &ca));
2513 }