1 // Written in the D programming language
2 
3 /**
4  * $(H2 Summary)
5  * This module contains a range-based compile-time _lexer generator.
6  *
7  * $(H2 Overview)
8  * The _lexer generator consists of a template mixin, $(LREF Lexer), along with
9  * several helper templates for generating such things as token identifiers.
10  *
11  * To write a _lexer using this API:
12  * $(OL
13  *     $(LI Create the string array constants for your language.
14  *         $(UL
15  *             $(LI $(LINK2 #.staticTokens, staticTokens))
16  *             $(LI $(LINK2 #.dynamicTokens, dynamicTokens))
17  *             $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens))
18  *             $(LI $(LINK2 #.tokenHandlers, tokenHandlers))
19  *         ))
20  *     $(LI Create aliases for the various token and token identifier types
21  *         specific to your language.
22  *         $(UL
23  *             $(LI $(LREF TokenIdType))
24  *             $(LI $(LREF tokenStringRepresentation))
25  *             $(LI $(LREF TokenStructure))
26  *             $(LI $(LREF TokenId))
27  *         ))
28  *     $(LI Create a struct that mixes in the Lexer template mixin and
29  *         implements the necessary functions.
30  *         $(UL
31  *             $(LI $(LREF Lexer))
32  *         ))
33  * )
34  * Examples:
35  * $(UL
36  * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).)
37  * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).)
38  * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).)
39  * )
40  * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions)
41  * $(DL
42  * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction)
43  * $(DD A function that serves as the default token lexing function. For most
44  *     languages this will be the identifier lexing function.))
45  * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction))
46  * $(DD A function that is able to determine if an identifier/keyword has come
47  *     to an end. This function must return bool and take a single size_t
48  *     argument representing the number of bytes to skip over before looking for
49  *     a separating character.)
50  * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens))
51  * $(DD A listing of the tokens whose exact value never changes and which cannot
52  *     possibly be a token handled by the default token lexing function. The
53  *     most common example of this kind of token is an operator such as
54  *     $(D_STRING "*"), or $(D_STRING "-") in a programming language.)
55  * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens))
56  * $(DD A listing of tokens whose value is variable, such as whitespace,
57  *     identifiers, number literals, and string literals.)
58  * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens))
59  * $(DD A listing of tokens that could posibly be one of the tokens handled by
60  *     the default token handling function. An common example of this is
61  *     a keyword such as $(D_STRING "for"), which looks like the beginning of
62  *     the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is
63  *     called to determine if the character after the $(D_STRING 'r') separates
64  *     the identifier, indicating that the token is $(D_STRING "for"), or if
65  *     lexing should be turned over to the $(B defaultTokenFunction).)
66  * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers))
67  * $(DD A mapping of prefixes to custom token handling function names. The
68  *     generated _lexer will search for the even-index elements of this array,
69  *     and then call the function whose name is the element immedately after the
70  *     even-indexed element. This is used for lexing complex tokens whose prefix
71  *     is fixed.)
72  * )
73  *
74  * Here are some example constants for a simple calculator _lexer:
75  * ---
76  * // There are a near infinite number of valid number literals, so numbers are
77  * // dynamic tokens.
78  * enum string[] dynamicTokens = ["numberLiteral", "whitespace"];
79  *
80  * // The operators are always the same, and cannot start a numberLiteral, so
81  * // they are staticTokens
82  * enum string[] staticTokens = ["-", "+", "*", "/"];
83  *
84  * // In this simple example there are no keywords or other tokens that could
85  * // look like dynamic tokens, so this is blank.
86  * enum string[] possibleDefaultTokens = [];
87  *
88  * // If any whitespace character or digit is encountered, pass lexing over to
89  * // our custom handler functions. These will be demonstrated in an example
90  * // later on.
91  * enum string[] tokenHandlers = [
92  *     "0", "lexNumber",
93  *     "1", "lexNumber",
94  *     "2", "lexNumber",
95  *     "3", "lexNumber",
96  *     "4", "lexNumber",
97  *     "5", "lexNumber",
98  *     "6", "lexNumber",
99  *     "7", "lexNumber",
100  *     "8", "lexNumber",
101  *     "9", "lexNumber",
102  *     " ", "lexWhitespace",
103  *     "\n", "lexWhitespace",
104  *     "\t", "lexWhitespace",
105  *     "\r", "lexWhitespace"
106  * ];
107  * ---
108  *
109  * Copyright: Brian Schott 2013
110  * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
111  * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu
112  * Source: $(PHOBOSSRC std/experimental/_lexer.d)
113  */
114 
115 module std.experimental.lexer;
116 
117 /**
118  * Template for determining the type used for a token type.
119  *
120  * Selects the smallest unsigned integral type that is able to hold the value
121  * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length.
122  * For example if there are 20 static tokens, 30 dynamic tokens,
123  * and 10 possible default tokens, this template will alias itself to ubyte,
124  * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max.
125  * Examples:
126  * ---
127  * // In our calculator example this means that IdType is an alias for ubyte.
128  * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens);
129  * ---
130  */
131 template TokenIdType(alias staticTokens, alias dynamicTokens,
132     alias possibleDefaultTokens)
133 {
134     immutable tokenCount = staticTokens.length + dynamicTokens.length
135         + possibleDefaultTokens.length + 1;
136     static if (tokenCount <= ubyte.max)
137         alias TokenIdType = ubyte;
138     else static if (tokenCount <= ushort.max)
139         alias TokenIdType = ushort;
140     else static if (tokenCount <= uint.max)
141         alias TokenIdType = uint;
142     else
143         static assert (false, "The number of tokens must be less than uint.max");
144 }
145 
146 /**
147  * Looks up the string representation of the given token type.
148  *
149  * This is the opposite of the function of the TokenId template.
150  * Params: type = the token type identifier
151  * Examples:
152  * ---
153  * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens);
154  * assert (str(tok!"*") == "*");
155  * ---
156  * See_also: $(LREF TokenId)
157  */
158 string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens,
159     alias possibleDefaultTokens)(IdType type) pure nothrow @property @nogc @safe
160 {
161     // hax
162     static auto f() pure nothrow @trusted
163     {
164         return cast(immutable) staticTokens ~ dynamicTokens ~ possibleDefaultTokens;
165     }
166 
167     static immutable tokens = f();
168 
169     if (type == 0)
170         return "!ERROR!";
171     else if (type < tokens.length + 1)
172         return tokens[type - 1];
173     else
174         return null;
175 }
176 
177 unittest
178 {
179     alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]);
180     enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token);
181     alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]);
182 
183     static assert (str(tok!"foo") == "foo");
184     static assert (str(tok!"bar") == "bar");
185     static assert (str(tok!"doo") == "doo");
186 }
187 
188 /**
189  * Generates the token type identifier for the given symbol.
190  *
191  * There are two special cases:
192  * $(UL
193  *     $(LI If symbol is $(D_STRING ""), then the token identifier will be 0)
194  *     $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum
195  *         valid token type identifier)
196  * )
197  * In all cases this template will alias itself to a constant of type IdType.
198  * This template will fail at compile time if $(D_PARAM symbol) is not one of
199  * the staticTokens, dynamicTokens, or possibleDefaultTokens.
200  * Examples:
201  * ---
202  * template tok(string symbol)
203  * {
204  *     alias tok = TokenId!(IdType, staticTokens, dynamicTokens,
205  *         possibleDefaultTokens, symbol);
206  * }
207  * // num and plus are of type ubyte.
208  * IdType plus = tok!"+";
209  * IdType num = tok!"numberLiteral";
210  * ---
211  */
212 template TokenId(IdType, alias staticTokens, alias dynamicTokens,
213     alias possibleDefaultTokens, string symbol)
214 {
215     enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens;
216 
217     import std.algorithm;
218     static if (symbol == "")
219     {
220         enum id = 0;
221         alias TokenId = id;
222     }
223     else static if (symbol == "\0")
224     {
225         enum id = 1 + tokens.length;
226         alias TokenId = id;
227     }
228     else
229     {
230         enum i = tokens.countUntil(symbol);
231         static if (i != -1)
232         {
233             enum id = i + 1;
234             static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol);
235             alias TokenId = id;
236         }
237         else
238             static assert (0, "Invalid token: " ~ symbol);
239     }
240 }
241 
242 /**
243  * The token that is returned by the lexer.
244  * Params:
245  *     IdType = The D type of the "type" token type field.
246  *     extraFields = A string containing D code for any extra fields that should
247  *         be included in the token structure body. This string is passed
248  *         directly to a mixin statement.
249  * Examples:
250  * ---
251  * // No extra struct fields are desired in this example, so leave it blank.
252  * alias Token = TokenStructure!(IdType, "");
253  * Token minusToken = Token(tok!"-");
254  * ---
255  */
256 struct TokenStructure(IdType, string extraFields = "")
257 {
258     public pure nothrow @safe @nogc
259     {
260 
261         bool opEquals(ref const typeof(this) other) const
262         {
263             return this.type == other.type && this.text == other.text;
264         }
265 
266         /**
267         * Returns: true if the token has the given type, false otherwise.
268         */
269         bool opEquals(IdType type) const
270         {
271             return this.type == type;
272         }
273 
274         /**
275         * Constructs a token from a token type.
276         * Params: type = the token type
277         */
278         this(IdType type)
279         {
280             this.type = type;
281         }
282 
283         /**
284         * Constructs a token.
285         * Params:
286         *     type = the token type
287         *     text = the text of the token, which may be null
288         *     line = the line number at which this token occurs
289         *     column = the column number at which this token occurs
290         *     index = the byte offset from the beginning of the input at which this
291         *         token occurs
292         */
293         this(IdType type, string text, size_t line, size_t column, size_t index)
294         {
295             this.text = text;
296             this.line = line;
297             this.column = column;
298             this.type = type;
299             this.index = index;
300         }
301 
302         /**
303         * The _text of the token.
304         */
305         string text;
306 
307         /**
308         * The _line number at which this token occurs.
309         */
310         size_t line;
311 
312         /**
313         * The _column number at which this token occurs. This is measured in bytes
314         * and may not be correct when tab characters are involved.
315         */
316         size_t column;
317 
318         /**
319         * The byte offset from the beginning of the input at which this token
320         * occurs.
321         */
322         size_t index;
323 
324         /**
325         * The token type.
326         */
327         IdType type;
328 
329     }
330 
331     mixin (extraFields);
332 }
333 
334 /**
335  * The implementation of the _lexer is contained within this mixin template.
336  *
337  * To use it, this template should be mixed in to a struct that represents the
338  * _lexer for your language. This struct should implement the following methods:
339  * $(UL
340  *     $(LI popFront, which should call this mixin's _popFront() and
341  *         additionally perform any token filtering or shuffling you deem
342  *         necessary. For example, you can implement popFront to skip comment or
343  *          tokens.)
344  *     $(LI A function that serves as the default token lexing function. For
345  *         most languages this will be the identifier lexing function. This
346  *         should then be passed to the $(LREF Lexer) template mixin as the
347  *         $(LINK2 #.defaultTokenFunction defaultTokenFunction) template
348  *         parameter.)
349  *     $(LI A function that is able to determine if an identifier/keyword has
350  *         come to an end. This function must return $(D_KEYWORD bool) and take
351  *         a single $(D_KEYWORD size_t) argument representing the number of
352  *         bytes to skip over before looking for a separating character.)
353  *     $(LI Any functions referred to in the tokenHandlers template paramater.
354  *         These functions must be marked $(D_KEYWORD pure nothrow), take no
355  *         arguments, and return a token)
356  *     $(LI A constructor that initializes the range field as well as calls
357  *         popFront() exactly once (to initialize the _front field).)
358  * )
359  * Params:
360  *     Token = $(LREF TokenStructure)
361  *     defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction)
362  *     tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction)
363  *     staticTokens = $(LINK2 #.staticTokens, staticTokens)
364  *     dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens)
365  *     possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)
366  *     tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers)
367  * Examples:
368  * ---
369  * struct CalculatorLexer
370  * {
371  *     mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating,
372  *         staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers);
373  *
374  *     this (ubyte[] bytes)
375  *     {
376  *         this.range = LexerRange(bytes);
377  *         popFront();
378  *     }
379  *
380  *     void popFront() pure
381  *     {
382  *         _popFront();
383  *     }
384  *
385  *     Token lexNumber() pure nothrow @safe
386  *     {
387  *         // implementation goes here
388  *     }
389  *
390  *     Token lexWhitespace() pure nothrow @safe
391  *     {
392  *         // implementation goes here
393  *     }
394  *
395  *     Token defaultTokenFunction() pure nothrow @safe
396  *     {
397  *         // There is no default token in the example calculator language, so
398  *         // this is always an error.
399  *         range.popFront();
400  *         return Token(tok!"");
401  *     }
402  *
403  *     bool isSeparating(size_t offset) pure nothrow @safe
404  *     {
405  *         // For this example language, always return true.
406  *         return true;
407  *     }
408  * }
409  * ---
410  */
411 mixin template Lexer(Token, alias defaultTokenFunction,
412     alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens,
413     alias possibleDefaultTokens, alias tokenHandlers)
414 {
415     private alias _IDType = typeof(Token.type);
416     private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol);
417 
418     static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must"
419         ~ " have a corresponding handler function name.");
420 
421     static string generateMask(const ubyte[] arr)
422     {
423         import std.string : format;
424         ulong u;
425         for (size_t i = 0; i < arr.length && i < 8; i++)
426         {
427             u |= (cast(ulong) arr[i]) << (i * 8);
428         }
429         return format("0x%016x", u);
430     }
431 
432     private static string generateByteMask(size_t l)
433     {
434         import std.string : format;
435         return format("0x%016x", ulong.max >> ((8 - l) * 8));
436     }
437 
438     private static size_t calcSplitCount(size_t a, size_t b) pure nothrow
439     {
440         int i;
441         while (true)
442         {
443             i++;
444             a /= 2;
445             if (a < b)
446                 break;
447         }
448         return i;
449     }
450 
451     private static char[] getBeginningChars(string[] allTokens)
452     {
453         char[] beginningChars;
454         for (size_t i = 0; i < allTokens.length; i++)
455         {
456             if (allTokens[i].length == 0)
457                 continue;
458             beginningChars ~= allTokens[i][0];
459             size_t j = i + 1;
460             while (j < allTokens.length && allTokens[i][0] == allTokens[j][0])
461                 j++;
462             i = j - 1;
463         }
464         return beginningChars;
465     }
466 
467     private static string generateStatements()
468     {
469         import std.algorithm : sort;
470         import std.range : stride;
471 
472         string[] pseudoTokens = array(tokenHandlers.stride(2));
473         string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq());
474         // Array consisting of a sorted list of the first characters of the
475         // tokens.
476         char[] beginningChars = getBeginningChars(allTokens);
477         size_t i = calcSplitCount(beginningChars.length, 8);
478         return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i);
479     }
480 
481     private static string generateStatementsStep(string[] allTokens,
482         string[] pseudoTokens, char[] chars, size_t i, string indent = "")
483     {
484         import std.string : format;
485         string code;
486         if (i > 0)
487         {
488             size_t p = chars.length / 2;
489             code ~= indent ~ format("if (f < 0x%02x) // %s \n%s{\n", chars[p], chars[p], indent);
490             code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ "    ");
491             code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n";
492             code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ "    ");
493             code ~= indent ~ "}\n";
494         }
495         else
496         {
497             code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n";
498             foreach (char c; chars)
499             {
500                 size_t begin;
501                 size_t end;
502                 for (size_t j = 0; j < allTokens.length; j++)
503                 {
504                     if (allTokens[j].length == 0 || allTokens[j][0] != c)
505                         continue;
506                     begin = j;
507                     end = j + 1;
508                     while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0])
509                         end++;
510                     break;
511                 }
512                 code ~= format("%scase 0x%02x:\n", indent, c);
513                 code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ "    ");
514             }
515             code ~= indent ~ "default: goto _defaultTokenFunction;\n";
516             code ~= indent ~ "}\n";
517         }
518 
519         return code;
520     }
521 
522     private static string printCase(string[] tokens, string[] pseudoTokens, string indent)
523     {
524         import std.array : array;
525         import std.algorithm : countUntil;
526         import std.conv : text;
527         string[] sortedTokens = array(sort!"a.length > b.length"(tokens));
528 
529         if (tokens.length == 1 && tokens[0].length == 1)
530         {
531             if (pseudoTokens.countUntil(tokens[0]) >= 0)
532             {
533                 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
534                     ~ "(token);\n" ~ indent ~ "return;\n";
535             }
536             else if (staticTokens.countUntil(tokens[0]) >= 0)
537             {
538                 return indent ~ "range.index++; range.column++;\n"
539                     ~ indent ~ "token = Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n"
540                     ~ indent ~ "return;\n";
541             }
542             else if (pseudoTokens.countUntil(tokens[0]) >= 0)
543             {
544                 return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1]
545                     ~ "(token);\n" ~ indent ~ "return;\n";
546 
547             }
548         }
549 
550         string code;
551 
552         bool insertTrailingGoto = true;
553         foreach (i, token; sortedTokens)
554         {
555             immutable mask = generateMask(cast (const ubyte[]) token);
556             if (token.length >= 8)
557                 code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n";
558             else if (token.length != 1)
559                 code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n";
560             if (token.length != 1)
561                 code ~= indent ~ "{\n";
562             if (pseudoTokens.countUntil(token) >= 0)
563             {
564                 if (token.length <= 8)
565                 {
566                     if (token.length == 1)
567                         insertTrailingGoto = false;
568                     code ~= (token.length == 1 ? indent : indent ~ "    ")
569                         ~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
570                         ~ "(token);\n";
571                     code ~= (token.length == 1 ? indent : indent ~ "    ") ~ "return;\n";
572                 }
573                 else
574                 {
575                     code ~= indent ~ "    if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n";
576                     code ~= indent ~ "        "
577                         ~ tokenHandlers[tokenHandlers.countUntil(token) + 1]
578                         ~ "();\n";
579                     code ~= indent ~ "    return;\n";
580                 }
581             }
582             else if (staticTokens.countUntil(token) >= 0)
583             {
584                 if (token.length <= 8)
585                 {
586                     if (token.length == 1)
587                         insertTrailingGoto = false;
588                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
589                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
590                     code ~= indent ~ (token.length != 1 ? "    " : "") ~ "return;\n";
591                 }
592                 else
593                 {
594                     code ~= indent ~ "    pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n";
595                 }
596             }
597             else
598             {
599                 // possible default
600                 if (token.length <= 8)
601                 {
602                     code ~= indent ~ "    if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n";
603                     code ~= indent ~ "    {\n";
604                     code ~= indent ~ "        range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
605                     code ~= indent ~ "        token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
606                     code ~= indent ~ "        return;\n";
607                     code ~= indent ~ "    }\n";
608                     code ~= indent ~ "    else\n";
609                     code ~= indent ~ "        goto _defaultTokenFunction;\n";
610                 }
611                 else
612                 {
613                     code ~= indent ~ "    if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n";
614                     code ~= indent ~ "    {\n";
615                     code ~= indent ~ "        range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n";
616                     code ~= indent ~ "        token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n";
617                     code ~= indent ~ "        return;\n";
618                     code ~= indent ~ "    }\n";
619                     code ~= indent ~ "    else\n";
620                     code ~= indent ~ "        goto _defaultTokenFunction;\n";
621                 }
622             }
623             if (token.length != 1)
624             {
625                 code ~= indent ~ "}\n";
626             }
627         }
628         if (insertTrailingGoto)
629             code ~= indent ~ "goto _defaultTokenFunction;\n";
630         return code;
631     }
632 
633     /**
634      * Implements the range primitive _front.
635      */
636     ref const(Token) front()() pure nothrow const @property @safe
637     {
638         return _front;
639     }
640 
641     /**
642      * Advances the lexer to the next token and stores the new current token in
643      * the _front variable.
644      */
645     void _popFront()() pure nothrow @safe
646     {
647         advance(_front);
648     }
649 
650     /**
651      * Implements the range primitive _empty.
652      */
653     bool empty()() pure const nothrow @property @safe @nogc
654     {
655         return _front.type == _tok!"\0";
656     }
657 
658     static string escape(string input) pure @trusted
659     {
660         string retVal;
661         foreach (ubyte c; cast(ubyte[]) input)
662         {
663             switch (c)
664             {
665             case '\\': retVal ~= `\\`; break;
666             case '"': retVal ~= `\"`; break;
667             case '\'': retVal ~= `\'`; break;
668             case '\t': retVal ~= `\t`; break;
669             case '\n': retVal ~= `\n`; break;
670             case '\r': retVal ~= `\r`; break;
671             default: retVal ~= c; break;
672             }
673         }
674         return retVal;
675     }
676 
677     enum tokenSearch = generateStatements();
678 
679     static ulong getFront(const ubyte[] arr) pure nothrow @trusted
680     {
681         static union ByteArr { ulong l; ubyte[8] arr; }
682         static assert(ByteArr.sizeof == ulong.sizeof);
683         ByteArr b;
684         b.l = ulong.max;
685         b.arr[0 .. arr.length] = arr[];
686         return b.l;
687     }
688 
689     void advance(ref Token token) pure nothrow @trusted
690     {
691         if (range.index >= range.bytes.length)
692         {
693             token.type = _tok!"\0";
694             return;
695         }
696         immutable size_t index = range.index;
697         immutable size_t column = range.column;
698         immutable size_t line = range.line;
699         immutable ulong frontBytes = range.index + 8 <= range.bytes.length
700             ? getFront(range.bytes[range.index .. range.index + 8])
701             : getFront(range.bytes[range.index .. $]);
702         ubyte f = cast(ubyte) frontBytes;
703 //        pragma(msg, tokenSearch);
704         mixin(tokenSearch);
705     _defaultTokenFunction:
706         defaultTokenFunction(token);
707     }
708 
709     /**
710      * The lexer input.
711      */
712     LexerRange range;
713 
714     /**
715      * The token that is currently at the front of the range.
716      */
717     Token _front;
718 }
719 
720 /**
721  * Range structure that wraps the _lexer's input.
722  */
723 struct LexerRange
724 {
725     // TODO: When D gets @forceinline the template inline hack (i.e
726     // `void front()() { ... }` )should be removed.
727 
728 public nothrow pure @safe @nogc:
729     /**
730      * Params:
731      *     bytes = the _lexer input
732      *     index = the initial offset from the beginning of $(D_PARAM bytes)
733      *     column = the initial _column number
734      *     line = the initial _line number
735      */
736     this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1)
737     {
738         this.bytes = bytes;
739         this.index = index;
740         this.column = column;
741         this.line = line;
742     }
743 
744     /**
745      * Returns: a mark at the current position that can then be used with slice.
746      */
747     size_t mark()() const
748     {
749         return index;
750     }
751 
752     /**
753      * Sets the range to the given position.
754      * Params: m = the position to seek to
755      */
756     void seek()(size_t m)
757     {
758         index = m;
759     }
760 
761     /**
762      * Returns a slice of the input byte array between the given mark and the
763      * current position.
764      * Params m = the beginning index of the slice to return
765      */
766     const(ubyte)[] slice()(size_t m) const
767     {
768         return bytes[m .. index];
769     }
770 
771     /**
772      * Implements the range primitive _empty.
773      */
774     bool empty()() const
775     {
776         return index >= bytes.length;
777     }
778 
779     /**
780      * Implements the range primitive _front.
781      */
782     ubyte front()() const
783     {
784         return bytes[index];
785     }
786 
787     /**
788      * Returns: the current item as well as the items $(D_PARAM p) items ahead.
789      */
790     const(ubyte)[] peek(size_t p) const
791     {
792         return index + p + 1 > bytes.length
793             ? bytes[index .. $]
794             : bytes[index .. index + p + 1];
795     }
796 
797     /**
798      * Returns: true if the range starts with the given byte sequence
799      */
800     bool startsWith(const(ubyte[]) needle) const
801     {
802         if (needle.length + index > bytes.length)
803             return false;
804         foreach (i; 0 .. needle.length)
805             if (needle[i] != bytes[index + i])
806                 return false;
807         return true;
808     }
809 
810     /**
811      *
812      */
813     ubyte peekAt()(size_t offset) const
814     {
815         return bytes[index + offset];
816     }
817 
818     /**
819      * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead.
820      */
821     bool canPeek()(size_t p) const
822     {
823         return index + p < bytes.length;
824     }
825 
826     /**
827      * Implements the range primitive _popFront.
828      */
829     void popFront()()
830     {
831         index++;
832         column++;
833     }
834 
835     /**
836      * Implements the algorithm _popFrontN more efficiently. This function does
837      * not detect or handle newlines.
838      */
839     void popFrontN()(size_t n)
840     {
841         index += n;
842         column += n;
843     }
844 
845     /**
846      * Increments the range's line number and resets the column counter.
847      */
848     void incrementLine()(size_t i = 1)
849     {
850         column = 1;
851         line += i;
852     }
853 
854     /**
855      * The input _bytes.
856      */
857     const(ubyte)[] bytes;
858 
859     /**
860      * The range's current position.
861      */
862     size_t index;
863 
864     /**
865      * The current _column number.
866      */
867     size_t column;
868 
869     /**
870      * The current _line number.
871      */
872     size_t line;
873 }