1 /// Utility for unescaping D string literals of any kind 2 module dparse.strings; 3 4 import std.algorithm; 5 import std.array; 6 import std.ascii : isAlphaNum, isHexDigit, isWhite; 7 import std.conv; 8 import std.range; 9 import std..string; 10 import std.utf; 11 12 /** 13 * Checks if a string literal input has correct start/end sequences (quotes) to 14 * be any kind of D string literal. 15 * 16 * Bugs: doesn't check for validity of token strings. 17 * 18 * Standards: $(LINK https://dlang.org/spec/lex.html#string_literals) 19 */ 20 bool isStringLiteral(const(char)[] literal, out char stringCloseChar, 21 out bool hasPostfix, out bool parseEscapes, out int prefixLength) 22 { 23 // there are no 1 character strings 24 if (literal.length < 2) 25 return false; 26 27 // check for valid start 28 bool allowPostfix; 29 switch (literal[0]) 30 { 31 case 'r': // WysiwygString 32 case 'x': // HexString 33 if (literal[1] != '"') 34 return false; 35 stringCloseChar = '"'; 36 allowPostfix = true; 37 prefixLength = 2; 38 break; 39 case 'q': // DelimitedString 40 if (literal[1] == '{') 41 stringCloseChar = '}'; 42 else if (literal[1] == '"') 43 stringCloseChar = '"'; 44 else 45 return false; 46 47 allowPostfix = false; 48 prefixLength = 2; 49 break; 50 case '`': 51 case '"': 52 stringCloseChar = literal[0]; 53 allowPostfix = true; 54 parseEscapes = stringCloseChar == '"'; 55 prefixLength = 1; 56 break; 57 default: 58 return false; 59 } 60 61 if (allowPostfix && literal[$ - 1].among!('c', 'w', 'd')) 62 { 63 hasPostfix = true; 64 literal = literal[0 .. $ - 1]; 65 } 66 67 if (literal.length <= prefixLength || literal[$ - 1] != stringCloseChar) 68 return false; 69 70 if (parseEscapes) 71 { 72 // check if end escapes the quote, making this an invalid string 73 auto end = literal[0 .. $ - 1].lastIndexOfNeither("\\"); 74 if (end != -1) 75 { 76 // don't need to subtract 1 77 size_t countBackslashes = literal.length - end; 78 79 if ((countBackslashes % 2) != 0) 80 return false; // uneven backslash count -> invalid end 81 } 82 } 83 84 return true; 85 } 86 87 /// ditto 88 bool isStringLiteral(const(char)[] literal) 89 { 90 char stringCloseChar; 91 bool hasPostfix, parseEscapes; 92 int prefixLength; 93 return isStringLiteral(literal, stringCloseChar, hasPostfix, parseEscapes, 94 prefixLength); 95 } 96 97 /// 98 unittest 99 { 100 assert(isStringLiteral(`"hello"`)); 101 assert(isStringLiteral(`"hello world!"`)); 102 assert(isStringLiteral(`r"hello world!"c`)); 103 assert(isStringLiteral(`r"hello world!"d`)); 104 assert(isStringLiteral(`q{cool}`)); 105 assert(isStringLiteral(`q{cool\}`)); 106 assert(isStringLiteral(`"\\"`)); 107 assert(!isStringLiteral(`"\\\"`)); 108 assert(isStringLiteral(`"\\\\"`)); 109 assert(isStringLiteral(`"a\\\\"`)); 110 assert(isStringLiteral(`""`)); 111 assert(isStringLiteral(`q""`)); 112 assert(isStringLiteral(`x""`)); 113 assert(!isStringLiteral(``)); 114 assert(!isStringLiteral(`"`)); 115 assert(!isStringLiteral(`w""`)); 116 assert(!isStringLiteral(`hello"`)); 117 assert(!isStringLiteral(`"hello`)); 118 assert(!isStringLiteral(`"hello world`)); 119 assert(!isStringLiteral(`hello world`)); 120 assert(!isStringLiteral(`r"`)); 121 assert(!isStringLiteral(`rr"ok"`)); 122 assert(!isStringLiteral(`x"`)); 123 assert(!isStringLiteral(`x" `)); 124 assert(!isStringLiteral(`qqqq`)); 125 } 126 127 /// Defines different handler types what to do when invalid escape sequences are 128 /// found inside $(LREF unescapeString). 129 enum InvalidEscapeAction 130 { 131 /// keep the backslash character as well as the escape characters in the 132 /// string like in the input string. 133 keep = 0, 134 /// Ignore and skip offending characters, drop them from the output. Named 135 /// character entities are still being included like $(LREF keep) as they 136 /// are not currently implemented. 137 skip, 138 /// Throw a ConvException on invalid escape sequences. Does not throw 139 /// anything on unknown named character entities as they are not currently 140 /// implemented but instead treats them like $(LREF keep). 141 error 142 } 143 144 /** 145 * Unescapes a D string, effectively being the same as mixing in the string into 146 * some function call, but only for single string literals. 147 * 148 * Strips quotes, prefixes and suffixes, interprets escape sequences in normal 149 * double quoted strings and interprets hex strings. Returns simple slices for 150 * non-escaped strings. 151 * 152 * It's undefined how invalid/malformed strings are evaluated. 153 * 154 * Bugs: doesn't check for validity of token strings, doesn't interpret named 155 * character entity escape sequences, (HTML-kind escape sequences) doesn't check 156 * nesting level of delimited strings. 157 * 158 * Standards: $(LINK https://dlang.org/spec/lex.html#string_literals) 159 */ 160 string unescapeString( 161 InvalidEscapeAction invalidEscapeAction = InvalidEscapeAction.error 162 )( 163 string input 164 ) 165 in 166 { 167 assert(isStringLiteral(input)); 168 } 169 do 170 { 171 char stringCloseChar; 172 bool hasPostfix, parseEscapes; 173 int prefixLength; 174 isStringLiteral(input, stringCloseChar, hasPostfix, parseEscapes, 175 prefixLength); 176 177 if (hasPostfix) 178 input = input[0 .. $ - 1]; 179 180 auto content = input[prefixLength .. $ - 1]; 181 182 if (!content.length) 183 return content; 184 185 if (input[0] == 'x') 186 { 187 // hex string, obsolete but still implemented 188 return parseHexStringContent!invalidEscapeAction(content); 189 } 190 else if (input[0] == 'q' && input[1] == '"') 191 { 192 content = content.normalizeNewLines; 193 if (isIdentifierChar(content[0])) 194 { 195 auto ln = content.indexOf('\n'); 196 if (ln == -1) 197 { 198 final switch (invalidEscapeAction) 199 { 200 case InvalidEscapeAction.keep: 201 return content; 202 case InvalidEscapeAction.skip: 203 return null; 204 case InvalidEscapeAction.error: 205 throw new ConvException("Invalid delimited escape string"); 206 } 207 } 208 auto delimiter = content[0 .. ln]; 209 content = content[ln + 1 .. $]; 210 if (!content.endsWith(chain("\n", delimiter))) 211 { 212 final switch (invalidEscapeAction) 213 { 214 case InvalidEscapeAction.keep: 215 return content; 216 case InvalidEscapeAction.skip: 217 auto lastNl = content.lastIndexOf('\n'); 218 if (lastNl == -1) 219 return content; 220 else 221 return content[0 .. lastNl]; 222 case InvalidEscapeAction.error: 223 throw new ConvException("Delimited escape string not ending correctly"); 224 } 225 } 226 return content[0 .. $ - delimiter.length]; 227 } 228 else 229 { 230 char delimiterChar = content[0]; 231 char endChar; 232 switch (delimiterChar) 233 { 234 case '[': endChar = ']'; break; 235 case '(': endChar = ')'; break; 236 case '<': endChar = '>'; break; 237 case '{': endChar = '}'; break; 238 default: endChar = delimiterChar; break; 239 } 240 241 if (content[1 .. $].endsWith(endChar)) 242 return content[1 .. $ - 1]; 243 else 244 { 245 final switch (invalidEscapeAction) 246 { 247 case InvalidEscapeAction.keep: 248 return content; 249 case InvalidEscapeAction.skip: 250 return content[1 .. $]; 251 case InvalidEscapeAction.error: 252 throw new ConvException("Invalid delimited escape string"); 253 } 254 } 255 } 256 } 257 else 258 { 259 if (!parseEscapes) 260 return content.normalizeNewLines; 261 else 262 return unescapeDoubleQuotedContent!invalidEscapeAction( 263 content.normalizeNewLines); 264 } 265 } 266 267 /// 268 unittest 269 { 270 assert(unescapeString(q{r"I am Oz"}) == r"I am Oz"); 271 assert(unescapeString(q{r"c:\games\Sudoku.exe"}) == r"c:\games\Sudoku.exe"); 272 assert(unescapeString(q{r"ab\n"}) == r"ab\n"); 273 274 assert(unescapeString(q{`the Great and Powerful.`}) == `the Great and Powerful.`); 275 assert(unescapeString(q{`c:\games\Empire.exe`}) == `c:\games\Empire.exe`); 276 assert(unescapeString(q{`The "lazy" dog`}) == `The "lazy" dog`); 277 assert(unescapeString(q{`a"b\n`}) == `a"b\n`); 278 279 assert(unescapeString(q{"Who are you?"}) == "Who are you?"); 280 assert(unescapeString(q{"c:\\games\\Doom.exe"}) == "c:\\games\\Doom.exe"); 281 assert(unescapeString(q{"ab\n"}) == "ab\n"); 282 283 assert(unescapeString(`x"0A"`) == hexString!"0A"); 284 assert(unescapeString(`x"00 FBCD 32FD 0A"`) == hexString!"00 FBCD 32FD 0A"); 285 286 assert(unescapeString(`q"(foo(xxx))"`) == q"(foo(xxx))"); 287 assert(unescapeString(`q"[foo{]"`) == q"[foo{]"); 288 assert(unescapeString(`q"<foo{>"`) == q"<foo{>"); 289 assert(unescapeString(`q"{foo(}"`) == q"{foo(}"); 290 assert(unescapeString(`q"EOS 291 This 292 is a multi-line 293 heredoc string 294 EOS"`) == q"EOS 295 This 296 is a multi-line 297 heredoc string 298 EOS"); 299 assert(unescapeString(`q"/foo]/"`) == `foo]`); 300 301 assert(unescapeString(`q{this is the voice of}`) == q{this is the voice of}); 302 assert(unescapeString(`q{/*}*/ }`) == q{/*}*/ }); 303 assert(unescapeString(`q{ world(q{control}); }`) == q{ world(q{control}); }); 304 assert(unescapeString(`q{ __TIME__ }`) == q{ __TIME__ }); 305 306 assert(unescapeString(q{"hello"c}) == "hello"); 307 assert(unescapeString(q{"hello"w}) == "hello"); 308 assert(unescapeString(q{"hello"d}) == "hello"); 309 310 assert(unescapeString(`""`) == ""); 311 assert(unescapeString(`"hello\'world\"cool\""`) == "hello\'world\"cool\""); 312 assert(unescapeString(`"\x0A"`) == "\x0A"); 313 assert(unescapeString(`"\u200b"`) == "\u200b"); 314 assert(unescapeString(`"\U0001F4A9"`) == "\U0001F4A9"); 315 assert(unescapeString(`"\0"`) == "\0"); 316 assert(unescapeString(`"\1"`) == "\1"); 317 assert(unescapeString(`"\12"`) == "\12"); 318 assert(unescapeString(`"\127"`) == "\127"); 319 assert(unescapeString(`"\1278"`) == "\1278"); 320 assert(unescapeString(`"\12a8"`) == "\12a8"); 321 assert(unescapeString(`"\1a28"`) == "\1a28"); 322 assert(unescapeString(`x"afDE"`) == "\xaf\xDE"); 323 assert(unescapeString("\"hello\nworld\rfoo\r\nbar\u2028ok\u2029\"") 324 == "hello\nworld\nfoo\nbar\nok\n"); 325 } 326 327 unittest 328 { 329 import std.exception : assertThrown; 330 331 // unimplemented named characters 332 assert(unescapeString(`"\&foo;"`) == "\\&foo;"); 333 334 assertThrown!ConvException(unescapeString(`"\&foo"`)); 335 assert(unescapeString!(InvalidEscapeAction.keep)(`"\&foo"`) == "\\&foo"); 336 assert(unescapeString!(InvalidEscapeAction.skip)(`"\&foo"`) == ""); 337 } 338 339 unittest 340 { 341 import std.exception : assertThrown; 342 343 assertThrown!ConvException(unescapeString(`q"EOS"`)); 344 assert(unescapeString!(InvalidEscapeAction.keep)(`q"EOS"`) == "EOS"); 345 assert(unescapeString!(InvalidEscapeAction.skip)(`q"EOS"`) == ""); 346 347 assertThrown!ConvException(unescapeString(`q"EOS 348 hello"`)); 349 assert(unescapeString!(InvalidEscapeAction.keep)(`q"EOS 350 hello"`) == "hello"); 351 assert(unescapeString!(InvalidEscapeAction.skip)(`q"EOS 352 hello"`) == "hello"); 353 assert(unescapeString!(InvalidEscapeAction.skip)(`q"EOS 354 hello 355 world"`) == "hello"); 356 357 assertThrown!ConvException(unescapeString(`q"/xd"`)); 358 assert(unescapeString!(InvalidEscapeAction.keep)(`q"/xd"`) == "/xd"); 359 assert(unescapeString!(InvalidEscapeAction.skip)(`q"/xd"`) == "xd"); 360 361 assertThrown!ConvException(unescapeString(`"\x"`)); 362 assert(unescapeString!(InvalidEscapeAction.keep)(`"\x"`) == "\\x"); 363 assert(unescapeString!(InvalidEscapeAction.skip)(`"\x"`) == ""); 364 365 assertThrown!ConvException(unescapeString(`"\u0"`)); 366 assert(unescapeString!(InvalidEscapeAction.keep)(`"\u0"`) == "\\u0"); 367 assert(unescapeString!(InvalidEscapeAction.skip)(`"\u0"`) == ""); 368 369 assertThrown!ConvException(unescapeString(`"\U0000000"`)); 370 assert(unescapeString!(InvalidEscapeAction.keep)(`"\U0000000"`) == "\\U0000000"); 371 assert(unescapeString!(InvalidEscapeAction.skip)(`"\U0000000"`) == ""); 372 373 assertThrown!ConvException(unescapeString(`"\xAG"`)); 374 assert(unescapeString!(InvalidEscapeAction.keep)(`"\xAG"`) == "\\xAG"); 375 assert(unescapeString!(InvalidEscapeAction.skip)(`"\xAG"`) == ""); 376 377 assertThrown!ConvException(unescapeString(`"\u00AG"`)); 378 assert(unescapeString!(InvalidEscapeAction.keep)(`"\u00AG"`) == "\\u00AG"); 379 assert(unescapeString!(InvalidEscapeAction.skip)(`"\u00AG"`) == ""); 380 381 assertThrown!ConvException(unescapeDoubleQuotedContent(`a\`)); 382 assert(unescapeDoubleQuotedContent!(InvalidEscapeAction.keep)(`a\`) == "a\\"); 383 assert(unescapeDoubleQuotedContent!(InvalidEscapeAction.skip)(`a\`) == "a"); 384 385 assertThrown!ConvException(unescapeString(`"\z"`)); 386 assert(unescapeString!(InvalidEscapeAction.keep)(`"\z"`) == "\\z"); 387 assert(unescapeString!(InvalidEscapeAction.skip)(`"\z"`) == "z"); 388 389 assert(parseHexStringContent("") == ""); 390 391 assertThrown!ConvException(unescapeString(`x"AG"`)); 392 assert(unescapeString!(InvalidEscapeAction.keep)(`x"AG"`) == "AG"); 393 assert(unescapeString!(InvalidEscapeAction.skip)(`x"AG"`) == ""); 394 395 assertThrown!ConvException(unescapeString(`x"A"`)); 396 assert(unescapeString!(InvalidEscapeAction.keep)(`x"A"`) == "A"); 397 assert(unescapeString!(InvalidEscapeAction.skip)(`x"A"`) == ""); 398 } 399 400 private string unescapeDoubleQuotedContent( 401 InvalidEscapeAction invalidEscapeAction = InvalidEscapeAction.error 402 )( 403 string input 404 ) 405 { 406 auto escape = input.indexOf('\\'); 407 if (escape == -1) 408 return input; 409 410 auto ret = appender!string; 411 ret.reserve(input.length); 412 size_t start = 0; 413 414 bool requireMinLength(size_t length) 415 { 416 if (escape + length >= input.length) 417 { 418 final switch (invalidEscapeAction) 419 { 420 case InvalidEscapeAction.keep: 421 ret ~= input[start .. $]; 422 start = input.length; 423 return false; 424 case InvalidEscapeAction.skip: 425 start = input.length; 426 return false; 427 case InvalidEscapeAction.error: 428 throw new ConvException("Unfinished escape at end of string"); 429 } 430 } 431 else 432 { 433 return true; 434 } 435 } 436 437 void errorInvalidCharacter(size_t continueAt) 438 { 439 final switch (invalidEscapeAction) 440 { 441 case InvalidEscapeAction.keep: 442 ret ~= input[start .. continueAt]; 443 start = continueAt; 444 break; 445 case InvalidEscapeAction.skip: 446 start = continueAt; 447 break; 448 case InvalidEscapeAction.error: 449 throw new ConvException("Invalid escape character before index " 450 ~ continueAt.to!string); 451 } 452 } 453 454 bool parseUnicode(size_t length) 455 { 456 auto c = input[escape + 2 .. escape + 2 + length]; 457 if (!c.all!isHexDigit) 458 { 459 errorInvalidCharacter(escape + 2 + length); 460 return false; 461 } 462 dchar ch = cast(dchar) c.to!uint(16); 463 char[4] buf; 464 auto size = encode(buf, ch); 465 ret ~= buf[0 .. size]; 466 start = escape + 2 + length; 467 return true; 468 } 469 470 Loop: while (escape != -1) 471 { 472 ret ~= input[start .. escape]; 473 start = escape; 474 475 if (!requireMinLength(1)) 476 break; 477 478 Switch: 479 switch (input[escape + 1]) 480 { 481 case '\'': 482 case '"': 483 case '?': 484 case '\\': 485 ret ~= input[escape + 1]; 486 start = escape + 2; 487 break; 488 489 case 'a': ret ~= '\a'; start = escape + 2; break; 490 case 'b': ret ~= '\b'; start = escape + 2; break; 491 case 'f': ret ~= '\f'; start = escape + 2; break; 492 case 'n': ret ~= '\n'; start = escape + 2; break; 493 case 'r': ret ~= '\r'; start = escape + 2; break; 494 case 't': ret ~= '\t'; start = escape + 2; break; 495 case 'v': ret ~= '\v'; start = escape + 2; break; 496 497 case 'x': 498 if (!requireMinLength(3)) 499 break Loop; 500 char a = input[escape + 2]; 501 char b = input[escape + 3]; 502 if (!a.isHexDigit || !b.isHexDigit) 503 { 504 errorInvalidCharacter(escape + 4); 505 break; 506 } 507 ret ~= cast(char)(a.parseHexChar << 4 | b.parseHexChar); 508 start = escape + 4; 509 break; 510 case 'u': 511 if (!requireMinLength(1 + 4)) 512 break Loop; 513 parseUnicode(4); 514 break; 515 case 'U': 516 if (!requireMinLength(1 + 8)) 517 break Loop; 518 parseUnicode(8); 519 break; 520 case '0': .. case '7': 521 int length = 1; 522 foreach (n; 2 .. 4) 523 { 524 if (escape + 1 + n > input.length) 525 break; 526 char c = input[escape + n]; 527 if (c >= '0' && c <= '7') 528 length = n; 529 else 530 break; 531 } 532 int c = input[escape + 1 .. escape + 1 + length].to!int(8); 533 ret ~= cast(char) c; 534 start = escape + 1 + length; 535 break; 536 case '&': 537 auto end = input.indexOf(';', escape + 2); 538 if (end == -1) 539 { 540 errorInvalidCharacter(input.length); 541 } 542 else 543 { 544 ret ~= input[escape .. end + 1]; 545 start = end + 1; 546 } 547 break; 548 default: 549 errorInvalidCharacter(escape + 1); 550 break; 551 } 552 553 escape = input.indexOf('\\', start); 554 } 555 ret ~= input[start .. $]; 556 return ret.data; 557 } 558 559 unittest 560 { 561 assert(unescapeDoubleQuotedContent(`hello world`) == "hello world"); 562 assert(unescapeDoubleQuotedContent(`hello\nworld`) == "hello\nworld"); 563 assert(unescapeDoubleQuotedContent(`hello\tworld`) == "hello\tworld"); 564 assert(unescapeDoubleQuotedContent(`hello\u200bworld`) == "hello\u200bworld"); 565 assert(unescapeDoubleQuotedContent(`hello \"\\ok`) == "hello \"\\ok"); 566 } 567 568 private string parseHexStringContent( 569 InvalidEscapeAction invalidEscapeAction = InvalidEscapeAction.error 570 )( 571 string input 572 ) 573 { 574 if (!input.length) 575 return input; 576 577 auto ret = appender!string; 578 ret.reserve(input.length / 3); 579 char buf; 580 foreach (i, char c; input) 581 { 582 if (c.isWhite) 583 continue; 584 585 if (!c.isHexDigit) 586 { 587 final switch (invalidEscapeAction) 588 { 589 case InvalidEscapeAction.keep: 590 if (buf != char.init) 591 { 592 ret ~= buf; 593 buf = char.init; 594 } 595 ret ~= c; 596 break; 597 case InvalidEscapeAction.skip: 598 break; 599 case InvalidEscapeAction.error: 600 throw new ConvException("Invalid hex character at index " 601 ~ i.to!string); 602 } 603 } 604 else 605 { 606 if (buf == char.init) 607 { 608 buf = c; 609 } 610 else 611 { 612 ret ~= cast(char)(buf.parseHexChar << 4 | c.parseHexChar); 613 buf = char.init; 614 } 615 } 616 } 617 618 if (buf != char.init) 619 { 620 final switch (invalidEscapeAction) 621 { 622 case InvalidEscapeAction.keep: 623 ret ~= buf; 624 break; 625 case InvalidEscapeAction.skip: 626 break; 627 case InvalidEscapeAction.error: 628 throw new ConvException("Unterminated hex character at end of string"); 629 } 630 } 631 632 return ret.data; 633 } 634 635 private int parseHexChar(char c) 636 in 637 { 638 assert(c.isHexDigit); 639 assert('a' > 'A' && 'A' > '0'); // just checking that ASCII doesn't suddenly change 640 } 641 do 642 { 643 // can omit range ends and digit check because of function preconditions 644 if (c >= 'a') 645 return (c - 'a') + 10; 646 else if (c >= 'A') 647 return (c - 'A') + 10; 648 else 649 return c - '0'; 650 } 651 652 private bool isIdentifierChar(char c) 653 { 654 return isAlphaNum(c) || c == '_'; 655 } 656 657 /// normalizes all line endings with \n, as parsed in D strings 658 private string normalizeNewLines(string text) 659 { 660 import std.utf : codeLength; 661 662 enum exoticLineBreakLength = codeLength!char('\u2028'); 663 static immutable dchar[] nlCharacters = ['\r', '\u2028', '\u2029']; 664 665 auto end = text.indexOfAny(nlCharacters); 666 if (end == -1) 667 return text; 668 auto ret = appender!string; 669 ret.reserve(text.length); 670 size_t start = 0; 671 while (end != -1) 672 { 673 ret ~= text[start .. end]; 674 ret ~= '\n'; 675 if (end + 1 < text.length && text[end] == '\r' && text[end + 1] == '\n') 676 end++; 677 else if (text[end] != '\r') 678 end += exoticLineBreakLength - 1; 679 start = end + 1; 680 end = text.indexOfAny(nlCharacters, start); 681 } 682 ret ~= text[start .. $]; 683 return ret.data; 684 } 685 686 /// 687 unittest 688 { 689 string testNoChange = "hello\nworld!"; 690 assert(normalizeNewLines(testNoChange).ptr is testNoChange.ptr); 691 692 assert(normalizeNewLines("hello\rworld") == "hello\nworld"); 693 assert(normalizeNewLines("hello\r\nworld") == "hello\nworld"); 694 assert(normalizeNewLines("hello\r\n\nworld") == "hello\n\nworld"); 695 assert(normalizeNewLines("hello\u2028\nworld") == "hello\n\nworld"); 696 assert(normalizeNewLines("hello\u2029\nworld") == "hello\n\nworld"); 697 assert(normalizeNewLines("hello\r") == "hello\n"); 698 }