From 143f55a6e5e7757827efcc8318764b73a74aa895 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Fri, 7 Sep 2018 10:32:43 +0900 Subject: [PATCH 01/12] [Lexer] Add formStringLiteralToken dedicated for forming string literal --- include/swift/Parse/Lexer.h | 5 +++-- include/swift/Parse/Token.h | 30 ++++++++++++++++++------------ lib/Parse/Lexer.cpp | 27 ++++++++++++++++++--------- lib/Parse/Parser.cpp | 4 ++-- 4 files changed, 41 insertions(+), 25 deletions(-) diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h index d71c0d6e8c9a2..35f172e150726 100644 --- a/include/swift/Parse/Lexer.h +++ b/include/swift/Parse/Lexer.h @@ -506,9 +506,10 @@ class Lexer { return diagnose(Loc, Diagnostic(DiagID, std::forward(Args)...)); } - void formToken(tok Kind, const char *TokStart, bool IsMultilineString = false, - unsigned CustomDelimiterLen = 0); + void formToken(tok Kind, const char *TokStart); void formEscapedIdentifierToken(const char *TokStart); + void formStringLiteralToken(const char *TokStart, bool IsMultilineString, + unsigned CustomDelimiterLen); /// Advance to the end of the line. /// If EatNewLine is true, CurPtr will be at end of newline character. diff --git a/include/swift/Parse/Token.h b/include/swift/Parse/Token.h index 6cd0fc95828c4..59e3e456ccd99 100644 --- a/include/swift/Parse/Token.h +++ b/include/swift/Parse/Token.h @@ -220,6 +220,21 @@ class Token { default: return false; } } + + /// \brief True if the string literal token is multiline. + bool isMultilineString() const { + return MultilineString; + } + /// \brief Count of extending escaping '#'. + unsigned getCustomDelimiterLen() const { + return CustomDelimiterLen; + } + /// \brief Set characteristics of string literal token. + void setStringLiteral(bool IsMultilineString, unsigned CustomDelimiterLen) { + assert(Kind == tok::string_literal); + this->MultilineString = IsMultilineString; + this->CustomDelimiterLen = CustomDelimiterLen; + } /// getLoc - Return a source location identifier for the specified /// offset in the current file. @@ -268,25 +283,16 @@ class Token { void setText(StringRef T) { Text = T; } /// \brief Set the token to the specified kind and source range. - void setToken(tok K, StringRef T, unsigned CommentLength = 0, - bool IsMultilineString = false, unsigned CustomDelimiterLen = 0) { + void setToken(tok K, StringRef T, unsigned CommentLength = 0) { Kind = K; Text = T; this->CommentLength = CommentLength; EscapedIdentifier = false; - this->MultilineString = IsMultilineString; - this->CustomDelimiterLen = CustomDelimiterLen; + this->MultilineString = false; + this->CustomDelimiterLen = 0; assert(this->CustomDelimiterLen == CustomDelimiterLen && "custom string delimiter length > 255"); } - - bool isMultilineString() const { - return MultilineString; - } - - unsigned getCustomDelimiterLen() const { - return CustomDelimiterLen; - } }; } // end namespace swift diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 9192819bad844..5d4dcf4d097c1 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -272,8 +272,7 @@ Token Lexer::getTokenAt(SourceLoc Loc) { return Result; } -void Lexer::formToken(tok Kind, const char *TokStart, - bool IsMultilineString, unsigned CustomDelimiterLen) { +void Lexer::formToken(tok Kind, const char *TokStart) { assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Current pointer out of range!"); @@ -305,8 +304,7 @@ void Lexer::formToken(tok Kind, const char *TokStart, lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true); } - NextToken.setToken(Kind, TokenText, CommentLength, - IsMultilineString, CustomDelimiterLen); + NextToken.setToken(Kind, TokenText, CommentLength); } void Lexer::formEscapedIdentifierToken(const char *TokStart) { @@ -326,6 +324,20 @@ void Lexer::formEscapedIdentifierToken(const char *TokStart) { NextToken.setEscapedIdentifier(true); } +static void validateMultilineIndents(const Token &Str, DiagnosticEngine *Diags); + +void Lexer::formStringLiteralToken(const char *TokStart, + bool IsMultilineString, + unsigned CustomDelimiterLen) { + formToken(tok::string_literal, TokStart); + if (NextToken.is(tok::eof)) + return; + NextToken.setStringLiteral(IsMultilineString, CustomDelimiterLen); + + if (IsMultilineString && Diags) + validateMultilineIndents(NextToken, Diags); +} + Lexer::State Lexer::getStateForBeginningOfTokenLoc(SourceLoc Loc) const { const char *Ptr = getBufferPtrForSourceLoc(Loc); // Skip whitespace backwards until we hit a newline. This is needed to @@ -1827,11 +1839,8 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { if (wasErroneous) return formToken(tok::unknown, TokStart); - formToken(tok::string_literal, TokStart, - IsMultilineString, CustomDelimiterLen); - if (IsMultilineString && Diags) - validateMultilineIndents(NextToken, Diags); - return; + return formStringLiteralToken(TokStart, IsMultilineString, + CustomDelimiterLen); } } } diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp index e0fd9200c1de6..95521e0479329 100644 --- a/lib/Parse/Parser.cpp +++ b/lib/Parse/Parser.cpp @@ -240,8 +240,8 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, StringRef Text = SM.extractText({ Loc, Len }); Token NewTok; - NewTok.setToken(tok::string_literal, Text, - IsMultiline, CustomDelimiterLen); + NewTok.setToken(tok::string_literal, Text); + NewTok.setStringLiteral(IsMultiline, CustomDelimiterLen); Toks.push_back(NewTok); } else { From 0e9b232755e92814a31a8844ae72f8dee854c9da Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Mon, 17 Sep 2018 16:33:16 +0900 Subject: [PATCH 02/12] [Lexer] Fix double diagnostics for unterminated string literal We should not emit diagnostics in skipToEndOfInterpolatedExpression() --- lib/Parse/Lexer.cpp | 14 +++++--------- test/Parse/string_literal_eof1.swift | 5 +++++ test/Parse/string_literal_eof2.swift | 5 +++++ 3 files changed, 15 insertions(+), 9 deletions(-) create mode 100644 test/Parse/string_literal_eof1.swift create mode 100644 test/Parse/string_literal_eof2.swift diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 5d4dcf4d097c1..976ebb075aefd 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1444,6 +1444,11 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, continue; // Will be diagnosed as an unterminated string literal. return CurPtr-1; + case 0: + if (CurPtr-1 != EndPtr) + continue; // CC token or random NUL character. + // Will be diagnosed as an unterminated string literal. + return CurPtr-1; case '#': if (inStringLiteral() || @@ -1509,15 +1514,6 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, } } continue; - case 0: - // If we hit EOF, we fail. - if (CurPtr-1 == EndPtr) { - if (Diags) - Diags->diagnose(Lexer::getSourceLoc(CurPtr-1), - diag::lex_unterminated_string); - return CurPtr-1; - } - continue; // Paren nesting deeper to support "foo = \((a+b)-(c*d)) bar". case '(': diff --git a/test/Parse/string_literal_eof1.swift b/test/Parse/string_literal_eof1.swift new file mode 100644 index 0000000000000..accc6a0287560 --- /dev/null +++ b/test/Parse/string_literal_eof1.swift @@ -0,0 +1,5 @@ +// RUN: %target-typecheck-verify-swift + +// NOTE: DO NOT add a newline at EOF. +// expected-error@+1 {{unterminated string literal}} +_ = "foo\( \ No newline at end of file diff --git a/test/Parse/string_literal_eof2.swift b/test/Parse/string_literal_eof2.swift new file mode 100644 index 0000000000000..c0495575a02d8 --- /dev/null +++ b/test/Parse/string_literal_eof2.swift @@ -0,0 +1,5 @@ +// RUN: %target-typecheck-verify-swift + +// NOTE: DO NOT add a newline at EOF. +// expected-error@+1 {{unterminated string literal}} +_ = "foo\("bar \ No newline at end of file From c9c69633c439461287b2f4ea6f6eb38289fa6111 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Mon, 17 Sep 2018 16:57:23 +0900 Subject: [PATCH 03/12] [Lexer] Simplify handling quotes in skipToEndOfInterpolatedExpression() NFC --- lib/Parse/Lexer.cpp | 66 +++++++++++++--------------- test/Parse/string_literal_eof3.swift | 10 +++++ 2 files changed, 40 insertions(+), 36 deletions(-) create mode 100644 test/Parse/string_literal_eof3.swift diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 976ebb075aefd..7cb769954ba2c 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1454,67 +1454,61 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, if (inStringLiteral() || !(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))) continue; + assert(CurPtr[-1] == '"' && + "advanceIfCustomDelimiter() must stop at after the quote"); LLVM_FALLTHROUGH; case '"': case '\'': { - if (!AllowNewline.back() && inStringLiteral()) { - if (OpenDelimiters.back() == CurPtr[-1] && - delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) { - // Closing single line string literal. - OpenDelimiters.pop_back(); - AllowNewline.pop_back(); - CustomDelimiter.pop_back(); - } - // Otherwise, it's just a quote in string literal. e.g. "foo's". - continue; - } - - bool isMultilineQuote = advanceIfMultilineDelimiter(CurPtr, Diags); - if (!inStringLiteral()) { - // Open string literal + // Open string literal. OpenDelimiters.push_back(CurPtr[-1]); - AllowNewline.push_back(isMultilineQuote); + AllowNewline.push_back(advanceIfMultilineDelimiter(CurPtr, Diags)); CustomDelimiter.push_back(CustomDelimiterLen); continue; } - // We are in multiline string literal. - assert(AllowNewline.back() && "other cases must be handled above"); - if (isMultilineQuote && - delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) { - // Close multiline string literal. - OpenDelimiters.pop_back(); - AllowNewline.pop_back(); - CustomDelimiter.pop_back(); - } + // In string literal. + + // Skip if it's an another kind of quote in string literal. e.g. "foo's". + if (OpenDelimiters.back() != CurPtr[-1]) + continue; + + // Multi-line string can only be closed by '"""'. + if (AllowNewline.back() && !advanceIfMultilineDelimiter(CurPtr, Diags)) + continue; + + // Check whether we have equivalent number of '#'s. + if (!delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) + continue; - // Otherwise, it's just a normal character in multiline string. + // Close string literal. + OpenDelimiters.pop_back(); + AllowNewline.pop_back(); + CustomDelimiter.pop_back(); continue; } case '\\': + // We ignore invalid escape sequence here. They should be diagnosed in + // the real lexer functions. if (inStringLiteral() && delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) { - char escapedChar = *CurPtr++; - switch (escapedChar) { + switch (*CurPtr++) { case '(': // Entering a recursive interpolated expression OpenDelimiters.push_back('('); continue; - case '\n': case '\r': - if (AllowNewline.back()) - continue; - LLVM_FALLTHROUGH; - case 0: - // Don't jump over newline/EOF due to preceding backslash! - return CurPtr-1; + case '\n': case '\r': case 0: + // Don't jump over newline/EOF due to preceding backslash. + // Let the outer switch to handle it. + --CurPtr; + continue; default: continue; } } continue; - + // Paren nesting deeper to support "foo = \((a+b)-(c*d)) bar". case '(': if (!inStringLiteral()) { diff --git a/test/Parse/string_literal_eof3.swift b/test/Parse/string_literal_eof3.swift new file mode 100644 index 0000000000000..a5425e6779a53 --- /dev/null +++ b/test/Parse/string_literal_eof3.swift @@ -0,0 +1,10 @@ +// RUN: %target-typecheck-verify-swift + +// expected-error@+2 {{unterminated string literal}} +// expected-error@+1 {{invalid escape sequence in literal}} +_ = "foo \ + +// NOTE: DO NOT add a newline at EOF. +// expected-error@+2 {{unterminated string literal}} +// expected-error@+1 {{invalid escape sequence in literal}} +_ = "foo \ \ No newline at end of file From 4536e69dd6ca8ad215cd3e4f4bdcd97166544dd7 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Mon, 17 Sep 2018 17:28:41 +0900 Subject: [PATCH 04/12] [Lexer] Don't emit diagnostics in skipToEndOfInterpolatedExpression() Removed Diags parameter from it. Skipped bytes are revisited by main lexer function anyway. So emitting diagnostics in it causes duplicated errors. --- lib/Parse/Lexer.cpp | 23 ++++++++++------------- test/Parse/raw_string_errors.swift | 12 ++++++++---- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 7cb769954ba2c..4ee40308639bd 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1414,7 +1414,6 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, /// outstanding delimiters as it scans the string. static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, const char *EndPtr, - DiagnosticEngine *Diags, bool IsMultilineString) { SmallVector OpenDelimiters; SmallVector AllowNewline; @@ -1452,7 +1451,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, case '#': if (inStringLiteral() || - !(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))) + !(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, nullptr))) continue; assert(CurPtr[-1] == '"' && "advanceIfCustomDelimiter() must stop at after the quote"); @@ -1463,7 +1462,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, if (!inStringLiteral()) { // Open string literal. OpenDelimiters.push_back(CurPtr[-1]); - AllowNewline.push_back(advanceIfMultilineDelimiter(CurPtr, Diags)); + AllowNewline.push_back(advanceIfMultilineDelimiter(CurPtr, nullptr)); CustomDelimiter.push_back(CustomDelimiterLen); continue; } @@ -1475,11 +1474,11 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, continue; // Multi-line string can only be closed by '"""'. - if (AllowNewline.back() && !advanceIfMultilineDelimiter(CurPtr, Diags)) + if (AllowNewline.back() && !advanceIfMultilineDelimiter(CurPtr, nullptr)) continue; // Check whether we have equivalent number of '#'s. - if (!delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) + if (!delimiterMatches(CustomDelimiter.back(), CurPtr, nullptr, true)) continue; // Close string literal. @@ -1492,7 +1491,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // We ignore invalid escape sequence here. They should be diagnosed in // the real lexer functions. if (inStringLiteral() && - delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) { + delimiterMatches(CustomDelimiter.back(), CurPtr, nullptr)) { switch (*CurPtr++) { case '(': // Entering a recursive interpolated expression @@ -1757,10 +1756,9 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { && *TmpPtr == '(') { // Consume tokens until we hit the corresponding ')'. CurPtr = TmpPtr + 1; - const char *EndPtr = - skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, - Diags, IsMultilineString); - + const char *EndPtr = skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, + IsMultilineString); + if (*EndPtr == ')') { // Successfully scanned the body of the expression literal. CurPtr = EndPtr+1; @@ -2231,9 +2229,8 @@ void Lexer::getStringLiteralSegments( IsFirstSegment = false; // Find the closing ')'. - const char *End = skipToEndOfInterpolatedExpression(BytesPtr, - Str.getText().end(), - Diags, MultilineString); + const char *End = skipToEndOfInterpolatedExpression( + BytesPtr, Str.getText().end(), MultilineString); assert(*End == ')' && "invalid string literal interpolations should" " not be returned as string literals"); ++End; diff --git a/test/Parse/raw_string_errors.swift b/test/Parse/raw_string_errors.swift index ca19b41fe4dd2..4bc97a11999ea 100644 --- a/test/Parse/raw_string_errors.swift +++ b/test/Parse/raw_string_errors.swift @@ -1,14 +1,18 @@ // RUN: %target-typecheck-verify-swift -#"\##("invalid")"# +let _ = "foo\(#"bar"##)baz" +// expected-error@-1{{too many '#' characters in closing delimiter}} +// expected-error@-2{{expected ',' separator}} +// expected-error@-3{{expected expression in list of expressions}} + +let _ = #"\##("invalid")"# // expected-error@-1{{too many '#' characters in delimited escape}} // expected-error@-2{{invalid escape sequence in literal}} -####"invalid"### +let _ = ####"invalid"### // expected-error@-1{{unterminated string literal}} -###"invalid"#### +let _ = ###"invalid"#### // expected-error@-1{{too many '#' characters in closing delimiter}} // expected-error@-2{{consecutive statements on a line must be separated by ';'}} // expected-error@-3{{expected expression}} -// expected-warning@-4{{string literal is unused}} From 893524ea77c940d170a2544accdb1e8dcc89a512 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Mon, 17 Sep 2018 17:35:00 +0900 Subject: [PATCH 05/12] [Lexer] Add assertion in advanceIfCustomDelimiter() CurPtr[-1] must be '#' when called. --- lib/Parse/Lexer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 4ee40308639bd..6202353d04f76 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1250,9 +1250,11 @@ static bool advanceIfMultilineDelimiter(const char *&CurPtr, /// advanceIfCustomDelimiter - Extracts/detects any custom delimiter on /// opening a string literal, advances CurPtr if a delimiter is found and -/// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called. +/// returns a non-zero delimiter length. CurPtr[-1] must be '#' when called. static unsigned advanceIfCustomDelimiter(const char *&CurPtr, DiagnosticEngine *Diags) { + assert(CurPtr[-1] == '#'); + const char *TmpPtr = CurPtr; unsigned CustomDelimiterLen = 1; while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) From ba5172738c554703b28bfc9d86c6c38faa8f1be7 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Mon, 17 Sep 2018 18:53:44 +0900 Subject: [PATCH 06/12] [Lexer] Advance pointer to the end of end-quote in lexCharacter This simplifies main lexStringLiteral loop --- lib/Parse/Lexer.cpp | 60 ++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 6202353d04f76..eac162d97428f 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1292,9 +1292,9 @@ static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr, /// lexCharacter - Read a character and return its UTF32 code. If this is the /// end of enclosing string/character sequence (i.e. the character is equal to -/// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal -/// quote. If this is a malformed character sequence, it emits a diagnostic -/// (when EmitDiagnostics is true) and returns ~1U. +/// 'StopQuote'), this returns ~0U and advances 'CurPtr' pointing to the end of +/// terminal quote. If this is a malformed character sequence, it emits a +/// diagnostic (when EmitDiagnostics is true) and returns ~1U. /// /// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0 /// character_escape ::= unicode_character_escape @@ -1305,6 +1305,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, switch (*CurPtr++) { default: {// Normal characters are part of the string. + // Normal characters are part of the string. // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) >= 0) { if (isPrintable(CurPtr[-1]) == 0) @@ -1322,14 +1323,26 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, } case '"': case '\'': - // If we found a closing quote character, we're done. if (CurPtr[-1] == StopQuote) { - --CurPtr; + // Mutliline and custom escaping are only enabled for " quote. + if (LLVM_UNLIKELY(StopQuote != '"')) + return ~0U; + if (!IsMultilineString && !CustomDelimiterLen) + return ~0U; + + DiagnosticEngine *D = EmitDiagnostics ? Diags : nullptr; + auto TmpPtr = CurPtr; + if (IsMultilineString && !advanceIfMultilineDelimiter(TmpPtr, D)) + return '"'; + if (CustomDelimiterLen && + !delimiterMatches(CustomDelimiterLen, TmpPtr, D, /*IsClosing=*/true)) + return '"'; + CurPtr = TmpPtr; return ~0U; } // Otherwise, this is just a character. return CurPtr[-1]; - + case 0: if (CurPtr-1 != BufferEnd) { if (EmitDiagnostics) @@ -1738,10 +1751,12 @@ static void validateMultilineIndents(const Token &Str, /// string_literal ::= ["]["]["].*["]["]["] - approximately /// string_literal ::= (#+)("")?".*"(\2\1) - "raw" strings void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { - const char *TokStart = CurPtr-1; - assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); + const char QuoteChar = CurPtr[-1]; + const char *TokStart = CurPtr - 1 - CustomDelimiterLen; + // NOTE: We only allow single-quote string literals so we can emit useful // diagnostics about changing them to double quotes. + assert((QuoteChar == '"' || QuoteChar == '\'') && "Unexpected start"); bool wasErroneous = false, IsMultilineString = false; @@ -1774,23 +1789,26 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { // String literals cannot have \n or \r in them (unless multiline). if (((*CurPtr == '\r' || *CurPtr == '\n') && !IsMultilineString) || CurPtr == BufferEnd) { - TokStart -= CustomDelimiterLen; diagnose(TokStart, diag::lex_unterminated_string); return formToken(tok::unknown, TokStart); } - unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, + unsigned CharValue = lexCharacter(CurPtr, QuoteChar, true, IsMultilineString, CustomDelimiterLen); wasErroneous |= CharValue == ~1U; // If this is the end of string, we are done. If it is a normal character // or an already-diagnosed error, just munch it. if (CharValue == ~0U) { - ++CurPtr; - if (*TokStart == '\'') { - // Complain about single-quote string and suggest replacement with - // double-quoted equivalent. + if (QuoteChar == '\'') { + // Emit diagnostics for single-quote string and suggest replacement + // with double-quoted equivalent. + assert( + !IsMultilineString && CustomDelimiterLen == 0 && + "Single quoted string cannot have custom delimitor, nor multiline"); + assert(*TokStart == '\'' && CurPtr[-1] == '\''); + StringRef orig(TokStart, CurPtr - TokStart); llvm::SmallString<32> replacement; replacement += '"'; @@ -1823,15 +1841,11 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { } // Is this the end of multiline/custom-delimited string literal? - if ((!IsMultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) && - delimiterMatches(CustomDelimiterLen, CurPtr, Diags, true)) { - TokStart -= CustomDelimiterLen; - if (wasErroneous) - return formToken(tok::unknown, TokStart); - - return formStringLiteralToken(TokStart, IsMultilineString, - CustomDelimiterLen); - } + if (wasErroneous) + return formToken(tok::unknown, TokStart); + + return formStringLiteralToken(TokStart, IsMultilineString, + CustomDelimiterLen); } } } From 1a4d37597f1643be4e7434181008f40d36428b50 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Mon, 17 Sep 2018 19:07:32 +0900 Subject: [PATCH 07/12] [Lexer] Remove unnecessary logic from lexCharacter EOF and newline (in non-multiline string literal) must be handled by call site. lexCharacter doesn't need to handle them. Added assertion instead. --- lib/Parse/Lexer.cpp | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index eac162d97428f..e67f6da574cb0 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1344,23 +1344,14 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, return CurPtr[-1]; case 0: - if (CurPtr-1 != BufferEnd) { - if (EmitDiagnostics) - diagnose(CurPtr-1, diag::lex_nul_character); - return CurPtr[-1]; - } - // Move the pointer back to EOF. - --CurPtr; + assert(CurPtr - 1 != BufferEnd && "Caller must handle EOF"); if (EmitDiagnostics) - diagnose(CurPtr-1, diag::lex_unterminated_string); - return ~1U; + diagnose(CurPtr-1, diag::lex_nul_character); + return CurPtr[-1]; case '\n': // String literals cannot have \n or \r in them. case '\r': - if (IsMultilineString) // ... unless they are multiline - return CurPtr[-1]; - if (EmitDiagnostics) - diagnose(CurPtr-1, diag::lex_unterminated_string); - return ~1U; + assert(IsMultilineString && "Caller must handle newlines in non-multiline"); + return CurPtr[-1]; case '\\': // Escapes. if (!delimiterMatches(CustomDelimiterLen, CurPtr, EmitDiagnostics ? Diags : nullptr)) From e6e55c23e17f2d1c3d5f0da9076f7135db4f5366 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Mon, 17 Sep 2018 19:55:18 +0900 Subject: [PATCH 08/12] [Lexer] Code tweaks in lexStringLiteral() NFC. Reorder code to improve readability. --- lib/Parse/Lexer.cpp | 128 +++++++++++++++++++++----------------------- 1 file changed, 62 insertions(+), 66 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index e67f6da574cb0..6ba64f9c66006 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1749,32 +1749,30 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { // diagnostics about changing them to double quotes. assert((QuoteChar == '"' || QuoteChar == '\'') && "Unexpected start"); - bool wasErroneous = false, IsMultilineString = false; - - // Is this the start of a multiline string literal? - if ((IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) { - if (*CurPtr != '\n' && *CurPtr != '\r') - diagnose(CurPtr, diag::lex_illegal_multiline_string_start) + bool IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags); + if (IsMultilineString && *CurPtr != '\n' && *CurPtr != '\r') + diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); - } + bool wasErroneous = false; while (true) { + // Handle string interpolation. const char *TmpPtr = CurPtr + 1; - if (*CurPtr == '\\' && delimiterMatches(CustomDelimiterLen, TmpPtr, nullptr) - && *TmpPtr == '(') { + if (*CurPtr == '\\' && + delimiterMatches(CustomDelimiterLen, TmpPtr, nullptr) && + *TmpPtr++ == '(') { // Consume tokens until we hit the corresponding ')'. - CurPtr = TmpPtr + 1; - const char *EndPtr = skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, - IsMultilineString); - - if (*EndPtr == ')') { + CurPtr = skipToEndOfInterpolatedExpression(TmpPtr, BufferEnd, + IsMultilineString); + if (*CurPtr == ')') { // Successfully scanned the body of the expression literal. - CurPtr = EndPtr+1; - } else { - CurPtr = EndPtr; - wasErroneous = true; + ++CurPtr; + continue; } - continue; + + // Being diagnosed below. + assert((*CurPtr == '\r' || *CurPtr == '\n' || CurPtr == BufferEnd) && + "Returned at unexpected position"); } // String literals cannot have \n or \r in them (unless multiline). @@ -1786,59 +1784,57 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { unsigned CharValue = lexCharacter(CurPtr, QuoteChar, true, IsMultilineString, CustomDelimiterLen); + // This is the end of string, we are done. + if (CharValue == ~0U) + break; + + // Remember we had already-diagnosed invalid characters. wasErroneous |= CharValue == ~1U; + } - // If this is the end of string, we are done. If it is a normal character - // or an already-diagnosed error, just munch it. - if (CharValue == ~0U) { - - if (QuoteChar == '\'') { - // Emit diagnostics for single-quote string and suggest replacement - // with double-quoted equivalent. - assert( - !IsMultilineString && CustomDelimiterLen == 0 && - "Single quoted string cannot have custom delimitor, nor multiline"); - assert(*TokStart == '\'' && CurPtr[-1] == '\''); - - StringRef orig(TokStart, CurPtr - TokStart); - llvm::SmallString<32> replacement; - replacement += '"'; - std::string str = orig.slice(1, orig.size() - 1).str(); - std::string quot = "\""; - size_t pos = 0; - while (pos != str.length()) { - if (str.at(pos) == '\\') { - if (str.at(pos + 1) == '\'') { - // Un-escape escaped single quotes. - str.replace(pos, 2, "'"); - ++pos; - } else { - // Skip over escaped characters. - pos += 2; - } - } else if (str.at(pos) == '"') { - str.replace(pos, 1, "\\\""); - // Advance past the newly added ["\""]. - pos += 2; - } else { - ++pos; - } + if (QuoteChar == '\'') { + // Emit diagnostics for single-quote string and suggest replacement + // with double-quoted equivalent. + assert(!IsMultilineString && CustomDelimiterLen == 0 && + "Single quoted string cannot have custom delimitor, nor multiline"); + assert(*TokStart == '\'' && CurPtr[-1] == '\''); + + StringRef orig(TokStart, CurPtr - TokStart); + llvm::SmallString<32> replacement; + replacement += '"'; + std::string str = orig.slice(1, orig.size() - 1).str(); + std::string quot = "\""; + size_t pos = 0; + while (pos != str.length()) { + if (str.at(pos) == '\\') { + if (str.at(pos + 1) == '\'') { + // Un-escape escaped single quotes. + str.replace(pos, 2, "'"); + ++pos; + } else { + // Skip over escaped characters. + pos += 2; } - replacement += StringRef(str); - replacement += '"'; - diagnose(TokStart, diag::lex_single_quote_string) - .fixItReplaceChars(getSourceLoc(TokStart), getSourceLoc(CurPtr), - replacement); + } else if (str.at(pos) == '"') { + str.replace(pos, 1, "\\\""); + // Advance past the newly added ["\""]. + pos += 2; + } else { + ++pos; } - - // Is this the end of multiline/custom-delimited string literal? - if (wasErroneous) - return formToken(tok::unknown, TokStart); - - return formStringLiteralToken(TokStart, IsMultilineString, - CustomDelimiterLen); } + replacement += StringRef(str); + replacement += '"'; + diagnose(TokStart, diag::lex_single_quote_string) + .fixItReplaceChars(getSourceLoc(TokStart), getSourceLoc(CurPtr), + replacement); } + + if (wasErroneous) + return formToken(tok::unknown, TokStart); + + return formStringLiteralToken(TokStart, IsMultilineString, + CustomDelimiterLen); } From 7b701d57d3fe976375ec48c93e9e686a78c32d5c Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Tue, 18 Sep 2018 08:03:46 +0900 Subject: [PATCH 09/12] [Lexer] Improve diagnostics for single-quote string literal Ignore the contents of interpolation. --- lib/Parse/Lexer.cpp | 51 ++++++++++++++++++++----------------- test/expr/expressions.swift | 3 +++ 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 6ba64f9c66006..dfc4546fc893d 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1799,32 +1799,37 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { "Single quoted string cannot have custom delimitor, nor multiline"); assert(*TokStart == '\'' && CurPtr[-1] == '\''); - StringRef orig(TokStart, CurPtr - TokStart); - llvm::SmallString<32> replacement; - replacement += '"'; - std::string str = orig.slice(1, orig.size() - 1).str(); - std::string quot = "\""; - size_t pos = 0; - while (pos != str.length()) { - if (str.at(pos) == '\\') { - if (str.at(pos + 1) == '\'') { - // Un-escape escaped single quotes. - str.replace(pos, 2, "'"); - ++pos; - } else { - // Skip over escaped characters. - pos += 2; + SmallString<32> replacement; + replacement.push_back('"'); + const char *Ptr = TokStart + 1; + const char *OutputPtr = Ptr; + + while (*Ptr++ != '\'') { + if (Ptr[-1] == '\\') { + if (*Ptr == '\'') { + replacement.append(OutputPtr, Ptr - 1); + OutputPtr = Ptr + 1; + // Un-escape single quotes. + replacement.push_back('\''); + } else if (*Ptr == '(') { + // Preserve the contents of interpolation. + Ptr = skipToEndOfInterpolatedExpression(Ptr + 1, replacement.end(), + /*IsMultiline=*/false); + assert(*Ptr == ')'); } - } else if (str.at(pos) == '"') { - str.replace(pos, 1, "\\\""); - // Advance past the newly added ["\""]. - pos += 2; - } else { - ++pos; + // Skip over escaped characters. + ++Ptr; + } else if (Ptr[-1] == '"') { + replacement.append(OutputPtr, Ptr - 1); + OutputPtr = Ptr; + // Escape double quotes. + replacement.append("\\\""); } } - replacement += StringRef(str); - replacement += '"'; + assert(Ptr == CurPtr); + replacement.append(OutputPtr, Ptr - 1); + replacement.push_back('"'); + diagnose(TokStart, diag::lex_single_quote_string) .fixItReplaceChars(getSourceLoc(TokStart), getSourceLoc(CurPtr), replacement); diff --git a/test/expr/expressions.swift b/test/expr/expressions.swift index 4a38054f8b5bb..85b1a624bb737 100644 --- a/test/expr/expressions.swift +++ b/test/expr/expressions.swift @@ -505,6 +505,9 @@ func testSingleQuoteStringLiterals() { _ = 'ab\nc' // expected-error{{single-quoted string literal found, use '"'}}{{7-14="ab\\nc"}} _ = "abc\('def')" // expected-error{{single-quoted string literal found, use '"'}}{{13-18="def"}} + _ = 'ab\("c")' // expected-error{{single-quoted string literal found, use '"'}}{{7-17="ab\\("c")"}} + _ = 'a\('b')c' // expected-error{{single-quoted string literal found, use '"'}}{{7-17="a\\('b')c"}} + // expected-error@-1{{single-quoted string literal found, use '"'}}{{11-14="b"}} _ = "abc' // expected-error{{unterminated string literal}} _ = 'abc" // expected-error{{unterminated string literal}} From aaa4110a7cf7c780f077dcb2b842548904631302 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Tue, 18 Sep 2018 08:24:48 +0900 Subject: [PATCH 10/12] [Lexer] Factor out diagnostics for single-quoted string literal For readability. --- lib/Parse/Lexer.cpp | 84 +++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index dfc4546fc893d..7ee822799f5c6 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1737,6 +1737,51 @@ static void validateMultilineIndents(const Token &Str, commonIndentation); } +/// Emit diagnostics for single-quote string and suggest replacement +/// with double-quoted equivalent. +static void diagnoseSingleQuoteStringLiteral(const char *TokStart, + const char *TokEnd, + DiagnosticEngine *D) { + assert(*TokStart == '\'' && TokEnd[-1] == '\''); + if (!D) + return; + + SmallString<32> replacement; + replacement.push_back('"'); + const char *Ptr = TokStart + 1; + const char *OutputPtr = Ptr; + + while (*Ptr++ != '\'' && Ptr < TokEnd) { + if (Ptr[-1] == '\\') { + if (*Ptr == '\'') { + replacement.append(OutputPtr, Ptr - 1); + OutputPtr = Ptr + 1; + // Un-escape single quotes. + replacement.push_back('\''); + } else if (*Ptr == '(') { + // Preserve the contents of interpolation. + Ptr = skipToEndOfInterpolatedExpression(Ptr + 1, replacement.end(), + /*IsMultiline=*/false); + assert(*Ptr == ')'); + } + // Skip over escaped characters. + ++Ptr; + } else if (Ptr[-1] == '"') { + replacement.append(OutputPtr, Ptr - 1); + OutputPtr = Ptr; + // Escape double quotes. + replacement.append("\\\""); + } + } + assert(Ptr == TokEnd && Ptr[-1] == '\''); + replacement.append(OutputPtr, Ptr - 1); + replacement.push_back('"'); + + D->diagnose(Lexer::getSourceLoc(TokStart), diag::lex_single_quote_string) + .fixItReplaceChars(Lexer::getSourceLoc(TokStart), + Lexer::getSourceLoc(TokEnd), replacement); +} + /// lexStringLiteral: /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] /// string_literal ::= ["]["]["].*["]["]["] - approximately @@ -1793,46 +1838,9 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { } if (QuoteChar == '\'') { - // Emit diagnostics for single-quote string and suggest replacement - // with double-quoted equivalent. assert(!IsMultilineString && CustomDelimiterLen == 0 && "Single quoted string cannot have custom delimitor, nor multiline"); - assert(*TokStart == '\'' && CurPtr[-1] == '\''); - - SmallString<32> replacement; - replacement.push_back('"'); - const char *Ptr = TokStart + 1; - const char *OutputPtr = Ptr; - - while (*Ptr++ != '\'') { - if (Ptr[-1] == '\\') { - if (*Ptr == '\'') { - replacement.append(OutputPtr, Ptr - 1); - OutputPtr = Ptr + 1; - // Un-escape single quotes. - replacement.push_back('\''); - } else if (*Ptr == '(') { - // Preserve the contents of interpolation. - Ptr = skipToEndOfInterpolatedExpression(Ptr + 1, replacement.end(), - /*IsMultiline=*/false); - assert(*Ptr == ')'); - } - // Skip over escaped characters. - ++Ptr; - } else if (Ptr[-1] == '"') { - replacement.append(OutputPtr, Ptr - 1); - OutputPtr = Ptr; - // Escape double quotes. - replacement.append("\\\""); - } - } - assert(Ptr == CurPtr); - replacement.append(OutputPtr, Ptr - 1); - replacement.push_back('"'); - - diagnose(TokStart, diag::lex_single_quote_string) - .fixItReplaceChars(getSourceLoc(TokStart), getSourceLoc(CurPtr), - replacement); + diagnoseSingleQuoteStringLiteral(TokStart, CurPtr, Diags); } if (wasErroneous) From a5759b73b58398cf379de59b39acd096c2259597 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Tue, 18 Sep 2018 08:43:39 +0900 Subject: [PATCH 11/12] [Lexer] Improve fix-it to remove "too long" delimiter in string literal Fix-it to remove extra '#'s at once. --- lib/Parse/Lexer.cpp | 24 ++++++++++++++---------- test/Parse/raw_string_errors.swift | 4 ++-- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 7ee822799f5c6..b614aa01c5c6e 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1277,16 +1277,20 @@ static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr, if (!CustomDelimiterLen) return true; const char *TmpPtr = BytesPtr; - while (CustomDelimiterLen--) - if (!diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) - return false; - BytesPtr = TmpPtr; - if (*BytesPtr == '#' && Diags) - Diags->diagnose(Lexer::getSourceLoc(BytesPtr), IsClosing ? - diag::lex_invalid_closing_delimiter : - diag::lex_invalid_escape_delimiter) - .fixItRemoveChars(Lexer::getSourceLoc(BytesPtr), - Lexer::getSourceLoc(BytesPtr + 1)); + while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) {} + + if (TmpPtr - BytesPtr < CustomDelimiterLen) + return false; + + BytesPtr += CustomDelimiterLen; + + if (Diags && TmpPtr > BytesPtr) { + Diag<> message = IsClosing ? diag::lex_invalid_closing_delimiter + : diag::lex_invalid_escape_delimiter; + Diags->diagnose(Lexer::getSourceLoc(BytesPtr), message) + .fixItRemoveChars(Lexer::getSourceLoc(BytesPtr), + Lexer::getSourceLoc(TmpPtr)); + } return true; } diff --git a/test/Parse/raw_string_errors.swift b/test/Parse/raw_string_errors.swift index 4bc97a11999ea..967ffffcf633a 100644 --- a/test/Parse/raw_string_errors.swift +++ b/test/Parse/raw_string_errors.swift @@ -12,7 +12,7 @@ let _ = #"\##("invalid")"# let _ = ####"invalid"### // expected-error@-1{{unterminated string literal}} -let _ = ###"invalid"#### -// expected-error@-1{{too many '#' characters in closing delimiter}} +let _ = ###"invalid"###### +// expected-error@-1{{too many '#' characters in closing delimiter}}{{24-27=}} // expected-error@-2{{consecutive statements on a line must be separated by ';'}} // expected-error@-3{{expected expression}} From 50497ff5a9ab50a7b5982684c47d1f90f9676a55 Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Tue, 18 Sep 2018 08:44:20 +0900 Subject: [PATCH 12/12] [Lexer] Add a couple of test cases for extended escaping str literal --- test/Parse/raw_string.swift | 3 +++ test/Parse/raw_string_errors.swift | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift index 953b92324b041..d96ba2fd24ac3 100644 --- a/test/Parse/raw_string.swift +++ b/test/Parse/raw_string.swift @@ -132,3 +132,6 @@ _ = #""" ] """# // CHECK: "[\n {\n \"id\": \"12345\",\n \"title\": \"A title that \\\"contains\\\" \\\\\\\"\"\n }\n]" + +_ = #"# #"# +// CHECK: "# #" diff --git a/test/Parse/raw_string_errors.swift b/test/Parse/raw_string_errors.swift index 967ffffcf633a..0ba693fc79f30 100644 --- a/test/Parse/raw_string_errors.swift +++ b/test/Parse/raw_string_errors.swift @@ -16,3 +16,9 @@ let _ = ###"invalid"###### // expected-error@-1{{too many '#' characters in closing delimiter}}{{24-27=}} // expected-error@-2{{consecutive statements on a line must be separated by ';'}} // expected-error@-3{{expected expression}} + +let _ = ##"""## + foobar + ##"""## +// expected-error@-3{{multi-line string literal content must begin on a new line}}{{14-14=\n}} +// expected-error@-2{{multi-line string literal closing delimiter must begin on a new line}}{{5-5=\n}}