From 14213b84bdac161cabcc13fe3e37725882885ef7 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Mon, 2 Jul 2018 12:47:12 +0100 Subject: [PATCH 01/15] Revised implementation for raw strings --- include/swift/Parse/Lexer.h | 19 +++++---- include/swift/Parse/Token.h | 14 +++++-- lib/Parse/Lexer.cpp | 79 +++++++++++++++++++++++++------------ lib/Parse/Parser.cpp | 6 +-- test/Parse/raw_string.swift | 47 ++++++++++++++++++++++ 5 files changed, 126 insertions(+), 39 deletions(-) create mode 100644 test/Parse/raw_string.swift diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h index d09502b03e2c9..7acae6e262e8a 100644 --- a/include/swift/Parse/Lexer.h +++ b/include/swift/Parse/Lexer.h @@ -356,11 +356,12 @@ class Lexer { // Loc+Length for the segment inside the string literal, without quotes. SourceLoc Loc; unsigned Length, IndentToStrip; - bool IsFirstSegment, IsLastSegment; + bool IsFirstSegment, IsLastSegment, RawString; static StringSegment getLiteral(SourceLoc Loc, unsigned Length, bool IsFirstSegment, bool IsLastSegment, - unsigned IndentToStrip) { + unsigned IndentToStrip, + bool RawString) { StringSegment Result; Result.Kind = Literal; Result.Loc = Loc; @@ -368,6 +369,7 @@ class Lexer { Result.IsFirstSegment = IsFirstSegment; Result.IsLastSegment = IsLastSegment; Result.IndentToStrip = IndentToStrip; + Result.RawString = RawString; return Result; } @@ -379,6 +381,7 @@ class Lexer { Result.IsFirstSegment = false; Result.IsLastSegment = false; Result.IndentToStrip = 0; + Result.RawString = false; return Result; } @@ -395,13 +398,14 @@ class Lexer { SmallVectorImpl &Buffer, bool IsFirstSegment = false, bool IsLastSegment = false, - unsigned IndentToStrip = 0); + unsigned IndentToStrip = 0, + bool RawString = false); StringRef getEncodedStringSegment(StringSegment Segment, SmallVectorImpl &Buffer) const { return getEncodedStringSegment( StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length), Buffer, Segment.IsFirstSegment, Segment.IsLastSegment, - Segment.IndentToStrip); + Segment.IndentToStrip, Segment.RawString); } /// \brief Given a string literal token, separate it into string/expr segments @@ -465,7 +469,8 @@ class Lexer { return diagnose(Loc, Diagnostic(DiagID, std::forward(Args)...)); } - void formToken(tok Kind, const char *TokStart, bool MultilineString = false); + void formToken(tok Kind, const char *TokStart, bool MultilineString = false, + bool RawString = false, size_t DelimiterLength = 0); void formEscapedIdentifierToken(const char *TokStart); /// Advance to the end of the line. @@ -491,8 +496,8 @@ class Lexer { unsigned lexCharacter(const char *&CurPtr, char StopQuote, bool EmitDiagnostics, - bool MultilineString = false); - void lexStringLiteral(); + bool Multiline = false, bool RawString = false); + void lexStringLiteral(bool RawString = false, std::string Delimiter = ""); void lexEscapedIdentifier(); void tryLexEditorPlaceholder(); diff --git a/include/swift/Parse/Token.h b/include/swift/Parse/Token.h index 9d1a99a179751..f85090f5bd04c 100644 --- a/include/swift/Parse/Token.h +++ b/include/swift/Parse/Token.h @@ -44,8 +44,9 @@ class Token { /// Modifiers for string literals unsigned MultilineString : 1; + unsigned RawString : 1; - // Padding bits == 32 - sizeof(Kind) * 8 - 3; + // Padding bits == 32 - sizeof(Kind) * 8 - 4; /// \brief The length of the comment that precedes the token. unsigned CommentLength; @@ -62,8 +63,8 @@ class Token { public: Token(tok Kind, StringRef Text, unsigned CommentLength = 0) : Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false), - MultilineString(false), CommentLength(CommentLength), - Text(Text) {} + MultilineString(false), RawString(false), + CommentLength(CommentLength), Text(Text) {} Token() : Token(tok::NUM_TOKENS, {}, 0) {} @@ -266,17 +267,22 @@ class Token { /// \brief Set the token to the specified kind and source range. void setToken(tok K, StringRef T, unsigned CommentLength = 0, - bool MultilineString = false) { + bool MultilineString = false, bool RawString = false) { Kind = K; Text = T; this->CommentLength = CommentLength; EscapedIdentifier = false; this->MultilineString = MultilineString; + this->RawString = RawString; } bool IsMultilineString() const { return MultilineString; } + + bool IsRawString() const { + return RawString; + } }; } // end namespace swift diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 21ed3c6892bd5..753a75ccbd8b1 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -267,7 +267,9 @@ Token Lexer::getTokenAt(SourceLoc Loc) { return Result; } -void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) { +void Lexer::formToken(tok Kind, const char *TokStart, + bool MultilineString, bool RawString, + size_t DelimiterLength) { assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Current pointer out of range!"); @@ -293,7 +295,9 @@ void Lexer::formToken(tok Kind, const char *TokStart, bool MultilineString) { lexTrivia(TrailingTrivia, /* IsForTrailingTrivia */ true); } - NextToken.setToken(Kind, TokenText, CommentLength, MultilineString); + NextToken.setToken(Kind, TokenText, CommentLength, + MultilineString, RawString); + CurPtr += DelimiterLength; } void Lexer::formEscapedIdentifierToken(const char *TokStart) { @@ -1213,7 +1217,8 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { /// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0 /// character_escape ::= unicode_character_escape unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, - bool EmitDiagnostics, bool MultilineString) { + bool EmitDiagnostics, bool MultilineString, + bool RawString) { const char *CharStart = CurPtr; switch (*CurPtr++) { @@ -1262,6 +1267,8 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, diagnose(CurPtr-1, diag::lex_unterminated_string); return ~1U; case '\\': // Escapes. + if (RawString) + return '\\'; break; } @@ -1489,7 +1496,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { auto string = StringRef(start, end - start); // Disallow escaped newline in the last line. - if (Diags) { + if (Diags && !Str.IsRawString()) { auto *Ptr = start - 1; if (*Ptr == '\n') --Ptr; if (*Ptr == '\r') --Ptr; @@ -1645,25 +1652,28 @@ static void validateMultilineIndents(const Token &Str, /// lexStringLiteral: /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] /// string_literal ::= ["]["]["].*["]["]["] - approximately -void Lexer::lexStringLiteral() { +void Lexer::lexStringLiteral(bool RawString, std::string Delimiter) { + CurPtr += Delimiter.length(); const char *TokStart = CurPtr-1; assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); // NOTE: We only allow single-quote string literals so we can emit useful // diagnostics about changing them to double quotes. bool wasErroneous = false, MultilineString = false; + Delimiter.insert(0, 1, *TokStart); // Is this the start of a multiline string literal? if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { MultilineString = true; CurPtr += 2; + Delimiter.insert(0, 2, *TokStart); if (*CurPtr != '\n' && *CurPtr != '\r') diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); } while (true) { - if (*CurPtr == '\\' && *(CurPtr + 1) == '(') { + if (*CurPtr == '\\' && *(CurPtr + 1) == '(' && !RawString) { // Consume tokens until we hit the corresponding ')'. CurPtr += 2; const char *EndPtr = @@ -1687,7 +1697,8 @@ void Lexer::lexStringLiteral() { return formToken(tok::unknown, TokStart); } - unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, MultilineString); + unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, + MultilineString, RawString); wasErroneous |= CharValue == ~1U; // If this is the end of string, we are done. If it is a normal character @@ -1731,20 +1742,20 @@ void Lexer::lexStringLiteral() { replacement); } - // Is this the end of a multiline string literal? - if (MultilineString) { - if (*CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr + 2) != '"') { + // Is this the end of a delimited/multiline string literal? + if(StringRef(CurPtr - 1, Delimiter.length()) == Delimiter) { + if (MultilineString) { CurPtr += 2; - formToken(tok::string_literal, TokStart, MultilineString); + formToken(tok::string_literal, TokStart, + MultilineString, RawString, Delimiter.length() - 3); if (Diags) validateMultilineIndents(NextToken, Diags); return; } else - continue; + return formToken(tok::string_literal, TokStart, + MultilineString, RawString, Delimiter.length() - 1); } - - return formToken(tok::string_literal, TokStart, MultilineString); } } } @@ -2009,7 +2020,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, SmallVectorImpl &TempString, bool IsFirstSegment, bool IsLastSegment, - unsigned IndentToStrip) { + unsigned IndentToStrip, + bool RawString) { TempString.clear(); // Note that it is always safe to read one over the end of "Bytes" because @@ -2036,7 +2048,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, continue; } - if (CurChar != '\\') { + if (CurChar != '\\' || RawString) { TempString.push_back(CurChar); continue; } @@ -2117,11 +2129,10 @@ void Lexer::getStringLiteralSegments( // range check subscripting on the StringRef. const char *SegmentStartPtr = Bytes.begin(); const char *BytesPtr = SegmentStartPtr; - // FIXME: Use SSE to scan for '\'. - while (BytesPtr != Bytes.end()) { - char CurChar = *BytesPtr++; - if (CurChar != '\\') - continue; + size_t pos; + while (!Str.IsRawString() && + (pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) { + BytesPtr = Bytes.begin() + pos + 1; if (*BytesPtr++ != '(') continue; @@ -2132,7 +2143,7 @@ void Lexer::getStringLiteralSegments( Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), BytesPtr-SegmentStartPtr-2, - IsFirstSegment, false, IndentToStrip)); + IsFirstSegment, false, IndentToStrip, false)); IsFirstSegment = false; // Find the closing ')'. @@ -2155,9 +2166,16 @@ void Lexer::getStringLiteralSegments( Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), Bytes.end()-SegmentStartPtr, - IsFirstSegment, true, IndentToStrip)); + IsFirstSegment, true, IndentToStrip, + Str.IsRawString())); } +/// A custom delimiter is zero or more # characters surrounding a quoted string +static bool isDelimitedString(const char *CurPtr, std::string &delimiter) { + while (*CurPtr == '#') + delimiter.push_back(*CurPtr++); + return *CurPtr == '"'; +} //===----------------------------------------------------------------------===// // Main Lexer Loop @@ -2250,9 +2268,20 @@ void Lexer::lexImpl() { case ',': return formToken(tok::comma, TokStart); case ';': return formToken(tok::semi, TokStart); case ':': return formToken(tok::colon, TokStart); - case '\\': return formToken(tok::backslash, TokStart); + case '\\': { + std::string Delimiter; + if (isDelimitedString(CurPtr, Delimiter)) { + CurPtr++; + return lexStringLiteral(true, Delimiter); + } + } + return formToken(tok::backslash, TokStart); - case '#': + case '#': { + std::string Delimiter; + if (isDelimitedString(CurPtr - 1, Delimiter)) + return lexStringLiteral(false, Delimiter); + } return lexHash(); // Operator characters. diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp index 5446f6156a122..84493b0b0f752 100644 --- a/lib/Parse/Parser.cpp +++ b/lib/Parse/Parser.cpp @@ -373,7 +373,7 @@ class TokenRecorder: public ConsumeTokenReceiver { } void relexComment(CharSourceRange CommentRange, - llvm::SmallVectorImpl &Scracth) { + llvm::SmallVectorImpl &Scratch) { Lexer L(Ctx.LangOpts, Ctx.SourceMgr, BufferID, nullptr, /*InSILMode=*/false, CommentRetentionMode::ReturnAsTokens, TriviaRetentionMode::WithoutTrivia, @@ -384,8 +384,8 @@ class TokenRecorder: public ConsumeTokenReceiver { L.lex(Result); if (Result.is(tok::eof)) break; - assert(Result.is(tok::comment)); - Scracth.push_back(Result); + if(Result.is(tok::comment)) // interacts badly with custom delimiters + Scratch.push_back(Result); } } diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift new file mode 100644 index 0000000000000..0eeaea55949eb --- /dev/null +++ b/test/Parse/raw_string.swift @@ -0,0 +1,47 @@ +// RUN: %target-swift-frontend -dump-ast %s 2>&1 | %FileCheck %s + +import Swift + +// ===---------- Multiline RawString --------=== + +_ = ##""" + One + ""Alpha"" + """## +// CHECK: "One\n\"\"Alpha\"\"" + +_ = ##""" + Two + Beta + """## +// CHECK: " Two\nBeta" + +_ = \""" + Three\r + Gamma\ + """ +// CHECK: " Three\\r\n Gamma\\" + +_ = \###""" + Four \(foo) + Delta +"""### +// CHECK: " Four \\(foo)\n Delta" + +_ = ##""" + print(""" + Five\n\n\nEpsilon + """) + """## +// CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")" + +// ===---------- Single line --------=== + +_ = #""Zeta""# +// CHECK: "\"Zeta\"" + +_ = #""Eta"\n\n\n\""# +// CHECK: "\"Eta\"\n\n\n\" + +_ = \#""Iota"\n\n\n\""# +// CHECK: "\"Iota\"\\n\\n\\n\\\"" From 2317048f44a834356ca593fe286e8baae3a583bd Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Wed, 4 Jul 2018 16:59:59 +0100 Subject: [PATCH 02/15] Alternative implementation for raw strings --- include/swift/Parse/Lexer.h | 20 ++++---- include/swift/Parse/Token.h | 17 ++++--- lib/Parse/Lexer.cpp | 93 ++++++++++++++++++++----------------- test/Parse/raw_string.swift | 81 ++++++++++++++++++++++++++------ 4 files changed, 137 insertions(+), 74 deletions(-) diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h index 7acae6e262e8a..e62ce66b15eb0 100644 --- a/include/swift/Parse/Lexer.h +++ b/include/swift/Parse/Lexer.h @@ -355,13 +355,13 @@ class Lexer { enum : char { Literal, Expr } Kind; // Loc+Length for the segment inside the string literal, without quotes. SourceLoc Loc; - unsigned Length, IndentToStrip; - bool IsFirstSegment, IsLastSegment, RawString; + unsigned Length, IndentToStrip, DelimiterLength; + bool IsFirstSegment, IsLastSegment; static StringSegment getLiteral(SourceLoc Loc, unsigned Length, bool IsFirstSegment, bool IsLastSegment, unsigned IndentToStrip, - bool RawString) { + unsigned DelimiterLength) { StringSegment Result; Result.Kind = Literal; Result.Loc = Loc; @@ -369,7 +369,7 @@ class Lexer { Result.IsFirstSegment = IsFirstSegment; Result.IsLastSegment = IsLastSegment; Result.IndentToStrip = IndentToStrip; - Result.RawString = RawString; + Result.DelimiterLength = DelimiterLength; return Result; } @@ -381,7 +381,7 @@ class Lexer { Result.IsFirstSegment = false; Result.IsLastSegment = false; Result.IndentToStrip = 0; - Result.RawString = false; + Result.DelimiterLength = 0; return Result; } @@ -399,13 +399,13 @@ class Lexer { bool IsFirstSegment = false, bool IsLastSegment = false, unsigned IndentToStrip = 0, - bool RawString = false); + unsigned DelimiterLength = 0); StringRef getEncodedStringSegment(StringSegment Segment, SmallVectorImpl &Buffer) const { return getEncodedStringSegment( StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length), Buffer, Segment.IsFirstSegment, Segment.IsLastSegment, - Segment.IndentToStrip, Segment.RawString); + Segment.IndentToStrip, Segment.DelimiterLength); } /// \brief Given a string literal token, separate it into string/expr segments @@ -470,7 +470,7 @@ class Lexer { } void formToken(tok Kind, const char *TokStart, bool MultilineString = false, - bool RawString = false, size_t DelimiterLength = 0); + unsigned DelimiterLength = 0); void formEscapedIdentifierToken(const char *TokStart); /// Advance to the end of the line. @@ -496,8 +496,8 @@ class Lexer { unsigned lexCharacter(const char *&CurPtr, char StopQuote, bool EmitDiagnostics, - bool Multiline = false, bool RawString = false); - void lexStringLiteral(bool RawString = false, std::string Delimiter = ""); + bool Multiline = false, unsigned DelimiterLength = 0); + void lexStringLiteral(unsigned DelimiterLength = 0); void lexEscapedIdentifier(); void tryLexEditorPlaceholder(); diff --git a/include/swift/Parse/Token.h b/include/swift/Parse/Token.h index f85090f5bd04c..9b95390d3111d 100644 --- a/include/swift/Parse/Token.h +++ b/include/swift/Parse/Token.h @@ -44,9 +44,11 @@ class Token { /// Modifiers for string literals unsigned MultilineString : 1; - unsigned RawString : 1; - // Padding bits == 32 - sizeof(Kind) * 8 - 4; + /// Length of custom delimiter of "raw" string literals + unsigned StringDelimiterLength : 8; + + // Padding bits == 32 - 11; /// \brief The length of the comment that precedes the token. unsigned CommentLength; @@ -63,7 +65,7 @@ class Token { public: Token(tok Kind, StringRef Text, unsigned CommentLength = 0) : Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false), - MultilineString(false), RawString(false), + MultilineString(false), StringDelimiterLength(0), CommentLength(CommentLength), Text(Text) {} Token() : Token(tok::NUM_TOKENS, {}, 0) {} @@ -267,21 +269,22 @@ class Token { /// \brief Set the token to the specified kind and source range. void setToken(tok K, StringRef T, unsigned CommentLength = 0, - bool MultilineString = false, bool RawString = false) { + bool MultilineString = false, unsigned DelimiterLength = 0) { Kind = K; Text = T; this->CommentLength = CommentLength; EscapedIdentifier = false; this->MultilineString = MultilineString; - this->RawString = RawString; + StringDelimiterLength = DelimiterLength; + assert(StringDelimiterLength == DelimiterLength && "delimiter too long"); } bool IsMultilineString() const { return MultilineString; } - bool IsRawString() const { - return RawString; + unsigned DelimiterLength() const { + return StringDelimiterLength; } }; diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 753a75ccbd8b1..8f500428afdcc 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -268,8 +268,7 @@ Token Lexer::getTokenAt(SourceLoc Loc) { } void Lexer::formToken(tok Kind, const char *TokStart, - bool MultilineString, bool RawString, - size_t DelimiterLength) { + bool MultilineString, unsigned DelimiterLength) { assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Current pointer out of range!"); @@ -296,7 +295,7 @@ void Lexer::formToken(tok Kind, const char *TokStart, } NextToken.setToken(Kind, TokenText, CommentLength, - MultilineString, RawString); + MultilineString, DelimiterLength); CurPtr += DelimiterLength; } @@ -1208,6 +1207,21 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } +/// delimiterMatches - Does custom delimiter (# characters surrounding quotes) +/// match the number of # charatters after \ inside the string? This allows +/// interpolation inside a "raw" string. Normal/cooked string processing is +/// the degenerate case of there being no # characters surrounding the quotes. +/// If delimiter matches, advances byte pointer passed in and returns true. +static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr) { + if (!DelimiterLength) + return true; + for (unsigned i = 0; i < DelimiterLength ; i++) + if (BytesPtr[i] != '#') + return false; + BytesPtr += DelimiterLength; + return true; +} + /// lexCharacter - Read a character and return its UTF32 code. If this is the /// end of enclosing string/character sequence (i.e. the character is equal to /// 'StopQuote'), this returns ~0U and leaves 'CurPtr' pointing to the terminal @@ -1218,7 +1232,7 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { /// character_escape ::= unicode_character_escape unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, bool EmitDiagnostics, bool MultilineString, - bool RawString) { + unsigned DelimiterLength) { const char *CharStart = CurPtr; switch (*CurPtr++) { @@ -1267,7 +1281,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, diagnose(CurPtr-1, diag::lex_unterminated_string); return ~1U; case '\\': // Escapes. - if (RawString) + if (!delimiterMatches(DelimiterLength, CurPtr)) return '\\'; break; } @@ -1496,7 +1510,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { auto string = StringRef(start, end - start); // Disallow escaped newline in the last line. - if (Diags && !Str.IsRawString()) { + if (Diags && Str.DelimiterLength() == 0) { auto *Ptr = start - 1; if (*Ptr == '\n') --Ptr; if (*Ptr == '\r') --Ptr; @@ -1652,30 +1666,34 @@ static void validateMultilineIndents(const Token &Str, /// lexStringLiteral: /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] /// string_literal ::= ["]["]["].*["]["]["] - approximately -void Lexer::lexStringLiteral(bool RawString, std::string Delimiter) { - CurPtr += Delimiter.length(); +void Lexer::lexStringLiteral(unsigned DelimiterLength) { const char *TokStart = CurPtr-1; assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); // NOTE: We only allow single-quote string literals so we can emit useful // diagnostics about changing them to double quotes. bool wasErroneous = false, MultilineString = false; - Delimiter.insert(0, 1, *TokStart); + std::string Delimiter; + Delimiter.push_back(*TokStart); // Is this the start of a multiline string literal? if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { MultilineString = true; CurPtr += 2; - Delimiter.insert(0, 2, *TokStart); + Delimiter.push_back(*TokStart); + Delimiter.push_back(*TokStart); if (*CurPtr != '\n' && *CurPtr != '\r') diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); } + Delimiter.insert(Delimiter.size(), DelimiterLength, '#'); while (true) { - if (*CurPtr == '\\' && *(CurPtr + 1) == '(' && !RawString) { + const char *TmpPtr = CurPtr + 1; + if (*CurPtr == '\\' && + delimiterMatches(DelimiterLength, TmpPtr) && *TmpPtr == '(') { // Consume tokens until we hit the corresponding ')'. - CurPtr += 2; + CurPtr = TmpPtr + 1; const char *EndPtr = skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, Diags, MultilineString); @@ -1698,7 +1716,7 @@ void Lexer::lexStringLiteral(bool RawString, std::string Delimiter) { } unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, - MultilineString, RawString); + MultilineString, DelimiterLength); wasErroneous |= CharValue == ~1U; // If this is the end of string, we are done. If it is a normal character @@ -1747,14 +1765,14 @@ void Lexer::lexStringLiteral(bool RawString, std::string Delimiter) { if (MultilineString) { CurPtr += 2; formToken(tok::string_literal, TokStart, - MultilineString, RawString, Delimiter.length() - 3); + MultilineString, DelimiterLength); if (Diags) validateMultilineIndents(NextToken, Diags); return; } else return formToken(tok::string_literal, TokStart, - MultilineString, RawString, Delimiter.length() - 1); + MultilineString, DelimiterLength); } } } @@ -2021,7 +2039,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, bool IsFirstSegment, bool IsLastSegment, unsigned IndentToStrip, - bool RawString) { + unsigned DelmiterLength) { TempString.clear(); // Note that it is always safe to read one over the end of "Bytes" because @@ -2048,7 +2066,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, continue; } - if (CurChar != '\\' || RawString) { + if (CurChar != '\\' || !delimiterMatches(DelmiterLength, BytesPtr)) { TempString.push_back(CurChar); continue; } @@ -2119,7 +2137,7 @@ void Lexer::getStringLiteralSegments( // Are substitutions required either for indent stripping or line ending // normalization? bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true; - unsigned IndentToStrip = 0; + unsigned IndentToStrip = 0, DelimiterLength = Str.DelimiterLength(); if (MultilineString) IndentToStrip = std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size(); @@ -2130,11 +2148,10 @@ void Lexer::getStringLiteralSegments( const char *SegmentStartPtr = Bytes.begin(); const char *BytesPtr = SegmentStartPtr; size_t pos; - while (!Str.IsRawString() && - (pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) { + while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) { BytesPtr = Bytes.begin() + pos + 1; - if (*BytesPtr++ != '(') + if (!delimiterMatches(DelimiterLength, BytesPtr) || *BytesPtr++ != '(') continue; // String interpolation. @@ -2142,8 +2159,9 @@ void Lexer::getStringLiteralSegments( // Push the current segment. Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), - BytesPtr-SegmentStartPtr-2, - IsFirstSegment, false, IndentToStrip, false)); + BytesPtr-SegmentStartPtr-2-DelimiterLength, + IsFirstSegment, false, IndentToStrip, + DelimiterLength)); IsFirstSegment = false; // Find the closing ')'. @@ -2167,14 +2185,7 @@ void Lexer::getStringLiteralSegments( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), Bytes.end()-SegmentStartPtr, IsFirstSegment, true, IndentToStrip, - Str.IsRawString())); -} - -/// A custom delimiter is zero or more # characters surrounding a quoted string -static bool isDelimitedString(const char *CurPtr, std::string &delimiter) { - while (*CurPtr == '#') - delimiter.push_back(*CurPtr++); - return *CurPtr == '"'; + DelimiterLength)); } //===----------------------------------------------------------------------===// @@ -2268,19 +2279,17 @@ void Lexer::lexImpl() { case ',': return formToken(tok::comma, TokStart); case ';': return formToken(tok::semi, TokStart); case ':': return formToken(tok::colon, TokStart); - case '\\': { - std::string Delimiter; - if (isDelimitedString(CurPtr, Delimiter)) { - CurPtr++; - return lexStringLiteral(true, Delimiter); - } - } - return formToken(tok::backslash, TokStart); + case '\\': return formToken(tok::backslash, TokStart); case '#': { - std::string Delimiter; - if (isDelimitedString(CurPtr - 1, Delimiter)) - return lexStringLiteral(false, Delimiter); + const char *Lookahead = CurPtr; + while (*Lookahead == '#') + Lookahead++; + if (*Lookahead++ == '"') { + unsigned DelimiterLength = Lookahead - CurPtr; + CurPtr = Lookahead; + return lexStringLiteral(DelimiterLength); + } } return lexHash(); diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift index 0eeaea55949eb..0695922b64899 100644 --- a/test/Parse/raw_string.swift +++ b/test/Parse/raw_string.swift @@ -4,44 +4,95 @@ import Swift // ===---------- Multiline RawString --------=== -_ = ##""" +print(##""" One ""Alpha"" - """## + """##) // CHECK: "One\n\"\"Alpha\"\"" -_ = ##""" +print(##""" Two Beta - """## + """##) // CHECK: " Two\nBeta" -_ = \""" +print(#""" Three\r Gamma\ - """ + """#) // CHECK: " Three\\r\n Gamma\\" -_ = \###""" +print(###""" Four \(foo) Delta -"""### +"""###) // CHECK: " Four \\(foo)\n Delta" -_ = ##""" +print(##""" print(""" - Five\n\n\nEpsilon + Five\##n\##n\##nEpsilon """) - """## + """##) // CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")" // ===---------- Single line --------=== -_ = #""Zeta""# +print(#""Zeta""#) // CHECK: "\"Zeta\"" -_ = #""Eta"\n\n\n\""# -// CHECK: "\"Eta\"\n\n\n\" +print(#""Eta"\#n\#n\#n\#""#) +// CHECK: "\"Eta\"\n\n\n\"" -_ = \#""Iota"\n\n\n\""# +print(#""Iota"\n\n\n\""#) // CHECK: "\"Iota\"\\n\\n\\n\\\"" + +let foo = "Interpolation" +print(#"\b\b \#(foo)\#(foo) Kappa"#) +// CHECK: "\\b\\b " +// CHECK: " Kappa" + +// ===---------- From proposal --------=== + +_ = #"This is a string"# +// CHECK: "This is a string" + +_ = #####"This is a string"##### +// CHECK: "This is a string" + +_ = #"enum\s+.+\{.*case\s+[:upper:]"# +// CHECK: "enum\\s+.+\\{.*case\\s+[:upper:]" + +_ = #"Alice: "How long is forever?" White Rabbit: "Sometimes, just one second.""# +// CHECK: "Alice: \"How long is forever?\" White Rabbit: \"Sometimes, just one second.\"" + +_ = #"\#\#1"# +/// CHECK: "\\#1" + +_ = ##"\#1"## +/// CHECK: "\\#1" + +_ = #"c:\windows\system32"# +/// CHCECK: "c:\\windows\\system32" + +_ = #"\d{3) \d{3} \d{4}"# +///CHECK: "\\d{3) \\d{3} \\d{4}" + +_ = #""" + a string with + """ + in it + """# +/// CHECK: "a string with\n\"\"\"\nin it" + +_ = #"a raw string containing \r\n"# +/// CHECK "a raw string containing \\r\\n" + +_ = #""" + [ + { + "id": "12345", + "title": "A title that \"contains\" \\\"" + } + ] + """# +/// CHECK: "[\n {\n \"id\": \"12345\",\n \"title\": \"A title that \\\"contains\\\" \\\\\\\"\"\n }\n]" From 74dd71ca9bb13941f7983c9c3141166d72e1816b Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Fri, 10 Aug 2018 17:27:38 +0100 Subject: [PATCH 03/15] Delimited/Raw strings inside interpolations --- lib/Parse/Lexer.cpp | 56 +++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 8f500428afdcc..d5d1b1f654798 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1207,8 +1207,23 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } +/// extractDelimiterLength - Extracts/detects any custom delimiter on opening +/// a string literal and advances CurPtr if a delimiter is found and returns +/// a non-zero delimiter length. CurPtr[-1] is generally '#' when called. +static unsigned extractDelimiterLength(const char *&CurPtr) { + const char *Lookahead = CurPtr; + while (*Lookahead == '#') + Lookahead++; + if (*Lookahead++ == '"') { + unsigned DelimiterLength = Lookahead - CurPtr; + CurPtr = Lookahead; + return DelimiterLength; + } + return 0; +} + /// delimiterMatches - Does custom delimiter (# characters surrounding quotes) -/// match the number of # charatters after \ inside the string? This allows +/// match the number of # characters after \ inside the string? This allows /// interpolation inside a "raw" string. Normal/cooked string processing is /// the degenerate case of there being no # characters surrounding the quotes. /// If delimiter matches, advances byte pointer passed in and returns true. @@ -1349,8 +1364,9 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, const char *EndPtr, DiagnosticEngine *Diags, bool MultilineString) { - llvm::SmallVector OpenDelimiters; - llvm::SmallVector AllowNewline; + SmallVector OpenDelimiters; + SmallVector AllowNewline; + SmallVector CustomDelimiter; AllowNewline.push_back(MultilineString); auto inStringLiteral = [&]() { @@ -1366,6 +1382,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // On success scanning the expression body, the real lexer will be used to // relex the body when parsing the expressions. We let it diagnose any // issues with malformed tokens or other problems. + unsigned DelimiterLength = 0; switch (*CurPtr++) { // String literals in general cannot be split across multiple lines; // interpolated ones are no exception - unless multiline literals. @@ -1376,13 +1393,21 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // Will be diagnosed as an unterminated string literal. return CurPtr-1; + case '#': + if (inStringLiteral() || + !(DelimiterLength = extractDelimiterLength(CurPtr))) + continue; + LLVM_FALLTHROUGH; + case '"': case '\'': { if (!AllowNewline.back() && inStringLiteral()) { - if (OpenDelimiters.back() == CurPtr[-1]) { + if (OpenDelimiters.back() == CurPtr[-1] && + delimiterMatches(CustomDelimiter.back(), CurPtr)) { // Closing single line string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); + CustomDelimiter.pop_back(); } // Otherwise, it's just a quote in string literal. e.g. "foo's". continue; @@ -1397,22 +1422,26 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // Open string literal OpenDelimiters.push_back(CurPtr[-1]); AllowNewline.push_back(isMultilineQuote); + CustomDelimiter.push_back(DelimiterLength); continue; } // We are in multiline string literal. assert(AllowNewline.back() && "other cases must be handled above"); - if (isMultilineQuote) { + if (isMultilineQuote && + delimiterMatches(CustomDelimiter.back(), CurPtr)) { // Close multiline string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); + CustomDelimiter.pop_back(); } // Otherwise, it's just a normal character in multiline string. continue; } case '\\': - if (inStringLiteral()) { + if (inStringLiteral() && + delimiterMatches(CustomDelimiter.back(), CurPtr)) { char escapedChar = *CurPtr++; switch (escapedChar) { case '(': @@ -1761,7 +1790,7 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { } // Is this the end of a delimited/multiline string literal? - if(StringRef(CurPtr - 1, Delimiter.length()) == Delimiter) { + if (StringRef(CurPtr - 1, Delimiter.length()) == Delimiter) { if (MultilineString) { CurPtr += 2; formToken(tok::string_literal, TokStart, @@ -2281,16 +2310,9 @@ void Lexer::lexImpl() { case ':': return formToken(tok::colon, TokStart); case '\\': return formToken(tok::backslash, TokStart); - case '#': { - const char *Lookahead = CurPtr; - while (*Lookahead == '#') - Lookahead++; - if (*Lookahead++ == '"') { - unsigned DelimiterLength = Lookahead - CurPtr; - CurPtr = Lookahead; - return lexStringLiteral(DelimiterLength); - } - } + case '#': + if (unsigned DelimiterLength = extractDelimiterLength(CurPtr)) + return lexStringLiteral(DelimiterLength); return lexHash(); // Operator characters. From 7866093ea5cd26e8993c96863f08f0e9a7905c02 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Sat, 11 Aug 2018 16:47:08 +0100 Subject: [PATCH 04/15] Extend token boundary to include delimiter --- include/swift/Parse/Token.h | 2 +- lib/Parse/Lexer.cpp | 58 +++++++++++++++--------------- lib/Parse/Parser.cpp | 7 ++-- test/Parse/raw_string.swift | 72 ++++++++++++++++++++++++++----------- 4 files changed, 84 insertions(+), 55 deletions(-) diff --git a/include/swift/Parse/Token.h b/include/swift/Parse/Token.h index 9b95390d3111d..43bc0a4859149 100644 --- a/include/swift/Parse/Token.h +++ b/include/swift/Parse/Token.h @@ -283,7 +283,7 @@ class Token { return MultilineString; } - unsigned DelimiterLength() const { + unsigned getDelimiterLength() const { return StringDelimiterLength; } }; diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index d5d1b1f654798..37a7cc6b3c3d6 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -296,7 +296,6 @@ void Lexer::formToken(tok Kind, const char *TokStart, NextToken.setToken(Kind, TokenText, CommentLength, MultilineString, DelimiterLength); - CurPtr += DelimiterLength; } void Lexer::formEscapedIdentifierToken(const char *TokStart) { @@ -1207,10 +1206,10 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } -/// extractDelimiterLength - Extracts/detects any custom delimiter on opening -/// a string literal and advances CurPtr if a delimiter is found and returns -/// a non-zero delimiter length. CurPtr[-1] is generally '#' when called. -static unsigned extractDelimiterLength(const char *&CurPtr) { +/// extractStringDelimiterLength - Extracts/detects any custom delimiter on +/// opening a string literal and advances CurPtr if a delimiter is found and +/// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called. +static unsigned extractStringDelimiterLength(const char *&CurPtr) { const char *Lookahead = CurPtr; while (*Lookahead == '#') Lookahead++; @@ -1230,7 +1229,7 @@ static unsigned extractDelimiterLength(const char *&CurPtr) { static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr) { if (!DelimiterLength) return true; - for (unsigned i = 0; i < DelimiterLength ; i++) + for (unsigned i = 0; i < DelimiterLength; i++) if (BytesPtr[i] != '#') return false; BytesPtr += DelimiterLength; @@ -1395,7 +1394,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, case '#': if (inStringLiteral() || - !(DelimiterLength = extractDelimiterLength(CurPtr))) + !(DelimiterLength = extractStringDelimiterLength(CurPtr))) continue; LLVM_FALLTHROUGH; @@ -1501,6 +1500,9 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, static StringRef getStringLiteralContent(const Token &Str) { StringRef Bytes = Str.getText(); + if (unsigned DelimiterLength = Str.getDelimiterLength()) + Bytes = Bytes.drop_front(DelimiterLength).drop_back(DelimiterLength); + if (Str.IsMultilineString()) Bytes = Bytes.drop_front(3).drop_back(3); else @@ -1539,7 +1541,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { auto string = StringRef(start, end - start); // Disallow escaped newline in the last line. - if (Diags && Str.DelimiterLength() == 0) { + if (Diags && Str.getDelimiterLength() == 0) { auto *Ptr = start - 1; if (*Ptr == '\n') --Ptr; if (*Ptr == '\r') --Ptr; @@ -1702,20 +1704,18 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { // diagnostics about changing them to double quotes. bool wasErroneous = false, MultilineString = false; - std::string Delimiter; - Delimiter.push_back(*TokStart); + std::string ExtraTermination; // Is this the start of a multiline string literal? if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { MultilineString = true; CurPtr += 2; - Delimiter.push_back(*TokStart); - Delimiter.push_back(*TokStart); if (*CurPtr != '\n' && *CurPtr != '\r') diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); + ExtraTermination.insert(ExtraTermination.size(), 2, *TokStart); } - Delimiter.insert(Delimiter.size(), DelimiterLength, '#'); + ExtraTermination.insert(ExtraTermination.size(), DelimiterLength, '#'); while (true) { const char *TmpPtr = CurPtr + 1; @@ -1752,8 +1752,6 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { // or an already-diagnosed error, just munch it. if (CharValue == ~0U) { ++CurPtr; - if (wasErroneous) - return formToken(tok::unknown, TokStart); if (*TokStart == '\'') { // Complain about single-quote string and suggest replacement with @@ -1789,19 +1787,18 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { replacement); } - // Is this the end of a delimited/multiline string literal? - if (StringRef(CurPtr - 1, Delimiter.length()) == Delimiter) { - if (MultilineString) { - CurPtr += 2; - formToken(tok::string_literal, TokStart, - MultilineString, DelimiterLength); - if (Diags) - validateMultilineIndents(NextToken, Diags); - return; - } - else - return formToken(tok::string_literal, TokStart, - MultilineString, DelimiterLength); + // Is this the end of multiline/delimited string literal? + if (StringRef(CurPtr, ExtraTermination.length()) == ExtraTermination) { + TokStart -= DelimiterLength; + CurPtr += ExtraTermination.length(); + if (wasErroneous) + return formToken(tok::unknown, TokStart); + + formToken(tok::string_literal, TokStart, + MultilineString, DelimiterLength); + if (MultilineString && Diags) + validateMultilineIndents(NextToken, Diags); + return; } } } @@ -2166,7 +2163,7 @@ void Lexer::getStringLiteralSegments( // Are substitutions required either for indent stripping or line ending // normalization? bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true; - unsigned IndentToStrip = 0, DelimiterLength = Str.DelimiterLength(); + unsigned IndentToStrip = 0, DelimiterLength = Str.getDelimiterLength(); if (MultilineString) IndentToStrip = std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size(); @@ -2217,6 +2214,7 @@ void Lexer::getStringLiteralSegments( DelimiterLength)); } + //===----------------------------------------------------------------------===// // Main Lexer Loop //===----------------------------------------------------------------------===// @@ -2311,7 +2309,7 @@ void Lexer::lexImpl() { case '\\': return formToken(tok::backslash, TokStart); case '#': - if (unsigned DelimiterLength = extractDelimiterLength(CurPtr)) + if (unsigned DelimiterLength = extractStringDelimiterLength(CurPtr)) return lexStringLiteral(DelimiterLength); return lexHash(); diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp index 84493b0b0f752..449599afb02e0 100644 --- a/lib/Parse/Parser.cpp +++ b/lib/Parse/Parser.cpp @@ -221,7 +221,8 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, int BufID, std::vector &Toks) { assert(Tok.is(tok::string_literal)); bool IsMultiline = Tok.IsMultilineString(); - unsigned QuoteLen = IsMultiline ? 3 : 1; + unsigned DelimiterLength = Tok.getDelimiterLength(); + unsigned QuoteLen = (IsMultiline ? 3 : 1) + DelimiterLength; SmallVector Segments; Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr); for (unsigned i = 0, e = Segments.size(); i != e; ++i) { @@ -243,7 +244,7 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, StringRef Text = SM.extractText({ Loc, Len }); Token NewTok; - NewTok.setToken(tok::string_literal, Text, IsMultiline); + NewTok.setToken(tok::string_literal, Text, IsMultiline, DelimiterLength); Toks.push_back(NewTok); } else { @@ -384,7 +385,7 @@ class TokenRecorder: public ConsumeTokenReceiver { L.lex(Result); if (Result.is(tok::eof)) break; - if(Result.is(tok::comment)) // interacts badly with custom delimiters + assert(Result.is(tok::comment)); Scratch.push_back(Result); } } diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift index 0695922b64899..50e38b157e730 100644 --- a/test/Parse/raw_string.swift +++ b/test/Parse/raw_string.swift @@ -2,55 +2,85 @@ import Swift +_ = #""" +################################################################### +## This source file is part of the Swift.org open source project ## +################################################################### +"""# + +// CHECK: "###################################################################\n## This source file is part of the Swift.org open source project ##\n###################################################################" + // ===---------- Multiline RawString --------=== -print(##""" +_ = ##""" One ""Alpha"" - """##) + """## // CHECK: "One\n\"\"Alpha\"\"" -print(##""" +_ = ##""" Two Beta - """##) + """## // CHECK: " Two\nBeta" -print(#""" +_ = #""" Three\r Gamma\ - """#) + """# // CHECK: " Three\\r\n Gamma\\" -print(###""" +_ = ###""" Four \(foo) Delta -"""###) +"""### // CHECK: " Four \\(foo)\n Delta" -print(##""" +_ = ##""" print(""" Five\##n\##n\##nEpsilon """) - """##) + """## // CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")" // ===---------- Single line --------=== -print(#""Zeta""#) +_ = #""Zeta""# // CHECK: "\"Zeta\"" -print(#""Eta"\#n\#n\#n\#""#) +_ = #""Eta"\#n\#n\#n\#""# // CHECK: "\"Eta\"\n\n\n\"" -print(#""Iota"\n\n\n\""#) +_ = #""Iota"\n\n\n\""# // CHECK: "\"Iota\"\\n\\n\\n\\\"" +_ = #"a raw string with \" in it"# +// CHECK: "a raw string with \\\" in it" + +_ = ##""" + a raw string with """ in it + """## +// CHECK: "a raw string with \"\"\" in it" + let foo = "Interpolation" -print(#"\b\b \#(foo)\#(foo) Kappa"#) +_ = #"\b\b \#(foo)\#(foo) Kappa"# // CHECK: "\\b\\b " // CHECK: " Kappa" +_ = """ + interpolating \(##""" + delimited \##("string")\#n\##n + """##) + """ + +// CHECK: "interpolating " +// CHECK: "delimited " +// CHECK: "string" +// CHECK: "\\#n\n" + +#"unused literal"# +// CHECK: "unused literal" + // ===---------- From proposal --------=== _ = #"This is a string"# @@ -66,26 +96,26 @@ _ = #"Alice: "How long is forever?" White Rabbit: "Sometimes, just one second."" // CHECK: "Alice: \"How long is forever?\" White Rabbit: \"Sometimes, just one second.\"" _ = #"\#\#1"# -/// CHECK: "\\#1" +// CHECK: "\\#1" _ = ##"\#1"## -/// CHECK: "\\#1" +// CHECK: "\\#1" _ = #"c:\windows\system32"# -/// CHCECK: "c:\\windows\\system32" +// CHECK: "c:\\windows\\system32" _ = #"\d{3) \d{3} \d{4}"# -///CHECK: "\\d{3) \\d{3} \\d{4}" +// CHECK: "\\d{3) \\d{3} \\d{4}" _ = #""" a string with """ in it """# -/// CHECK: "a string with\n\"\"\"\nin it" +// CHECK: "a string with\n\"\"\"\nin it" _ = #"a raw string containing \r\n"# -/// CHECK "a raw string containing \\r\\n" +// CHECK: "a raw string containing \\r\\n" _ = #""" [ @@ -95,4 +125,4 @@ _ = #""" } ] """# -/// CHECK: "[\n {\n \"id\": \"12345\",\n \"title\": \"A title that \\\"contains\\\" \\\\\\\"\"\n }\n]" +// CHECK: "[\n {\n \"id\": \"12345\",\n \"title\": \"A title that \\\"contains\\\" \\\\\\\"\"\n }\n]" From 6bd7cb884ceebd6d7e4443629c3534e5635a4830 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Fri, 17 Aug 2018 06:26:54 +0100 Subject: [PATCH 05/15] Pragmatic support of multiline/delimited in attributes --- include/swift/Parse/Lexer.h | 6 +++--- lib/Parse/Lexer.cpp | 25 +++++++++++++++++++++++-- lib/Sema/TypeChecker.h | 2 +- test/Parse/raw_string.swift | 10 ++++++++-- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h index e62ce66b15eb0..37a27b9c8f5ad 100644 --- a/include/swift/Parse/Lexer.h +++ b/include/swift/Parse/Lexer.h @@ -494,9 +494,9 @@ class Lexer { void lexTrivia(syntax::Trivia &T, bool IsForTrailingTrivia); static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags); - unsigned lexCharacter(const char *&CurPtr, - char StopQuote, bool EmitDiagnostics, - bool Multiline = false, unsigned DelimiterLength = 0); + unsigned lexCharacter(const char *&CurPtr, char StopQuote, + bool EmitDiagnostics, bool MultilineString = false, + unsigned DelimiterLength = 0); void lexStringLiteral(unsigned DelimiterLength = 0); void lexEscapedIdentifier(); diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 37a7cc6b3c3d6..73d51e227a8c7 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -2065,13 +2065,34 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, bool IsFirstSegment, bool IsLastSegment, unsigned IndentToStrip, - unsigned DelmiterLength) { + unsigned DelimiterLength) { TempString.clear(); // Note that it is always safe to read one over the end of "Bytes" because // we know that there is a terminating " character. Use BytesPtr to avoid a // range check subscripting on the StringRef. const char *BytesPtr = Bytes.begin(); + + // Special case when being called from EncodedDiagnosticMessage(...) + // This allows multiline and delimited strings to work in attributes. + // The string has already been validated by the initial parse. + if (IndentToStrip == ~0u && DelimiterLength == ~0u) { + IndentToStrip = DelimiterLength = 0; + + // restore trailing indent removal for multiline + const char *Backtrack = BytesPtr - 1; + if (Backtrack[-1] == '"' && Backtrack[-2] == '"') { + Backtrack -= 2; + for (const char *Trailing = Bytes.end() - 1; + *Trailing == ' ' || *Trailing == '\t'; Trailing--) + IndentToStrip++; + } + + // restore delimiter if any + while (*--Backtrack == '#') + DelimiterLength++; + } + bool IsEscapedNewline = false; while (BytesPtr < Bytes.end()) { char CurChar = *BytesPtr++; @@ -2092,7 +2113,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, continue; } - if (CurChar != '\\' || !delimiterMatches(DelmiterLength, BytesPtr)) { + if (CurChar != '\\' || !delimiterMatches(DelimiterLength, BytesPtr)) { TempString.push_back(CurChar); continue; } diff --git a/lib/Sema/TypeChecker.h b/lib/Sema/TypeChecker.h index 2da25c8f49322..c82803d39e60e 100644 --- a/lib/Sema/TypeChecker.h +++ b/lib/Sema/TypeChecker.h @@ -2552,7 +2552,7 @@ class EncodedDiagnosticMessage { public: /// \param S A string with an encoded message EncodedDiagnosticMessage(StringRef S) - : Message(Lexer::getEncodedStringSegment(S, Buf)) {} + : Message(Lexer::getEncodedStringSegment(S, Buf, true, true, ~0, ~0)) {} /// The unescaped message to display to the user. const StringRef Message; diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift index 50e38b157e730..953b92324b041 100644 --- a/test/Parse/raw_string.swift +++ b/test/Parse/raw_string.swift @@ -1,4 +1,4 @@ -// RUN: %target-swift-frontend -dump-ast %s 2>&1 | %FileCheck %s +// RUN: %target-swift-frontend -dump-ast %s 2>&1 | %FileCheck --strict-whitespace %s import Swift @@ -7,9 +7,15 @@ _ = #""" ## This source file is part of the Swift.org open source project ## ################################################################### """# - // CHECK: "###################################################################\n## This source file is part of the Swift.org open source project ##\n###################################################################" +_ = #""" + # H1 # + ## H2 ## + ### H3 ### + """# +// CHECK: "# H1 #\n## H2 ##\n### H3 ###" + // ===---------- Multiline RawString --------=== _ = ##""" From 4209b72a667945a32fb5321b19068a3d95b81abe Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Mon, 27 Aug 2018 10:15:50 +0100 Subject: [PATCH 06/15] Delimiter specific diagnostic --- include/swift/AST/DiagnosticsParse.def | 2 ++ lib/Parse/Lexer.cpp | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def index cc6ba014cafbd..71e7972e437f9 100644 --- a/include/swift/AST/DiagnosticsParse.def +++ b/include/swift/AST/DiagnosticsParse.def @@ -138,6 +138,8 @@ ERROR(lex_invalid_u_escape,none, "\\u{...} escape sequence expects between 1 and 8 hex digits", ()) ERROR(lex_invalid_u_escape_rbrace,none, "expected '}' in \\u{...} escape sequence", ()) +ERROR(lex_invalid_delimiter_escape,none, + "Too many # characters in delimited escape", ()) ERROR(lex_invalid_unicode_scalar,none, "invalid unicode scalar", ()) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 73d51e227a8c7..f076c34ee7fa9 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1303,8 +1303,18 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, unsigned CharValue = 0; // Escape processing. We already ate the "\". switch (*CurPtr) { - case ' ': case '\t': case '\n': case '\r': - if (MultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) + case ' ': case '\t': case '\n': case '\r': case '#': + if (*CurPtr == '#') { + if (DelimiterLength) { + if (EmitDiagnostics) + diagnose(CurPtr, diag::lex_invalid_delimiter_escape) + .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), + Lexer::getSourceLoc(CurPtr + 1)); + CurPtr++; + return ~1U; + } + } + else if (MultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) return '\n'; LLVM_FALLTHROUGH; default: // Invalid escape. From 9691076af0530d48ac96e062b8a087512185b87c Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Mon, 27 Aug 2018 19:14:26 +0100 Subject: [PATCH 07/15] Response to rintaro's review --- lib/Parse/Lexer.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index f076c34ee7fa9..4cee20f00c7fb 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1707,6 +1707,7 @@ static void validateMultilineIndents(const Token &Str, /// lexStringLiteral: /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] /// string_literal ::= ["]["]["].*["]["]["] - approximately +/// string_literal ::= (#+)("")?".*"(\2\1) - "raw" strings void Lexer::lexStringLiteral(unsigned DelimiterLength) { const char *TokStart = CurPtr-1; assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); @@ -1714,7 +1715,7 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { // diagnostics about changing them to double quotes. bool wasErroneous = false, MultilineString = false; - std::string ExtraTermination; + SmallString<8> ExtraTermination; // Is this the start of a multiline string literal? if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { @@ -1723,9 +1724,9 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { if (*CurPtr != '\n' && *CurPtr != '\r') diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); - ExtraTermination.insert(ExtraTermination.size(), 2, *TokStart); + ExtraTermination.append(2, *TokStart); } - ExtraTermination.insert(ExtraTermination.size(), DelimiterLength, '#'); + ExtraTermination.append(DelimiterLength, '#'); while (true) { const char *TmpPtr = CurPtr + 1; @@ -1750,6 +1751,7 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { // String literals cannot have \n or \r in them (unless multiline). if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString) || CurPtr == BufferEnd) { + TokStart -= DelimiterLength; diagnose(TokStart, diag::lex_unterminated_string); return formToken(tok::unknown, TokStart); } @@ -1798,9 +1800,9 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { } // Is this the end of multiline/delimited string literal? - if (StringRef(CurPtr, ExtraTermination.length()) == ExtraTermination) { + if (StringRef(CurPtr, BufferEnd - CurPtr).startswith(ExtraTermination)) { TokStart -= DelimiterLength; - CurPtr += ExtraTermination.length(); + CurPtr += ExtraTermination.size(); if (wasErroneous) return formToken(tok::unknown, TokStart); From 032d865fa115fa207712083421d0116fe7bfd3b6 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Thu, 30 Aug 2018 14:02:40 +0100 Subject: [PATCH 08/15] Response to rintaro's 2nd review --- lib/Parse/Lexer.cpp | 8 +++++--- test/Parse/raw_string.swift | 9 +++++++++ test/Parse/raw_string_errors.swift | 10 ++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 test/Parse/raw_string_errors.swift diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 4cee20f00c7fb..6786df509aa14 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1411,8 +1411,9 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, case '"': case '\'': { if (!AllowNewline.back() && inStringLiteral()) { - if (OpenDelimiters.back() == CurPtr[-1] && - delimiterMatches(CustomDelimiter.back(), CurPtr)) { + unsigned InnerDelimiter = CustomDelimiter.back(); + if (OpenDelimiters.back() == CurPtr[-1] && (!InnerDelimiter || + (delimiterMatches(InnerDelimiter, CurPtr) && *CurPtr != '#'))) { // Closing single line string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); @@ -1800,7 +1801,8 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { } // Is this the end of multiline/delimited string literal? - if (StringRef(CurPtr, BufferEnd - CurPtr).startswith(ExtraTermination)) { + if (StringRef(CurPtr, BufferEnd - CurPtr).startswith(ExtraTermination) && + (!DelimiterLength || *(CurPtr + ExtraTermination.size()) != '#')) { TokStart -= DelimiterLength; CurPtr += ExtraTermination.size(); if (wasErroneous) diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift index 953b92324b041..66858327348a7 100644 --- a/test/Parse/raw_string.swift +++ b/test/Parse/raw_string.swift @@ -49,6 +49,15 @@ _ = ##""" """## // CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")" +_ = #" "##" "# +// CHECK: " \"##\" " + +_ = #" \#(#" "######" "#) "# +// CHECK: " \"######\" " + +_ = #" \#(##"###""###"##) "# +// CHECK: "###\"\"###" + // ===---------- Single line --------=== _ = #""Zeta""# diff --git a/test/Parse/raw_string_errors.swift b/test/Parse/raw_string_errors.swift new file mode 100644 index 0000000000000..fd49ea17ba7d2 --- /dev/null +++ b/test/Parse/raw_string_errors.swift @@ -0,0 +1,10 @@ +// RUN: %target-typecheck-verify-swift + +#"\##("invalid")"# +// expected-error@-1{{Too many # characters in delimited escape}} + +####"invalid"### +// expected-error@-1{{unterminated string literal}} + +###"invalid"#### +// expected-error@-1{{unterminated string literal}} From 3fc43bcb80e7baab6cd37180c0b25b9425dd0847 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Sat, 1 Sep 2018 15:18:20 +0100 Subject: [PATCH 09/15] Check for zero-width characters in delimiters --- include/swift/AST/DiagnosticsParse.def | 4 +- lib/Parse/Lexer.cpp | 75 ++++++++++++++++++-------- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def index 71e7972e437f9..b25eb21461d68 100644 --- a/include/swift/AST/DiagnosticsParse.def +++ b/include/swift/AST/DiagnosticsParse.def @@ -139,7 +139,9 @@ ERROR(lex_invalid_u_escape,none, ERROR(lex_invalid_u_escape_rbrace,none, "expected '}' in \\u{...} escape sequence", ()) ERROR(lex_invalid_delimiter_escape,none, - "Too many # characters in delimited escape", ()) + "too many # characters in delimited escape", ()) +ERROR(lex_zerowidth_in_string_delimiter,none, + "zero-width character detected in string delimiter", ()) ERROR(lex_invalid_unicode_scalar,none, "invalid unicode scalar", ()) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 6786df509aa14..b0ecb9a6a9838 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1206,6 +1206,38 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } +/// diagnoseZeroWidth - check for and error zero-width characters in delimiters +static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) { + const unsigned char *TmpPtr = (const unsigned char *)CurPtr; + // zero-width set assumed to be: U+200B, U+200C, U+200D, U+2060, U+FEFF + while ((TmpPtr[0] == 0xE2 && ((TmpPtr[1] == 0x80 && + (TmpPtr[2] == 0x8B || TmpPtr[2] == 0x8C || TmpPtr[2] == 0x8D)) || + (TmpPtr[1] == 0x81 && TmpPtr[2] == 0xA0))) || + (TmpPtr[0] == 0xEF && TmpPtr[1] == 0xBB && TmpPtr[2] == 0xBF)) { + if (Diags) + Diags->diagnose(Lexer::getSourceLoc(CurPtr), + diag::lex_zerowidth_in_string_delimiter) + .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), + Lexer::getSourceLoc(CurPtr + 3)); + TmpPtr += 3; + CurPtr += 3; + } + return true; +} + +/// advanceIfMultilineDelimiter - centralized check for multiline delimiter +static bool advanceIfMultilineDelimiter(const char *&CurPtr, + DiagnosticEngine *Diags) { + const char *TmpPtr = CurPtr - 1; + if (*TmpPtr++ == '"' && diagnoseZeroWidth(TmpPtr, Diags) && + *TmpPtr++ == '"' && diagnoseZeroWidth(TmpPtr, Diags) && + *TmpPtr++ == '"') { + CurPtr = TmpPtr; + return true; + } + return false; +} + /// extractStringDelimiterLength - Extracts/detects any custom delimiter on /// opening a string literal and advances CurPtr if a delimiter is found and /// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called. @@ -1226,13 +1258,15 @@ static unsigned extractStringDelimiterLength(const char *&CurPtr) { /// interpolation inside a "raw" string. Normal/cooked string processing is /// the degenerate case of there being no # characters surrounding the quotes. /// If delimiter matches, advances byte pointer passed in and returns true. -static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr) { +static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr, + DiagnosticEngine *Diags) { if (!DelimiterLength) return true; + const char *TmpPtr = BytesPtr; for (unsigned i = 0; i < DelimiterLength; i++) - if (BytesPtr[i] != '#') + if (diagnoseZeroWidth(TmpPtr, Diags) && *TmpPtr++ != '#') return false; - BytesPtr += DelimiterLength; + BytesPtr = TmpPtr; return true; } @@ -1295,7 +1329,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, diagnose(CurPtr-1, diag::lex_unterminated_string); return ~1U; case '\\': // Escapes. - if (!delimiterMatches(DelimiterLength, CurPtr)) + if (!delimiterMatches(DelimiterLength, CurPtr, Diags)) return '\\'; break; } @@ -1413,7 +1447,8 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, if (!AllowNewline.back() && inStringLiteral()) { unsigned InnerDelimiter = CustomDelimiter.back(); if (OpenDelimiters.back() == CurPtr[-1] && (!InnerDelimiter || - (delimiterMatches(InnerDelimiter, CurPtr) && *CurPtr != '#'))) { + (delimiterMatches(InnerDelimiter, CurPtr, Diags) + && *CurPtr != '#'))) { // Closing single line string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); @@ -1423,10 +1458,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, continue; } - bool isMultilineQuote = ( - *CurPtr == '"' && *(CurPtr + 1) == '"' && *(CurPtr - 1) == '"'); - if (isMultilineQuote) - CurPtr += 2; + bool isMultilineQuote = advanceIfMultilineDelimiter(CurPtr, Diags); if (!inStringLiteral()) { // Open string literal @@ -1439,7 +1471,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // We are in multiline string literal. assert(AllowNewline.back() && "other cases must be handled above"); if (isMultilineQuote && - delimiterMatches(CustomDelimiter.back(), CurPtr)) { + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) { // Close multiline string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); @@ -1451,7 +1483,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, } case '\\': if (inStringLiteral() && - delimiterMatches(CustomDelimiter.back(), CurPtr)) { + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) { char escapedChar = *CurPtr++; switch (escapedChar) { case '(': @@ -1716,23 +1748,18 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { // diagnostics about changing them to double quotes. bool wasErroneous = false, MultilineString = false; - SmallString<8> ExtraTermination; // Is this the start of a multiline string literal? - if (*TokStart == '"' && *CurPtr == '"' && *(CurPtr + 1) == '"') { - MultilineString = true; - CurPtr += 2; + if ((MultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) { if (*CurPtr != '\n' && *CurPtr != '\r') diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); - ExtraTermination.append(2, *TokStart); } - ExtraTermination.append(DelimiterLength, '#'); while (true) { const char *TmpPtr = CurPtr + 1; if (*CurPtr == '\\' && - delimiterMatches(DelimiterLength, TmpPtr) && *TmpPtr == '(') { + delimiterMatches(DelimiterLength, TmpPtr, Diags) && *TmpPtr == '(') { // Consume tokens until we hit the corresponding ')'. CurPtr = TmpPtr + 1; const char *EndPtr = @@ -1801,10 +1828,10 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { } // Is this the end of multiline/delimited string literal? - if (StringRef(CurPtr, BufferEnd - CurPtr).startswith(ExtraTermination) && - (!DelimiterLength || *(CurPtr + ExtraTermination.size()) != '#')) { + if ((!MultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) && + (!DelimiterLength || (delimiterMatches(DelimiterLength, CurPtr, Diags) + && *CurPtr != '#'))) { TokStart -= DelimiterLength; - CurPtr += ExtraTermination.size(); if (wasErroneous) return formToken(tok::unknown, TokStart); @@ -2127,7 +2154,8 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, continue; } - if (CurChar != '\\' || !delimiterMatches(DelimiterLength, BytesPtr)) { + if (CurChar != '\\' || + !delimiterMatches(DelimiterLength, BytesPtr, nullptr)) { TempString.push_back(CurChar); continue; } @@ -2212,7 +2240,8 @@ void Lexer::getStringLiteralSegments( while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) { BytesPtr = Bytes.begin() + pos + 1; - if (!delimiterMatches(DelimiterLength, BytesPtr) || *BytesPtr++ != '(') + if (!delimiterMatches(DelimiterLength, BytesPtr, Diags) || + *BytesPtr++ != '(') continue; // String interpolation. From dc96342368904727a0fba4dd570cf0533fdca540 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Sun, 2 Sep 2018 10:20:37 +0100 Subject: [PATCH 10/15] Response to xwu's review --- include/swift/AST/DiagnosticsParse.def | 2 +- include/swift/Parse/Lexer.h | 22 ++-- include/swift/Parse/Token.h | 19 ++-- lib/Parse/Lexer.cpp | 148 +++++++++++++------------ lib/Parse/ParseExpr.cpp | 2 +- lib/Parse/Parser.cpp | 9 +- test/Parse/raw_string.swift | 9 -- test/Parse/raw_string_errors.swift | 3 +- 8 files changed, 109 insertions(+), 105 deletions(-) diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def index b25eb21461d68..35ab944902d2d 100644 --- a/include/swift/AST/DiagnosticsParse.def +++ b/include/swift/AST/DiagnosticsParse.def @@ -139,7 +139,7 @@ ERROR(lex_invalid_u_escape,none, ERROR(lex_invalid_u_escape_rbrace,none, "expected '}' in \\u{...} escape sequence", ()) ERROR(lex_invalid_delimiter_escape,none, - "too many # characters in delimited escape", ()) + "too many '#' characters in delimited escape", ()) ERROR(lex_zerowidth_in_string_delimiter,none, "zero-width character detected in string delimiter", ()) diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h index 37a27b9c8f5ad..ac8a7b9ea9279 100644 --- a/include/swift/Parse/Lexer.h +++ b/include/swift/Parse/Lexer.h @@ -355,13 +355,13 @@ class Lexer { enum : char { Literal, Expr } Kind; // Loc+Length for the segment inside the string literal, without quotes. SourceLoc Loc; - unsigned Length, IndentToStrip, DelimiterLength; + unsigned Length, IndentToStrip, CustomDelimiterLen; bool IsFirstSegment, IsLastSegment; static StringSegment getLiteral(SourceLoc Loc, unsigned Length, bool IsFirstSegment, bool IsLastSegment, unsigned IndentToStrip, - unsigned DelimiterLength) { + unsigned CustomDelimiterLen) { StringSegment Result; Result.Kind = Literal; Result.Loc = Loc; @@ -369,7 +369,7 @@ class Lexer { Result.IsFirstSegment = IsFirstSegment; Result.IsLastSegment = IsLastSegment; Result.IndentToStrip = IndentToStrip; - Result.DelimiterLength = DelimiterLength; + Result.CustomDelimiterLen = CustomDelimiterLen; return Result; } @@ -381,7 +381,7 @@ class Lexer { Result.IsFirstSegment = false; Result.IsLastSegment = false; Result.IndentToStrip = 0; - Result.DelimiterLength = 0; + Result.CustomDelimiterLen = 0; return Result; } @@ -399,13 +399,13 @@ class Lexer { bool IsFirstSegment = false, bool IsLastSegment = false, unsigned IndentToStrip = 0, - unsigned DelimiterLength = 0); + unsigned CustomDelimiterLen = 0); StringRef getEncodedStringSegment(StringSegment Segment, SmallVectorImpl &Buffer) const { return getEncodedStringSegment( StringRef(getBufferPtrForSourceLoc(Segment.Loc), Segment.Length), Buffer, Segment.IsFirstSegment, Segment.IsLastSegment, - Segment.IndentToStrip, Segment.DelimiterLength); + Segment.IndentToStrip, Segment.CustomDelimiterLen); } /// \brief Given a string literal token, separate it into string/expr segments @@ -469,8 +469,8 @@ class Lexer { return diagnose(Loc, Diagnostic(DiagID, std::forward(Args)...)); } - void formToken(tok Kind, const char *TokStart, bool MultilineString = false, - unsigned DelimiterLength = 0); + void formToken(tok Kind, const char *TokStart, bool IsMultilineString = false, + unsigned CustomDelimiterLen = 0); void formEscapedIdentifierToken(const char *TokStart); /// Advance to the end of the line. @@ -495,9 +495,9 @@ class Lexer { static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags); unsigned lexCharacter(const char *&CurPtr, char StopQuote, - bool EmitDiagnostics, bool MultilineString = false, - unsigned DelimiterLength = 0); - void lexStringLiteral(unsigned DelimiterLength = 0); + bool EmitDiagnostics, bool IsMultilineString = false, + unsigned CustomDelimiterLen = 0); + void lexStringLiteral(unsigned CustomDelimiterLen = 0); void lexEscapedIdentifier(); void tryLexEditorPlaceholder(); diff --git a/include/swift/Parse/Token.h b/include/swift/Parse/Token.h index 43bc0a4859149..9e06ec5e44aa1 100644 --- a/include/swift/Parse/Token.h +++ b/include/swift/Parse/Token.h @@ -46,7 +46,7 @@ class Token { unsigned MultilineString : 1; /// Length of custom delimiter of "raw" string literals - unsigned StringDelimiterLength : 8; + unsigned CustomDelimiterLen : 8; // Padding bits == 32 - 11; @@ -65,7 +65,7 @@ class Token { public: Token(tok Kind, StringRef Text, unsigned CommentLength = 0) : Kind(Kind), AtStartOfLine(false), EscapedIdentifier(false), - MultilineString(false), StringDelimiterLength(0), + MultilineString(false), CustomDelimiterLen(0), CommentLength(CommentLength), Text(Text) {} Token() : Token(tok::NUM_TOKENS, {}, 0) {} @@ -269,22 +269,23 @@ class Token { /// \brief Set the token to the specified kind and source range. void setToken(tok K, StringRef T, unsigned CommentLength = 0, - bool MultilineString = false, unsigned DelimiterLength = 0) { + bool IsMultilineString = false, unsigned CustomDelimiterLen = 0) { Kind = K; Text = T; this->CommentLength = CommentLength; EscapedIdentifier = false; - this->MultilineString = MultilineString; - StringDelimiterLength = DelimiterLength; - assert(StringDelimiterLength == DelimiterLength && "delimiter too long"); + this->MultilineString = IsMultilineString; + this->CustomDelimiterLen = CustomDelimiterLen; + assert(this->CustomDelimiterLen == CustomDelimiterLen && + "string custom delimiter too long"); } - bool IsMultilineString() const { + bool isMultilineString() const { return MultilineString; } - unsigned getDelimiterLength() const { - return StringDelimiterLength; + unsigned getCustomDelimiterLen() const { + return CustomDelimiterLen; } }; diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index b0ecb9a6a9838..cfccb5d62c665 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -268,7 +268,7 @@ Token Lexer::getTokenAt(SourceLoc Loc) { } void Lexer::formToken(tok Kind, const char *TokStart, - bool MultilineString, unsigned DelimiterLength) { + bool IsMultilineString, unsigned CustomDelimiterLen) { assert(CurPtr >= BufferStart && CurPtr <= BufferEnd && "Current pointer out of range!"); @@ -295,7 +295,7 @@ void Lexer::formToken(tok Kind, const char *TokStart, } NextToken.setToken(Kind, TokenText, CommentLength, - MultilineString, DelimiterLength); + IsMultilineString, CustomDelimiterLen); } void Lexer::formEscapedIdentifierToken(const char *TokStart) { @@ -1206,26 +1206,38 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } -/// diagnoseZeroWidth - check for and error zero-width characters in delimiters +/// diagnoseZeroWidth - Check for and error zero-width characters in delimiters. +/// A non visible character in the middle of a delimter can be used to extend +/// the literal beyond what it would appear creating potential security bugs. static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) { - const unsigned char *TmpPtr = (const unsigned char *)CurPtr; - // zero-width set assumed to be: U+200B, U+200C, U+200D, U+2060, U+FEFF - while ((TmpPtr[0] == 0xE2 && ((TmpPtr[1] == 0x80 && - (TmpPtr[2] == 0x8B || TmpPtr[2] == 0x8C || TmpPtr[2] == 0x8D)) || - (TmpPtr[1] == 0x81 && TmpPtr[2] == 0xA0))) || - (TmpPtr[0] == 0xEF && TmpPtr[1] == 0xBB && TmpPtr[2] == 0xBF)) { - if (Diags) - Diags->diagnose(Lexer::getSourceLoc(CurPtr), - diag::lex_zerowidth_in_string_delimiter) - .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), - Lexer::getSourceLoc(CurPtr + 3)); - TmpPtr += 3; - CurPtr += 3; - } + // A way needs to be found to find the complete set of zero width chars or + // this security mitigation will be in vain. Current list was taken from: + // https://www.ptiglobal.com/2018/04/26/the-beauty-of-unicode-zero-width-characters/ + // https://github.com/dblspk/web-app + // As this list may not be complete this code is not currently implemented. +// const char *TmpPtr = CurPtr; +// while (true) { +// switch (validateUTF8CharacterAndAdvance(TmpPtr, TmpPtr+3)) { +// case 0x200B: case 0x200C: case 0x200D: case 0x2060: +// case 0x2061: case 0x2062: case 0x2063: case 0x2064: +// case 0x206A: case 0x206B: case 0x206C: case 0x206D: +// case 0x206E: case 0x206F: case 0xFE00: case 0xFE01: +// case 0xFEFF: +// if (Diags) +// Diags->diagnose(Lexer::getSourceLoc(CurPtr), +// diag::lex_zerowidth_in_string_delimiter) +// .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), +// Lexer::getSourceLoc(CurPtr + 3)); +// CurPtr = TmpPtr; +// break; +// default: +// return true; +// } +// } return true; } -/// advanceIfMultilineDelimiter - centralized check for multiline delimiter +/// advanceIfMultilineDelimiter - Centralized check for multiline delimiter. static bool advanceIfMultilineDelimiter(const char *&CurPtr, DiagnosticEngine *Diags) { const char *TmpPtr = CurPtr - 1; @@ -1238,17 +1250,17 @@ static bool advanceIfMultilineDelimiter(const char *&CurPtr, return false; } -/// extractStringDelimiterLength - Extracts/detects any custom delimiter on +/// advanceIfCustomDelimiterLen - Extracts/detects any custom delimiter on /// opening a string literal and advances CurPtr if a delimiter is found and /// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called. -static unsigned extractStringDelimiterLength(const char *&CurPtr) { +static unsigned advanceIfCustomDelimiterLen(const char *&CurPtr) { const char *Lookahead = CurPtr; while (*Lookahead == '#') Lookahead++; if (*Lookahead++ == '"') { - unsigned DelimiterLength = Lookahead - CurPtr; + unsigned CustomDelimiterLen = Lookahead - CurPtr; CurPtr = Lookahead; - return DelimiterLength; + return CustomDelimiterLen; } return 0; } @@ -1258,12 +1270,12 @@ static unsigned extractStringDelimiterLength(const char *&CurPtr) { /// interpolation inside a "raw" string. Normal/cooked string processing is /// the degenerate case of there being no # characters surrounding the quotes. /// If delimiter matches, advances byte pointer passed in and returns true. -static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr, +static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr, DiagnosticEngine *Diags) { - if (!DelimiterLength) + if (!CustomDelimiterLen) return true; const char *TmpPtr = BytesPtr; - for (unsigned i = 0; i < DelimiterLength; i++) + for (unsigned i = 0; i < CustomDelimiterLen; i++) if (diagnoseZeroWidth(TmpPtr, Diags) && *TmpPtr++ != '#') return false; BytesPtr = TmpPtr; @@ -1279,8 +1291,8 @@ static bool delimiterMatches(unsigned DelimiterLength, const char *&BytesPtr, /// character_escape ::= [\][\] | [\]t | [\]n | [\]r | [\]" | [\]' | [\]0 /// character_escape ::= unicode_character_escape unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, - bool EmitDiagnostics, bool MultilineString, - unsigned DelimiterLength) { + bool EmitDiagnostics, bool IsMultilineString, + unsigned CustomDelimiterLen) { const char *CharStart = CurPtr; switch (*CurPtr++) { @@ -1288,7 +1300,7 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, // If this is a "high" UTF-8 character, validate it. if ((signed char)(CurPtr[-1]) >= 0) { if (isPrintable(CurPtr[-1]) == 0) - if (!(MultilineString && (CurPtr[-1] == '\t'))) + if (!(IsMultilineString && (CurPtr[-1] == '\t'))) if (EmitDiagnostics) diagnose(CharStart, diag::lex_unprintable_ascii_character); return CurPtr[-1]; @@ -1323,13 +1335,13 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, return ~1U; case '\n': // String literals cannot have \n or \r in them. case '\r': - if (MultilineString) // ... unless they are multiline + if (IsMultilineString) // ... unless they are multiline return CurPtr[-1]; if (EmitDiagnostics) diagnose(CurPtr-1, diag::lex_unterminated_string); return ~1U; case '\\': // Escapes. - if (!delimiterMatches(DelimiterLength, CurPtr, Diags)) + if (!delimiterMatches(CustomDelimiterLen, CurPtr, Diags)) return '\\'; break; } @@ -1339,16 +1351,16 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, switch (*CurPtr) { case ' ': case '\t': case '\n': case '\r': case '#': if (*CurPtr == '#') { - if (DelimiterLength) { + if (CustomDelimiterLen) { if (EmitDiagnostics) diagnose(CurPtr, diag::lex_invalid_delimiter_escape) - .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), - Lexer::getSourceLoc(CurPtr + 1)); + .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), + Lexer::getSourceLoc(CurPtr + 1)); CurPtr++; return ~1U; } } - else if (MultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) + else if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) return '\n'; LLVM_FALLTHROUGH; default: // Invalid escape. @@ -1406,11 +1418,11 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, const char *EndPtr, DiagnosticEngine *Diags, - bool MultilineString) { + bool IsMultilineString) { SmallVector OpenDelimiters; SmallVector AllowNewline; SmallVector CustomDelimiter; - AllowNewline.push_back(MultilineString); + AllowNewline.push_back(IsMultilineString); auto inStringLiteral = [&]() { return !OpenDelimiters.empty() && @@ -1425,7 +1437,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // On success scanning the expression body, the real lexer will be used to // relex the body when parsing the expressions. We let it diagnose any // issues with malformed tokens or other problems. - unsigned DelimiterLength = 0; + unsigned CustomDelimiterLen = 0; switch (*CurPtr++) { // String literals in general cannot be split across multiple lines; // interpolated ones are no exception - unless multiline literals. @@ -1438,7 +1450,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, case '#': if (inStringLiteral() || - !(DelimiterLength = extractStringDelimiterLength(CurPtr))) + !(CustomDelimiterLen = advanceIfCustomDelimiterLen(CurPtr))) continue; LLVM_FALLTHROUGH; @@ -1446,9 +1458,8 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, case '\'': { if (!AllowNewline.back() && inStringLiteral()) { unsigned InnerDelimiter = CustomDelimiter.back(); - if (OpenDelimiters.back() == CurPtr[-1] && (!InnerDelimiter || - (delimiterMatches(InnerDelimiter, CurPtr, Diags) - && *CurPtr != '#'))) { + if (OpenDelimiters.back() == CurPtr[-1] && + delimiterMatches(InnerDelimiter, CurPtr, Diags)) { // Closing single line string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); @@ -1464,7 +1475,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // Open string literal OpenDelimiters.push_back(CurPtr[-1]); AllowNewline.push_back(isMultilineQuote); - CustomDelimiter.push_back(DelimiterLength); + CustomDelimiter.push_back(CustomDelimiterLen); continue; } @@ -1543,10 +1554,10 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, static StringRef getStringLiteralContent(const Token &Str) { StringRef Bytes = Str.getText(); - if (unsigned DelimiterLength = Str.getDelimiterLength()) - Bytes = Bytes.drop_front(DelimiterLength).drop_back(DelimiterLength); + if (unsigned CustomDelimiterLen = Str.getCustomDelimiterLen()) + Bytes = Bytes.drop_front(CustomDelimiterLen).drop_back(CustomDelimiterLen); - if (Str.IsMultilineString()) + if (Str.isMultilineString()) Bytes = Bytes.drop_front(3).drop_back(3); else Bytes = Bytes.drop_front().drop_back(); @@ -1584,7 +1595,7 @@ getMultilineTrailingIndent(const Token &Str, DiagnosticEngine *Diags) { auto string = StringRef(start, end - start); // Disallow escaped newline in the last line. - if (Diags && Str.getDelimiterLength() == 0) { + if (Diags && Str.getCustomDelimiterLen() == 0) { auto *Ptr = start - 1; if (*Ptr == '\n') --Ptr; if (*Ptr == '\r') --Ptr; @@ -1741,7 +1752,7 @@ static void validateMultilineIndents(const Token &Str, /// string_literal ::= ["]([^"\\\n\r]|character_escape)*["] /// string_literal ::= ["]["]["].*["]["]["] - approximately /// string_literal ::= (#+)("")?".*"(\2\1) - "raw" strings -void Lexer::lexStringLiteral(unsigned DelimiterLength) { +void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { const char *TokStart = CurPtr-1; assert((*TokStart == '"' || *TokStart == '\'') && "Unexpected start"); // NOTE: We only allow single-quote string literals so we can emit useful @@ -1759,7 +1770,7 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { while (true) { const char *TmpPtr = CurPtr + 1; if (*CurPtr == '\\' && - delimiterMatches(DelimiterLength, TmpPtr, Diags) && *TmpPtr == '(') { + delimiterMatches(CustomDelimiterLen, TmpPtr, Diags) && *TmpPtr == '(') { // Consume tokens until we hit the corresponding ')'. CurPtr = TmpPtr + 1; const char *EndPtr = @@ -1779,13 +1790,13 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { // String literals cannot have \n or \r in them (unless multiline). if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString) || CurPtr == BufferEnd) { - TokStart -= DelimiterLength; + TokStart -= CustomDelimiterLen; diagnose(TokStart, diag::lex_unterminated_string); return formToken(tok::unknown, TokStart); } unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, - MultilineString, DelimiterLength); + MultilineString, CustomDelimiterLen); wasErroneous |= CharValue == ~1U; // If this is the end of string, we are done. If it is a normal character @@ -1827,16 +1838,15 @@ void Lexer::lexStringLiteral(unsigned DelimiterLength) { replacement); } - // Is this the end of multiline/delimited string literal? + // Is this the end of multiline/custom-delimited string literal? if ((!MultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) && - (!DelimiterLength || (delimiterMatches(DelimiterLength, CurPtr, Diags) - && *CurPtr != '#'))) { - TokStart -= DelimiterLength; + delimiterMatches(CustomDelimiterLen, CurPtr, Diags)) { + TokStart -= CustomDelimiterLen; if (wasErroneous) return formToken(tok::unknown, TokStart); formToken(tok::string_literal, TokStart, - MultilineString, DelimiterLength); + MultilineString, CustomDelimiterLen); if (MultilineString && Diags) validateMultilineIndents(NextToken, Diags); return; @@ -2106,7 +2116,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, bool IsFirstSegment, bool IsLastSegment, unsigned IndentToStrip, - unsigned DelimiterLength) { + unsigned CustomDelimiterLen) { TempString.clear(); // Note that it is always safe to read one over the end of "Bytes" because @@ -2114,11 +2124,11 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, // range check subscripting on the StringRef. const char *BytesPtr = Bytes.begin(); - // Special case when being called from EncodedDiagnosticMessage(...) + // Special case when being called from EncodedDiagnosticMessage(...). // This allows multiline and delimited strings to work in attributes. // The string has already been validated by the initial parse. - if (IndentToStrip == ~0u && DelimiterLength == ~0u) { - IndentToStrip = DelimiterLength = 0; + if (IndentToStrip == ~0u && CustomDelimiterLen == ~0u) { + IndentToStrip = CustomDelimiterLen = 0; // restore trailing indent removal for multiline const char *Backtrack = BytesPtr - 1; @@ -2131,7 +2141,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, // restore delimiter if any while (*--Backtrack == '#') - DelimiterLength++; + CustomDelimiterLen++; } bool IsEscapedNewline = false; @@ -2155,7 +2165,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, } if (CurChar != '\\' || - !delimiterMatches(DelimiterLength, BytesPtr, nullptr)) { + !delimiterMatches(CustomDelimiterLen, BytesPtr, nullptr)) { TempString.push_back(CurChar); continue; } @@ -2225,8 +2235,8 @@ void Lexer::getStringLiteralSegments( // Are substitutions required either for indent stripping or line ending // normalization? - bool MultilineString = Str.IsMultilineString(), IsFirstSegment = true; - unsigned IndentToStrip = 0, DelimiterLength = Str.getDelimiterLength(); + bool MultilineString = Str.isMultilineString(), IsFirstSegment = true; + unsigned IndentToStrip = 0, CustomDelimiterLen = Str.getCustomDelimiterLen(); if (MultilineString) IndentToStrip = std::get<0>(getMultilineTrailingIndent(Str, /*Diags=*/nullptr)).size(); @@ -2240,7 +2250,7 @@ void Lexer::getStringLiteralSegments( while ((pos = Bytes.find('\\', BytesPtr-Bytes.begin())) != StringRef::npos) { BytesPtr = Bytes.begin() + pos + 1; - if (!delimiterMatches(DelimiterLength, BytesPtr, Diags) || + if (!delimiterMatches(CustomDelimiterLen, BytesPtr, Diags) || *BytesPtr++ != '(') continue; @@ -2249,9 +2259,9 @@ void Lexer::getStringLiteralSegments( // Push the current segment. Segments.push_back( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), - BytesPtr-SegmentStartPtr-2-DelimiterLength, + BytesPtr-SegmentStartPtr-2-CustomDelimiterLen, IsFirstSegment, false, IndentToStrip, - DelimiterLength)); + CustomDelimiterLen)); IsFirstSegment = false; // Find the closing ')'. @@ -2275,7 +2285,7 @@ void Lexer::getStringLiteralSegments( StringSegment::getLiteral(getSourceLoc(SegmentStartPtr), Bytes.end()-SegmentStartPtr, IsFirstSegment, true, IndentToStrip, - DelimiterLength)); + CustomDelimiterLen)); } @@ -2373,8 +2383,8 @@ void Lexer::lexImpl() { case '\\': return formToken(tok::backslash, TokStart); case '#': - if (unsigned DelimiterLength = extractStringDelimiterLength(CurPtr)) - return lexStringLiteral(DelimiterLength); + if (unsigned CustomDelimiterLen = advanceIfCustomDelimiterLen(CurPtr)) + return lexStringLiteral(CustomDelimiterLen); return lexHash(); // Operator characters. diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp index 605bc130cc0e9..a1302d6f0b795 100644 --- a/lib/Parse/ParseExpr.cpp +++ b/lib/Parse/ParseExpr.cpp @@ -1961,7 +1961,7 @@ ParserResult Parser::parseExprStringLiteral() { LocalContext.setCreateSyntax(SyntaxKind::StringInterpolationExpr); StringRef Quote; tok QuoteKind; - std::tie(Quote, QuoteKind) = Tok.IsMultilineString() ? + std::tie(Quote, QuoteKind) = Tok.isMultilineString() ? std::make_tuple("\"\"\"", tok::multiline_string_quote) : std::make_tuple("\"", tok::string_quote); diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp index 449599afb02e0..ed760b45e3caf 100644 --- a/lib/Parse/Parser.cpp +++ b/lib/Parse/Parser.cpp @@ -220,9 +220,9 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, const SourceManager &SM, int BufID, std::vector &Toks) { assert(Tok.is(tok::string_literal)); - bool IsMultiline = Tok.IsMultilineString(); - unsigned DelimiterLength = Tok.getDelimiterLength(); - unsigned QuoteLen = (IsMultiline ? 3 : 1) + DelimiterLength; + bool IsMultiline = Tok.isMultilineString(); + unsigned CustomDelimiterLen = Tok.getCustomDelimiterLen(); + unsigned QuoteLen = (IsMultiline ? 3 : 1) + CustomDelimiterLen; SmallVector Segments; Lexer::getStringLiteralSegments(Tok, Segments, /*Diags=*/nullptr); for (unsigned i = 0, e = Segments.size(); i != e; ++i) { @@ -244,7 +244,8 @@ static void getStringPartTokens(const Token &Tok, const LangOptions &LangOpts, StringRef Text = SM.extractText({ Loc, Len }); Token NewTok; - NewTok.setToken(tok::string_literal, Text, IsMultiline, DelimiterLength); + NewTok.setToken(tok::string_literal, Text, + IsMultiline, CustomDelimiterLen); Toks.push_back(NewTok); } else { diff --git a/test/Parse/raw_string.swift b/test/Parse/raw_string.swift index 66858327348a7..953b92324b041 100644 --- a/test/Parse/raw_string.swift +++ b/test/Parse/raw_string.swift @@ -49,15 +49,6 @@ _ = ##""" """## // CHECK: "print(\"\"\"\n Five\n\n\nEpsilon\n \"\"\")" -_ = #" "##" "# -// CHECK: " \"##\" " - -_ = #" \#(#" "######" "#) "# -// CHECK: " \"######\" " - -_ = #" \#(##"###""###"##) "# -// CHECK: "###\"\"###" - // ===---------- Single line --------=== _ = #""Zeta""# diff --git a/test/Parse/raw_string_errors.swift b/test/Parse/raw_string_errors.swift index fd49ea17ba7d2..bb83c9b24d6ce 100644 --- a/test/Parse/raw_string_errors.swift +++ b/test/Parse/raw_string_errors.swift @@ -7,4 +7,5 @@ // expected-error@-1{{unterminated string literal}} ###"invalid"#### -// expected-error@-1{{unterminated string literal}} +// expected-error@-1{{consecutive statements on a line must be separated by ';'}} +// expected-error@-2{{expected expression}} From 02f7cd5db6f89af9c8bbfc9f4e713e7fccba2a96 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Sun, 2 Sep 2018 13:03:17 +0100 Subject: [PATCH 11/15] generated zero-width characters --- lib/Parse/Lexer.cpp | 158 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 134 insertions(+), 24 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index cfccb5d62c665..467a46d94c3c1 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1206,35 +1206,145 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } +const static uint32_t ZeroWidthC[] = { + // Characters which don't appear to be visible (sic) follow. + 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, + 0x000b, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, + 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, + 0x001d, 0x001e, 0x001f, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, + 0x0084, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, + 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, + 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, + 0x009d, 0x009e, 0x009f, 0x00ad, 0x0300, 0x0301, 0x0302, 0x0303, + 0x0304, 0x0306, 0x0307, 0x0308, 0x0309, 0x030a, 0x030b, 0x030c, + 0x030f, 0x0311, 0x031b, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, + 0x0328, 0x032d, 0x032e, 0x0330, 0x0331, 0x0332, 0x034f, 0x055f, + 0x0610, 0x0611, 0x0612, 0x0613, 0x0614, 0x0615, 0x0616, 0x0617, + 0x0618, 0x0619, 0x061a, 0x061c, 0x064b, 0x064c, 0x064d, 0x064e, + 0x064f, 0x0650, 0x0651, 0x0652, 0x0653, 0x0654, 0x0655, 0x0656, + 0x0657, 0x0658, 0x0659, 0x065a, 0x065b, 0x065c, 0x065d, 0x065e, + 0x065f, 0x0670, 0x06d6, 0x06d7, 0x06d8, 0x06d9, 0x06da, 0x06db, + 0x06dc, 0x06df, 0x06e0, 0x06e1, 0x06e2, 0x06e3, 0x06e4, 0x06e6, + 0x06e7, 0x06e8, 0x06ea, 0x06eb, 0x06ec, 0x06ed, 0x070f, 0x0711, + 0x0730, 0x0731, 0x0732, 0x0733, 0x0734, 0x0735, 0x0736, 0x0737, + 0x0738, 0x0739, 0x073a, 0x073b, 0x073c, 0x073d, 0x073e, 0x073f, + 0x0740, 0x0741, 0x0742, 0x0743, 0x0744, 0x0745, 0x0746, 0x0747, + 0x0748, 0x0749, 0x074a, 0x07a6, 0x07a7, 0x07a8, 0x07a9, 0x07aa, + 0x07ab, 0x07ac, 0x07ad, 0x07ae, 0x07af, 0x07b0, 0x07eb, 0x07ec, + 0x07ed, 0x07ee, 0x07ef, 0x07f0, 0x07f1, 0x07f2, 0x07f3, 0x0816, + 0x0817, 0x0818, 0x0819, 0x081b, 0x081c, 0x081d, 0x081e, 0x081f, + 0x0820, 0x0821, 0x0822, 0x0823, 0x0825, 0x0826, 0x0827, 0x0829, + 0x082a, 0x082b, 0x082c, 0x082d, 0x0858, 0x0859, 0x085a, 0x085b, + 0x08d5, 0x08d6, 0x08d7, 0x08d8, 0x08d9, 0x08e0, 0x08e1, 0x08e2, + 0x08e3, 0x08e4, 0x08e5, 0x08e6, 0x08e7, 0x08e8, 0x08e9, 0x08ea, + 0x08eb, 0x08ec, 0x08ed, 0x08ee, 0x08ef, 0x08f0, 0x08f1, 0x08f2, + 0x08f3, 0x08f4, 0x08f5, 0x08f6, 0x08f7, 0x08f8, 0x08f9, 0x08fb, + 0x08fc, 0x08fd, 0x08fe, 0x08ff, 0x0f18, 0x0f19, 0x0f35, 0x0f37, + 0x0f39, 0x0f72, 0x0f7a, 0x0f7b, 0x0f7c, 0x0f7d, 0x0f7e, 0x0f80, + 0x0f82, 0x0f83, 0x0f84, 0x0f86, 0x0f87, 0x0fc6, 0x115f, 0x1160, + 0x1712, 0x1713, 0x1714, 0x1732, 0x1733, 0x1752, 0x1753, 0x1772, + 0x1773, 0x17b4, 0x17b5, 0x180b, 0x180c, 0x180d, 0x180e, 0x1920, + 0x1921, 0x1922, 0x1927, 0x1928, 0x192a, 0x1932, 0x193a, 0x193b, + 0x1a17, 0x1a18, 0x1a1b, 0x1a55, 0x1a56, 0x1a59, 0x1a5a, 0x1a5b, + 0x1a5c, 0x1a5d, 0x1a5e, 0x1a60, 0x1a62, 0x1a65, 0x1a66, 0x1a67, + 0x1a68, 0x1a69, 0x1a6a, 0x1a6c, 0x1a73, 0x1a74, 0x1a75, 0x1a76, + 0x1a77, 0x1a78, 0x1a79, 0x1a7a, 0x1a7b, 0x1a7c, 0x1a7f, 0x1abe, + 0x1b80, 0x1b81, 0x1ba1, 0x1ba2, 0x1ba3, 0x1ba4, 0x1ba5, 0x1ba8, + 0x1ba9, 0x1bac, 0x1bad, 0x1be6, 0x1be8, 0x1be9, 0x1bed, 0x1bee, + 0x1bef, 0x1bf0, 0x1bf1, 0x1c2c, 0x1c2d, 0x1c2e, 0x1c2f, 0x1c30, + 0x1c31, 0x1c32, 0x1c33, 0x1c36, 0x1c37, 0x1ce1, 0x1cf2, 0x1cf3, + 0x1cf7, 0x200b, 0x200c, 0x200d, 0x200e, 0x200f, 0x202a, 0x202b, + 0x202c, 0x202d, 0x202e, 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, + 0x2065, 0x2066, 0x2067, 0x2068, 0x2069, 0x206a, 0x206b, 0x206c, + 0x206d, 0x206e, 0x206f, 0x2cef, 0x2cf0, 0x2cf1, 0x2d7f, 0x3164, + 0xa6f0, 0xa6f1, 0xa802, 0xa806, 0xa80b, 0xa825, 0xa826, 0xa8b6, + 0xa8c4, 0xa948, 0xa949, 0xa94a, 0xa94b, 0xa94c, 0xa94d, 0xa94e, + 0xa94f, 0xa950, 0xa951, 0xa953, 0xa961, 0xa962, 0xa963, 0xa964, + 0xa965, 0xa966, 0xa967, 0xa968, 0xa969, 0xa96a, 0xa96b, 0xa96c, + 0xa96d, 0xa96e, 0xa96f, 0xa970, 0xa971, 0xa972, 0xa973, 0xa974, + 0xa975, 0xa976, 0xa977, 0xa978, 0xa979, 0xa97a, 0xa97b, 0xa97c, + 0xa9e5, 0xaa7b, 0xaa7c, 0xaa7d, 0xaab0, 0xaab2, 0xaab3, 0xaab4, + 0xaab7, 0xaab8, 0xaabe, 0xaabf, 0xaac1, 0xaaec, 0xaaed, 0xaaf6, + 0xabe5, 0xabe8, 0xabe9, 0xabea, 0xabed, 0xd7b1, 0xd7b2, 0xd7b3, + 0xd7b4, 0xd7b5, 0xd7b6, 0xd7b7, 0xd7b8, 0xd7b9, 0xd7ba, 0xd7bb, + 0xd7bc, 0xd7bd, 0xd7be, 0xd7bf, 0xd7c0, 0xd7c1, 0xd7c2, 0xd7c3, + 0xd7c4, 0xd7c5, 0xd7c6, 0xd7cc, 0xd7cd, 0xd7ce, 0xd7cf, 0xd7d0, + 0xd7d1, 0xd7d2, 0xd7d3, 0xd7d4, 0xd7d5, 0xd7d6, 0xd7d7, 0xd7d8, + 0xd7d9, 0xd7da, 0xd7db, 0xd7dc, 0xd7dd, 0xd7de, 0xd7df, 0xd7e0, + 0xd7e1, 0xd7e2, 0xd7e3, 0xd7e4, 0xd7e5, 0xd7e6, 0xd7e7, 0xd7e8, + 0xd7e9, 0xd7ea, 0xd7eb, 0xd7ec, 0xd7ed, 0xd7ee, 0xd7ef, 0xd7f0, + 0xd7f1, 0xd7f2, 0xd7f3, 0xd7f4, 0xd7f5, 0xd7f6, 0xd7f7, 0xd7f8, + 0xd7f9, 0xd7fa, 0xd7fb, 0xf850, 0xf85f, 0xf860, 0xf861, 0xf862, + 0xf863, 0xf864, 0xf865, 0xf866, 0xf867, 0xf868, 0xf869, 0xf86a, + 0xf86b, 0xf86c, 0xf86d, 0xf86e, 0xf86f, 0xf884, 0xf885, 0xf886, + 0xf887, 0xf888, 0xf889, 0xf88a, 0xf88b, 0xf88c, 0xf88d, 0xf88e, + 0xf88f, 0xf890, 0xf891, 0xf892, 0xf893, 0xf894, 0xf895, 0xf896, + 0xf897, 0xf898, 0xf899, 0xf89f, 0xfbb2, 0xfbb3, 0xfbb4, 0xfbb5, + 0xfbb6, 0xfbb7, 0xfbb8, 0xfbb9, 0xfbba, 0xfbbb, 0xfbbd, 0xfbbe, + 0xfbbf, 0xfbc1, 0xfc5e, 0xfc5f, 0xfc60, 0xfc61, 0xfc62, 0xfc63, + 0xfe0f, 0xfe20, 0xfe21, 0xfe22, 0xfe23, 0xfeff, 0xffa0, 0xfff0, + 0xfff1, 0xfff2, 0xfff3, 0xfff4, 0xfff5, 0xfff6, 0xfff7, 0xfff8, + 0xfff9, 0xfffa, 0xfffb, 0xfffc, 0x10a01, 0x10a02, 0x10a03, 0x10a05, + 0x10a06, 0x10a0c, 0x10a0d, 0x10a0e, 0x10a0f, 0x10a38, 0x10a39, 0x10a3a, + 0x11038, 0x11039, 0x1103a, 0x1103b, 0x1103c, 0x1103d, 0x1103e, 0x1103f, + 0x11040, 0x11041, 0x11042, 0x11043, 0x11044, 0x11045, 0x11046, 0x11080, + 0x11081, 0x110b1, 0x110b3, 0x110b4, 0x110b5, 0x110b6, 0x110b9, 0x110ba, + 0x11a01, 0x11a02, 0x11a03, 0x11a04, 0x11a05, 0x11a06, 0x11a07, 0x11a08, + 0x11a09, 0x11a0a, 0x11a33, 0x11a35, 0x11a36, 0x11a37, 0x11a38, 0x11a39, + 0x11a3b, 0x11a3c, 0x11a3d, 0x11a3e, 0x11a51, 0x11a52, 0x11a53, 0x11a54, + 0x11a55, 0x11a56, 0x11a57, 0x11a58, 0x11a59, 0x11a5a, 0x11a5b, 0x11a8a, + 0x11a8b, 0x11a8c, 0x11a8d, 0x11a8e, 0x11a8f, 0x11a90, 0x11a91, 0x11a92, + 0x11a93, 0x11a94, 0x11a95, 0x11a96, 0x11a97, 0x11a98, 0x11d31, 0x11d32, + 0x11d33, 0x11d34, 0x11d35, 0x11d36, 0x11d3a, 0x11d3c, 0x11d3d, 0x11d3f, + 0x11d40, 0x11d41, 0x11d43, 0x11d47, 0x1bc9d, 0x1bca0, 0x1bca1, 0x1bca2, + 0x1bca3, 0x1d173, 0x1d174, 0x1d175, 0x1d176, 0x1d177, 0x1d178, 0x1d179, + 0x1d17a, 0x1da00, 0x1da01, 0x1da02, 0x1da03, 0x1da04, 0x1da05, 0x1da06, + 0x1da07, 0x1da08, 0x1da09, 0x1da0a, 0x1da0b, 0x1da0c, 0x1da0d, 0x1da0e, + 0x1da0f, 0x1da10, 0x1da11, 0x1da12, 0x1da13, 0x1da14, 0x1da15, 0x1da16, + 0x1da17, 0x1da18, 0x1da19, 0x1da1a, 0x1da1b, 0x1da1c, 0x1da1d, 0x1da1e, + 0x1da1f, 0x1da20, 0x1da21, 0x1da22, 0x1da23, 0x1da24, 0x1da25, 0x1da26, + 0x1da27, 0x1da28, 0x1da29, 0x1da2a, 0x1da2b, 0x1da2c, 0x1da2d, 0x1da2e, + 0x1da2f, 0x1da30, 0x1da31, 0x1da32, 0x1da33, 0x1da34, 0x1da35, 0x1da36, + 0x1da3b, 0x1da3c, 0x1da3d, 0x1da3e, 0x1da3f, 0x1da40, 0x1da41, 0x1da42, + 0x1da43, 0x1da44, 0x1da45, 0x1da46, 0x1da47, 0x1da48, 0x1da49, 0x1da4a, + 0x1da4b, 0x1da4c, 0x1da4d, 0x1da4e, 0x1da4f, 0x1da50, 0x1da51, 0x1da52, + 0x1da53, 0x1da54, 0x1da55, 0x1da56, 0x1da57, 0x1da58, 0x1da59, 0x1da5a, + 0x1da5b, 0x1da5c, 0x1da5d, 0x1da5e, 0x1da5f, 0x1da60, 0x1da61, 0x1da62, + 0x1da63, 0x1da64, 0x1da65, 0x1da66, 0x1da67, 0x1da68, 0x1da69, 0x1da6a, + 0x1da6b, 0x1da6c, 0x1da75, 0x1da84, 0x1da9b, 0x1da9c, 0x1da9d, 0x1da9e, + 0x1da9f, 0x1daa1, 0x1daa2, 0x1daa3, 0x1daa4, 0x1daa5, 0x1daa6, 0x1daa7, + 0x1daa8, 0x1daa9, 0x1daaa, 0x1daab, 0x1daac, 0x1daad, 0x1daae, 0x1daaf, +}; + /// diagnoseZeroWidth - Check for and error zero-width characters in delimiters. /// A non visible character in the middle of a delimter can be used to extend /// the literal beyond what it would appear creating potential security bugs. static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) { // A way needs to be found to find the complete set of zero width chars or - // this security mitigation will be in vain. Current list was taken from: - // https://www.ptiglobal.com/2018/04/26/the-beauty-of-unicode-zero-width-characters/ - // https://github.com/dblspk/web-app - // As this list may not be complete this code is not currently implemented. -// const char *TmpPtr = CurPtr; -// while (true) { -// switch (validateUTF8CharacterAndAdvance(TmpPtr, TmpPtr+3)) { -// case 0x200B: case 0x200C: case 0x200D: case 0x2060: -// case 0x2061: case 0x2062: case 0x2063: case 0x2064: -// case 0x206A: case 0x206B: case 0x206C: case 0x206D: -// case 0x206E: case 0x206F: case 0xFE00: case 0xFE01: -// case 0xFEFF: -// if (Diags) -// Diags->diagnose(Lexer::getSourceLoc(CurPtr), -// diag::lex_zerowidth_in_string_delimiter) -// .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), -// Lexer::getSourceLoc(CurPtr + 3)); -// CurPtr = TmpPtr; -// break; -// default: -// return true; -// } -// } - return true; + // this security mitigation will be in vain. Current list is generated using + // the display width of attributed strings checking when it does not change. + // The font used was SF Mono, 11pt, the default font of the Xcode editor. + static std::vector ZeroWidthV; + if (!ZeroWidthV.size()) + ZeroWidthV.assign(ZeroWidthC, + ZeroWidthC + sizeof ZeroWidthC/sizeof ZeroWidthC[0]); + + const char *TmpPtr = CurPtr; + while (true) { + uint32_t NextChar = validateUTF8CharacterAndAdvance(TmpPtr, TmpPtr + 6); + if (NextChar != '"' && NextChar != '#' && + (NextChar == ~0U || (NextChar >= 0xe0000 && NextChar <= 0xe0fff) || + std::binary_search(ZeroWidthV.begin(), ZeroWidthV.end(), NextChar))) { + if (Diags) + Diags->diagnose(Lexer::getSourceLoc(CurPtr), + diag::lex_zerowidth_in_string_delimiter) + .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), + Lexer::getSourceLoc(TmpPtr)); + CurPtr = TmpPtr; + continue; + } + return true; + } } /// advanceIfMultilineDelimiter - Centralized check for multiline delimiter. From 999bb40294d68bf86b97f70b681a73bea58c8cfd Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Mon, 3 Sep 2018 12:49:16 +0100 Subject: [PATCH 12/15] New diagnostic for closing delimiter --- include/swift/AST/DiagnosticsParse.def | 4 +- include/swift/Parse/Token.h | 2 +- lib/Parse/Lexer.cpp | 95 +++++++++++++------------- 3 files changed, 51 insertions(+), 50 deletions(-) diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def index 35ab944902d2d..2a243d0639822 100644 --- a/include/swift/AST/DiagnosticsParse.def +++ b/include/swift/AST/DiagnosticsParse.def @@ -138,8 +138,10 @@ ERROR(lex_invalid_u_escape,none, "\\u{...} escape sequence expects between 1 and 8 hex digits", ()) ERROR(lex_invalid_u_escape_rbrace,none, "expected '}' in \\u{...} escape sequence", ()) -ERROR(lex_invalid_delimiter_escape,none, +ERROR(lex_invalid_escape_delimiter,none, "too many '#' characters in delimited escape", ()) +ERROR(lex_invalid_closing_delimiter,none, + "too many '#' characters in closing delimiter", ()) ERROR(lex_zerowidth_in_string_delimiter,none, "zero-width character detected in string delimiter", ()) diff --git a/include/swift/Parse/Token.h b/include/swift/Parse/Token.h index 9e06ec5e44aa1..6cd0fc95828c4 100644 --- a/include/swift/Parse/Token.h +++ b/include/swift/Parse/Token.h @@ -277,7 +277,7 @@ class Token { this->MultilineString = IsMultilineString; this->CustomDelimiterLen = CustomDelimiterLen; assert(this->CustomDelimiterLen == CustomDelimiterLen && - "string custom delimiter too long"); + "custom string delimiter length > 255"); } bool isMultilineString() const { diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 467a46d94c3c1..39b0bfd719d23 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1207,7 +1207,7 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } const static uint32_t ZeroWidthC[] = { - // Characters which don't appear to be visible (sic) follow. + // "Zero-Width" characters which don't appear to be visible follow. 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x000b, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, @@ -1316,10 +1316,11 @@ const static uint32_t ZeroWidthC[] = { 0x1daa8, 0x1daa9, 0x1daaa, 0x1daab, 0x1daac, 0x1daad, 0x1daae, 0x1daaf, }; -/// diagnoseZeroWidth - Check for and error zero-width characters in delimiters. +/// diagnoseZeroWidthMatchAndAdvance - Error zerowidth characters in delimiters. /// A non visible character in the middle of a delimter can be used to extend /// the literal beyond what it would appear creating potential security bugs. -static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) { +static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr, + DiagnosticEngine *Diags) { // A way needs to be found to find the complete set of zero width chars or // this security mitigation will be in vain. Current list is generated using // the display width of attributed strings checking when it does not change. @@ -1332,10 +1333,10 @@ static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) { const char *TmpPtr = CurPtr; while (true) { uint32_t NextChar = validateUTF8CharacterAndAdvance(TmpPtr, TmpPtr + 6); - if (NextChar != '"' && NextChar != '#' && + if (NextChar != (uint32_t)Target && (NextChar == ~0U || (NextChar >= 0xe0000 && NextChar <= 0xe0fff) || std::binary_search(ZeroWidthV.begin(), ZeroWidthV.end(), NextChar))) { - if (Diags) + if (Diags && *TmpPtr == Target) Diags->diagnose(Lexer::getSourceLoc(CurPtr), diag::lex_zerowidth_in_string_delimiter) .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), @@ -1343,33 +1344,35 @@ static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) { CurPtr = TmpPtr; continue; } - return true; + + return *CurPtr == Target && CurPtr++; } } /// advanceIfMultilineDelimiter - Centralized check for multiline delimiter. static bool advanceIfMultilineDelimiter(const char *&CurPtr, DiagnosticEngine *Diags) { - const char *TmpPtr = CurPtr - 1; - if (*TmpPtr++ == '"' && diagnoseZeroWidth(TmpPtr, Diags) && - *TmpPtr++ == '"' && diagnoseZeroWidth(TmpPtr, Diags) && - *TmpPtr++ == '"') { + const char *TmpPtr = CurPtr; + if (*(TmpPtr - 1) == '"' && + diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags) && + diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) { CurPtr = TmpPtr; return true; } return false; } -/// advanceIfCustomDelimiterLen - Extracts/detects any custom delimiter on -/// opening a string literal and advances CurPtr if a delimiter is found and +/// advanceIfCustomDelimiter - Extracts/detects any custom delimiter on +/// opening a string literal, advances CurPtr if a delimiter is found and /// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called. -static unsigned advanceIfCustomDelimiterLen(const char *&CurPtr) { - const char *Lookahead = CurPtr; - while (*Lookahead == '#') - Lookahead++; - if (*Lookahead++ == '"') { - unsigned CustomDelimiterLen = Lookahead - CurPtr; - CurPtr = Lookahead; +static unsigned advanceIfCustomDelimiter(const char *&CurPtr, + DiagnosticEngine *Diags) { + const char *TmpPtr = CurPtr; + unsigned CustomDelimiterLen = 1; + while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) + CustomDelimiterLen++; + if (diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) { + CurPtr = TmpPtr; return CustomDelimiterLen; } return 0; @@ -1380,15 +1383,22 @@ static unsigned advanceIfCustomDelimiterLen(const char *&CurPtr) { /// interpolation inside a "raw" string. Normal/cooked string processing is /// the degenerate case of there being no # characters surrounding the quotes. /// If delimiter matches, advances byte pointer passed in and returns true. +/// Also used to detect the final delimiter of a string when IsClosing == true. static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr, - DiagnosticEngine *Diags) { + DiagnosticEngine *Diags, bool IsClosing = false) { if (!CustomDelimiterLen) return true; const char *TmpPtr = BytesPtr; - for (unsigned i = 0; i < CustomDelimiterLen; i++) - if (diagnoseZeroWidth(TmpPtr, Diags) && *TmpPtr++ != '#') + while (CustomDelimiterLen--) + if (!diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags)) return false; BytesPtr = TmpPtr; + if (*BytesPtr == '#' && Diags) + Diags->diagnose(Lexer::getSourceLoc(BytesPtr), IsClosing ? + diag::lex_invalid_closing_delimiter : + diag::lex_invalid_escape_delimiter) + .fixItRemoveChars(Lexer::getSourceLoc(BytesPtr), + Lexer::getSourceLoc(BytesPtr + 1)); return true; } @@ -1459,18 +1469,8 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, unsigned CharValue = 0; // Escape processing. We already ate the "\". switch (*CurPtr) { - case ' ': case '\t': case '\n': case '\r': case '#': - if (*CurPtr == '#') { - if (CustomDelimiterLen) { - if (EmitDiagnostics) - diagnose(CurPtr, diag::lex_invalid_delimiter_escape) - .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), - Lexer::getSourceLoc(CurPtr + 1)); - CurPtr++; - return ~1U; - } - } - else if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) + case ' ': case '\t': case '\n': case '\r': + if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0)) return '\n'; LLVM_FALLTHROUGH; default: // Invalid escape. @@ -1560,16 +1560,15 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, case '#': if (inStringLiteral() || - !(CustomDelimiterLen = advanceIfCustomDelimiterLen(CurPtr))) + !(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))) continue; LLVM_FALLTHROUGH; case '"': case '\'': { if (!AllowNewline.back() && inStringLiteral()) { - unsigned InnerDelimiter = CustomDelimiter.back(); if (OpenDelimiters.back() == CurPtr[-1] && - delimiterMatches(InnerDelimiter, CurPtr, Diags)) { + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) { // Closing single line string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); @@ -1592,7 +1591,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr, // We are in multiline string literal. assert(AllowNewline.back() && "other cases must be handled above"); if (isMultilineQuote && - delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) { + delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) { // Close multiline string literal. OpenDelimiters.pop_back(); AllowNewline.pop_back(); @@ -1868,10 +1867,10 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { // NOTE: We only allow single-quote string literals so we can emit useful // diagnostics about changing them to double quotes. - bool wasErroneous = false, MultilineString = false; + bool wasErroneous = false, IsMultilineString = false; // Is this the start of a multiline string literal? - if ((MultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) { + if ((IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) { if (*CurPtr != '\n' && *CurPtr != '\r') diagnose(CurPtr, diag::lex_illegal_multiline_string_start) .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n"); @@ -1885,7 +1884,7 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { CurPtr = TmpPtr + 1; const char *EndPtr = skipToEndOfInterpolatedExpression(CurPtr, BufferEnd, - Diags, MultilineString); + Diags, IsMultilineString); if (*EndPtr == ')') { // Successfully scanned the body of the expression literal. @@ -1898,7 +1897,7 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { } // String literals cannot have \n or \r in them (unless multiline). - if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString) + if (((*CurPtr == '\r' || *CurPtr == '\n') && !IsMultilineString) || CurPtr == BufferEnd) { TokStart -= CustomDelimiterLen; diagnose(TokStart, diag::lex_unterminated_string); @@ -1906,7 +1905,7 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { } unsigned CharValue = lexCharacter(CurPtr, *TokStart, true, - MultilineString, CustomDelimiterLen); + IsMultilineString, CustomDelimiterLen); wasErroneous |= CharValue == ~1U; // If this is the end of string, we are done. If it is a normal character @@ -1949,15 +1948,15 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { } // Is this the end of multiline/custom-delimited string literal? - if ((!MultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) && - delimiterMatches(CustomDelimiterLen, CurPtr, Diags)) { + if ((!IsMultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) && + delimiterMatches(CustomDelimiterLen, CurPtr, Diags, true)) { TokStart -= CustomDelimiterLen; if (wasErroneous) return formToken(tok::unknown, TokStart); formToken(tok::string_literal, TokStart, - MultilineString, CustomDelimiterLen); - if (MultilineString && Diags) + IsMultilineString, CustomDelimiterLen); + if (IsMultilineString && Diags) validateMultilineIndents(NextToken, Diags); return; } @@ -2493,7 +2492,7 @@ void Lexer::lexImpl() { case '\\': return formToken(tok::backslash, TokStart); case '#': - if (unsigned CustomDelimiterLen = advanceIfCustomDelimiterLen(CurPtr)) + if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags)) return lexStringLiteral(CustomDelimiterLen); return lexHash(); From 3baf04177097121941c7396fe5aa0da664a8a3b5 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Tue, 4 Sep 2018 23:03:01 +0100 Subject: [PATCH 13/15] Fix error tests --- test/Parse/raw_string_errors.swift | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/Parse/raw_string_errors.swift b/test/Parse/raw_string_errors.swift index bb83c9b24d6ce..ca19b41fe4dd2 100644 --- a/test/Parse/raw_string_errors.swift +++ b/test/Parse/raw_string_errors.swift @@ -1,11 +1,14 @@ // RUN: %target-typecheck-verify-swift #"\##("invalid")"# -// expected-error@-1{{Too many # characters in delimited escape}} +// expected-error@-1{{too many '#' characters in delimited escape}} +// expected-error@-2{{invalid escape sequence in literal}} ####"invalid"### // expected-error@-1{{unterminated string literal}} ###"invalid"#### -// expected-error@-1{{consecutive statements on a line must be separated by ';'}} -// expected-error@-2{{expected expression}} +// expected-error@-1{{too many '#' characters in closing delimiter}} +// expected-error@-2{{consecutive statements on a line must be separated by ';'}} +// expected-error@-3{{expected expression}} +// expected-warning@-4{{string literal is unused}} From f0f08e1e86d0c24921fe8ca71901a5ef6bd39811 Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Tue, 4 Sep 2018 23:26:06 +0100 Subject: [PATCH 14/15] Remove zero width detection for now --- include/swift/AST/DiagnosticsParse.def | 2 - lib/Parse/Lexer.cpp | 145 +------------------------ 2 files changed, 6 insertions(+), 141 deletions(-) diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def index 2a243d0639822..10b072cde19fb 100644 --- a/include/swift/AST/DiagnosticsParse.def +++ b/include/swift/AST/DiagnosticsParse.def @@ -142,8 +142,6 @@ ERROR(lex_invalid_escape_delimiter,none, "too many '#' characters in delimited escape", ()) ERROR(lex_invalid_closing_delimiter,none, "too many '#' characters in closing delimiter", ()) -ERROR(lex_zerowidth_in_string_delimiter,none, - "zero-width character detected in string delimiter", ()) ERROR(lex_invalid_unicode_scalar,none, "invalid unicode scalar", ()) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 39b0bfd719d23..1967d0f4d35b7 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1206,147 +1206,13 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } -const static uint32_t ZeroWidthC[] = { - // "Zero-Width" characters which don't appear to be visible follow. - 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, - 0x000b, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, - 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, - 0x001d, 0x001e, 0x001f, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, - 0x0084, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, - 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, - 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, - 0x009d, 0x009e, 0x009f, 0x00ad, 0x0300, 0x0301, 0x0302, 0x0303, - 0x0304, 0x0306, 0x0307, 0x0308, 0x0309, 0x030a, 0x030b, 0x030c, - 0x030f, 0x0311, 0x031b, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, - 0x0328, 0x032d, 0x032e, 0x0330, 0x0331, 0x0332, 0x034f, 0x055f, - 0x0610, 0x0611, 0x0612, 0x0613, 0x0614, 0x0615, 0x0616, 0x0617, - 0x0618, 0x0619, 0x061a, 0x061c, 0x064b, 0x064c, 0x064d, 0x064e, - 0x064f, 0x0650, 0x0651, 0x0652, 0x0653, 0x0654, 0x0655, 0x0656, - 0x0657, 0x0658, 0x0659, 0x065a, 0x065b, 0x065c, 0x065d, 0x065e, - 0x065f, 0x0670, 0x06d6, 0x06d7, 0x06d8, 0x06d9, 0x06da, 0x06db, - 0x06dc, 0x06df, 0x06e0, 0x06e1, 0x06e2, 0x06e3, 0x06e4, 0x06e6, - 0x06e7, 0x06e8, 0x06ea, 0x06eb, 0x06ec, 0x06ed, 0x070f, 0x0711, - 0x0730, 0x0731, 0x0732, 0x0733, 0x0734, 0x0735, 0x0736, 0x0737, - 0x0738, 0x0739, 0x073a, 0x073b, 0x073c, 0x073d, 0x073e, 0x073f, - 0x0740, 0x0741, 0x0742, 0x0743, 0x0744, 0x0745, 0x0746, 0x0747, - 0x0748, 0x0749, 0x074a, 0x07a6, 0x07a7, 0x07a8, 0x07a9, 0x07aa, - 0x07ab, 0x07ac, 0x07ad, 0x07ae, 0x07af, 0x07b0, 0x07eb, 0x07ec, - 0x07ed, 0x07ee, 0x07ef, 0x07f0, 0x07f1, 0x07f2, 0x07f3, 0x0816, - 0x0817, 0x0818, 0x0819, 0x081b, 0x081c, 0x081d, 0x081e, 0x081f, - 0x0820, 0x0821, 0x0822, 0x0823, 0x0825, 0x0826, 0x0827, 0x0829, - 0x082a, 0x082b, 0x082c, 0x082d, 0x0858, 0x0859, 0x085a, 0x085b, - 0x08d5, 0x08d6, 0x08d7, 0x08d8, 0x08d9, 0x08e0, 0x08e1, 0x08e2, - 0x08e3, 0x08e4, 0x08e5, 0x08e6, 0x08e7, 0x08e8, 0x08e9, 0x08ea, - 0x08eb, 0x08ec, 0x08ed, 0x08ee, 0x08ef, 0x08f0, 0x08f1, 0x08f2, - 0x08f3, 0x08f4, 0x08f5, 0x08f6, 0x08f7, 0x08f8, 0x08f9, 0x08fb, - 0x08fc, 0x08fd, 0x08fe, 0x08ff, 0x0f18, 0x0f19, 0x0f35, 0x0f37, - 0x0f39, 0x0f72, 0x0f7a, 0x0f7b, 0x0f7c, 0x0f7d, 0x0f7e, 0x0f80, - 0x0f82, 0x0f83, 0x0f84, 0x0f86, 0x0f87, 0x0fc6, 0x115f, 0x1160, - 0x1712, 0x1713, 0x1714, 0x1732, 0x1733, 0x1752, 0x1753, 0x1772, - 0x1773, 0x17b4, 0x17b5, 0x180b, 0x180c, 0x180d, 0x180e, 0x1920, - 0x1921, 0x1922, 0x1927, 0x1928, 0x192a, 0x1932, 0x193a, 0x193b, - 0x1a17, 0x1a18, 0x1a1b, 0x1a55, 0x1a56, 0x1a59, 0x1a5a, 0x1a5b, - 0x1a5c, 0x1a5d, 0x1a5e, 0x1a60, 0x1a62, 0x1a65, 0x1a66, 0x1a67, - 0x1a68, 0x1a69, 0x1a6a, 0x1a6c, 0x1a73, 0x1a74, 0x1a75, 0x1a76, - 0x1a77, 0x1a78, 0x1a79, 0x1a7a, 0x1a7b, 0x1a7c, 0x1a7f, 0x1abe, - 0x1b80, 0x1b81, 0x1ba1, 0x1ba2, 0x1ba3, 0x1ba4, 0x1ba5, 0x1ba8, - 0x1ba9, 0x1bac, 0x1bad, 0x1be6, 0x1be8, 0x1be9, 0x1bed, 0x1bee, - 0x1bef, 0x1bf0, 0x1bf1, 0x1c2c, 0x1c2d, 0x1c2e, 0x1c2f, 0x1c30, - 0x1c31, 0x1c32, 0x1c33, 0x1c36, 0x1c37, 0x1ce1, 0x1cf2, 0x1cf3, - 0x1cf7, 0x200b, 0x200c, 0x200d, 0x200e, 0x200f, 0x202a, 0x202b, - 0x202c, 0x202d, 0x202e, 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, - 0x2065, 0x2066, 0x2067, 0x2068, 0x2069, 0x206a, 0x206b, 0x206c, - 0x206d, 0x206e, 0x206f, 0x2cef, 0x2cf0, 0x2cf1, 0x2d7f, 0x3164, - 0xa6f0, 0xa6f1, 0xa802, 0xa806, 0xa80b, 0xa825, 0xa826, 0xa8b6, - 0xa8c4, 0xa948, 0xa949, 0xa94a, 0xa94b, 0xa94c, 0xa94d, 0xa94e, - 0xa94f, 0xa950, 0xa951, 0xa953, 0xa961, 0xa962, 0xa963, 0xa964, - 0xa965, 0xa966, 0xa967, 0xa968, 0xa969, 0xa96a, 0xa96b, 0xa96c, - 0xa96d, 0xa96e, 0xa96f, 0xa970, 0xa971, 0xa972, 0xa973, 0xa974, - 0xa975, 0xa976, 0xa977, 0xa978, 0xa979, 0xa97a, 0xa97b, 0xa97c, - 0xa9e5, 0xaa7b, 0xaa7c, 0xaa7d, 0xaab0, 0xaab2, 0xaab3, 0xaab4, - 0xaab7, 0xaab8, 0xaabe, 0xaabf, 0xaac1, 0xaaec, 0xaaed, 0xaaf6, - 0xabe5, 0xabe8, 0xabe9, 0xabea, 0xabed, 0xd7b1, 0xd7b2, 0xd7b3, - 0xd7b4, 0xd7b5, 0xd7b6, 0xd7b7, 0xd7b8, 0xd7b9, 0xd7ba, 0xd7bb, - 0xd7bc, 0xd7bd, 0xd7be, 0xd7bf, 0xd7c0, 0xd7c1, 0xd7c2, 0xd7c3, - 0xd7c4, 0xd7c5, 0xd7c6, 0xd7cc, 0xd7cd, 0xd7ce, 0xd7cf, 0xd7d0, - 0xd7d1, 0xd7d2, 0xd7d3, 0xd7d4, 0xd7d5, 0xd7d6, 0xd7d7, 0xd7d8, - 0xd7d9, 0xd7da, 0xd7db, 0xd7dc, 0xd7dd, 0xd7de, 0xd7df, 0xd7e0, - 0xd7e1, 0xd7e2, 0xd7e3, 0xd7e4, 0xd7e5, 0xd7e6, 0xd7e7, 0xd7e8, - 0xd7e9, 0xd7ea, 0xd7eb, 0xd7ec, 0xd7ed, 0xd7ee, 0xd7ef, 0xd7f0, - 0xd7f1, 0xd7f2, 0xd7f3, 0xd7f4, 0xd7f5, 0xd7f6, 0xd7f7, 0xd7f8, - 0xd7f9, 0xd7fa, 0xd7fb, 0xf850, 0xf85f, 0xf860, 0xf861, 0xf862, - 0xf863, 0xf864, 0xf865, 0xf866, 0xf867, 0xf868, 0xf869, 0xf86a, - 0xf86b, 0xf86c, 0xf86d, 0xf86e, 0xf86f, 0xf884, 0xf885, 0xf886, - 0xf887, 0xf888, 0xf889, 0xf88a, 0xf88b, 0xf88c, 0xf88d, 0xf88e, - 0xf88f, 0xf890, 0xf891, 0xf892, 0xf893, 0xf894, 0xf895, 0xf896, - 0xf897, 0xf898, 0xf899, 0xf89f, 0xfbb2, 0xfbb3, 0xfbb4, 0xfbb5, - 0xfbb6, 0xfbb7, 0xfbb8, 0xfbb9, 0xfbba, 0xfbbb, 0xfbbd, 0xfbbe, - 0xfbbf, 0xfbc1, 0xfc5e, 0xfc5f, 0xfc60, 0xfc61, 0xfc62, 0xfc63, - 0xfe0f, 0xfe20, 0xfe21, 0xfe22, 0xfe23, 0xfeff, 0xffa0, 0xfff0, - 0xfff1, 0xfff2, 0xfff3, 0xfff4, 0xfff5, 0xfff6, 0xfff7, 0xfff8, - 0xfff9, 0xfffa, 0xfffb, 0xfffc, 0x10a01, 0x10a02, 0x10a03, 0x10a05, - 0x10a06, 0x10a0c, 0x10a0d, 0x10a0e, 0x10a0f, 0x10a38, 0x10a39, 0x10a3a, - 0x11038, 0x11039, 0x1103a, 0x1103b, 0x1103c, 0x1103d, 0x1103e, 0x1103f, - 0x11040, 0x11041, 0x11042, 0x11043, 0x11044, 0x11045, 0x11046, 0x11080, - 0x11081, 0x110b1, 0x110b3, 0x110b4, 0x110b5, 0x110b6, 0x110b9, 0x110ba, - 0x11a01, 0x11a02, 0x11a03, 0x11a04, 0x11a05, 0x11a06, 0x11a07, 0x11a08, - 0x11a09, 0x11a0a, 0x11a33, 0x11a35, 0x11a36, 0x11a37, 0x11a38, 0x11a39, - 0x11a3b, 0x11a3c, 0x11a3d, 0x11a3e, 0x11a51, 0x11a52, 0x11a53, 0x11a54, - 0x11a55, 0x11a56, 0x11a57, 0x11a58, 0x11a59, 0x11a5a, 0x11a5b, 0x11a8a, - 0x11a8b, 0x11a8c, 0x11a8d, 0x11a8e, 0x11a8f, 0x11a90, 0x11a91, 0x11a92, - 0x11a93, 0x11a94, 0x11a95, 0x11a96, 0x11a97, 0x11a98, 0x11d31, 0x11d32, - 0x11d33, 0x11d34, 0x11d35, 0x11d36, 0x11d3a, 0x11d3c, 0x11d3d, 0x11d3f, - 0x11d40, 0x11d41, 0x11d43, 0x11d47, 0x1bc9d, 0x1bca0, 0x1bca1, 0x1bca2, - 0x1bca3, 0x1d173, 0x1d174, 0x1d175, 0x1d176, 0x1d177, 0x1d178, 0x1d179, - 0x1d17a, 0x1da00, 0x1da01, 0x1da02, 0x1da03, 0x1da04, 0x1da05, 0x1da06, - 0x1da07, 0x1da08, 0x1da09, 0x1da0a, 0x1da0b, 0x1da0c, 0x1da0d, 0x1da0e, - 0x1da0f, 0x1da10, 0x1da11, 0x1da12, 0x1da13, 0x1da14, 0x1da15, 0x1da16, - 0x1da17, 0x1da18, 0x1da19, 0x1da1a, 0x1da1b, 0x1da1c, 0x1da1d, 0x1da1e, - 0x1da1f, 0x1da20, 0x1da21, 0x1da22, 0x1da23, 0x1da24, 0x1da25, 0x1da26, - 0x1da27, 0x1da28, 0x1da29, 0x1da2a, 0x1da2b, 0x1da2c, 0x1da2d, 0x1da2e, - 0x1da2f, 0x1da30, 0x1da31, 0x1da32, 0x1da33, 0x1da34, 0x1da35, 0x1da36, - 0x1da3b, 0x1da3c, 0x1da3d, 0x1da3e, 0x1da3f, 0x1da40, 0x1da41, 0x1da42, - 0x1da43, 0x1da44, 0x1da45, 0x1da46, 0x1da47, 0x1da48, 0x1da49, 0x1da4a, - 0x1da4b, 0x1da4c, 0x1da4d, 0x1da4e, 0x1da4f, 0x1da50, 0x1da51, 0x1da52, - 0x1da53, 0x1da54, 0x1da55, 0x1da56, 0x1da57, 0x1da58, 0x1da59, 0x1da5a, - 0x1da5b, 0x1da5c, 0x1da5d, 0x1da5e, 0x1da5f, 0x1da60, 0x1da61, 0x1da62, - 0x1da63, 0x1da64, 0x1da65, 0x1da66, 0x1da67, 0x1da68, 0x1da69, 0x1da6a, - 0x1da6b, 0x1da6c, 0x1da75, 0x1da84, 0x1da9b, 0x1da9c, 0x1da9d, 0x1da9e, - 0x1da9f, 0x1daa1, 0x1daa2, 0x1daa3, 0x1daa4, 0x1daa5, 0x1daa6, 0x1daa7, - 0x1daa8, 0x1daa9, 0x1daaa, 0x1daab, 0x1daac, 0x1daad, 0x1daae, 0x1daaf, -}; - /// diagnoseZeroWidthMatchAndAdvance - Error zerowidth characters in delimiters. /// A non visible character in the middle of a delimter can be used to extend /// the literal beyond what it would appear creating potential security bugs. static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr, DiagnosticEngine *Diags) { - // A way needs to be found to find the complete set of zero width chars or - // this security mitigation will be in vain. Current list is generated using - // the display width of attributed strings checking when it does not change. - // The font used was SF Mono, 11pt, the default font of the Xcode editor. - static std::vector ZeroWidthV; - if (!ZeroWidthV.size()) - ZeroWidthV.assign(ZeroWidthC, - ZeroWidthC + sizeof ZeroWidthC/sizeof ZeroWidthC[0]); - - const char *TmpPtr = CurPtr; - while (true) { - uint32_t NextChar = validateUTF8CharacterAndAdvance(TmpPtr, TmpPtr + 6); - if (NextChar != (uint32_t)Target && - (NextChar == ~0U || (NextChar >= 0xe0000 && NextChar <= 0xe0fff) || - std::binary_search(ZeroWidthV.begin(), ZeroWidthV.end(), NextChar))) { - if (Diags && *TmpPtr == Target) - Diags->diagnose(Lexer::getSourceLoc(CurPtr), - diag::lex_zerowidth_in_string_delimiter) - .fixItRemoveChars(Lexer::getSourceLoc(CurPtr), - Lexer::getSourceLoc(TmpPtr)); - CurPtr = TmpPtr; - continue; - } - - return *CurPtr == Target && CurPtr++; - } + // Detect, diagnose and skip over zero-width characters here if required. + return *CurPtr == Target && CurPtr++; } /// advanceIfMultilineDelimiter - Centralized check for multiline delimiter. @@ -1461,7 +1327,8 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote, diagnose(CurPtr-1, diag::lex_unterminated_string); return ~1U; case '\\': // Escapes. - if (!delimiterMatches(CustomDelimiterLen, CurPtr, Diags)) + if (!delimiterMatches(CustomDelimiterLen, CurPtr, + EmitDiagnostics ? Diags : nullptr)) return '\\'; break; } @@ -1878,8 +1745,8 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) { while (true) { const char *TmpPtr = CurPtr + 1; - if (*CurPtr == '\\' && - delimiterMatches(CustomDelimiterLen, TmpPtr, Diags) && *TmpPtr == '(') { + if (*CurPtr == '\\' && delimiterMatches(CustomDelimiterLen, TmpPtr, nullptr) + && *TmpPtr == '(') { // Consume tokens until we hit the corresponding ')'. CurPtr = TmpPtr + 1; const char *EndPtr = From 9208c5bca64c27d02253466dd7ec2c6ddde4967f Mon Sep 17 00:00:00 2001 From: John Holdsworth Date: Thu, 6 Sep 2018 14:20:19 +0100 Subject: [PATCH 15/15] Final nits in comments. --- lib/Parse/Lexer.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp index 1967d0f4d35b7..95d45bcb6fc66 100644 --- a/lib/Parse/Lexer.cpp +++ b/lib/Parse/Lexer.cpp @@ -1206,12 +1206,13 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) { } } -/// diagnoseZeroWidthMatchAndAdvance - Error zerowidth characters in delimiters. -/// A non visible character in the middle of a delimter can be used to extend +/// diagnoseZeroWidthMatchAndAdvance - Error invisible characters in delimiters. +/// An invisible character in the middle of a delimiter can be used to extend /// the literal beyond what it would appear creating potential security bugs. static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr, DiagnosticEngine *Diags) { - // Detect, diagnose and skip over zero-width characters here if required. + // TODO: Detect, diagnose and skip over zero-width characters if required. + // See https://github.com/apple/swift/pull/17668 for possible implementation. return *CurPtr == Target && CurPtr++; } @@ -1244,10 +1245,10 @@ static unsigned advanceIfCustomDelimiter(const char *&CurPtr, return 0; } -/// delimiterMatches - Does custom delimiter (# characters surrounding quotes) -/// match the number of # characters after \ inside the string? This allows +/// delimiterMatches - Does custom delimiter ('#' characters surrounding quotes) +/// match the number of '#' characters after '\' inside the string? This allows /// interpolation inside a "raw" string. Normal/cooked string processing is -/// the degenerate case of there being no # characters surrounding the quotes. +/// the degenerate case of there being no '#' characters surrounding the quotes. /// If delimiter matches, advances byte pointer passed in and returns true. /// Also used to detect the final delimiter of a string when IsClosing == true. static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr, @@ -2106,7 +2107,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, if (IndentToStrip == ~0u && CustomDelimiterLen == ~0u) { IndentToStrip = CustomDelimiterLen = 0; - // restore trailing indent removal for multiline + // Restore trailing indent removal for multiline. const char *Backtrack = BytesPtr - 1; if (Backtrack[-1] == '"' && Backtrack[-2] == '"') { Backtrack -= 2; @@ -2115,7 +2116,7 @@ StringRef Lexer::getEncodedStringSegment(StringRef Bytes, IndentToStrip++; } - // restore delimiter if any + // Restore delimiter if any. while (*--Backtrack == '#') CustomDelimiterLen++; }