Skip to content

Commit

Permalink
[Lexer] Finding beginning of token with escaped new line
Browse files Browse the repository at this point in the history
Summary:
Lexer::GetBeginningOfToken produced invalid location when
backtracking across escaped new lines.

This fixes PR26228

Reviewers: akyrtzi, alexfh, rsmith, doug.gregor

Reviewed By: alexfh

Subscribers: alexfh, cfe-commits

Patch by Paweł Żukowski!

Differential Revision: https://reviews.llvm.org/D30748

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@310576 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
alexfh committed Aug 10, 2017
1 parent 3685dba commit a2fc7db
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 28 deletions.
4 changes: 4 additions & 0 deletions include/clang/Lex/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,10 @@ class Lexer : public PreprocessorLexer {
/// \brief Returns true if the given character could appear in an identifier.
static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);

/// \brief Checks whether new line pointed by Str is preceded by escape
/// sequence.
static bool isNewLineEscaped(const char *BufferStart, const char *Str);

/// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
/// emit a warning.
static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
Expand Down
72 changes: 44 additions & 28 deletions lib/Lex/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,19 +463,15 @@ static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
const char *BufStart = Buffer.data();
if (Offset >= Buffer.size())
return nullptr;
const char *StrData = BufStart + Offset;

if (StrData[0] == '\n' || StrData[0] == '\r')
return StrData;

const char *LexStart = StrData;
while (LexStart != BufStart) {
if (LexStart[0] == '\n' || LexStart[0] == '\r') {
const char *LexStart = BufStart + Offset;
for (; LexStart != BufStart; --LexStart) {
if (isVerticalWhitespace(LexStart[0]) &&
!Lexer::isNewLineEscaped(BufStart, LexStart)) {
// LexStart should point at first character of logical line.
++LexStart;
break;
}

--LexStart;
}
return LexStart;
}
Expand All @@ -487,7 +483,7 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
if (LocInfo.first.isInvalid())
return Loc;

bool Invalid = false;
StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
if (Invalid)
Expand All @@ -499,52 +495,52 @@ static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
if (!LexStart || LexStart == StrData)
return Loc;

// Create a lexer starting at the beginning of this token.
SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
Buffer.end());
TheLexer.SetCommentRetentionState(true);

// Lex tokens until we find the token that contains the source location.
Token TheTok;
do {
TheLexer.LexFromRawLexer(TheTok);

if (TheLexer.getBufferLocation() > StrData) {
// Lexing this token has taken the lexer past the source location we're
// looking for. If the current token encompasses our source location,
// return the beginning of that token.
if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
return TheTok.getLocation();

// We ended up skipping over the source location entirely, which means
// that it points into whitespace. We're done here.
break;
}
} while (TheTok.getKind() != tok::eof);

// We've passed our source location; just return the original source location.
return Loc;
}

SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
const SourceManager &SM,
const LangOptions &LangOpts) {
if (Loc.isFileID())
return getBeginningOfFileToken(Loc, SM, LangOpts);
if (!SM.isMacroArgExpansion(Loc))
return Loc;
if (Loc.isFileID())
return getBeginningOfFileToken(Loc, SM, LangOpts);

if (!SM.isMacroArgExpansion(Loc))
return Loc;

SourceLocation FileLoc = SM.getSpellingLoc(Loc);
SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
std::pair<FileID, unsigned> BeginFileLocInfo
= SM.getDecomposedLoc(BeginFileLoc);
assert(FileLocInfo.first == BeginFileLocInfo.first &&
FileLocInfo.second >= BeginFileLocInfo.second);
return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
SourceLocation FileLoc = SM.getSpellingLoc(Loc);
SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
std::pair<FileID, unsigned> BeginFileLocInfo =
SM.getDecomposedLoc(BeginFileLoc);
assert(FileLocInfo.first == BeginFileLocInfo.first &&
FileLocInfo.second >= BeginFileLocInfo.second);
return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
}

namespace {
Expand Down Expand Up @@ -1032,6 +1028,26 @@ bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
return isIdentifierBody(c, LangOpts.DollarIdents);
}

bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
assert(isVerticalWhitespace(Str[0]));
if (Str - 1 < BufferStart)
return false;

if ((Str[0] == '\n' && Str[-1] == '\r') ||
(Str[0] == '\r' && Str[-1] == '\n')) {
if (Str - 2 < BufferStart)
return false;
--Str;
}
--Str;

// Rewind to first non-space character:
while (Str > BufferStart && isHorizontalWhitespace(*Str))
--Str;

return *Str == '\\';
}

StringRef Lexer::getIndentationForLine(SourceLocation Loc,
const SourceManager &SM) {
if (Loc.isInvalid() || Loc.isMacroID())
Expand Down
53 changes: 53 additions & 0 deletions unittests/Lex/LexerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,4 +420,57 @@ TEST_F(LexerTest, DontOverallocateStringifyArgs) {
#endif
}

TEST_F(LexerTest, IsNewLineEscapedValid) {
auto hasNewLineEscaped = [](const char *S) {
return Lexer::isNewLineEscaped(S, S + strlen(S) - 1);
};

EXPECT_TRUE(hasNewLineEscaped("\\\r"));
EXPECT_TRUE(hasNewLineEscaped("\\\n"));
EXPECT_TRUE(hasNewLineEscaped("\\\r\n"));
EXPECT_TRUE(hasNewLineEscaped("\\\n\r"));
EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r"));
EXPECT_TRUE(hasNewLineEscaped("\\ \t\v\f\r\n"));

EXPECT_FALSE(hasNewLineEscaped("\\\r\r"));
EXPECT_FALSE(hasNewLineEscaped("\\\r\r\n"));
EXPECT_FALSE(hasNewLineEscaped("\\\n\n"));
EXPECT_FALSE(hasNewLineEscaped("\r"));
EXPECT_FALSE(hasNewLineEscaped("\n"));
EXPECT_FALSE(hasNewLineEscaped("\r\n"));
EXPECT_FALSE(hasNewLineEscaped("\n\r"));
EXPECT_FALSE(hasNewLineEscaped("\r\r"));
EXPECT_FALSE(hasNewLineEscaped("\n\n"));
}

TEST_F(LexerTest, GetBeginningOfTokenWithEscapedNewLine) {
// Each line should have the same length for
// further offset calculation to be more straightforward.
const unsigned IdentifierLength = 8;
std::string TextToLex = "rabarbar\n"
"foo\\\nbar\n"
"foo\\\rbar\n"
"fo\\\r\nbar\n"
"foo\\\n\rba\n";
std::vector<tok::TokenKind> ExpectedTokens{5, tok::identifier};
std::vector<Token> LexedTokens = CheckLex(TextToLex, ExpectedTokens);

for (const Token &Tok : LexedTokens) {
std::pair<FileID, unsigned> OriginalLocation =
SourceMgr.getDecomposedLoc(Tok.getLocation());
for (unsigned Offset = 0; Offset < IdentifierLength; ++Offset) {
SourceLocation LookupLocation =
Tok.getLocation().getLocWithOffset(Offset);

std::pair<FileID, unsigned> FoundLocation =
SourceMgr.getDecomposedExpansionLoc(
Lexer::GetBeginningOfToken(LookupLocation, SourceMgr, LangOpts));

// Check that location returned by the GetBeginningOfToken
// is the same as original token location reported by Lexer.
EXPECT_EQ(FoundLocation.second, OriginalLocation.second);
}
}
}

} // anonymous namespace

0 comments on commit a2fc7db

Please sign in to comment.