Revert "Refactor and cleanup Assembly Parsing / Lexing"

Reverting for unexpected crashes on various platforms. This reverts commit r272953. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272957 91177308-0d34-0410-b5e6-96231b3b80d8
larryv · Jun 16, 2016 · afacc94 · afacc94
1 parent f3108f6
commit afacc94
Showing 9 changed files with 135 additions and 165 deletions.
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
@@ -29,8 +29,7 @@ class AsmLexer : public MCAsmLexer {
 
   const char *CurPtr;
   StringRef CurBuf;
-  bool IsAtStartOfLine;
-  bool IsAtStartOfStatement;
+  bool isAtStartOfLine;
 
   void operator=(const AsmLexer&) = delete;
   AsmLexer(const AsmLexer&) = delete;
@@ -46,15 +45,17 @@ class AsmLexer : public MCAsmLexer {
   void setBuffer(StringRef Buf, const char *ptr = nullptr);
 
   StringRef LexUntilEndOfStatement() override;
+  StringRef LexUntilEndOfLine();
 
   size_t peekTokens(MutableArrayRef<AsmToken> Buf,
                     bool ShouldSkipSpace = true) override;
 
+  bool isAtStartOfComment(const char *Ptr);
+  bool isAtStatementSeparator(const char *Ptr);
+
   const MCAsmInfo &getMAI() const { return MAI; }
 
 private:
-  bool isAtStartOfComment(const char *Ptr);
-  bool isAtStatementSeparator(const char *Ptr);
   int getNextChar();
   AsmToken ReturnError(const char *Loc, const std::string &Msg);
 
@@ -66,8 +67,6 @@ class AsmLexer : public MCAsmLexer {
   AsmToken LexQuote();
   AsmToken LexFloatLiteral();
   AsmToken LexHexFloatLiteral(bool NoIntDigits);
-
-  StringRef LexUntilEndOfLine();
 };
 
 } // end namespace llvm

diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -39,15 +39,12 @@ class AsmToken {
     // Real values.
     Real,
 
-    // Comments
-    Comment,
-    HashDirective,
     // No-value.
     EndOfStatement,
     Colon,
     Space,
     Plus, Minus, Tilde,
-    Slash,     // '/'
+    Slash,    // '/'
     BackSlash, // '\'
     LParen, RParen, LBrac, RBrac, LCurly, RCurly,
     Star, Dot, Comma, Dollar, Equal, EqualEqual,
@@ -156,9 +153,8 @@ class MCAsmLexer {
   const AsmToken &Lex() {
     assert(!CurTok.empty());
     CurTok.erase(CurTok.begin());
-    // Always place in front as LexToken may generate multiple tokens via UnLex.
     if (CurTok.empty())
-      CurTok.insert(CurTok.begin(), LexToken());
+      CurTok.emplace_back(LexToken());
     return CurTok.front();
   }
 

diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
@@ -23,8 +23,7 @@ using namespace llvm;
 
 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
   CurPtr = nullptr;
-  IsAtStartOfLine = true;
-  IsAtStartOfStatement = true;
+  isAtStartOfLine = true;
   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
 }
 
@@ -51,9 +50,20 @@ AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
 }
 
 int AsmLexer::getNextChar() {
-  if (CurPtr == CurBuf.end())
+  char CurChar = *CurPtr++;
+  switch (CurChar) {
+  default:
+    return (unsigned char)CurChar;
+  case 0:
+    // A nul character in the stream is either the end of the current buffer or
+    // a random nul in the file.  Disambiguate that here.
+    if (CurPtr - 1 != CurBuf.end())
+      return 0;  // Just whitespace.
+
+    // Otherwise, return end of file.
+    --CurPtr;  // Another call to lex will return EOF again.
     return EOF;
-  return (unsigned char)*CurPtr++;
+  }
 }
 
 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
@@ -159,52 +169,43 @@ AsmToken AsmLexer::LexIdentifier() {
 AsmToken AsmLexer::LexSlash() {
   switch (*CurPtr) {
   case '*':
-    IsAtStartOfStatement = false;
     break; // C style comment.
   case '/':
     ++CurPtr;
     return LexLineComment();
   default:
-    IsAtStartOfStatement = false;
-    return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
+    return AsmToken(AsmToken::Slash, StringRef(CurPtr - 1, 1));
   }
 
   // C Style comment.
   ++CurPtr;  // skip the star.
-  while (CurPtr != CurBuf.end()) {
-    switch (*CurPtr++) {
+  while (1) {
+    int CurChar = getNextChar();
+    switch (CurChar) {
+    case EOF:
+      return ReturnError(TokStart, "unterminated comment");
     case '*':
       // End of the comment?
-      if (*CurPtr != '/')
-        break;
+      if (CurPtr[0] != '/') break;
+
       ++CurPtr;   // End the */.
-      return AsmToken(AsmToken::Comment,
-                      StringRef(TokStart, CurPtr - TokStart));
+      return LexToken();
     }
   }
-  return ReturnError(TokStart, "unterminated comment");
 }
 
 /// LexLineComment: Comment: #[^\n]*
 ///                        : //[^\n]*
 AsmToken AsmLexer::LexLineComment() {
-  // Mark This as an end of statement with a body of the
-  // comment. While it would be nicer to leave this two tokens,
-  // backwards compatability with TargetParsers makes keeping this in this form
-  // better.
+  // FIXME: This is broken if we happen to a comment at the end of a file, which
+  // was .included, and which doesn't end with a newline.
   int CurChar = getNextChar();
   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
     CurChar = getNextChar();
 
-  IsAtStartOfLine = true;
-  // Whis is a whole line comment. leave newline
-  if (IsAtStartOfStatement)
-    return AsmToken(AsmToken::EndOfStatement,
-                    StringRef(TokStart, CurPtr - TokStart));
-  IsAtStartOfStatement = true;
-
-  return AsmToken(AsmToken::EndOfStatement,
-                  StringRef(TokStart, CurPtr - 1 - TokStart));
+  if (CurChar == EOF)
+    return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
+  return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
 }
 
 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
@@ -422,7 +423,8 @@ StringRef AsmLexer::LexUntilEndOfStatement() {
 
   while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
          !isAtStatementSeparator(CurPtr) && // End of statement marker.
-         *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
+         *CurPtr != '\n' && *CurPtr != '\r' &&
+         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
     ++CurPtr;
   }
   return StringRef(TokStart, CurPtr-TokStart);
@@ -431,7 +433,8 @@ StringRef AsmLexer::LexUntilEndOfStatement() {
 StringRef AsmLexer::LexUntilEndOfLine() {
   TokStart = CurPtr;
 
-  while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
+  while (*CurPtr != '\n' && *CurPtr != '\r' &&
+         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
     ++CurPtr;
   }
   return StringRef(TokStart, CurPtr-TokStart);
@@ -441,8 +444,7 @@ size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
                             bool ShouldSkipSpace) {
   const char *SavedTokStart = TokStart;
   const char *SavedCurPtr = CurPtr;
-  bool SavedAtStartOfLine = IsAtStartOfLine;
-  bool SavedAtStartOfStatement = IsAtStartOfStatement;
+  bool SavedAtStartOfLine = isAtStartOfLine;
   bool SavedSkipSpace = SkipSpace;
 
   std::string SavedErr = getErr();
@@ -463,8 +465,7 @@ size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
   SetError(SavedErrLoc, SavedErr);
 
   SkipSpace = SavedSkipSpace;
-  IsAtStartOfLine = SavedAtStartOfLine;
-  IsAtStartOfStatement = SavedAtStartOfStatement;
+  isAtStartOfLine = SavedAtStartOfLine;
   CurPtr = SavedCurPtr;
   TokStart = SavedTokStart;
 
@@ -494,45 +495,29 @@ AsmToken AsmLexer::LexToken() {
   // This always consumes at least one character.
   int CurChar = getNextChar();
 
-  if (CurChar == '#' && IsAtStartOfStatement) {
-    // If this starts with a '#', this may be a cpp
-    // hash directive and otherwise a line comment.
-    AsmToken TokenBuf[2];
-    MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
-    size_t num = peekTokens(Buf, true);
-    // There cannot be a space preceeding this
-    if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
-        TokenBuf[1].is(AsmToken::String)) {
-      CurPtr = TokStart; // reset curPtr;
-      StringRef s = LexUntilEndOfLine();
-      UnLex(TokenBuf[1]);
-      UnLex(TokenBuf[0]);
-      return AsmToken(AsmToken::HashDirective, s);
-    }
+  if (isAtStartOfComment(TokStart)) {
+    // If this comment starts with a '#', then return the Hash token and let
+    // the assembler parser see if it can be parsed as a cpp line filename
+    // comment. We do this only if we are at the start of a line.
+    if (CurChar == '#' && isAtStartOfLine)
+      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
+    isAtStartOfLine = true;
     return LexLineComment();
   }
-
-  if (isAtStartOfComment(TokStart))
-    return LexLineComment();
-
   if (isAtStatementSeparator(TokStart)) {
     CurPtr += strlen(MAI.getSeparatorString()) - 1;
-    IsAtStartOfLine = true;
-    IsAtStartOfStatement = true;
     return AsmToken(AsmToken::EndOfStatement,
                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
   }
 
   // If we're missing a newline at EOF, make sure we still get an
   // EndOfStatement token before the Eof token.
-  if (CurChar == EOF && !IsAtStartOfStatement) {
-    IsAtStartOfLine = true;
-    IsAtStartOfStatement = true;
+  if (CurChar == EOF && !isAtStartOfLine) {
+    isAtStartOfLine = true;
     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
   }
-  IsAtStartOfLine = false;
-  bool OldIsAtStartOfStatement = IsAtStartOfStatement;
-  IsAtStartOfStatement = false;
+
+  isAtStartOfLine = false;
   switch (CurChar) {
   default:
     // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
@@ -541,24 +526,24 @@ AsmToken AsmLexer::LexToken() {
 
     // Unknown character, emit an error.
     return ReturnError(TokStart, "invalid character in input");
-  case EOF:
-    IsAtStartOfLine = true;
-    IsAtStartOfStatement = true;
-    return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
+  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
   case 0:
   case ' ':
   case '\t':
-    IsAtStartOfStatement = OldIsAtStartOfStatement;
-    while (*CurPtr == ' ' || *CurPtr == '\t')
-      CurPtr++;
-    if (SkipSpace)
-      return LexToken(); // Ignore whitespace.
-    else
-      return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
-  case '\n':
+    if (SkipSpace) {
+      // Ignore whitespace.
+      return LexToken();
+    } else {
+      int len = 1;
+      while (*CurPtr==' ' || *CurPtr=='\t') {
+        CurPtr++;
+        len++;
+      }
+      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
+    }
+  case '\n': // FALL THROUGH.
   case '\r':
-    IsAtStartOfLine = true;
-    IsAtStartOfStatement = true;
+    isAtStartOfLine = true;
     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
@@ -601,9 +586,7 @@ AsmToken AsmLexer::LexToken() {
     }
     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
   case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
-  case '/':
-    IsAtStartOfStatement = OldIsAtStartOfStatement;
-    return LexSlash();
+  case '/': return LexSlash();
   case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
   case '\'': return LexSingleQuote();
   case '"': return LexQuote();