Skip to content

Commit

Permalink
Preserve whitespace and comments during lexing as Trivia
Browse files Browse the repository at this point in the history
Store leading a trailing "trivia" around a token, such as whitespace,
comments, doc comments, and escaping backticks. These are syntactically
important for preserving formatting when printing ASTs but don't
semantically affect the program.

Tokens take all trailing trivia up to, but not including, the next
newline. This is important to maintain checks that statements without
semicolon separators start on a new line, among other things.

Trivia are now data attached to the ends of tokens, not tokens
themselves.

Create a new Syntax sublibrary for upcoming immutable, persistent,
thread-safe ASTs, which will contain only the syntactic information
about source structure, as well as for generating new source code, and
structural editing. Proactively move swift::Token into there.

Since this patch is getting a bit large, a token fuzzer which checks
for round-trip equivlence with the workflow:

fuzzer => token stream => file1
  => Lexer => token stream => file 2 => diff(file1, file2)

Will arrive in a subsequent commit.

This patch does not change the grammar.
  • Loading branch information
bitjammer committed Nov 16, 2016
1 parent 6a9298c commit d6e2b58
Show file tree
Hide file tree
Showing 56 changed files with 1,817 additions and 1,049 deletions.
2 changes: 0 additions & 2 deletions include/swift/AST/ASTContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -783,8 +783,6 @@ class ASTContext {

private:
friend class Decl;
Optional<RawComment> getRawComment(const Decl *D);
void setRawComment(const Decl *D, RawComment RC);

Optional<StringRef> getBriefComment(const Decl *D);
void setBriefComment(const Decl *D, StringRef Comment);
Expand Down
9 changes: 5 additions & 4 deletions include/swift/AST/Attr.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "swift/AST/KnownProtocols.h"
#include "swift/AST/Ownership.h"
#include "swift/AST/PlatformKind.h"
#include "swift/AST/RawComment.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/ErrorHandling.h"
Expand Down Expand Up @@ -958,15 +959,15 @@ class OwnershipAttr : public DeclAttribute {
class RawDocCommentAttr : public DeclAttribute {
/// Source range of the attached comment. This comment is located before
/// the declaration.
CharSourceRange CommentRange;
const RawComment Comment;

public:
RawDocCommentAttr(CharSourceRange CommentRange)
RawDocCommentAttr(RawComment Comment)
: DeclAttribute(DAK_RawDocComment, SourceLoc(), SourceRange(),
/*Implicit=*/false),
CommentRange(CommentRange) {}
Comment(Comment) {}

CharSourceRange getCommentRange() const { return CommentRange; }
const RawComment &getComment() const { return Comment; }

static bool classof(const DeclAttribute *DA) {
return DA->getKind() == DAK_RawDocComment;
Expand Down
94 changes: 94 additions & 0 deletions include/swift/Basic/String.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
//===--- String.h - String storage ------------------------------*- C++ -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// This file defines the 'String' storage wrapper, which can hold its own
// unique copy of a string, or merely hold a reference to some point in a
// source buffer, which is assumed to live at least as long as a value of
// this type.
//
//===----------------------------------------------------------------------===//
#ifndef SWIFT_BASIC_STRING_H
#define SWIFT_BASIC_STRING_H

#include "llvm/ADT/IntrusiveRefCntPtr.h"
#include "llvm/ADT/StringRef.h"

using llvm::StringRef;

namespace swift {

class String {
const char *Data;
size_t Length;
bool Managed;


static const char *copyBuffer(const String &Other) {
auto Buffer = (char *)malloc(Other.str().size());
memcpy(Buffer, Other.str().data(), Other.str().size());
return Buffer;
}

public:
String() : Data(nullptr), Length(0), Managed(false) {}

String(const char *Data, size_t Length, bool Managed)
: Data(Data), Length(Length), Managed(Managed) {}

String(StringRef Str, bool IsManaged = false)
: String(Str.data(), Str.size(), IsManaged) {}

String(const String &Other)
: Data(Other.Managed ? copyBuffer(Other) : Other.Data), Length(Other.Length),
Managed(Other.Managed) {}

static String createManaged(const char *Str, size_t Length) {
auto Buffer = malloc(Length);
memcpy(Buffer, Str, Length);
return String { reinterpret_cast<const char *>(Buffer), Length,
/* Managed */ true };
}

static String createManaged(StringRef Str) {
return createManaged(Str.data(), Str.size());
}

static String createUnmanaged(StringRef Str) {
return String { Str, /* Managed */ false };
}

size_t size() const {
return Length;
}

bool empty() const {
return Length == 0;
}

StringRef str() const {
return StringRef { Data, Length };
}

bool operator==(const String &Right) const {
return str() == Right.str();
}

~String() {
if (Managed)
free(reinterpret_cast<void *>(const_cast<char *>(Data)));
}
};

} // end namespace swift

#endif // SWIFT_BASIC_STRING_H

6 changes: 6 additions & 0 deletions include/swift/IDE/SyntaxModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ namespace swift {
class ModuleDecl;
class SourceFile;

namespace syntax {
class Trivia;
}

namespace ide {

enum class SyntaxNodeKind : uint8_t {
Expand Down Expand Up @@ -182,6 +186,8 @@ class SyntaxModelContext {
struct Implementation;
Implementation &Impl;

void addTrivia(const syntax::Trivia &T, std::vector<SyntaxNode> &Nodes);

public:
explicit SyntaxModelContext(SourceFile &SrcFile);
~SyntaxModelContext();
Expand Down
42 changes: 30 additions & 12 deletions include/swift/Parse/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@

#include "swift/Basic/SourceLoc.h"
#include "swift/Basic/SourceManager.h"
#include "swift/Parse/Token.h"
#include "swift/Syntax/Token.h"
#include "swift/Syntax/Syntax.h"
#include "swift/AST/DiagnosticEngine.h"
#include "llvm/ADT/SmallVector.h"

#include <deque>

namespace swift {
class DiagnosticEngine;
class InFlightDiagnostic;
Expand Down Expand Up @@ -88,7 +91,7 @@ class Lexer {

/// @}

Token NextToken;
syntax::Token NextToken;

/// \brief This is true if we're lexing a .sil file instead of a .swift
/// file. This enables the 'sil' keyword.
Expand All @@ -99,6 +102,13 @@ class Lexer {
/// InSILBody - This is true when we're lexing the body of a SIL declaration
/// in a SIL file. This enables some context-sensitive lexing.
bool InSILBody = false;

/// The source trivia leading up to the current token.
std::deque<syntax::Trivia> LeadingTrivia;

/// The source trivia after the current token, up to and including the first
/// newline after the token.
std::deque<syntax::Trivia> TrailingTrivia;

public:
/// \brief Lexer state can be saved/restored to/from objects of this class.
Expand Down Expand Up @@ -192,10 +202,11 @@ class Lexer {
return CodeCompletionPtr != nullptr;
}

void lex(Token &Result) {
Result = NextToken;
syntax::Token lex() {
auto Result = NextToken;
if (Result.isNot(tok::eof))
lexImpl();
return Result;
}

bool isKeepingComments() const {
Expand All @@ -206,7 +217,7 @@ class Lexer {

/// peekNextToken - Return the next token to be returned by Lex without
/// actually lexing it.
const Token &peekNextToken() const { return NextToken; }
const syntax::Token &peekNextToken() const { return NextToken; }

/// \brief Returns the lexer state for the beginning of the given token
/// location. After restoring the state, lexer will return this token and
Expand All @@ -216,11 +227,11 @@ class Lexer {
/// \brief Returns the lexer state for the beginning of the given token.
/// After restoring the state, lexer will return this token and continue from
/// there.
State getStateForBeginningOfToken(const Token &Tok) const {
State getStateForBeginningOfToken(syntax::Token Tok) const {
// If the token has a comment attached to it, rewind to before the comment,
// not just the start of the token. This ensures that we will re-lex and
// reattach the comment to the token if rewound to this state.
SourceLoc TokStart = Tok.getCommentStart();
auto TokStart = Tok.getAbsoluteTriviaStart();
if (TokStart.isInvalid())
TokStart = Tok.getLoc();
return getStateForBeginningOfTokenLoc(TokStart);
Expand Down Expand Up @@ -256,8 +267,8 @@ class Lexer {
/// resides.
///
/// \param Loc The source location of the beginning of a token.
static Token getTokenAtLocation(const SourceManager &SM, SourceLoc Loc);

static Optional<syntax::Token>
getTokenAtLocation(const SourceManager &SM, SourceLoc Loc);

/// \brief Retrieve the source location that points just past the
/// end of the token referred to by \c Loc.
Expand Down Expand Up @@ -368,11 +379,11 @@ class Lexer {
/// \brief Given a string literal token, separate it into string/expr segments
/// of a potentially interpolated string.
static void getStringLiteralSegments(
const Token &Str,
const syntax::Token &Str,
SmallVectorImpl<StringSegment> &Segments,
DiagnosticEngine *Diags);

void getStringLiteralSegments(const Token &Str,
void getStringLiteralSegments(const syntax::Token &Str,
SmallVectorImpl<StringSegment> &Segments) {
return getStringLiteralSegments(Str, Segments, Diags);
}
Expand All @@ -382,7 +393,7 @@ class Lexer {
}

/// Get the token that starts at the given location.
Token getTokenAt(SourceLoc Loc);
syntax::Token getTokenAt(SourceLoc Loc);

/// SILBodyRAII - This helper class is used when parsing a SIL body to inform
/// the lexer that SIL-specific lexing should be enabled.
Expand Down Expand Up @@ -426,6 +437,7 @@ class Lexer {

void formToken(tok Kind, const char *TokStart);

void skipUpToEndOfLine();
void skipToEndOfLine();

/// Skip to the end of the line of a // comment.
Expand All @@ -441,6 +453,12 @@ class Lexer {
void lexOperatorIdentifier();
void lexHexNumber();
void lexNumber();
void lexTrivia(std::deque<syntax::Trivia> &T, bool StopAtFirstNewline = false);
Optional<syntax::Trivia> lexWhitespace(bool StopAtFirstNewline);
Optional<syntax::Trivia> lexComment();
Optional<syntax::Trivia> lexSingleLineComment(syntax::TriviaKind Kind);
Optional<syntax::Trivia> lexBlockComment(syntax::TriviaKind Kind);
Optional<syntax::Trivia> lexDocComment();
static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags);

unsigned lexCharacter(const char *&CurPtr,
Expand Down
Loading

0 comments on commit d6e2b58

Please sign in to comment.