Skip to content

Commit

Permalink
FilePiece now falls back to read automatically.
Browse files Browse the repository at this point in the history
git-svn-id: file:///dev/shm/somefilter.svn@314 e102df66-1e2e-11dd-9b44-c24451a4db5e
  • Loading branch information
kpu committed Sep 12, 2010
1 parent a01eb40 commit 892df48
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 34 deletions.
91 changes: 78 additions & 13 deletions util/file_piece.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

#include "util/exception.hh"

#include <iostream>
#include <string>
#include <limits>

#include <assert.h>
#include <cstdlib>
#include <ctype.h>
#include <fcntl.h>
Expand All @@ -29,22 +32,29 @@ int OpenOrThrow(const char *name) {
if (ret == -1) UTIL_THROW(ErrnoException, "in open (" << name << ") for reading.");
return ret;
}
const off_t kBadSize = std::numeric_limits<off_t>::max();

off_t SizeOrThrow(int fd, const char *name) {
off_t SizeFile(int fd) {
struct stat sb;
if (fstat(fd, &sb) == -1) UTIL_THROW(ErrnoException, "in stat " << name);
if (fstat(fd, &sb) == -1) return kBadSize;
return sb.st_size;
}
} // namespace

FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) :
file_(OpenOrThrow(name)), total_size_(SizeOrThrow(file_.get(), name)), page_(sysconf(_SC_PAGE_SIZE)),
progress_(show_progress, std::string("Reading ") + name, total_size_) {

file_(OpenOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)),
progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) {
if (total_size_ == kBadSize) {
fallback_to_read_ = true;
if (show_progress)
*show_progress << "Couldn't stat " << name << ". Using slower read() instead of mmap(). No progress bar." << std::endl;
} else {
fallback_to_read_ = false;
}
default_map_size_ = page_ * std::max<off_t>((min_buffer / page_ + 1), 2);
position_ = NULL;
position_end_ = NULL;
mapped_offset_ = data_.begin() - position_end_;
mapped_offset_ = 0;
at_end_ = false;
Shift();
}
Expand All @@ -53,6 +63,7 @@ float FilePiece::ReadFloat() throw(EndOfFileException, ParseNumberException) {
SkipSpaces();
while (last_space_ < position_) {
if (at_end_) {
// Hallucinate a null off the end of the file.
std::string buffer(position_, position_end_);
char *end;
float ret = std::strtof(buffer.c_str(), &end);
Expand Down Expand Up @@ -113,28 +124,82 @@ void FilePiece::Shift() throw(EndOfFileException) {
if (at_end_) throw EndOfFileException();
off_t desired_begin = position_ - data_.begin() + mapped_offset_;
progress_.Set(desired_begin);

if (!fallback_to_read_) MMapShift(desired_begin);
// Notice an mmap failure might set the fallback.
if (fallback_to_read_) ReadShift(desired_begin);

for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
if (isspace(*last_space_)) break;
}
}

void FilePiece::MMapShift(off_t desired_begin) throw() {
// Use mmap.
off_t ignore = desired_begin % page_;
// Duplicate request for Shift means give more data.
if (position_ == data_.begin() + ignore) {
default_map_size_ *= 2;
}
mapped_offset_ = desired_begin - ignore;
// Local version so that in case of failure it doesn't overwrite the class variable.
off_t mapped_offset = desired_begin - ignore;

off_t mapped_size;
if (default_map_size_ >= total_size_ - mapped_offset_) {
if (default_map_size_ >= total_size_ - mapped_offset) {
at_end_ = true;
mapped_size = total_size_ - mapped_offset_;
mapped_size = total_size_ - mapped_offset;
} else {
mapped_size = default_map_size_;
}

// Forcibly clear the existing mmap first.
data_.reset();
data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_PRIVATE, *file_, mapped_offset_), mapped_size);
if (data_.get() == MAP_FAILED) UTIL_THROW(ErrnoException, "mmap language model file for reading")
data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_PRIVATE, *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED);
if (data_.get() == MAP_FAILED) {
fallback_to_read_ = true;
return;
}
mapped_offset_ = mapped_offset;
position_ = data_.begin() + ignore;
position_end_ = data_.begin() + mapped_size;
for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
if (isspace(*last_space_)) break;
}

void FilePiece::ReadShift(off_t desired_begin) throw() {
assert(fallback_to_read_);
if (data_.source() != scoped_memory::MALLOC_ALLOCATED) {
// First call.
data_.reset();
data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED);
if (!data_.get()) UTIL_THROW(ErrnoException, "malloc failed for " << default_map_size_);
position_ = data_.begin();
position_end_ = position_;
} else if (position_ && (position_ == data_.begin()) && (position_end_ == data_.end())) {
// If we've already fallen back, this is a duplicate call, and the buffer
// is full, increase buffer size.
std::size_t valid_length = position_end_ - position_;
default_map_size_ *= 2;
data_.call_realloc(default_map_size_);
if (!data_.get()) UTIL_THROW(ErrnoException, "realloc failed for " << default_map_size_);
position_ = data_.begin();
position_end_ = position_ + valid_length;
}

// Bytes [data_.begin(), position_) have been consumed.
// Bytes [position_, position_end_) have been read into the buffer.

// Start at the beginning of the buffer if there's nothing useful in it.
if (position_ == position_end_) {
mapped_offset_ += (position_end_ - data_.begin());
position_ = data_.begin();
position_end_ = position_;
}

std::size_t already_read = position_end_ - data_.begin();
assert(already_read < default_map_size_);
ssize_t read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read);
if (read_return == -1) UTIL_THROW(ErrnoException, "read failed");
if (read_return == 0) at_end_ = true;
position_end_ += read_return;
}

} // namespace util
11 changes: 10 additions & 1 deletion util/file_piece.hh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ class FilePiece {
off_t Offset() const {
return position_ - data_.begin() + mapped_offset_;
}

// Only for testing.
void ForceFallbackToRead() {
fallback_to_read_ = true;
}

private:
StringPiece Consume(const char *to) {
Expand All @@ -64,6 +69,9 @@ class FilePiece {
const char *FindDelimiterOrEOF() throw(EndOfFileException);

void Shift() throw (EndOfFileException);
// Backends to Shift().
void MMapShift(off_t desired_begin) throw ();
void ReadShift(off_t desired_begin) throw ();

const char *position_, *last_space_, *position_end_;

Expand All @@ -75,9 +83,10 @@ class FilePiece {
off_t mapped_offset_;

// Order matters: file_ should always be destroyed after this.
scoped_mmap data_;
scoped_memory data_;

bool at_end_;
bool fallback_to_read_;

ErsatzProgress progress_;
};
Expand Down
26 changes: 21 additions & 5 deletions util/file_piece_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,32 @@
namespace util {
namespace {

/* mmap implementation */
BOOST_AUTO_TEST_CASE(MMapLine) {
std::fstream ref("file_piece.cc", std::ios::in);
FilePiece test("file_piece.cc", NULL, 1);
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
// I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
if (!test_line.empty() || !ref_line.empty()) {
BOOST_CHECK_EQUAL(ref_line, test_line);
}
}
}

/* read() implementation */
BOOST_AUTO_TEST_CASE(ReadLine) {
std::fstream ref("file_piece.hh", std::ios::in);
FilePiece test("file_piece.hh", 1);
std::fstream ref("file_piece.cc", std::ios::in);
FilePiece test("file_piece.cc", NULL, 1);
test.ForceFallbackToRead();
std::string ref_line;
while (getline(ref, ref_line)) {
StringPiece test_line(test.ReadLine());
if (test_line != ref_line) {
std::cerr << test_line.size() << " " << ref_line.size() << std::endl;
// I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
if (!test_line.empty() || !ref_line.empty()) {
BOOST_CHECK_EQUAL(ref_line, test_line);
}
BOOST_CHECK_EQUAL(ref_line, test_line);
}
}

Expand Down
19 changes: 17 additions & 2 deletions util/scoped.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#include "util/scoped.hh"

#include <assert.h>
#include <err.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <unistd.h>

namespace util {
Expand All @@ -17,20 +19,33 @@ scoped_mmap::~scoped_mmap() {
}
}

void scoped_mmap_or_array::internal_reset(void *data, std::size_t size, Alloc source) {
void scoped_memory::reset(void *data, std::size_t size, Alloc source) {
switch(source_) {
case MMAP_ALLOCATED:
scoped_mmap(data_, size_);
break;
case ARRAY_ALLOCATED:
delete [] reinterpret_cast<char*>(data_);
break;
case NONE:
case MALLOC_ALLOCATED:
free(data_);
break;
case NONE_ALLOCATED:
break;
}
data_ = data;
size_ = size;
source_ = source;
}

void scoped_memory::call_realloc(std::size_t size) {
assert(source_ == MALLOC_ALLOCATED || source_ == NONE_ALLOCATED);
void *new_data = realloc(data_, size);
if (!new_data) {
reset();
} else {
reset(new_data, size, MALLOC_ALLOCATED);
}
}

} // namespace util
31 changes: 18 additions & 13 deletions util/scoped.hh
Original file line number Diff line number Diff line change
Expand Up @@ -86,37 +86,42 @@ class scoped_mmap {
scoped_mmap &operator=(const scoped_mmap &);
};

/* For when the memory might come from mmap or new char[] */
class scoped_mmap_or_array {
private:
typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, NONE} Alloc;

/* For when the memory might come from mmap, new char[], or malloc. Uses NULL
* and 0 for blanks even though mmap signals errors with (void*)-1). The reset
* function checks that blank for mmap.
*/
class scoped_memory {
public:
scoped_mmap_or_array() : data_(NULL), size_(0), source_(NONE) {}
typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, MALLOC_ALLOCATED, NONE_ALLOCATED} Alloc;

~scoped_mmap_or_array() { reset(); }
scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {}

~scoped_memory() { reset(); }

void *get() const { return data_; }
const char *begin() const { return reinterpret_cast<char*>(data_); }
const char *end() const { return reinterpret_cast<char*>(data_) + size_; }
std::size_t size() const { return size_; }

void reset() { internal_reset(NULL, 0, NONE); }
Alloc source() const { return source_; }

void reset() { reset(NULL, 0, NONE_ALLOCATED); }

void reset_mmap(void *data, std::size_t size) { internal_reset(data, size, MMAP_ALLOCATED); }
void reset(void *data, std::size_t size, Alloc from);

void reset_array(void *data, std::size_t size) { internal_reset(data, size, ARRAY_ALLOCATED); }
// realloc allows the current data to escape hence the need for this call
// If realloc fails, destroys the original too and get() returns NULL.
void call_realloc(std::size_t to);

private:
void internal_reset(void *data, std::size_t size, Alloc from);

void *data_;
std::size_t size_;

Alloc source_;

scoped_mmap_or_array(const scoped_mmap_or_array &);
scoped_mmap_or_array &operator=(const scoped_mmap_or_array &);
scoped_memory(const scoped_memory &);
scoped_memory &operator=(const scoped_memory &);
};

} // namespace util
Expand Down

0 comments on commit 892df48

Please sign in to comment.