Skip to content

Commit

Permalink
apacheGH-37041: [MATLAB] Implement Feather V1 Reader using new MATLAB…
Browse files Browse the repository at this point in the history
… Interface APIs (apache#37044)

### Rationale for this change

Now that we've have the basic building blocks for tabular IO in the MATLAB Interface (Array, Schema, RecordBatch), we can implement a Feather V1 reader in terms of the new APIs.

This is a follow up to apache#37043, where a new Feather V1 internal `Writer` object was added.

### What changes are included in this PR?

1. Added a new class called arrow.internal.io.feather.Reader which can be used to read Feather V1 files. It has one public property named `Filename` and one public method named `read`.

**Example Usage:**

```matlab
>> T = array2table(rand(3))       

T =

  3x3 table

     Var1        Var2       Var3  
    _______    ________    _______

    0.79221    0.035712    0.67874
    0.95949     0.84913    0.75774
    0.65574     0.93399    0.74313

>> filename = "test.feather";

>> featherwrite(filename, T)

>> reader = arrow.internal.io.feather.Reader(filename)

reader = 

  Reader with properties:

    Filename: "test.feather"

>> T = reader.read()

T =

  3x3 table

     Var1        Var2       Var3  
    _______    ________    _______

    0.79221    0.035712    0.67874
    0.95949     0.84913    0.75774
    0.65574     0.93399    0.74313
```

### Are these changes tested?

Yes.

1. Added `Reader` to `feather/tRoundTrip.m`.

### Are there any user-facing changes?

No.

These are only internal objects right now. 

### Future Directions

1. Re-implement `featherread` in terms of the new `Reader` object.
2. Remove legacy feather code and infrastructure.

### Notes

1. For conciseness, I renamed the C++ Proxy class `FeatherWriter` to `Writer` since it is already inside of a `feather` namespace / "package".
* Closes: apache#37041

Authored-by: Kevin Gurney <[email protected]>
Signed-off-by: Kevin Gurney <[email protected]>
  • Loading branch information
kevingurney authored Aug 7, 2023
1 parent 71329ce commit 152be67
Show file tree
Hide file tree
Showing 10 changed files with 219 additions and 17 deletions.
6 changes: 6 additions & 0 deletions matlab/src/cpp/arrow/matlab/error/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,13 @@ namespace arrow::matlab::error {
static const char* RECORD_BATCH_NUMERIC_INDEX_WITH_EMPTY_RECORD_BATCH = "arrow:tabular:recordbatch:NumericIndexWithEmptyRecordBatch";
static const char* RECORD_BATCH_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:recordbatch:InvalidNumericColumnIndex";
static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite";
static const char* FAILED_TO_OPEN_FILE_FOR_READ = "arrow:io:FailedToOpenFileForRead";
static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable";
static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch";
static const char* FEATHER_FAILED_TO_CREATE_READER = "arrow:io:feather:FailedToCreateReader";
static const char* FEATHER_VERSION_2 = "arrow:io:feather:FeatherVersion2";
static const char* FEATHER_VERSION_UNKNOWN = "arrow:io:feather:FeatherVersionUnknown";
static const char* FEATHER_FAILED_TO_READ_TABLE = "arrow:io:feather:FailedToReadTable";
static const char* FEATHER_FAILED_TO_READ_RECORD_BATCH = "arrow:io:feather:FailedToReadRecordBatch";

}
98 changes: 98 additions & 0 deletions matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "libmexclass/proxy/ProxyManager.h"

#include "arrow/matlab/error/error.h"
#include "arrow/matlab/io/feather/proxy/reader.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"

#include "arrow/util/utf8.h"

#include "arrow/result.h"

#include "arrow/io/file.h"
#include "arrow/ipc/feather.h"
#include "arrow/table.h"

namespace arrow::matlab::io::feather::proxy {

Reader::Reader(const std::string& filename) : filename{filename} {
REGISTER_METHOD(Reader, read);
REGISTER_METHOD(Reader, getFilename);
}

libmexclass::proxy::MakeResult Reader::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;
using ReaderProxy = arrow::matlab::io::feather::proxy::Reader;

mda::StructArray args = constructor_arguments[0];
const mda::StringArray filename_utf16_mda = args[0]["Filename"];
const auto filename_utf16 = std::u16string(filename_utf16_mda[0]);
MATLAB_ASSIGN_OR_ERROR(const auto filename, arrow::util::UTF16StringToUTF8(filename_utf16), error::UNICODE_CONVERSION_ERROR_ID);

return std::make_shared<ReaderProxy>(filename);
}

void Reader::read(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
using namespace libmexclass::proxy;
using RecordBatchProxy = arrow::matlab::tabular::proxy::RecordBatch;

mda::ArrayFactory factory;

// Create a file input stream.
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto source, arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), context, error::FAILED_TO_OPEN_FILE_FOR_READ);

// Create a Reader from the file input stream.
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto reader, arrow::ipc::feather::Reader::Open(source), context, error::FEATHER_FAILED_TO_CREATE_READER);

// Error if not Feather V1.
const auto version = reader->version();
if (version == ipc::feather::kFeatherV2Version) {
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(Status::NotImplemented("Support for Feather V2 has not been implemented."), context, error::FEATHER_VERSION_2);
} else if (version != ipc::feather::kFeatherV1Version) {
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(Status::Invalid("Unknown Feather format version."), context, error::FEATHER_VERSION_UNKNOWN);
}

// Read a Table from the file.
std::shared_ptr<arrow::Table> table = nullptr;
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(reader->Read(&table), context, error::FEATHER_FAILED_TO_READ_TABLE);

// Get the first RecordBatch from the Table.
arrow::TableBatchReader table_batch_reader{table};
std::shared_ptr<arrow::RecordBatch> record_batch = nullptr;
MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(table_batch_reader.ReadNext(&record_batch), context, error::FEATHER_FAILED_TO_READ_RECORD_BATCH);

// Create a Proxy from the first RecordBatch.
auto record_batch_proxy = std::make_shared<RecordBatchProxy>(record_batch);
const auto record_batch_proxy_id = ProxyManager::manageProxy(record_batch_proxy);

const auto record_batch_proxy_id_mda = factory.createScalar(record_batch_proxy_id);

context.outputs[0] = record_batch_proxy_id_mda;
}

void Reader::getFilename(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::ArrayFactory factory;

MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto filename_utf16, arrow::util::UTF8StringToUTF16(filename), context, error::UNICODE_CONVERSION_ERROR_ID);
auto filename_utf16_mda = factory.createScalar(filename_utf16);
context.outputs[0] = filename_utf16_mda;
}

}
39 changes: 39 additions & 0 deletions matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "libmexclass/proxy/Proxy.h"

namespace arrow::matlab::io::feather::proxy {

class Reader : public libmexclass::proxy::Proxy {
public:
Reader(const std::string& filename);

virtual ~Reader() {}

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments);

protected:
void read(libmexclass::proxy::method::Context& context);
void getFilename(libmexclass::proxy::method::Context& context);

const std::string filename;
};

}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include "arrow/matlab/io/feather/proxy/feather_writer.h"
#include "arrow/matlab/io/feather/proxy/writer.h"
#include "arrow/matlab/tabular/proxy/record_batch.h"
#include "arrow/matlab/error/error.h"

Expand All @@ -30,12 +30,12 @@

namespace arrow::matlab::io::feather::proxy {

FeatherWriter::FeatherWriter(const std::string& filename) : filename{filename} {
REGISTER_METHOD(FeatherWriter, getFilename);
REGISTER_METHOD(FeatherWriter, write);
Writer::Writer(const std::string& filename) : filename{filename} {
REGISTER_METHOD(Writer, getFilename);
REGISTER_METHOD(Writer, write);
}

libmexclass::proxy::MakeResult FeatherWriter::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
libmexclass::proxy::MakeResult Writer::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) {
namespace mda = ::matlab::data;
mda::StructArray opts = constructor_arguments[0];
const mda::StringArray filename_mda = opts[0]["Filename"];
Expand All @@ -45,10 +45,10 @@ namespace arrow::matlab::io::feather::proxy {
arrow::util::UTF16StringToUTF8(filename_utf16),
error::UNICODE_CONVERSION_ERROR_ID);

return std::make_shared<FeatherWriter>(filename_utf8);
return std::make_shared<Writer>(filename_utf8);
}

void FeatherWriter::getFilename(libmexclass::proxy::method::Context& context) {
void Writer::getFilename(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename,
arrow::util::UTF8StringToUTF16(filename),
Expand All @@ -59,7 +59,7 @@ namespace arrow::matlab::io::feather::proxy {
context.outputs[0] = str_mda;
}

void FeatherWriter::write(libmexclass::proxy::method::Context& context) {
void Writer::write(libmexclass::proxy::method::Context& context) {
namespace mda = ::matlab::data;
mda::StructArray opts = context.inputs[0];
const mda::TypedArray<uint64_t> record_batch_proxy_id_mda = opts[0]["RecordBatchProxyID"];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@

namespace arrow::matlab::io::feather::proxy {

class FeatherWriter : public libmexclass::proxy::Proxy {
class Writer : public libmexclass::proxy::Proxy {
public:
FeatherWriter(const std::string& filename);
Writer(const std::string& filename);

~FeatherWriter() {}
~Writer() {}

static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments);

Expand Down
6 changes: 4 additions & 2 deletions matlab/src/cpp/arrow/matlab/proxy/factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
#include "arrow/matlab/type/proxy/string_type.h"
#include "arrow/matlab/type/proxy/timestamp_type.h"
#include "arrow/matlab/type/proxy/field.h"
#include "arrow/matlab/io/feather/proxy/feather_writer.h"
#include "arrow/matlab/io/feather/proxy/writer.h"
#include "arrow/matlab/io/feather/proxy/reader.h"

#include "factory.h"

Expand Down Expand Up @@ -61,7 +62,8 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name,
REGISTER_PROXY(arrow.type.proxy.BooleanType , arrow::matlab::type::proxy::PrimitiveCType<bool>);
REGISTER_PROXY(arrow.type.proxy.StringType , arrow::matlab::type::proxy::StringType);
REGISTER_PROXY(arrow.type.proxy.TimestampType , arrow::matlab::type::proxy::TimestampType);
REGISTER_PROXY(arrow.io.feather.proxy.FeatherWriter , arrow::matlab::io::feather::proxy::FeatherWriter);
REGISTER_PROXY(arrow.io.feather.proxy.Writer , arrow::matlab::io::feather::proxy::Writer);
REGISTER_PROXY(arrow.io.feather.proxy.Reader , arrow::matlab::io::feather::proxy::Reader);

return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name};
};
Expand Down
52 changes: 52 additions & 0 deletions matlab/src/matlab/+arrow/+internal/+io/+feather/Reader.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
% Licensed to the Apache Software Foundation (ASF) under one or more
% contributor license agreements. See the NOTICE file distributed with
% this work for additional information regarding copyright ownership.
% The ASF licenses this file to you under the Apache License, Version
% 2.0 (the "License"); you may not use this file except in compliance
% with the License. You may obtain a copy of the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS,
% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
% implied. See the License for the specific language governing
% permissions and limitations under the License.

classdef Reader
%READER An internal Reader object for reading Feather files.

properties (GetAccess=public, SetAccess=private, Hidden)
Proxy
end

properties (Dependent, SetAccess=private, GetAccess=public)
% Name of the file to read.
Filename
end

methods

function obj = Reader(filename)
arguments
filename(1, 1) {mustBeNonmissing, mustBeNonzeroLengthText}
end

args = struct(Filename=filename);
obj.Proxy = arrow.internal.proxy.create("arrow.io.feather.proxy.Reader", args);
end

function T = read(obj)
recordBatchProxyID = obj.Proxy.read();
proxy = libmexclass.proxy.Proxy(Name="arrow.tabular.proxy.RecordBatch", ID=recordBatchProxyID);
recordBatch = arrow.tabular.RecordBatch(proxy);
T = recordBatch.toMATLAB();
end

function filename = get.Filename(obj)
filename = obj.Proxy.getFilename();
end

end

end
4 changes: 2 additions & 2 deletions matlab/src/matlab/+arrow/+internal/+io/+feather/Writer.m
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
end

args = struct(Filename=filename);
proxyName = "arrow.io.feather.proxy.FeatherWriter";
proxyName = "arrow.io.feather.proxy.Writer";
obj.Proxy = arrow.internal.proxy.create(proxyName, args);
end

Expand All @@ -45,4 +45,4 @@ function write(obj, T)
filename = obj.Proxy.getFilename();
end
end
end
end
5 changes: 5 additions & 0 deletions matlab/test/arrow/io/feather/tRoundTrip.m
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,9 @@ function Basic(testCase)
function featherwrite(T, filename)
writer = arrow.internal.io.feather.Writer(filename);
writer.write(T);
end

function T = featherread(filename)
reader = arrow.internal.io.feather.Reader(filename);
T = reader.read();
end
4 changes: 2 additions & 2 deletions matlab/tools/cmake/BuildMatlabArrowInterface.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/timestamp_type.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/field.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/feather_writer.cc")

"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/writer.cc"
"${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/reader.cc")


set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy")
Expand Down

0 comments on commit 152be67

Please sign in to comment.