Skip to content

Commit

Permalink
[DataTransform] Add TextDataTransform and miscellaneous Modifications (
Browse files Browse the repository at this point in the history
…#369)

* Remove ambiguity for function SetPredMargin() by adding this ref

* tmp save

* Add Support for Texttransformer in DataTransform, in support of MultiColumnTfIdfVectorizer

* Reformatting with clang-format-10

* Reformatting with clang-format-10 without googlestyle

* Migrating C based function to STL equivalent ones. reformatting for some onvention mismatches

Co-authored-by: Ubuntu <[email protected]>
  • Loading branch information
CloudManX and Ubuntu authored Sep 29, 2021
1 parent ea8e928 commit 373bfad
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 44 deletions.
77 changes: 56 additions & 21 deletions include/dlr_data_transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <tvm/runtime/ndarray.h>

#include <ctime>
#include <memory>
#include <nlohmann/json.hpp>

#include "dlr_common.h"
Expand All @@ -19,15 +20,17 @@ class DLR_DLL Transformer {
virtual void MapToNDArray(const nlohmann::json& input_json, const nlohmann::json& transform,
tvm::runtime::NDArray& input_array) const = 0;

/*! \brief Helper function for TransformInput. Allocates NDArray to store mapped input data. */
/*! \brief Helper function for TransformInput. Allocates NDArray to store
* mapped input data. */
virtual void InitNDArray(const nlohmann::json& input_json, const nlohmann::json& transform,
DLDataType dtype, DLContext ctx,
tvm::runtime::NDArray& input_array) const;
};

class DLR_DLL FloatTransformer : public Transformer {
private:
/*! \brief When there is a value stof cannot convert to float, this value is used. */
/*! \brief When there is a value stof cannot convert to float, this value is
* used. */
const float kBadValue = std::numeric_limits<float>::quiet_NaN();

public:
Expand All @@ -37,7 +40,8 @@ class DLR_DLL FloatTransformer : public Transformer {

class DLR_DLL CategoricalStringTransformer : public Transformer {
private:
/*! \brief When there is no mapping entry for TransformInput, this value is used. */
/*! \brief When there is no mapping entry for TransformInput, this value is
* used. */
const float kMissingValue = -1.0f;

public:
Expand All @@ -47,11 +51,11 @@ class DLR_DLL CategoricalStringTransformer : public Transformer {

class DLR_DLL DateTimeTransformer : public Transformer {
private:
/*! \brief Number of columns defined by Autopilot Sagemaker-Scikit-Learn-Extension for
* DateTimeVectorizer */
/*! \brief Number of columns defined by Autopilot
* Sagemaker-Scikit-Learn-Extension for DateTimeVectorizer */
const int kNumDateTimeCols = 7;

const std::vector<std::string> datetime_templates = {
const std::array<std::string, 10> datetime_templates = {
"%h %dth, %Y, %I:%M:%S%p",
"%h %dth, %Y, %I:%M%p",
"%h %dth, %Y, %I%p",
Expand All @@ -68,8 +72,6 @@ class DLR_DLL DateTimeTransformer : public Transformer {
* YEAR, HOUR, MINUTE, SECOND, MONTH, WEEK_OF_YEAR*/
void DigitizeDateTime(std::string& input_string, std::vector<int64_t>& datetime_digits) const;

bool isLeap(int64_t year) const;

int64_t GetWeekNumber(std::tm tm) const;

public:
Expand All @@ -80,16 +82,46 @@ class DLR_DLL DateTimeTransformer : public Transformer {
DLDataType dtype, DLContext ctx, tvm::runtime::NDArray& input_array) const;
};

class DLR_DLL TextTransformer : public Transformer {
public:
TextTransformer();

virtual void MapToNDArray(const nlohmann::json& input_json, const nlohmann::json& transform,
tvm::runtime::NDArray& input_array) const override;

virtual void InitNDArray(const nlohmann::json& input_json, const nlohmann::json& transform,
DLDataType dtype, DLContext ctx,
tvm::runtime::NDArray& input_array) const override;

inline void SetIndex(int idx) const { column_idx_ = idx; };

private:
const static int kCharNum = 256;
std::string delims;
std::unique_ptr<std::vector<std::unordered_map<std::string, int>>> vocab_to_cols_;
std::unique_ptr<std::unordered_map<int, int>> col_to_id_;

mutable int column_idx_;

static void LowerStr(std::string& data) {
std::transform(data.begin(), data.end(), data.begin(),
[](unsigned char c) { return std::tolower(c); });
}
};

/*! \brief Handles transformations of input and output data. */
class DLR_DLL DataTransform {
private:
/*! \brief When there is no mapping entry for TransformOutput, this value is used. */
/*! \brief When there is no mapping entry for TransformOutput, this value is
* used. */
const char* kUnknownLabel = "<unseen_label>";

/*! \brief Buffers to store transformed outputs. Maps output index to transformed data. */
/*! \brief Buffers to store transformed outputs. Maps output index to
* transformed data. */
std::unordered_map<int, std::string> transformed_outputs_;

/*! \brief Helper function for TransformInput. Interpets 1-D char input as JSON. */
/*! \brief Helper function for TransformInput. Interpets 1-D char input as
* JSON. */
nlohmann::json GetAsJson(const int64_t* shape, const void* input, int dim) const;

const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<Transformer>>>
Expand All @@ -110,21 +142,24 @@ class DLR_DLL DataTransform {
/*! \brief Returns true if the output requires a data transform */
bool HasOutputTransform(const nlohmann::json& metadata, int index) const;

/*! \brief Transform string input using CategoricalString input DataTransform. When
* this map is present in the metadata file, the user is expected to provide string inputs to
* SetDLRInput as 1-D vector. This function will interpret the user's input as JSON, apply the
* mapping to convert strings to numbers, and produce a numeric NDArray which can be given to TVM
* for the model input.
/*! \brief Transform string input using CategoricalString input DataTransform.
* When this map is present in the metadata file, the user is expected to
* provide string inputs to SetDLRInput as 1-D vector. This function will
* interpret the user's input as JSON, apply the mapping to convert strings to
* numbers, and produce a numeric NDArray which can be given to TVM for the
* model input.
*/
void TransformInput(const nlohmann::json& metadata, const int64_t* shape, const void* input,
int dim, const std::vector<DLDataType>& dtypes, DLContext ctx,
std::vector<tvm::runtime::NDArray>* tvm_inputs) const;

/*! \brief Transform integer output using CategoricalString output DataTransform. When this map is
* present in the metadata file, the model's output will be converted from an integer array to a
* JSON string, where numbers are mapped back to strings according to the CategoricalString map in
* the metadata file. A buffer is created to store the transformed output, and it's contents can
* be accessed using the GetOutputShape, GetOutputSizeDim, GetOutput and GetOutputPtr methods.
/*! \brief Transform integer output using CategoricalString output
* DataTransform. When this map is present in the metadata file, the model's
* output will be converted from an integer array to a JSON string, where
* numbers are mapped back to strings according to the CategoricalString map
* in the metadata file. A buffer is created to store the transformed output,
* and it's contents can be accessed using the GetOutputShape,
* GetOutputSizeDim, GetOutput and GetOutputPtr methods.
*/
void TransformOutput(const nlohmann::json& metadata, int index,
const tvm::runtime::NDArray& output_array);
Expand Down
2 changes: 1 addition & 1 deletion include/dlr_treelite.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class DLR_DLL TreeliteModel : public DLRModel {
virtual void SetNumThreads(int threads) override;
virtual void UseCPUAffinity(bool use) override;

inline void SetPredMargin(bool pred_margin) { pred_margin = int(pred_margin); };
inline void SetPredMargin(bool pred_margin) { this->pred_margin = int(pred_margin); };
};

} // namespace dlr
Expand Down
94 changes: 77 additions & 17 deletions src/dlr_data_transform.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ void DataTransform::TransformInput(const nlohmann::json& metadata, const int64_t
auto it = GetTransformerMap()->find(transformer_type);
CHECK(it != GetTransformerMap()->end())
<< transformer_type << " is not a valid DataTransform type.";
const auto transformer = it->second;
const Transformer* transformer = &(*it->second);

transformer->InitNDArray(input_json, transforms[i], dtypes[i], ctx, tvm_inputs->at(i));
transformer->MapToNDArray(input_json, transforms[i], tvm_inputs->at(i));
Expand Down Expand Up @@ -61,9 +61,10 @@ void Transformer::InitNDArray(const nlohmann::json& input_json, const nlohmann::
std::vector<int64_t> arr_shape = {static_cast<int64_t>(input_json.size()),
static_cast<int64_t>(input_json[0].size())};
CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1)
<< "DataTransform CategoricalString is only supported for float32 inputs.";
// Only allocate new buffer if not initialized or if shape or dtype has changed. Context will
// always match.
<< "DataTransform CategoricalString is only supported for float32 "
"inputs.";
// Only allocate new buffer if not initialized or if shape or dtype has
// changed. Context will always match.
if (input_array == empty_ || input_array.Shape() != arr_shape) {
input_array = tvm::runtime::NDArray::Empty(arr_shape, dtype, ctx);
}
Expand Down Expand Up @@ -98,8 +99,9 @@ void CategoricalStringTransformer::MapToNDArray(const nlohmann::json& input_json
tvm::runtime::NDArray& input_array) const {
const nlohmann::json& mapping = transform["Map"];
DLTensor* input_tensor = const_cast<DLTensor*>(input_array.operator->());
// Writing directly to the DLTensor will only work for CPU context. For other contexts, we would
// need to create an intermediate buffer on CPU and copy that to the context.
// Writing directly to the DLTensor will only work for CPU context. For other
// contexts, we would need to create an intermediate buffer on CPU and copy
// that to the context.
CHECK_EQ(input_tensor->ctx.device_type, DLDeviceType::kDLCPU)
<< "DataTransform CategoricalString is only supported for CPU.";
CHECK_EQ(input_json[0].size(), mapping.size())
Expand Down Expand Up @@ -146,16 +148,6 @@ void DateTimeTransformer::InitNDArray(const nlohmann::json& input_json,
}
}

bool DateTimeTransformer::isLeap(int64_t year) const {
if (year % 4 == 0) {
if (year % 100 == 0 && year % 400 != 0)
return false;
else
return true;
}
return false;
}

int64_t DateTimeTransformer::GetWeekNumber(std::tm tm) const {
// mktime(&tm);
int day_of_the_week = (tm.tm_wday + 6) % 7;
Expand Down Expand Up @@ -223,6 +215,7 @@ DataTransform::GetTransformerMap() const {
map->emplace("Float", std::make_shared<FloatTransformer>());
map->emplace("CategoricalString", std::make_shared<CategoricalStringTransformer>());
map->emplace("DateTime", std::make_shared<DateTimeTransformer>());
map->emplace("Text", std::make_shared<TextTransformer>());
return map;
}

Expand All @@ -245,6 +238,71 @@ nlohmann::json DataTransform::TransformOutputHelper1D(const nlohmann::json& tran
return output_json;
}

TextTransformer::TextTransformer() {
vocab_to_cols_ = std::make_unique<std::vector<std::unordered_map<std::string, int>>>();
col_to_id_ = std::make_unique<std::unordered_map<int, int>>();

for (size_t i = 1; i < kCharNum; i++) {
if (!isalnum(i)) {
delims += i;
}
}
}

void TextTransformer::InitNDArray(const nlohmann::json& input_json, const nlohmann::json& transform,
DLDataType dtype, DLContext ctx,
tvm::runtime::NDArray& input_array) const {
auto vocabularies = transform["Vocabularies"].get<std::vector<std::string>>();
auto text_col = transform["TextCol"].get<int>();
SetIndex(text_col);

std::unordered_map<std::string, int> vocab_to_col;
for (size_t i = 0; i < vocabularies.size(); ++i) {
vocab_to_col[vocabularies[i]] = i;
}
col_to_id_->emplace(text_col, vocab_to_cols_->size());
vocab_to_cols_->push_back(vocab_to_col);

std::vector<int64_t> arr_shape = {static_cast<int64_t>(input_json.size()),
static_cast<int64_t>(vocabularies.size())};

CHECK(dtype.code == kDLFloat && dtype.bits == 32 && dtype.lanes == 1)
<< "DataTransform TextTransformer is only supported for float32 inputs.";
if (input_array == empty_ || input_array.Shape() != arr_shape) {
input_array = tvm::runtime::NDArray::Empty(arr_shape, dtype, ctx);
}
}

void TextTransformer::MapToNDArray(const nlohmann::json& input_json,
const nlohmann::json& transform,
tvm::runtime::NDArray& input_array) const {
const nlohmann::json& vocab = transform["Vocabularies"];
DLTensor* input_tensor = const_cast<DLTensor*>(input_array.operator->());
CHECK_EQ(input_tensor->ctx.device_type, DLDeviceType::kDLCPU)
<< "DataTransform TfIdfVectorizer is only supported for CPU.";

int id = col_to_id_->at(column_idx_);
std::unordered_map<std::string, int>& vocab_to_col = vocab_to_cols_->at(id);
int num_col = vocab_to_col.size();
float* data = static_cast<float*>(input_tensor->data);

for (size_t r = 0; r < input_json.size(); ++r) {
std::fill_n(data + r * num_col, num_col, 0.f);
std::string entry = input_json[r][column_idx_].get_ref<const std::string&>();
LowerStr(entry);
std::size_t str_pos = 0;
std::string token;
while ((str_pos = entry.find_first_of(delims)) != std::string::npos) {
token = entry.substr(0, str_pos);
entry.erase(0, str_pos + 1);
if (vocab_to_col.find(token) != vocab_to_col.end()) {
int out_index = r * num_col + vocab_to_col[token];
data[out_index] += 1;
}
}
}
}

template <typename T>
nlohmann::json DataTransform::TransformOutputHelper2D(const nlohmann::json& transform,
const T* data,
Expand Down Expand Up @@ -273,7 +331,9 @@ void DataTransform::TransformOutput(const nlohmann::json& metadata, int index,
} else if (shape.size() == 2) {
output_json = TransformOutputHelper2D<int>(transform, static_cast<int*>(tensor->data), shape);
} else {
throw dmlc::Error("DataTransform CategoricalString is only supported for 1-D or 2-D inputs.");
throw dmlc::Error(
"DataTransform CategoricalString is only supported for 1-D or 2-D "
"inputs.");
}
transformed_outputs_[index] = output_json.dump();
}
Expand Down
Loading

0 comments on commit 373bfad

Please sign in to comment.