Skip to content

Commit

Permalink
[Other] faster_tokenizer->fast_tokenizer (PaddlePaddle#636)
Browse files Browse the repository at this point in the history
* faster_tokenizer->fast_tokenizer

* ErnieFasterTokenizer->ErnieFastTokenizer

* update the fastdeploy_init

Co-authored-by: Jason <[email protected]>
  • Loading branch information
joey12300 and jiangjiajun authored Nov 21, 2022
1 parent 3e1fc69 commit eeae48d
Show file tree
Hide file tree
Showing 14 changed files with 170 additions and 175 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -415,14 +415,14 @@ endif()
if(ANDROID OR IOS)
if(ENABLE_TEXT)
set(ENABLE_TEXT OFF CACHE BOOL "Force ENABLE_TEXT OFF" FORCE)
message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support faster_tokenizer with Android/IOS now.")
message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support fast_tokenizer with Android/IOS now.")
endif()
endif()

if(ENABLE_TEXT)
add_definitions(-DENABLE_TEXT)
list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_TEXT_SRCS})
include(${PROJECT_SOURCE_DIR}/cmake/faster_tokenizer.cmake)
include(${PROJECT_SOURCE_DIR}/cmake/fast_tokenizer.cmake)
endif()

if(ENABLE_PADDLE_FRONTEND)
Expand Down
8 changes: 4 additions & 4 deletions FastDeploy.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,10 @@ if (ENABLE_TEXT)
message(FATAL_ERROR "Not support fastdeploy text APIs with Android now!")
endif()
# Add dependency libs later
find_library(FASTER_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/lib NO_DEFAULT_PATH)
list(APPEND FASTDEPLOY_LIBS ${FASTER_TOKENIZER_LIB})
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/include)
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/third_party/include)
find_library(FAST_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/lib NO_DEFAULT_PATH)
list(APPEND FASTDEPLOY_LIBS ${FAST_TOKENIZER_LIB})
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/include)
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/third_party/include)
endif()

if(ENABLE_PADDLE_FRONTEND)
Expand Down
108 changes: 108 additions & 0 deletions cmake/fast_tokenizer.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@


# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)

set(FASTTOKENIZER_PROJECT "extern_fast_tokenizer")
set(FASTTOKENIZER_PREFIX_DIR ${THIRD_PARTY_PATH}/fast_tokenizer)
set(FASTTOKENIZER_SOURCE_DIR
${THIRD_PARTY_PATH}/fast_tokenizer/src/${FASTTOKENIZER_PROJECT})
set(FASTTOKENIZER_INSTALL_DIR ${THIRD_PARTY_PATH}/install/fast_tokenizer)
set(FASTTOKENIZER_INC_DIR
"${FASTTOKENIZER_INSTALL_DIR}/include"
"${FASTTOKENIZER_INSTALL_DIR}/third_party/include"
CACHE PATH "fast_tokenizer include directory." FORCE)
set(FASTTOKENIZER_LIB_DIR
"${FASTTOKENIZER_INSTALL_DIR}/lib/"
CACHE PATH "fast_tokenizer lib directory." FORCE)
set(FASTTOKENIZER_THIRD_LIB_DIR
"${FASTTOKENIZER_INSTALL_DIR}/third_party/lib/"
CACHE PATH "fast_tokenizer lib directory." FORCE)
set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
"${FASTTOKENIZER_LIB_DIR}")

include_directories(${FASTTOKENIZER_INC_DIR})

# Set lib path
if(WIN32)
set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/core_tokenizers.lib"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")
set(ICUDT_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icudt.lib")
set(ICUUC_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icuuc.lib")

elseif(APPLE)
set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
else()

set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.so"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")
endif(WIN32)

set(FASTTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/fast_tokenizer/")
set(FASTTOKENIZER_VERSION "1.0.0")

# Set download url
if(WIN32)
set(FASTTOKENIZER_FILE "fast_tokenizer-win-x64-${FASTTOKENIZER_VERSION}.zip")
if(NOT CMAKE_CL_64)
set(FASTTOKENIZER_FILE "fast_tokenizer-win-x86-${FASTTOKENIZER_VERSION}.zip")
endif()
elseif(APPLE)
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
set(FASTTOKENIZER_FILE "fast_tokenizer-osx-arm64-${FASTTOKENIZER_VERSION}.tgz")
else()
set(FASTTOKENIZER_FILE "fast_tokenizer-osx-x86_64-${FASTTOKENIZER_VERSION}.tgz")
endif()
else()
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
set(FASTTOKENIZER_FILE "fast_tokenizer-linux-aarch64-${FASTTOKENIZER_VERSION}.tgz")
else()
set(FASTTOKENIZER_FILE "fast_tokenizer-linux-x64-${FASTTOKENIZER_VERSION}.tgz")
endif()
endif()
set(FASTTOKENIZER_URL "${FASTTOKENIZER_URL_BASE}${FASTTOKENIZER_FILE}")

ExternalProject_Add(
${FASTTOKENIZER_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
URL ${FASTTOKENIZER_URL}
PREFIX ${FASTTOKENIZER_PREFIX_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND
${CMAKE_COMMAND} -E copy_directory ${FASTTOKENIZER_SOURCE_DIR} ${FASTTOKENIZER_INSTALL_DIR}
BUILD_BYPRODUCTS ${FASTTOKENIZER_COMPILE_LIB})

add_library(fast_tokenizer STATIC IMPORTED GLOBAL)
set_property(TARGET fast_tokenizer PROPERTY IMPORTED_LOCATION ${FASTTOKENIZER_COMPILE_LIB})
add_dependencies(fast_tokenizer ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS fast_tokenizer)

if (WIN32)
add_library(icudt STATIC IMPORTED GLOBAL)
set_property(TARGET icudt PROPERTY IMPORTED_LOCATION ${ICUDT_LIB})
add_dependencies(icudt ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icudt)

add_library(icuuc STATIC IMPORTED GLOBAL)
set_property(TARGET icuuc PROPERTY IMPORTED_LOCATION ${ICUUC_LIB})
add_dependencies(icuuc ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icuuc)
endif()
108 changes: 0 additions & 108 deletions cmake/faster_tokenizer.cmake

This file was deleted.

14 changes: 7 additions & 7 deletions examples/text/ernie-3.0/cpp/seq_cls_infer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
#include "fastdeploy/function/softmax.h"
#include "fastdeploy/runtime.h"
#include "fastdeploy/utils/path.h"
#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h"
#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
#include "gflags/gflags.h"

using namespace paddlenlp;
using namespace faster_tokenizer::tokenizers_impl;
using namespace fast_tokenizer::tokenizers_impl;
#ifdef WIN32
const char sep = '\\';
#else
Expand Down Expand Up @@ -124,19 +124,19 @@ struct SeqClsResult {

struct ErnieForSequenceClassificationPredictor {
fastdeploy::Runtime runtime_;
ErnieFasterTokenizer tokenizer_;
ErnieFastTokenizer tokenizer_;
ErnieForSequenceClassificationPredictor(
const fastdeploy::RuntimeOption& option,
const ErnieFasterTokenizer& tokenizer)
const ErnieFastTokenizer& tokenizer)
: tokenizer_(tokenizer) {
runtime_.Init(option);
}

bool Preprocess(const std::vector<std::string>& texts,
const std::vector<std::string>& texts_pair,
std::vector<fastdeploy::FDTensor>* inputs) {
std::vector<faster_tokenizer::core::Encoding> encodings;
std::vector<faster_tokenizer::core::EncodeInput> text_pair_input;
std::vector<fast_tokenizer::core::Encoding> encodings;
std::vector<fast_tokenizer::core::EncodeInput> text_pair_input;
// 1. Tokenize the text or (text, text_pair)
if (texts_pair.empty()) {
for (int i = 0; i < texts.size(); ++i) {
Expand Down Expand Up @@ -242,7 +242,7 @@ int main(int argc, char* argv[]) {
return -1;
}
}
ErnieFasterTokenizer tokenizer(vocab_path);
ErnieFastTokenizer tokenizer(vocab_path);

ErnieForSequenceClassificationPredictor predictor(option, tokenizer);

Expand Down
2 changes: 1 addition & 1 deletion examples/text/ernie-3.0/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
faster_tokenizer
fast-tokenizer-python
paddlenlp
2 changes: 1 addition & 1 deletion examples/text/ernie-3.0/python/seq_cls_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import distutils.util

import numpy as np
import faster_tokenizer
import fast_tokenizer
from paddlenlp.transformers import AutoTokenizer
import fastdeploy as fd

Expand Down
Loading

0 comments on commit eeae48d

Please sign in to comment.