Skip to content

Commit

Permalink
Merge remote-tracking branch 'ups/develop' into fix/op/elewise_add
Browse files Browse the repository at this point in the history
  • Loading branch information
tensor-tang committed Aug 20, 2018
2 parents 0507f7b + 99f74be commit 82bb917
Show file tree
Hide file tree
Showing 23 changed files with 376 additions and 172 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ elseif()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
endif()

include(flags) # set paddle compile flags
include(cudnn) # set cudnn libraries, must before configure
include(cupti)
include(configure) # add paddle env configuration
Expand All @@ -220,7 +221,6 @@ include(package) # set paddle packages
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(rdma) # set rdma libraries
include(flags) # set paddle compile flags
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries
Expand Down
10 changes: 5 additions & 5 deletions cmake/configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,16 @@ if(NOT WITH_PROFILER)
endif(NOT WITH_PROFILER)

if(NOT CMAKE_CROSSCOMPILING)
if(WITH_AVX AND AVX_FOUND)
if(WITH_AVX AND AVX512F_FOUND)
set(SIMD_FLAG ${AVX512F_FLAG})
elseif(WITH_AVX AND AVX2_FOUND)
set(SIMD_FLAG ${AVX2_FLAG})
elseif(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
endif()
if(UNIX AND NOT APPLE)
# except apple from nix*Os family
set(LINUX TRUE)
endif(UNIX AND NOT APPLE)

if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
Expand Down
21 changes: 19 additions & 2 deletions cmake/cudnn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,25 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
$ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib
/usr/lib)
find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
/usr/lib
${CUDA_TOOLKIT_ROOT_DIR}
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
)
set(CUDNN_LIB_NAME "")
if (LINUX)
set(CUDNN_LIB_NAME "libcudnn.so")
endif(LINUX)

if(WIN32)
# only support cudnn7
set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
endif(WIN32)

if(Apple)
set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
endif(Apple)

find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
NO_DEFAULT_PATH
DOC "Path to cuDNN library.")
Expand Down
5 changes: 5 additions & 0 deletions cmake/flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,11 @@ else()
${GPU_COMMON_FLAGS})
endif()

if(UNIX AND NOT APPLE)
# except apple from nix*Os family
set(LINUX TRUE)
endif(UNIX AND NOT APPLE)


foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag})
Expand Down
14 changes: 13 additions & 1 deletion cmake/simd.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
set(SSE3_FLAG "-msse3")
set(AVX_FLAG "-mavx")
set(AVX2_FLAG "-mavx2")
set(AVX512F_FLAG "-mavx512f")
elseif(MSVC)
set(MMX_FLAG "/arch:MMX")
set(SSE2_FLAG "/arch:SSE2")
Expand Down Expand Up @@ -81,5 +82,16 @@ int main()
return 0;
}" AVX2_FOUND)

# Check AVX512F
set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m512i a = _mm512_undefined_epi32();
return 0;
}" AVX512F_FOUND)

set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
3 changes: 2 additions & 1 deletion paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,13 @@ else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
endif()


if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fast_threaded_ssa_graph_executor)
endif() # NOT WIN32

cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
Expand Down
34 changes: 21 additions & 13 deletions paddle/fluid/framework/program_desc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
auto all_ops = blocks_[block_id]->AllOps();
for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
auto &op = all_ops[op_id];

for (const std::string &attr_name : op->AttrNames()) {
if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
int sub_block_id =
o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
} else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
std::vector<int> sub_block_ids =
o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
std::vector<BlockDesc *> block_descs;
for (int block_id : sub_block_ids) {
block_descs.push_back(MutableBlock(block_id));
}
op->SetBlocksAttr(attr_name, block_descs);
}
}
}
Expand All @@ -68,24 +77,16 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {

ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
desc_ = desc;
for (auto &block_desc : *desc_.mutable_blocks()) {
blocks_.emplace_back(new BlockDesc(this, &block_desc));
}
for (auto &block : blocks_) {
for (auto *op : block->AllOps()) {
for (const auto &attr : op->Proto()->attrs()) {
if (attr.type() == proto::AttrType::BLOCK) {
size_t blk_idx = attr.block_idx();
op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
}
}
}
}
InitFromProto();
}

ProgramDesc::ProgramDesc(const std::string &binary_str) {
PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
"Fail to parse program_desc from binary string.");
InitFromProto();
}

void ProgramDesc::InitFromProto() {
for (auto &block_desc : *desc_.mutable_blocks()) {
blocks_.emplace_back(new BlockDesc(this, &block_desc));
}
Expand All @@ -95,6 +96,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
if (attr.type() == proto::AttrType::BLOCK) {
size_t blk_idx = attr.block_idx();
op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
} else if (attr.type() == proto::AttrType::BLOCKS) {
auto blks_idx = attr.blocks_idx();
std::vector<BlockDesc *> block_descs;
for (int blk_idx : blks_idx) {
block_descs.push_back(this->MutableBlock(blk_idx));
}
op->SetBlocksAttr(attr.name(), block_descs);
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/program_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class ProgramDesc {
void SetFetchHolderName(const std::string &fetch_holder_name);

private:
void InitFromProto();

proto::ProgramDesc desc_;

std::vector<std::unique_ptr<BlockDesc>> blocks_;
Expand Down
26 changes: 25 additions & 1 deletion paddle/fluid/framework/program_desc_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) {
out->SetType(proto::VarType::LOD_TENSOR);
op->SetOutput("Y", {out->Name()});

BlockDesc* new_block = program.AppendBlock(*global_block);
op = new_block->AppendOp();
op->SetType("mul");

op = global_block->AppendOp();
op->SetType("op_with_subblock");
op->SetAttr("sub_block", new_block);

std::vector<BlockDesc*> sub_blocks;
sub_blocks.push_back(program.AppendBlock(*global_block));
sub_blocks.push_back(program.AppendBlock(*global_block));
op->SetAttr("sub_blocks", sub_blocks);

ProgramDesc program_copy(program);

auto* global_block_copy = program_copy.MutableBlock(0);
Expand All @@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) {
assert_same_var("Y", y);
assert_same_var("Out", out);

bool found_sub_block = false;
bool found_sub_blocks = false;
for (size_t i = 0; i < global_block->OpSize(); ++i) {
auto op_origin = global_block->Op(i);
auto op_copy = global_block_copy->Op(i);
Expand All @@ -74,8 +89,17 @@ TEST(ProgramDesc, copy_ctor) {

ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
op_origin->Proto()->SerializeAsString());
}

if (op->Type() == "op_with_subblock") {
ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
found_sub_block = true;

ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
found_sub_blocks = true;
}
}
ASSERT_TRUE(found_sub_block);
ASSERT_TRUE(found_sub_blocks);
// Not check block's protostr are same it because the order of vars could be
// different and it is correct.
}
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/inference/api/high_level_api_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ config.model_dir = "xxx";
config.use_gpu = false;
// 创建一个原生的 PaddlePredictor
auto predictor =
paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
// 创建输入 tensor
int64_t data[4] = {1, 2, 3, 4};
paddle::PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
.data = paddle::PaddleBuf(data, sizeof(data)),
.dtype = paddle::PaddleDType::INT64};
// 创建输出 tensor,输出 tensor 的内存可以复用
std::vector<paddle::PaddleTensor> outputs;
// 执行预测
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/inference/tensorrt/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
add_subdirectory(convert)
19 changes: 15 additions & 4 deletions paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ function(op_library TARGET)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
endif()

#remove windows unsupported op
if (WIN32)
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return()
endif()
endforeach()
endif(WIN32)

list(LENGTH op_library_DEPS op_library_DEPS_len)
if (${op_library_DEPS_len} GREATER 0)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
Expand Down Expand Up @@ -181,19 +190,19 @@ function(op_library TARGET)
endfunction()

add_subdirectory(math)
if (NOT WIN32)
add_subdirectory(nccl)

if(WITH_GPU)
op_library(nccl_op DEPS nccl_common)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
else()
set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif()
endif() # NOT WIN32

set(DISTRIBUTE_DEPS "")
if(WITH_DISTRIBUTE)
add_subdirectory(distributed)

set(DISTRIBUTE_DEPS "")
if(WITH_GRPC)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
Expand Down Expand Up @@ -222,7 +231,7 @@ if(WITH_DISTRIBUTE)
#set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
# listen_and_serv_op sum_op executor SERIAL)
if(WITH_GPU)
if(WITH_GPU AND NOT WIN32)
set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
if(WITH_GRPC)
Expand All @@ -233,7 +242,7 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else()
set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
endif()
endif() # WITH_GPU AND NOT WIN32
else()
set(DEPS_OPS ${DEPS_OPS} checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
endif()
Expand Down Expand Up @@ -331,5 +340,7 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
if(NOT WIN32)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
Loading

0 comments on commit 82bb917

Please sign in to comment.