Skip to content

Commit

Permalink
downpour_worker增加try_catch机制,打印program所有参数 (PaddlePaddle#24700)
Browse files Browse the repository at this point in the history
* test=develop, add try_catch for debug
  • Loading branch information
123malin authored Jun 3, 2020
1 parent b2ba830 commit 9d2bd0a
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 5 deletions.
7 changes: 3 additions & 4 deletions paddle/fluid/framework/device_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
}

template <typename T>
std::string PrintLodTensorType(LoDTensor* tensor, int64_t start, int64_t end) {
std::string PrintLodTensorType(Tensor* tensor, int64_t start, int64_t end) {
auto count = tensor->numel();
if (start < 0 || end > count) {
VLOG(3) << "access violation";
Expand All @@ -38,8 +38,7 @@ std::string PrintLodTensorType(LoDTensor* tensor, int64_t start, int64_t end) {
return os.str();
}

std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start,
int64_t end) {
std::string PrintLodTensorIntType(Tensor* tensor, int64_t start, int64_t end) {
auto count = tensor->numel();
if (start < 0 || end > count) {
VLOG(3) << "access violation";
Expand All @@ -52,7 +51,7 @@ std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start,
return os.str();
}

std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) {
std::string PrintLodTensor(Tensor* tensor, int64_t start, int64_t end) {
std::string out_val;
if (tensor->type() == proto::VarType::FP32) {
out_val = PrintLodTensorType<float>(tensor, start, end);
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/framework/device_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ limitations under the License. */
namespace paddle {
namespace framework {

std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end);
std::string PrintLodTensor(Tensor* tensor, int64_t start, int64_t end);
std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index);
bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);

Expand Down Expand Up @@ -171,6 +171,7 @@ class DeviceWorker {
bool need_dump_field_;
const std::vector<std::string>* dump_param_;
const std::vector<std::string>* dump_fields_;
std::vector<std::string> all_param_;

int dump_mode_ = 0;
int dump_interval_ = 10000;
Expand Down
43 changes: 43 additions & 0 deletions paddle/fluid/framework/downpour_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,50 @@ void DownpourWorker::TrainFiles() {
}
}
if (!need_skip) {
#ifdef PADDLE_WITH_PSLIB
try {
op->Run(*thread_scope_, place_);
} catch (std::exception& e) {
fprintf(stderr, "error message: %s\n", e.what());
auto& ins_id_vec = device_reader_->GetInsIdVec();
size_t batch_size = device_reader_->GetCurBatchSize();
std::string s = "";
for (auto& ins_id : ins_id_vec) {
if (s != "") s += ",";
s += ins_id;
}
fprintf(stderr, "batch_size: %zu, ins_ids_vec: %s\n", batch_size,
s.c_str());
s = "";
for (auto& param : all_param_) {
Variable* var = thread_scope_->FindVar(param);
if (var == nullptr) {
continue;
}
Tensor* tensor = nullptr;
int64_t len = 0;
if (var->IsType<framework::LoDTensor>()) {
tensor = var->GetMutable<LoDTensor>();
len = tensor->numel();
} else if (var->IsType<SelectedRows>()) {
auto selected_rows = var->GetMutable<SelectedRows>();
tensor = selected_rows->mutable_value();
len = tensor->numel();
}
if (!tensor->IsInitialized()) {
continue;
}
s += param + ":" + std::to_string(len) + ":";
s += PrintLodTensor(tensor, 0, len);
fprintf(stderr, "%s\n", s.c_str());
fflush(stderr);
s = "";
}
throw e;
}
#else
op->Run(*thread_scope_, place_);
#endif
}
}

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/hogwild_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ void HogwildWorker::CreateThreadScope(const ProgramDesc &program) {
thread_scope_ = &root_scope_->NewScope();

for (auto &var : block.AllVars()) {
all_param_.push_back(var->Name());
if (var->Persistable()) {
auto *ptr = root_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
Expand Down

0 comments on commit 9d2bd0a

Please sign in to comment.