-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f7c34d1
commit e82f7ec
Showing
98 changed files
with
30,406 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+2.19 KB
ops_pytorch/2d_conv_random_k/__pycache__/fused_conv_random_k.cpython-36.pyc
Binary file not shown.
Binary file added
BIN
+2.19 KB
ops_pytorch/2d_conv_random_k/__pycache__/fused_conv_select_k.cpython-36.pyc
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
#include <cstdio> | ||
#include <ctime> | ||
#include <cstring> // memset | ||
#include <cstdlib> // rand, RAND_MAX | ||
#include <cmath> // sqrtf | ||
#include <math.h> | ||
#include "tensorflow/core/framework/op.h" | ||
#include "tensorflow/core/framework/op_kernel.h" | ||
#include "tensorflow/core/framework/shape_inference.h" | ||
#include "tensorflow/core/framework/common_shape_fns.h" | ||
#include <cuda_runtime.h> | ||
using namespace tensorflow; | ||
|
||
|
||
REGISTER_OP("FusedConvRandomK") | ||
.Attr("H: int") | ||
.Attr("W: int") | ||
.Attr("npoints: int") | ||
.Attr("kernel_size_H: int") | ||
.Attr("kernel_size_W: int") | ||
.Attr("K: int") | ||
.Attr("flag_copy: int") | ||
.Attr("distance: float") | ||
.Attr("stride_h: int") | ||
.Attr("stride_w: int") | ||
.Input("xyz1: float32")//(batch_size,h,w,3) central points | ||
.Input("xyz2: float32")//(batch_size,h,w,3) queried points | ||
.Input("idx_n2: int32")//(batch_size, n, 2) | ||
.Input("random_hw: int32")//(kernel_h * kernel_w) ################################################## 1 dim | ||
.Output("selected_bhw_idx: int32")//(batch_size, npoints, K, 3) | ||
.Output("selected_valid_idx: float32")//(batch_size, npoints, K, 1) | ||
.Output("selected_valid_in_dis_idx: float32")//(batch_size, npoints, K, 1) | ||
.Output("selected_mask: float32")//(batch_size, npoints, K, 1) | ||
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { | ||
::tensorflow::shape_inference::ShapeHandle dims1; // (batch_size, H, W, 3) | ||
c->WithRank(c->input(1), 4, &dims1); | ||
|
||
int H, W, npoints, kernel_size_H, kernel_size_W, K, flag_copy; | ||
float distance; | ||
int stride_h, stride_w; | ||
|
||
TF_RETURN_IF_ERROR(c->GetAttr("H", &H)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("W", &W)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("npoints", &npoints)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("kernel_size_H", &kernel_size_H)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("kernel_size_W", &kernel_size_W)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("K", &K)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("flag_copy", &flag_copy)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("distance", &distance)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("stride_h", &stride_h)); | ||
TF_RETURN_IF_ERROR(c->GetAttr("stride_w", &stride_w)); | ||
|
||
|
||
::tensorflow::shape_inference::ShapeHandle output_bhw_idx = c->MakeShape({ c->Dim(dims1, 0), npoints, K, 3 }); // b n k c+3 | ||
::tensorflow::shape_inference::ShapeHandle output_valid_idx = c->MakeShape({ c->Dim(dims1, 0), npoints, kernel_size_H * kernel_size_W, 1 }); | ||
::tensorflow::shape_inference::ShapeHandle output_valid_in_dis_idx = c->MakeShape({ c->Dim(dims1, 0), npoints, kernel_size_H * kernel_size_W, 1 }); | ||
::tensorflow::shape_inference::ShapeHandle output_mask = c->MakeShape({ c->Dim(dims1, 0), npoints, K, 1 }); | ||
c->set_output(0, output_bhw_idx); | ||
c->set_output(1, output_valid_idx); | ||
c->set_output(2, output_valid_in_dis_idx); | ||
c->set_output(3, output_mask); | ||
return Status::OK(); | ||
}); | ||
|
||
|
||
|
||
|
||
|
||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||
|
||
|
||
|
||
void FusedConvRandomKLauncher(int batch_size, int H, int W, int npoints, int kernel_size_H, int kernel_size_W, int K, int flag_copy, float distance, int stride_h, int stride_w, const float *xyz1, const float *xyz2, const int *idx_n2, const int *random_hw, int *selected_bhw_idx, float *valid_idx, float *valid_in_dis_idx, float *selected_mask, int small_h, int small_w); | ||
|
||
class FusedConvRandomKGpuOp : public OpKernel { | ||
public: | ||
explicit FusedConvRandomKGpuOp(OpKernelConstruction* context) : OpKernel(context) { | ||
OP_REQUIRES_OK(context, context->GetAttr("npoints", &npoints_)); | ||
OP_REQUIRES(context, npoints_ > 0, errors::InvalidArgument("FusedConv expects positive npoints")); | ||
|
||
OP_REQUIRES_OK(context, context->GetAttr("kernel_size_H", &kernel_size_H_)); | ||
OP_REQUIRES(context, kernel_size_H_ > 0, errors::InvalidArgument("FusedConv expects positive kernel_size_H")); | ||
|
||
OP_REQUIRES_OK(context, context->GetAttr("kernel_size_W", &kernel_size_W_)); | ||
OP_REQUIRES(context, kernel_size_W_ > 0, errors::InvalidArgument("FusedConv expects positive kernel_size_W")); | ||
|
||
OP_REQUIRES_OK(context, context->GetAttr("K", &K_)); | ||
OP_REQUIRES(context, K_ > 0, errors::InvalidArgument("FusedConv expects positive K")); | ||
|
||
OP_REQUIRES_OK(context, context->GetAttr("flag_copy", &flag_copy_)); | ||
OP_REQUIRES(context, flag_copy_ > -1, errors::InvalidArgument("FusedConv expects 0 OR 1 flag_copy")); | ||
|
||
OP_REQUIRES_OK(context, context->GetAttr("distance", &distance_)); | ||
OP_REQUIRES(context, distance_ > 0, errors::InvalidArgument("FusedConv expects positive distance")); | ||
|
||
OP_REQUIRES_OK(context, context->GetAttr("stride_h", &stride_h_)); | ||
OP_REQUIRES(context, stride_h_ > 0, errors::InvalidArgument("FusedConv expects positive stride_h")); | ||
|
||
OP_REQUIRES_OK(context, context->GetAttr("stride_w", &stride_w_)); | ||
OP_REQUIRES(context, stride_w_ > 0, errors::InvalidArgument("FusedConv expects positive stride_w")); | ||
|
||
} | ||
|
||
void Compute(OpKernelContext* context) override { | ||
|
||
const Tensor& xyz1_tensor = context->input(0); | ||
OP_REQUIRES(context, xyz1_tensor.dims() == 4 && xyz1_tensor.shape().dim_size(3) == 3, errors::InvalidArgument("FusedConvRandomK expects (batch_size, H, W, 3) xyz1 shape.")); | ||
int batch_size = xyz1_tensor.shape().dim_size(0); | ||
int H = xyz1_tensor.shape().dim_size(1); | ||
int W = xyz1_tensor.shape().dim_size(2); | ||
|
||
int H2 = ceil(H / double(stride_h_)); | ||
int W2 = ceil(W / double(stride_w_)); | ||
// std::cout << H << " " << H2 << std::endl; | ||
// std::cout << W << " " << W2 << std::endl; | ||
const Tensor& xyz2_tensor = context->input(1); | ||
OP_REQUIRES(context, xyz2_tensor.dims() == 4 && xyz2_tensor.shape().dim_size(1) == H2, errors::InvalidArgument("FusedConvRandomK expects (batch_size, H/stride_h, W/stride_w, 3) xyz2 shape.")); | ||
|
||
const Tensor& idx_n2_tensor = context->input(2); | ||
OP_REQUIRES(context, idx_n2_tensor.shape().dim_size(2) == 2 && idx_n2_tensor.shape().dim_size(1)==npoints_, errors::InvalidArgument("FusedConv expects (batch_size, npoints, 2) idx_n2 shape.")); | ||
|
||
const Tensor& random_hw_tensor = context->input(3); | ||
OP_REQUIRES(context, random_hw_tensor.shape().dim_size(0) == kernel_size_H_ * kernel_size_W_, errors::InvalidArgument("FusedConv expects (kernel_size_h * kernel_size_w) random_hw shape.")); | ||
|
||
|
||
Tensor *selected_bhw_idx_tensor = nullptr; | ||
OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape{batch_size, npoints_, K_, 3}, &selected_bhw_idx_tensor)); | ||
|
||
Tensor *valid_idx_tensor = nullptr; | ||
OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape{batch_size, npoints_, kernel_size_H_ * kernel_size_W_, 1}, &valid_idx_tensor)); | ||
|
||
Tensor *valid_in_dis_idx_tensor = nullptr; | ||
OP_REQUIRES_OK(context, context->allocate_output(2, TensorShape{batch_size, npoints_, kernel_size_H_ * kernel_size_W_, 1}, &valid_in_dis_idx_tensor)); | ||
|
||
Tensor *selected_mask_tensor = nullptr; | ||
OP_REQUIRES_OK(context, context->allocate_output(3, TensorShape{batch_size, npoints_, K_, 1}, &selected_mask_tensor)); | ||
|
||
|
||
auto xyz1_flat = xyz1_tensor.flat<float>(); | ||
const float *xyz1 = &(xyz1_flat(0)); | ||
|
||
auto xyz2_flat = xyz2_tensor.flat<float>(); | ||
const float *xyz2 = &(xyz2_flat(0)); | ||
|
||
auto idx_n2_flat = idx_n2_tensor.flat<int>(); | ||
const int *idx_n2 = &(idx_n2_flat(0)); | ||
|
||
auto random_hw_flat = random_hw_tensor.flat<int>(); | ||
const int *random_hw = &(random_hw_flat(0)); | ||
|
||
|
||
auto selected_bhw_idx_flat = selected_bhw_idx_tensor->flat<int>(); | ||
int *selected_bhw_idx = &(selected_bhw_idx_flat(0)); | ||
cudaMemset(selected_bhw_idx, 0, sizeof(int) * batch_size * npoints_ * K_ * 3); | ||
|
||
auto valid_idx_flat = valid_idx_tensor->flat<float>(); | ||
float *valid_idx = &(valid_idx_flat(0)); | ||
cudaMemset(valid_idx, 0, sizeof(float) * batch_size * npoints_ * kernel_size_H_ * kernel_size_W_ * 1); | ||
|
||
auto valid_in_dis_idx_flat = valid_in_dis_idx_tensor->flat<float>(); | ||
float *valid_in_dis_idx = &(valid_in_dis_idx_flat(0)); | ||
cudaMemset(valid_in_dis_idx, 0, sizeof(float) * batch_size * npoints_ * kernel_size_H_ * kernel_size_W_ * 1); | ||
|
||
auto selected_mask_flat = selected_mask_tensor->flat<float>(); | ||
float *selected_mask = &(selected_mask_flat(0)); | ||
cudaMemset(selected_mask, 0, sizeof(float) * batch_size * npoints_ * K_ * 1); | ||
|
||
|
||
FusedConvRandomKLauncher(batch_size, H, W, npoints_, kernel_size_H_, kernel_size_W_, K_, flag_copy_, distance_, stride_h_, stride_w_, xyz1, xyz2, idx_n2, random_hw, selected_bhw_idx, valid_idx, valid_in_dis_idx, selected_mask, H2, W2); | ||
} | ||
private: | ||
int kernel_size_H_, kernel_size_W_, K_, flag_copy_, npoints_; | ||
float distance_; | ||
int stride_h_, stride_w_; | ||
}; | ||
REGISTER_KERNEL_BUILDER(Name("FusedConvRandomK").Device(DEVICE_GPU), FusedConvRandomKGpuOp); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
CUDA_PATH=/usr/local/cuda-9.0 | ||
$CUDA_PATH/bin/nvcc fused_conv_g.cu -o fused_conv_g.cu.o -c -O2 -DGOOGLE_CUDA=1 -x cu -Xcompiler -fPIC | ||
TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())') | ||
TF_LIB=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())') | ||
g++ -std=c++11 fused_conv.cpp fused_conv_g.cu.o -o fused_conv_so.so -shared -fPIC -I $TF_INC -I $CUDA_PATH/include -L$TF_LIB -I$TF_INC/external/nsync/public -lcudart -L $CUDA_PATH/lib64/ -ltensorflow_framework -O2 -D_GLIBCXX_USE_CXX11_ABI=0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
// input: kernel_size(h,w), stride_size(h,w), distance(float), flag_padding, xyz (b,H,W,3),bhw_idx(b,H, W, 3) | ||
// output: selected_xyz(b, npoints, h*w, 3), selected_feature(b, npoints, h*w, 3) | ||
#include <algorithm> | ||
#include <stdio.h> | ||
#include <stdlib.h> /* srand, rand */ | ||
#include <time.h> /* time */ | ||
#include <cstdlib>// Header file needed to use rand | ||
#include <math.h> | ||
#include <cuda_runtime.h> | ||
|
||
|
||
|
||
__global__ void fused_conv_random_k_gpu(int batch_size, int H, int W, int npoints, int kernel_size_H, int kernel_size_W, int K, int flag_copy, float distance, int stride_h, int stride_w, const float *xyz1, const float *xyz2, const int *idx_n2, const int *random_hw, int *selected_bhw_idx, float *valid_idx, float *valid_in_dis_idx, float *selected_mask, int small_h, int small_w) | ||
{ | ||
|
||
int batch_index = blockIdx.x; | ||
int index_thread = threadIdx.x; | ||
int stride_thread = blockDim.x; | ||
|
||
int kernel_total = kernel_size_H * kernel_size_W; | ||
int selected_W_idx = 0, selected_H_idx =0; | ||
|
||
float dist_square = distance * distance; | ||
|
||
int kernel_half_H = kernel_size_H / 2; | ||
int kernel_half_W = kernel_size_W / 2; | ||
|
||
xyz1 += batch_index * H * W * 3; | ||
xyz2 += batch_index * small_h * small_w * 3; | ||
idx_n2 += batch_index * npoints * 2; | ||
selected_bhw_idx += batch_index * npoints * K * 3 ; //(b, npoints, h*w, 3) | ||
|
||
valid_idx += batch_index * npoints * kernel_total * 1 ; //(b, npoints, h*w, 1) | ||
valid_in_dis_idx += batch_index * npoints * kernel_total * 1 ; //(b, npoints, h*w, 1) | ||
|
||
selected_mask += batch_index * npoints * K * 1 ; //(b, npoints, h*w, 1) | ||
|
||
|
||
////////////// Fused Conv Between | ||
|
||
for (int current_n = index_thread; current_n < npoints; current_n += stride_thread) // output_W circle | ||
{ | ||
|
||
int idx_w[500], idx_h[500]; | ||
float Dist[500]; | ||
|
||
for(int ii = 0; ii<500; ++ii) | ||
{ | ||
idx_w[ii] = 0; | ||
idx_h[ii] = 0; | ||
Dist[ii] = 1e10f; | ||
} | ||
|
||
int num_select = 0; // the number of selected points in each kernel | ||
int num_valid_idx = 0; // the number of valid points in each kernel | ||
|
||
selected_H_idx = idx_n2[current_n * 2 + 0]; // the central points H idx of input 2d frame | ||
selected_W_idx = idx_n2[current_n * 2 + 1]; // the central points W idx of input 2d frame | ||
|
||
float x_c = xyz1[selected_H_idx * W * 3 + selected_W_idx * 3 + 0]; | ||
float y_c = xyz1[selected_H_idx * W * 3 + selected_W_idx * 3 + 1]; | ||
float z_c = xyz1[selected_H_idx * W * 3 + selected_W_idx * 3 + 2]; | ||
|
||
float Dist_c = max((x_c-0)*(x_c-0)+(y_c-0)*(y_c-0)+(z_c-0)*(z_c-0), 1e-10f); | ||
|
||
if (Dist_c <= 1e-10f) // not valid central points of xyz1 | ||
{ | ||
continue; | ||
|
||
} | ||
|
||
// valid central points of xyz2 | ||
|
||
for (int current_HW_idx = 0; current_HW_idx < kernel_total; ++current_HW_idx) //select points in every kernel element | ||
{ | ||
|
||
int kernel_HW_idx = random_hw[current_HW_idx]; | ||
|
||
|
||
int kernel_select_H_idx = selected_H_idx / stride_h + kernel_HW_idx / kernel_size_W - kernel_half_H; // random select | ||
int kernel_select_W_idx = selected_W_idx / stride_w + kernel_HW_idx % kernel_size_W - kernel_half_W; // random select | ||
|
||
if ((kernel_select_H_idx < 0) || (kernel_select_H_idx >= small_h)) // the region of padding points (not valid) | ||
{ | ||
continue; | ||
} | ||
|
||
|
||
if (kernel_select_W_idx < 0) | ||
{ | ||
kernel_select_W_idx = small_w + kernel_select_W_idx; //// cylindrical project | ||
} | ||
|
||
if (kernel_select_W_idx >= small_w) | ||
{ | ||
kernel_select_W_idx = kernel_select_W_idx - small_w; //// cylindrical project | ||
} | ||
|
||
|
||
// not the padding points | ||
|
||
float x_q = xyz2[kernel_select_H_idx * small_w * 3 + kernel_select_W_idx * 3 + 0]; | ||
float y_q = xyz2[kernel_select_H_idx * small_w * 3 + kernel_select_W_idx * 3 + 1]; | ||
float z_q = xyz2[kernel_select_H_idx * small_w * 3 + kernel_select_W_idx * 3 + 2]; | ||
|
||
float Dist_q_0 = x_q*x_q + y_q*y_q + z_q*z_q; | ||
|
||
if (Dist_q_0 <= 1e-10f) // not valid xyz2 points | ||
{ | ||
continue; | ||
} | ||
|
||
// valid xyz2 points, calculate the distance | ||
|
||
valid_idx[current_n * kernel_total * 1 + num_valid_idx * 1 + 0 ] = 1.0; | ||
++num_valid_idx; | ||
|
||
float Dist_q = max((x_c-x_q)*(x_c-x_q)+(y_c-y_q)*(y_c-y_q)+(z_c-z_q)*(z_c-z_q), 1e-10f); | ||
|
||
if (Dist_q > dist_square) // too far from the central points, regarding as not valid | ||
{ | ||
continue; | ||
} | ||
|
||
|
||
if ((flag_copy == 1) && (num_select == 0)) // copy the first selected point in xyz2 for K times | ||
{ | ||
for (int k_idx = 0; k_idx < K; ++ k_idx) | ||
{ | ||
|
||
selected_bhw_idx[current_n * K * 3 + k_idx * 3 + 0 ] = batch_index; | ||
selected_bhw_idx[current_n * K * 3 + k_idx * 3 + 1 ] = kernel_select_H_idx; | ||
selected_bhw_idx[current_n * K * 3 + k_idx * 3 + 2 ] = kernel_select_W_idx; | ||
selected_mask[current_n * K * 1 + k_idx * 1 + 0 ] = 1.0; | ||
|
||
} | ||
|
||
} // copy done | ||
|
||
selected_bhw_idx[current_n * K * 3 + num_select * 3 + 0 ] = batch_index; | ||
selected_bhw_idx[current_n * K * 3 + num_select * 3 + 1 ] = kernel_select_H_idx; | ||
selected_bhw_idx[current_n * K * 3 + num_select * 3 + 2 ] = kernel_select_W_idx; | ||
selected_mask[current_n * K * 1 + num_select * 1 + 0 ] = 1.0; | ||
|
||
valid_in_dis_idx[current_n * kernel_total * 1 + num_select * 1 + 0 ] = 1.0; | ||
|
||
++num_select; | ||
|
||
if(num_select >= K) // search all position | ||
break; | ||
|
||
} | ||
|
||
} | ||
|
||
} | ||
|
||
|
||
|
||
|
||
|
||
void FusedConvRandomKLauncher(int batch_size, int H, int W, int npoints, int kernel_size_H, int kernel_size_W, int K, int flag_copy, float distance, int stride_h, int stride_w, const float *xyz1, const float *xyz2, const int *idx_n2, const int *random_hw, int *selected_bhw_idx, float *valid_idx, float *valid_in_dis_idx, float *selected_mask, int small_h, int small_w) | ||
{ | ||
|
||
fused_conv_random_k_gpu<<<batch_size,256>>>(batch_size, H, W, npoints, kernel_size_H, kernel_size_W, K, flag_copy, distance, stride_h, stride_w, xyz1, xyz2, idx_n2, random_hw, selected_bhw_idx, valid_idx, valid_in_dis_idx, selected_mask, small_h, small_w); | ||
|
||
//cudaDeviceSynchronize(); | ||
} |
Binary file not shown.
Oops, something went wrong.