@@ -40,6 +40,11 @@ DEVICE void PrRoIPoolingDistributeDiffCUDA(T* diff, const T top_diff,
40
40
}
41
41
}
42
42
43
+ template <typename T>
44
+ DEVICE void GPUAccumulateRois (T* offset, T data) {
45
+ paddle::platform::CudaAtomicAdd (offset, data);
46
+ }
47
+
43
48
template <typename T>
44
49
__global__ void GPUPRROIPoolForward (
45
50
const int nthreads, const T* input_data, const T* input_rois,
@@ -78,7 +83,7 @@ __global__ void GPUPRROIPoolForward(
78
83
T win_end_h = win_start_h + bin_size_h;
79
84
80
85
T win_size = max (static_cast <T>(0.0 ), bin_size_w * bin_size_h);
81
- int input_channel = (c * pooled_height + ph) * pooled_width + pw ;
86
+ int input_channel = c ;
82
87
const T* offset_input_data =
83
88
input_data +
84
89
(roi_batch_id * input_channels + input_channel) * height * width;
@@ -110,10 +115,12 @@ __global__ void GPUPRROIPoolForward(
110
115
111
116
template <typename T>
112
117
__global__ void GPUPRROIPoolBackward (
113
- const int nthreads, const T* input_rois, const T* output_grad_data,
114
- const float spatial_scale, const int input_channels, const int height,
115
- const int width, const int output_channels, const int pooled_height,
116
- const int pooled_width, const int * rois_batch_id_data, T* input_grad_data) {
118
+ const int nthreads, const T* in_data, const T* input_rois,
119
+ const T* output_grad_data, const float spatial_scale,
120
+ const int input_channels, const int height, const int width,
121
+ const int output_channels, const int pooled_height, const int pooled_width,
122
+ const int * rois_batch_id_data, T* input_grad_data, const T* out_data,
123
+ T* input_roi_grad_data) {
117
124
int index = blockIdx .x * blockDim .x + threadIdx .x ;
118
125
int offset = blockDim .x * gridDim .x ;
119
126
for (int i = index ; i < nthreads; i += offset) {
@@ -125,7 +132,7 @@ __global__ void GPUPRROIPoolBackward(
125
132
126
133
// set roi_batch_id
127
134
int roi_batch_id = rois_batch_id_data[n];
128
- int input_channel = (c * pooled_height + ph) * pooled_width + pw ;
135
+ int input_channel = c ;
129
136
int input_offset =
130
137
(roi_batch_id * input_channels + input_channel) * height * width;
131
138
T* offset_input_grad_data = input_grad_data + input_offset;
@@ -137,6 +144,7 @@ __global__ void GPUPRROIPoolBackward(
137
144
T roi_start_h = static_cast <T>(offset_input_rois[1 ]) * spatial_scale;
138
145
T roi_end_w = static_cast <T>(offset_input_rois[2 ]) * spatial_scale;
139
146
T roi_end_h = static_cast <T>(offset_input_rois[3 ]) * spatial_scale;
147
+ T* offset_input_roi_grad_data = input_roi_grad_data + n * 4 ;
140
148
141
149
T roi_width = max (roi_end_w - roi_start_w, static_cast <T>(0.0 ));
142
150
T roi_height = max (roi_end_h - roi_start_h, static_cast <T>(0.0 ));
@@ -171,6 +179,16 @@ __global__ void GPUPRROIPoolBackward(
171
179
height, width, PrRoIPoolingDistributeDiffCUDA<T>);
172
180
}
173
181
}
182
+
183
+ const T* offset_out_data = out_data + i;
184
+ const T* offset_in_data = in_data + input_offset;
185
+ PrRoIPoolingCoorBackward (
186
+ s_w, e_w, s_h, e_h, width, height, win_start_w, win_start_h, win_end_w,
187
+ win_end_h, pw, ph, pooled_width, pooled_height, win_size, spatial_scale,
188
+ offset_in_data, offset_out_data, offset_input_grad_data,
189
+ offset_input_roi_grad_data, GPUAccumulateRois<T>,
190
+ [](const T x, const T y) { return max (x, y); },
191
+ [](const T x, const T y) { return min (x, y); });
174
192
}
175
193
}
176
194
@@ -184,20 +202,15 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
184
202
185
203
auto pooled_height = ctx.Attr <int >(" pooled_height" );
186
204
auto pooled_width = ctx.Attr <int >(" pooled_width" );
187
- auto output_channels = ctx.Attr <int >(" output_channels" );
188
205
auto spatial_scale = ctx.Attr <float >(" spatial_scale" );
189
206
190
207
auto in_dims = in->dims ();
191
208
int batch_size = in_dims[0 ];
192
209
int input_channels = in_dims[1 ];
210
+ auto output_channels = input_channels;
193
211
int height = in_dims[2 ];
194
212
int width = in_dims[3 ];
195
213
196
- PADDLE_ENFORCE_EQ (input_channels,
197
- output_channels * pooled_height * pooled_width,
198
- " the channels of input X should equal the product of "
199
- " output_channels x pooled_height x pooled_width" );
200
-
201
214
int rois_num = rois->dims ()[0 ];
202
215
if (rois_num == 0 ) return ;
203
216
@@ -245,17 +258,20 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
245
258
void Compute (const framework::ExecutionContext& ctx) const override {
246
259
auto * in = ctx.Input <Tensor>(" X" );
247
260
auto * rois = ctx.Input <LoDTensor>(" ROIs" );
261
+ auto * out = ctx.Input <framework::Tensor>(" Out" );
248
262
249
263
auto * output_grad = ctx.Input <Tensor>(framework::GradVarName (" Out" ));
250
264
auto * input_grad = ctx.Output <Tensor>(framework::GradVarName (" X" ));
265
+ auto * input_roi_grad =
266
+ ctx.Output <LoDTensor>(framework::GradVarName (" ROIs" ));
251
267
252
268
auto pooled_height = ctx.Attr <int >(" pooled_height" );
253
269
auto pooled_width = ctx.Attr <int >(" pooled_width" );
254
- auto output_channels = ctx.Attr <int >(" output_channels" );
255
270
auto spatial_scale = ctx.Attr <float >(" spatial_scale" );
256
271
257
272
int rois_num = rois->dims ()[0 ];
258
273
int input_channels = in->dims ()[1 ];
274
+ auto output_channels = input_channels;
259
275
int height = in->dims ()[2 ];
260
276
int width = in->dims ()[3 ];
261
277
@@ -280,6 +296,8 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
280
296
input_grad->mutable_data <T>(ctx.GetPlace ());
281
297
math::SetConstant<DeviceContext, T> set_zero;
282
298
set_zero (ctx.cuda_device_context (), input_grad, static_cast <T>(0 ));
299
+ input_roi_grad->mutable_data <T>(ctx.GetPlace ());
300
+ set_zero (ctx.cuda_device_context (), input_roi_grad, static_cast <T>(0 ));
283
301
284
302
int output_grad_size = output_grad->numel ();
285
303
int blocks = NumBlocks (output_grad_size);
@@ -288,10 +306,12 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
288
306
if (output_grad_size > 0 ) {
289
307
GPUPRROIPoolBackward<
290
308
T><<<blocks, threads, 0 , ctx.cuda_device_context().stream()>>> (
291
- output_grad_size, rois->data <T>(), output_grad->data <T>(),
292
- spatial_scale, input_channels, height, width, output_channels,
293
- pooled_height, pooled_width, rois_batch_id_list_gpu.data <int >(),
294
- input_grad->mutable_data <T>(ctx.GetPlace ()));
309
+ output_grad_size, in->data <T>(), rois->data <T>(),
310
+ output_grad->data <T>(), spatial_scale, input_channels, height,
311
+ width, output_channels, pooled_height, pooled_width,
312
+ rois_batch_id_list_gpu.data <int >(),
313
+ input_grad->mutable_data <T>(ctx.GetPlace ()), out->data <T>(),
314
+ input_roi_grad->mutable_data <T>(ctx.GetPlace ()));
295
315
}
296
316
}
297
317
}
0 commit comments