@@ -11,28 +11,13 @@ extern "C" void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categoriz
11
11
{
12
12
int batch = params.mini_batch ;
13
13
int i;
14
+ const int device_id = 0 ;
14
15
_cwc_convnet_alloc_reserved_both (convnet, batch, 0 , params.layer_params );
15
- int out_rows, out_cols, out_partition;
16
- for (i = 0 ; i < convnet->count ; i++)
17
- {
18
- ccv_convnet_layer_t *layer = convnet->layers + i;
19
- _ccv_convnet_layer_derive_output (layer, layer->input .matrix .rows , layer->input .matrix .cols , &out_rows, &out_cols, &out_partition);
20
- switch (layer->type )
21
- {
22
- case CCV_CONVNET_CONVOLUTIONAL:
23
- cudaFree (GPU (convnet)->device [0 ].forwards [i]);
24
- GPU (convnet)->device [0 ].forwards [i] = 0 ;
25
- // since for the benchmark, output doesn't match input, this chooses the maximum ones to allocate
26
- cudaMalloc (&GPU (convnet)->device [0 ].forwards [i], sizeof (float ) * ccv_max (out_rows * out_cols * layer->net .convolutional .count , layer->input .matrix .rows * layer->input .matrix .cols * layer->input .matrix .channels ) * batch);
27
- assert (GPU (convnet)->device [0 ].forwards [i]);
28
- break ;
29
- }
30
- }
31
16
cwc_convnet_context_t * context = GPU (convnet)->contexts ;
32
17
for (i = 0 ; i < convnet->rows * convnet->cols * convnet->channels ; i++)
33
18
convnet->mean_activity ->data .f32 [i] = 128 ;
34
- _cwc_convnet_batch_formation (0 , categorizeds, convnet->mean_activity , 0 , 0 , 0 , 0 , ccv_size (128 , 128 ), convnet->rows , convnet->cols , convnet->channels , 1000 , 0 , batch, 0 , batch, context->host [0 ].input , context->host [0 ].c );
35
- cudaMemcpy (context->device [0 ].input , context->host [0 ].input , sizeof (float ) * convnet->rows * convnet->cols * convnet->channels * batch, cudaMemcpyHostToDevice);
19
+ cwc_convnet_batch_formation (0 , categorizeds, convnet->mean_activity , 0 , 0 , 0 , 0 , 0 , ccv_size (128 , 128 ), 128 , 128 , convnet->rows , convnet->cols , convnet->channels , 1000 , 0 , batch, 0 , batch, context->host [device_id ].input , context->host [device_id ].c );
20
+ cudaMemcpy (context->device [device_id ].input , context->host [device_id ].input , sizeof (float ) * convnet->rows * convnet->cols * convnet->channels * batch, cudaMemcpyHostToDevice);
36
21
37
22
cudaEvent_t overallStart;
38
23
cudaEvent_t overallStop;
@@ -43,21 +28,21 @@ extern "C" void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categoriz
43
28
cudaEventCreate (&start);
44
29
cudaEventCreate (&stop);
45
30
float elapsed_time;
46
- VARY (GPU (convnet)->device [0 ].layers + 0 )->convolutional .forward .x = 4 ;
47
- VARY (GPU (convnet)->device [0 ].layers + 0 )->convolutional .forward .y = 8 ;
48
- VARY (GPU (convnet)->device [0 ].layers + 0 )->convolutional .forward .z = 32 ;
49
- VARY (GPU (convnet)->device [0 ].layers + 1 )->convolutional .forward .x = 4 ;
50
- VARY (GPU (convnet)->device [0 ].layers + 1 )->convolutional .forward .y = 8 ;
51
- VARY (GPU (convnet)->device [0 ].layers + 1 )->convolutional .forward .z = 32 ;
52
- VARY (GPU (convnet)->device [0 ].layers + 2 )->convolutional .forward .x = 4 ;
53
- VARY (GPU (convnet)->device [0 ].layers + 2 )->convolutional .forward .y = 8 ;
54
- VARY (GPU (convnet)->device [0 ].layers + 2 )->convolutional .forward .z = 32 ;
55
- VARY (GPU (convnet)->device [0 ].layers + 3 )->convolutional .forward .x = 4 ;
56
- VARY (GPU (convnet)->device [0 ].layers + 3 )->convolutional .forward .y = 8 ;
57
- VARY (GPU (convnet)->device [0 ].layers + 3 )->convolutional .forward .z = 32 ;
58
- VARY (GPU (convnet)->device [0 ].layers + 4 )->convolutional .forward .x = 4 ;
59
- VARY (GPU (convnet)->device [0 ].layers + 4 )->convolutional .forward .y = 8 ;
60
- VARY (GPU (convnet)->device [0 ].layers + 4 )->convolutional .forward .z = 32 ;
31
+ EXTRA (GPU (convnet)->device [0 ].layers + 0 )->vary . convolutional .forward .x = 4 ;
32
+ EXTRA (GPU (convnet)->device [0 ].layers + 0 )->vary . convolutional .forward .y = 8 ;
33
+ EXTRA (GPU (convnet)->device [0 ].layers + 0 )->vary . convolutional .forward .z = 32 ;
34
+ EXTRA (GPU (convnet)->device [0 ].layers + 1 )->vary . convolutional .forward .x = 4 ;
35
+ EXTRA (GPU (convnet)->device [0 ].layers + 1 )->vary . convolutional .forward .y = 8 ;
36
+ EXTRA (GPU (convnet)->device [0 ].layers + 1 )->vary . convolutional .forward .z = 32 ;
37
+ EXTRA (GPU (convnet)->device [0 ].layers + 2 )->vary . convolutional .forward .x = 4 ;
38
+ EXTRA (GPU (convnet)->device [0 ].layers + 2 )->vary . convolutional .forward .y = 8 ;
39
+ EXTRA (GPU (convnet)->device [0 ].layers + 2 )->vary . convolutional .forward .z = 32 ;
40
+ EXTRA (GPU (convnet)->device [0 ].layers + 3 )->vary . convolutional .forward .x = 4 ;
41
+ EXTRA (GPU (convnet)->device [0 ].layers + 3 )->vary . convolutional .forward .y = 8 ;
42
+ EXTRA (GPU (convnet)->device [0 ].layers + 3 )->vary . convolutional .forward .z = 32 ;
43
+ EXTRA (GPU (convnet)->device [0 ].layers + 4 )->vary . convolutional .forward .x = 4 ;
44
+ EXTRA (GPU (convnet)->device [0 ].layers + 4 )->vary . convolutional .forward .y = 8 ;
45
+ EXTRA (GPU (convnet)->device [0 ].layers + 4 )->vary . convolutional .forward .z = 32 ;
61
46
cudaEventRecord (overallStart, context->device [0 ].data_stream );
62
47
for (i = 0 ; i < convnet->count ; i++)
63
48
{
@@ -68,7 +53,7 @@ extern "C" void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categoriz
68
53
cudaEventSynchronize (stop);
69
54
cudaEventElapsedTime (&elapsed_time, start, stop);
70
55
if (layer->type == CCV_CONVNET_CONVOLUTIONAL)
71
- printf (" %d %d %d, elapsed time for layer %d fprop: %f milliseconds\n " , VARY (layer)->convolutional .forward .x , VARY (layer)->convolutional .forward .y , VARY (layer)->convolutional .forward .z , i + 1 , elapsed_time);
56
+ printf (" %d %d %d, elapsed time for layer %d fprop: %f milliseconds\n " , EXTRA (layer)->vary . convolutional .forward .x , EXTRA (layer)->vary . convolutional .forward .y , EXTRA (layer)->vary . convolutional .forward .z , i + 1 , elapsed_time);
72
57
else
73
58
printf (" elapsed time for layer %d fprop: %f milliseconds\n " , i + 1 , elapsed_time);
74
59
}
@@ -78,33 +63,33 @@ extern "C" void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categoriz
78
63
printf (" forward pass %f milliseconds\n " , elapsed_time);
79
64
80
65
/*
81
- VARY (GPU(convnet)->device[0].layers + 0)->convolutional.backward.coefficient.x = 1;
82
- VARY (GPU(convnet)->device[0].layers + 0)->convolutional.backward.coefficient.y = 3;
83
- VARY (GPU(convnet)->device[0].layers + 0)->convolutional.backward.coefficient.z = 1;
84
- VARY (GPU(convnet)->device[0].layers + 3)->convolutional.backward.coefficient.x = 4;
85
- VARY (GPU(convnet)->device[0].layers + 3)->convolutional.backward.coefficient.y = 4;
86
- VARY (GPU(convnet)->device[0].layers + 3)->convolutional.backward.coefficient.z = 16;
87
- VARY (GPU(convnet)->device[0].layers + 3)->convolutional.backward.gradient.x = 4;
88
- VARY (GPU(convnet)->device[0].layers + 3)->convolutional.backward.gradient.y = 6;
89
- VARY (GPU(convnet)->device[0].layers + 3)->convolutional.backward.gradient.z = 24;
90
- VARY (GPU(convnet)->device[0].layers + 6)->convolutional.backward.coefficient.x = 8;
91
- VARY (GPU(convnet)->device[0].layers + 6)->convolutional.backward.coefficient.y = 3;
92
- VARY (GPU(convnet)->device[0].layers + 6)->convolutional.backward.coefficient.z = 32;
93
- VARY (GPU(convnet)->device[0].layers + 6)->convolutional.backward.gradient.x = 4;
94
- VARY (GPU(convnet)->device[0].layers + 6)->convolutional.backward.gradient.y = 8;
95
- VARY (GPU(convnet)->device[0].layers + 6)->convolutional.backward.gradient.z = 32;
96
- VARY (GPU(convnet)->device[0].layers + 7)->convolutional.backward.coefficient.x = 8;
97
- VARY (GPU(convnet)->device[0].layers + 7)->convolutional.backward.coefficient.y = 3;
98
- VARY (GPU(convnet)->device[0].layers + 7)->convolutional.backward.coefficient.z = 32;
99
- VARY (GPU(convnet)->device[0].layers + 7)->convolutional.backward.gradient.x = 4;
100
- VARY (GPU(convnet)->device[0].layers + 7)->convolutional.backward.gradient.y = 8;
101
- VARY (GPU(convnet)->device[0].layers + 7)->convolutional.backward.gradient.z = 32;
102
- VARY (GPU(convnet)->device[0].layers + 8)->convolutional.backward.coefficient.x = 8;
103
- VARY (GPU(convnet)->device[0].layers + 8)->convolutional.backward.coefficient.y = 4;
104
- VARY (GPU(convnet)->device[0].layers + 8)->convolutional.backward.coefficient.z = 32;
105
- VARY (GPU(convnet)->device[0].layers + 8)->convolutional.backward.gradient.x = 4;
106
- VARY (GPU(convnet)->device[0].layers + 8)->convolutional.backward.gradient.y = 8;
107
- VARY (GPU(convnet)->device[0].layers + 8)->convolutional.backward.gradient.z = 32;
66
+ EXTRA (GPU(convnet)->device[0].layers + 0)->vary. convolutional.backward.coefficient.x = 1;
67
+ EXTRA (GPU(convnet)->device[0].layers + 0)->vary. convolutional.backward.coefficient.y = 3;
68
+ EXTRA (GPU(convnet)->device[0].layers + 0)->vary. convolutional.backward.coefficient.z = 1;
69
+ EXTRA (GPU(convnet)->device[0].layers + 3)->vary. convolutional.backward.coefficient.x = 4;
70
+ EXTRA (GPU(convnet)->device[0].layers + 3)->vary. convolutional.backward.coefficient.y = 4;
71
+ EXTRA (GPU(convnet)->device[0].layers + 3)->vary. convolutional.backward.coefficient.z = 16;
72
+ EXTRA (GPU(convnet)->device[0].layers + 3)->vary. convolutional.backward.gradient.x = 4;
73
+ EXTRA (GPU(convnet)->device[0].layers + 3)->vary. convolutional.backward.gradient.y = 6;
74
+ EXTRA (GPU(convnet)->device[0].layers + 3)->vary. convolutional.backward.gradient.z = 24;
75
+ EXTRA (GPU(convnet)->device[0].layers + 6)->vary. convolutional.backward.coefficient.x = 8;
76
+ EXTRA (GPU(convnet)->device[0].layers + 6)->vary. convolutional.backward.coefficient.y = 3;
77
+ EXTRA (GPU(convnet)->device[0].layers + 6)->vary. convolutional.backward.coefficient.z = 32;
78
+ EXTRA (GPU(convnet)->device[0].layers + 6)->vary. convolutional.backward.gradient.x = 4;
79
+ EXTRA (GPU(convnet)->device[0].layers + 6)->vary. convolutional.backward.gradient.y = 8;
80
+ EXTRA (GPU(convnet)->device[0].layers + 6)->vary. convolutional.backward.gradient.z = 32;
81
+ EXTRA (GPU(convnet)->device[0].layers + 7)->vary. convolutional.backward.coefficient.x = 8;
82
+ EXTRA (GPU(convnet)->device[0].layers + 7)->vary. convolutional.backward.coefficient.y = 3;
83
+ EXTRA (GPU(convnet)->device[0].layers + 7)->vary. convolutional.backward.coefficient.z = 32;
84
+ EXTRA (GPU(convnet)->device[0].layers + 7)->vary. convolutional.backward.gradient.x = 4;
85
+ EXTRA (GPU(convnet)->device[0].layers + 7)->vary. convolutional.backward.gradient.y = 8;
86
+ EXTRA (GPU(convnet)->device[0].layers + 7)->vary. convolutional.backward.gradient.z = 32;
87
+ EXTRA (GPU(convnet)->device[0].layers + 8)->vary. convolutional.backward.coefficient.x = 8;
88
+ EXTRA (GPU(convnet)->device[0].layers + 8)->vary. convolutional.backward.coefficient.y = 4;
89
+ EXTRA (GPU(convnet)->device[0].layers + 8)->vary. convolutional.backward.coefficient.z = 32;
90
+ EXTRA (GPU(convnet)->device[0].layers + 8)->vary. convolutional.backward.gradient.x = 4;
91
+ EXTRA (GPU(convnet)->device[0].layers + 8)->vary. convolutional.backward.gradient.y = 8;
92
+ EXTRA (GPU(convnet)->device[0].layers + 8)->vary. convolutional.backward.gradient.z = 32;
108
93
float* a = 0;
109
94
cudaMalloc(&a, sizeof(float) * 1000 * batch);
110
95
cudaMemcpy(a, GPU(convnet)->device[0].forwards[convnet->count - 1], sizeof(float) * 1000 * batch, cudaMemcpyDeviceToDevice);
@@ -120,7 +105,7 @@ extern "C" void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categoriz
120
105
if (context->device[0].dor[i])
121
106
{
122
107
int out_rows, out_cols, out_partition;
123
- _ccv_convnet_layer_derive_output (layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
108
+ ccv_convnet_make_output (layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
124
109
_cwc_kern_mute_neuron
125
110
<<<out_rows * out_cols * layer->net.convolutional.count, batch, 0, context->device[0].data_stream>>>
126
111
(i == convnet->count - 1 ? a : GPU(convnet)->device[0].backwards[i + 1], context->device[0].dor[i]);
@@ -153,7 +138,7 @@ extern "C" void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categoriz
153
138
cudaEventSynchronize(stop);
154
139
cudaEventElapsedTime(&elapsed_time, start, stop);
155
140
if (layer->type == CCV_CONVNET_CONVOLUTIONAL)
156
- printf("%d %d %d, %d %d %d, elapsed time for layer %d bprop: %f milliseconds\n", VARY (layer)->convolutional.backward.coefficient.x, VARY (layer)->convolutional.backward.coefficient.y, VARY (layer)->convolutional.backward.coefficient.z, VARY (layer)->convolutional.backward.gradient.x, VARY (layer)->convolutional.backward.gradient.y, VARY (layer)->convolutional.backward.gradient.z, i + 1, elapsed_time);
141
+ printf("%d %d %d, %d %d %d, elapsed time for layer %d bprop: %f milliseconds\n", EXTRA (layer)->vary. convolutional.backward.coefficient.x, EXTRA (layer)->vary. convolutional.backward.coefficient.y, EXTRA (layer)->vary. convolutional.backward.coefficient.z, EXTRA (layer)->vary. convolutional.backward.gradient.x, EXTRA (layer)->vary. convolutional.backward.gradient.y, EXTRA (layer)->vary. convolutional.backward.gradient.z, i + 1, elapsed_time);
157
142
else
158
143
printf("elapsed time for layer %d bprop: %f milliseconds\n", i + 1, elapsed_time);
159
144
}
0 commit comments