-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathctranspose_conj.cu
271 lines (227 loc) · 7.36 KB
/
ctranspose_conj.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/*
-- MAGMA (version 2.1.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
@date August 2016
@generated from magmablas/ztranspose_conj.cu, normal z -> c, Tue Aug 30 09:38:34 2016
@author Stan Tomov
@author Mark Gates
*/
#include "magma_internal.h"
#define PRECISION_c
#if defined(PRECISION_z)
#define NX 16
#else
#define NX 32
#endif
#define NB 32
#define NY 8
// nearly same code in ctranspose.cu
// tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB.
// uses NX-by-NY threads, where NB/NX, NB/NY, NX/NY evenly.
// subtile each NB-by-NB tile with (NB/NX) subtiles sized NX-by-NB
// for each subtile
// load NX-by-NB subtile transposed from A into sA, as (NB/NY) blocks sized NX-by-NY
// save NB-by-NX subtile from sA into AT, as (NB/NX)*(NX/NY) blocks sized NX-by-NY
// A += NX
// AT += NX*ldat
//
// e.g., with NB=32, NX=32, NY=8 ([sdc] precisions)
// load 32x32 subtile as 4 blocks of 32x8 columns: (A11 A12 A13 A14 )
// save 32x32 subtile as 1*4 blocks of 32x8 columns: (AT11 AT12 AT13 AT14)
//
// e.g., with NB=32, NX=16, NY=8 (z precision)
// load 16x32 subtile as 4 blocks of 16x8 columns: (A11 A12 A13 A14)
// save 32x16 subtile as 2*2 blocks of 16x8 columns: (AT11 AT12)
// (AT21 AT22)
static __device__ void
ctranspose_conj_device(
int m, int n,
const magmaFloatComplex *A, int lda,
magmaFloatComplex *AT, int ldat)
{
__shared__ magmaFloatComplex sA[NB][NX+1];
int tx = threadIdx.x;
int ty = threadIdx.y;
int ibx = blockIdx.x*NB;
int iby = blockIdx.y*NB;
int i, j;
A += ibx + tx + (iby + ty)*lda;
AT += iby + tx + (ibx + ty)*ldat;
#pragma unroll
for( int tile=0; tile < NB/NX; ++tile ) {
// load NX-by-NB subtile transposed from A into sA
i = ibx + tx + tile*NX;
j = iby + ty;
if (i < m) {
#pragma unroll
for( int j2=0; j2 < NB; j2 += NY ) {
if (j + j2 < n) {
sA[ty + j2][tx] = MAGMA_C_CONJ( A[j2*lda] );
}
}
}
__syncthreads();
// save NB-by-NX subtile from sA into AT
i = iby + tx;
j = ibx + ty + tile*NX;
#pragma unroll
for( int i2=0; i2 < NB; i2 += NX ) {
if (i + i2 < n) {
#pragma unroll
for( int j2=0; j2 < NX; j2 += NY ) {
if (j + j2 < m) {
AT[i2 + j2*ldat] = sA[tx + i2][ty + j2];
}
}
}
}
__syncthreads();
// move to next subtile
A += NX;
AT += NX*ldat;
}
}
/*
kernel wrapper to call the device function.
*/
__global__
void ctranspose_conj_kernel(
int m, int n,
const magmaFloatComplex *A, int lda,
magmaFloatComplex *AT, int ldat)
{
ctranspose_conj_device(m, n, A, lda, AT, ldat);
}
__global__
void ctranspose_conj_kernel_batched(
int m, int n,
magmaFloatComplex **dA_array, int lda,
magmaFloatComplex **dAT_array, int ldat)
{
int batchid = blockIdx.z;
ctranspose_conj_device(m, n, dA_array[batchid], lda, dAT_array[batchid], ldat);
}
/***************************************************************************//**
Purpose
-------
ctranspose_conj_q copies and conjugate-transposes a matrix dA to matrix dAT.
Same as ctranspose_conj, but adds queue argument.
Arguments
---------
@param[in]
m INTEGER
The number of rows of the matrix dA. M >= 0.
@param[in]
n INTEGER
The number of columns of the matrix dA. N >= 0.
@param[in]
dA COMPLEX array, dimension (LDDA,N)
The M-by-N matrix dA.
@param[in]
ldda INTEGER
The leading dimension of the array dA. LDDA >= M.
@param[in]
dAT COMPLEX array, dimension (LDDAT,M)
The N-by-M matrix dAT.
@param[in]
lddat INTEGER
The leading dimension of the array dAT. LDDAT >= N.
@param[in]
queue magma_queue_t
Queue to execute in.
@ingroup magma_transpose
*******************************************************************************/
extern "C" void
magmablas_ctranspose_conj_q(
magma_int_t m, magma_int_t n,
magmaFloatComplex_const_ptr dA, magma_int_t ldda,
magmaFloatComplex_ptr dAT, magma_int_t lddat,
magma_queue_t queue )
{
magma_int_t info = 0;
if ( m < 0 )
info = -1;
else if ( n < 0 )
info = -2;
else if ( ldda < m )
info = -4;
else if ( lddat < n )
info = -6;
if ( info != 0 ) {
magma_xerbla( __func__, -(info) );
return; //info;
}
/* Quick return */
if ( (m == 0) || (n == 0) )
return;
dim3 threads( NX, NY );
dim3 grid( magma_ceildiv( m, NB ), magma_ceildiv( n, NB ) );
ctranspose_conj_kernel<<< grid, threads, 0, queue->cuda_stream() >>>
( m, n, dA, ldda, dAT, lddat );
}
/***************************************************************************//**
Purpose
-------
ctranspose_conj_batched_q copies and conjugate-transposes a matrix dA_array[i] to matrix dAT_array[i].
Same as ctranspose_conj_batched, but adds queue argument.
Arguments
---------
@param[in]
m INTEGER
The number of rows of the matrix dA. M >= 0.
@param[in]
n INTEGER
The number of columns of the matrix dA. N >= 0.
@param[in]
dA_array
COMPLEX* array, dimension (batchCount)
array of pointers to the matrices dA, where each dA is of dimension (LDDA,N)
The M-by-N matrix dA.
@param[in]
ldda INTEGER
The leading dimension of the array dA. LDDA >= M.
@param[in]
dAT_array
COMPLEX* array, dimension (batchCount)
array of pointers to the matrices dAT, where each dAT is of dimension (LDDAT,M)
The N-by-M matrix dAT.
@param[in]
lddat INTEGER
The leading dimension of the array dAT. LDDAT >= N.
@param[in]
queue magma_queue_t
Queue to execute in.
@param[in]
batchCount Number of matrices in dA_array and dAT_array
@ingroup magma_transpose_batched
*******************************************************************************/
extern "C" void
magmablas_ctranspose_conj_batched(
magma_int_t m, magma_int_t n,
magmaFloatComplex **dA_array, magma_int_t ldda,
magmaFloatComplex **dAT_array, magma_int_t lddat,
magma_int_t batchCount, magma_queue_t queue )
{
magma_int_t info = 0;
if ( m < 0 )
info = -1;
else if ( n < 0 )
info = -2;
else if ( ldda < m )
info = -4;
else if ( lddat < n )
info = -6;
if ( info != 0 ) {
magma_xerbla( __func__, -(info) );
return; //info;
}
/* Quick return */
if ( (m == 0) || (n == 0) )
return;
dim3 threads( NX, NY );
dim3 grid( magma_ceildiv( m, NB ), magma_ceildiv( n, NB ), batchCount );
ctranspose_conj_kernel_batched<<< grid, threads, 0, queue->cuda_stream() >>>
( m, n, dA_array, ldda, dAT_array, lddat );
}