-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathclarf.cu
156 lines (130 loc) · 4.7 KB
/
clarf.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/*
-- MAGMA (version 2.1.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
@date August 2016
@generated from magmablas/zlarf.cu, normal z -> c, Tue Aug 30 09:38:30 2016
@author Azzam Haidar
*/
#include "magma_internal.h"
#include "magma_templates.h"
// 512 is maximum number of threads for CUDA capability 1.x
#define BLOCK_SIZE 512
#define BLOCK_SIZEx 32
#define BLOCK_SIZEy 16
/******************************************************************************/
__global__
void magma_clarf_kernel(
int m, const magmaFloatComplex *dv, const magmaFloatComplex *dtau,
magmaFloatComplex *dc, int lddc )
{
if ( !MAGMA_C_EQUAL(*dtau, MAGMA_C_ZERO) ) {
const int tx = threadIdx.x;
dc = dc + blockIdx.x * lddc;
__shared__ magmaFloatComplex sum[ BLOCK_SIZE ];
magmaFloatComplex tmp;
/* perform w := v**H * C */
if (tx == 0)
tmp = dc[0]; //since V[0] should be one
else
tmp = MAGMA_C_ZERO;
for( int j = tx+1; j < m; j += BLOCK_SIZE ) {
tmp += MAGMA_C_MUL( MAGMA_C_CONJ( dv[j] ), dc[j] );
}
sum[tx] = tmp;
magma_sum_reduce< BLOCK_SIZE >( tx, sum );
/* C := C - v * w */
__syncthreads();
tmp = - MAGMA_C_CONJ(*dtau) * sum[0];
for( int j = m-tx-1; j > 0; j -= BLOCK_SIZE )
dc[j] += tmp * dv[j];
if (tx == 0) dc[0] += tmp;
}
}
/******************************************************************************/
__global__
void magma_clarf_smkernel(
int m, int n, magmaFloatComplex *dv, magmaFloatComplex *dtau,
magmaFloatComplex *dc, int lddc )
{
if ( ! MAGMA_C_EQUAL(*dtau, MAGMA_C_ZERO) ) {
const int i = threadIdx.x, col= threadIdx.y;
for( int k = col; k < n; k += BLOCK_SIZEy ) {
dc = dc + k * lddc;
__shared__ magmaFloatComplex sum[ BLOCK_SIZEx ][ BLOCK_SIZEy + 1];
magmaFloatComplex lsum;
/* w := v**H * C */
lsum = MAGMA_C_ZERO;
for( int j = i; j < m; j += BLOCK_SIZEx ) {
if (j == 0)
lsum += MAGMA_C_MUL( MAGMA_C_ONE, dc[j] );
else
lsum += MAGMA_C_MUL( MAGMA_C_CONJ( dv[j] ), dc[j] );
}
sum[i][col] = lsum;
magma_sum_reduce_2d< BLOCK_SIZEx, BLOCK_SIZEy+1 >( i, col, sum );
/* C := C - v * w */
__syncthreads();
magmaFloatComplex z__1 = - MAGMA_C_CONJ(*dtau) * sum[0][col];
for( int j = m-i-1; j >= 0; j -= BLOCK_SIZEx ) {
if (j == 0)
dc[j] += z__1;
else
dc[j] += z__1 * dv[j];
}
}
}
}
/******************************************************************************/
/*
Apply a complex elementary reflector H to a complex M-by-N
matrix C from the left. H is represented in the form
H = I - tau * v * v**H
where tau is a complex scalar and v is a complex vector.
If tau = 0, then H is taken to be the unit matrix.
To apply H**H (the conjugate transpose of H), supply conjg(tau)
instead tau.
This routine uses only one SM (block).
*/
extern "C" void
magma_clarf_sm(
magma_int_t m, magma_int_t n,
magmaFloatComplex *dv, magmaFloatComplex *dtau,
magmaFloatComplex *dc, magma_int_t lddc,
magma_queue_t queue )
{
dim3 blocks( 1 );
dim3 threads( BLOCK_SIZEx, BLOCK_SIZEy );
magma_clarf_smkernel
<<< blocks, threads, 0, queue->cuda_stream() >>>
( m, n, dv, dtau, dc, lddc );
}
/***************************************************************************//**
Apply a complex elementary reflector H to a complex M-by-N
matrix C from the left. H is represented in the form
H = I - tau * v * v**H
where tau is a complex scalar and v is a complex vector.
If tau = 0, then H is taken to be the unit matrix.
To apply H**H (the conjugate transpose of H), supply conjg(tau)
instead tau.
*******************************************************************************/
extern "C" magma_int_t
magma_clarf_gpu(
magma_int_t m, magma_int_t n,
magmaFloatComplex_const_ptr dv,
magmaFloatComplex_const_ptr dtau,
magmaFloatComplex_ptr dC, magma_int_t lddc,
magma_queue_t queue )
{
dim3 grid( n, 1, 1 );
dim3 threads( BLOCK_SIZE );
if ( n > 0 ) {
magma_clarf_kernel
<<< grid, threads, 0, queue->cuda_stream() >>>
( m, dv, dtau, dC, lddc);
}
// The computation can be done on 1 SM with the following routine.
// magma_clarf_sm(m, n, dv, dtau, dc, lddc);
return MAGMA_SUCCESS;
}