-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathclascl_2x2.cu
142 lines (117 loc) · 4.11 KB
/
clascl_2x2.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/*
-- MAGMA (version 2.1.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
@date August 2016
@generated from magmablas/zlascl_2x2.cu, normal z -> c, Tue Aug 30 09:38:32 2016
@author Ichitaro Yamazaki
*/
#include "magma_internal.h"
#define NB 64
#define A(i,j) (A[(i) + (j)*lda])
#define W(i,j) (W[(i) + (j)*ldw])
// each thread block does one NB x n block row of A.
// each thread does one row, starting from left edge and moving right to diagonal.
__global__ void
clascl_2x2_lower(
int m,
const magmaFloatComplex* W, int ldw,
magmaFloatComplex* A, int lda)
{
int ind = blockIdx.x * NB + threadIdx.x;
magmaFloatComplex D21 = W( 1, 0 );
magmaFloatComplex D11 = MAGMA_C_DIV( W( 1, 1 ), D21 );
magmaFloatComplex D22 = MAGMA_C_DIV( W( 0, 0 ), MAGMA_C_CONJ( D21 ) );
float T = 1.0 / ( MAGMA_C_REAL( D11*D22 ) - 1.0 );
D21 = MAGMA_C_DIV( MAGMA_C_MAKE(T,0.0), D21 );
if (ind < m) {
A( ind, 0 ) = MAGMA_C_CONJ( D21 )*( D11*W( 2+ind, 0 )-W( 2+ind, 1 ) );
A( ind, 1 ) = D21*( D22*W( 2+ind, 1 )-W( 2+ind, 0 ) );
}
}
// each thread block does one NB x n block row of A.
// each thread does one row, starting from right edge and moving left to diagonal.
__global__ void
clascl_2x2_upper(
int m,
const magmaFloatComplex *W, int ldw,
magmaFloatComplex* A, int lda)
{
int ind = blockIdx.x * NB + threadIdx.x;
magmaFloatComplex D21 = W( m, 1 );
magmaFloatComplex D11 = MAGMA_C_DIV( W( m+1, 1 ), MAGMA_C_CONJ( D21 ) );
magmaFloatComplex D22 = MAGMA_C_DIV( W( m, 0 ), D21 );
float T = 1.0 / ( MAGMA_C_REAL( D11*D22 ) - 1.0 );
D21 = MAGMA_C_DIV( MAGMA_C_MAKE(T,0.0), D21 );
if (ind < m) {
A( ind, 0 ) = D21*( D11*W( ind, 0 )-W( ind, 1 ) );
A( ind, 1 ) = MAGMA_C_CONJ( D21 )*( D22*W( ind, 1 )-W( ind, 0 ) );
}
}
/***************************************************************************//**
Purpose
-------
CLASCL_2x2 scales the M by M complex matrix A by the 2-by-2 pivot.
TYPE specifies that A may be upper or lower triangular.
Arguments
---------
@param[in]
type magma_type_t
TYPE indices the storage type of the input matrix A.
= MagmaLower: lower triangular matrix.
= MagmaUpper: upper triangular matrix.
Other formats that LAPACK supports, MAGMA does not currently support.
@param[in]
m INTEGER
The number of rows of the matrix A. M >= 0.
@param[in]
dW REAL vector, dimension (2*lddw)
The matrix containing the 2-by-2 pivot.
@param[in]
lddw INTEGER
The leading dimension of the array W. LDDA >= max(1,M).
@param[in,out]
dA COMPLEX array, dimension (LDDA,N)
The matrix to be scaled by dW. See TYPE for the
storage type.
@param[in]
ldda INTEGER
The leading dimension of the array A. LDDA >= max(1,M).
@param[out]
info INTEGER
- = 0: successful exit
- < 0: if INFO = -i, the i-th argument had an illegal value.
@param[in]
queue magma_queue_t
Queue to execute in.
@ingroup magma_lascl_2x2
*******************************************************************************/
extern "C" void
magmablas_clascl_2x2_q(
magma_type_t type, magma_int_t m,
magmaFloatComplex_const_ptr dW, magma_int_t lddw,
magmaFloatComplex_ptr dA, magma_int_t ldda,
magma_queue_t queue,
magma_int_t *info )
{
*info = 0;
if ( type != MagmaLower && type != MagmaUpper )
*info = -1;
else if ( m < 0 )
*info = -2;
else if ( ldda < max(1,m) )
*info = -4;
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return; //info;
}
dim3 threads( NB );
dim3 grid( magma_ceildiv( m, NB ) );
if (type == MagmaLower) {
clascl_2x2_lower <<< grid, threads, 0, queue->cuda_stream() >>> (m, dW, lddw, dA, ldda);
}
else {
clascl_2x2_upper <<< grid, threads, 0, queue->cuda_stream() >>> (m, dW, lddw, dA, ldda);
}
}