-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathclarfg_devicesfunc.cuh
113 lines (97 loc) · 3.13 KB
/
clarfg_devicesfunc.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/*
-- MAGMA (version 2.1.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
@date August 2016
@author Tingxing Dong
@generated from magmablas/zlarfg_devicesfunc.cuh, normal z -> c, Tue Aug 30 09:39:22 2016
*/
#include "magma_templates.h"
#ifndef MAGMABLAS_CLARFG_DEVICES_Z_H
#define MAGMABLAS_CLARFG_DEVICES_Z_H
#define COMPLEX
/******************************************************************************/
/*
lapack clarfg, compute the norm, scale and generate the householder vector
assume swork, sscale, scale are already allocated in shared memory
BLOCK_SIZE is set outside, the size of swork is BLOCK_SIZE
*/
static __device__ void
clarfg_device(
magma_int_t n,
magmaFloatComplex* dalpha, magmaFloatComplex* dx, int incx,
magmaFloatComplex* dtau, float* swork, float* sscale, magmaFloatComplex* scale)
{
const int tx = threadIdx.x;
magmaFloatComplex tmp;
// find max of [dalpha, dx], to use as scaling to avoid unnecesary under- and overflow
if ( tx == 0 ) {
tmp = *dalpha;
#ifdef COMPLEX
swork[tx] = max( fabs(real(tmp)), fabs(imag(tmp)) );
#else
swork[tx] = fabs(tmp);
#endif
}
else {
swork[tx] = 0;
}
if (tx < BLOCK_SIZE)
{
for( int j = tx; j < n-1; j += BLOCK_SIZE ) {
tmp = dx[j*incx];
#ifdef COMPLEX
swork[tx] = max( swork[tx], max( fabs(real(tmp)), fabs(imag(tmp)) ));
#else
swork[tx] = max( swork[tx], fabs(tmp) );
#endif
}
}
magma_max_reduce<BLOCK_SIZE>( tx, swork );
if ( tx == 0 )
*sscale = swork[0];
__syncthreads();
// sum norm^2 of dx/sscale
// dx has length n-1
if (tx < BLOCK_SIZE) swork[tx] = 0;
if ( *sscale > 0 ) {
if (tx < BLOCK_SIZE)
{
for( int j = tx; j < n-1; j += BLOCK_SIZE ) {
tmp = dx[j*incx] / *sscale;
swork[tx] += real(tmp)*real(tmp) + imag(tmp)*imag(tmp);
}
}
magma_sum_reduce<BLOCK_SIZE>( tx, swork );
}
if ( tx == 0 ) {
magmaFloatComplex alpha = *dalpha;
if ( swork[0] == 0 && imag(alpha) == 0 ) {
// H = I
*dtau = MAGMA_C_ZERO;
}
else {
// beta = norm( [dalpha, dx] )
float beta;
tmp = alpha / *sscale;
beta = *sscale * sqrt( real(tmp)*real(tmp) + imag(tmp)*imag(tmp) + swork[0] );
beta = -copysign( beta, real(alpha) );
// todo: deal with badly scaled vectors (see lapack's larfg)
*dtau = MAGMA_C_MAKE( (beta - real(alpha)) / beta, -imag(alpha) / beta );
*dalpha = MAGMA_C_MAKE( beta, 0 );
*scale = 1 / (alpha - beta);
}
}
// scale x (if norm was not 0)
__syncthreads();
if ( swork[0] != 0 ) {
if (tx < BLOCK_SIZE)
{
for( int j = tx; j < n-1; j += BLOCK_SIZE ) {
dx[j*incx] *= *scale;
}
}
}
}
#endif /* MAGMABLAS_CLARFG_DEVICES_Z_H */