-
Notifications
You must be signed in to change notification settings - Fork 251
/
Copy pathmul_waksman.c
131 lines (105 loc) · 3.98 KB
/
mul_waksman.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/*
Copyright (C) 2024 Éric Schost
Copyright (C) 2024 Vincent Neiger
This file is part of FLINT.
FLINT is free software: you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License (LGPL) as published
by the Free Software Foundation; either version 3 of the License, or
(at your option) any later version. See <https://www.gnu.org/licenses/>.
*/
#include "fmpz.h"
#include "fmpz_vec.h"
#include "fmpz_mat.h"
/** ------------------------------------------------------------ */
/** Waksman's algorithm for matrix multiplication */
/** does n^3/2+O(n^2) products, but many additions */
/** good for small matrices with large entries */
/** ------------------------------------------------------------ */
void fmpz_mat_mul_waksman(fmpz_mat_t C, const fmpz_mat_t A, const fmpz_mat_t B)
{
slong m = A->r;
slong n = B->r;
slong p = B->c;
if (m == 0 || n == 0 || p == 0)
{
fmpz_mat_zero(C);
return;
}
slong i, l, j, k;
fmpz * Crow = _fmpz_vec_init(p + m);
fmpz * Ccol = Crow + p;
slong np = n >> 1;
fmpz_t val0, val1, val2, crow;
fmpz_init(val0);
fmpz_init(val1);
fmpz_init(val2);
fmpz_init(crow);
for (i = 0; i < p; i++)
fmpz_zero(Crow + i);
for (i = 0; i < m; i++)
fmpz_zero(Ccol + i);
for (i = 0; i < m; i++)
for (j = 0; j < p; j++)
fmpz_zero(fmpz_mat_entry(C, i, j));
for (j = 1; j <= np; j++)
{
slong j2 = (j << 1) - 1;
for (k = 0; k < p; k++)
{
fmpz_add(val1, fmpz_mat_entry(A, 0, j2-1), fmpz_mat_entry(B, j2, k));
fmpz_add(val2, fmpz_mat_entry(A, 0, j2), fmpz_mat_entry(B, j2-1, k));
fmpz_addmul(fmpz_mat_entry(C, 0, k), val1, val2);
fmpz_sub(val1, fmpz_mat_entry(A, 0, j2-1), fmpz_mat_entry(B, j2, k));
fmpz_sub(val2, fmpz_mat_entry(A, 0, j2), fmpz_mat_entry(B, j2-1, k));
fmpz_addmul(Crow + k, val1, val2);
}
for (l = 1; l < m; l++)
{
fmpz_add(val1, fmpz_mat_entry(A, l, j2-1), fmpz_mat_entry(B, j2, 0));
fmpz_add(val2, fmpz_mat_entry(A, l, j2), fmpz_mat_entry(B, j2-1, 0));
fmpz_addmul(fmpz_mat_entry(C, l, 0), val1, val2);
fmpz_sub(val1, fmpz_mat_entry(A, l, j2-1), fmpz_mat_entry(B, j2, 0));
fmpz_sub(val2, fmpz_mat_entry(A, l, j2), fmpz_mat_entry(B, j2-1, 0));
fmpz_addmul(Ccol + l, val1, val2);
}
for (k = 1; k < p; k++)
{
for (l = 1; l < m; l++)
{
fmpz_add(val1, fmpz_mat_entry(A, l, j2-1), fmpz_mat_entry(B, j2, k));
fmpz_add(val2, fmpz_mat_entry(A, l, j2), fmpz_mat_entry(B, j2-1, k));
fmpz_addmul(fmpz_mat_entry(C, l, k), val1, val2);
}
}
}
for (l = 1; l < m; l++)
{
fmpz_add(val1, Ccol + l, fmpz_mat_entry(C, l, 0));
fmpz_tdiv_q_2exp(Ccol+ l, val1, 1);
fmpz_sub(fmpz_mat_entry(C, l, 0), fmpz_mat_entry(C, l, 0), Ccol + l);
}
fmpz_add(val1, Crow, fmpz_mat_entry(C, 0, 0));
fmpz_tdiv_q_2exp(val0, val1, 1);
fmpz_sub(fmpz_mat_entry(C, 0, 0), fmpz_mat_entry(C, 0, 0), val0);
for (k = 1; k < p; k++)
{
fmpz_add(crow, Crow + k, fmpz_mat_entry(C, 0, k));
fmpz_tdiv_q_2exp(val1, crow, 1);
fmpz_sub(fmpz_mat_entry(C, 0, k), fmpz_mat_entry(C, 0, k), val1);
fmpz_sub(crow, val1, val0);
for (l = 1; l < m; l++)
{
fmpz_sub(val2, fmpz_mat_entry(C, l, k), crow);
fmpz_sub(fmpz_mat_entry(C, l, k), val2, Ccol + l);
}
}
if ((n & 1) == 1)
for (l = 0; l < m; l++)
for (k = 0; k < p; k++)
fmpz_addmul(fmpz_mat_entry(C, l, k), fmpz_mat_entry(A, l, n-1), fmpz_mat_entry(B, n-1, k));
_fmpz_vec_clear(Crow, p + m);
fmpz_clear(val0);
fmpz_clear(val1);
fmpz_clear(val2);
fmpz_clear(crow);
}