forked from pytorch/FBGEMM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCodeGenHelpers.h
128 lines (119 loc) · 3.38 KB
/
CodeGenHelpers.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
* Copyright (c) Facebook, Inc. and its affiliates.
* All rights reserved.
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <asmjit/asmjit.h>
namespace fbgemm {
namespace x86 = asmjit::x86;
/**
* @brief Create instruction sequence to generate 16-bit 1s
* @tparam T Register type of destination, e.g., x86::Ymm or x86::Zmm
*
* @param dest Once the instruction sequence is executed,
* dest[0:15] will have 0x0001, dest[16:31]
* will have 0x0001 and so on
*/
template <typename T>
void gen16BitVectorOne(x86::Emitter* a, T dest) {
a->vpcmpeqw(dest, dest, dest);
a->vpsrlw(dest, dest, 15);
}
/**
* @brief Create instruction sequence to generate 8-bit 1s
* @tparam T Register type of destination, e.g., x86::Ymm or x86::Zmm
*
* @param dest Once the instruction sequence is executed,
* dest[0:7] will have 0x01, dest[8:15]
* will have 0x01 and so on
*/
template <typename T>
void gen8BitVectorOne(x86::Emitter* a, T dest) {
a->vpcmpeqw(dest, dest, dest);
a->vpabsb(dest, dest);
}
/**
* @brief Generates instruction sequence to compute s32 += U8 * I8
* @tparam T Register type of destination, e.g., x86::Ymm or x86::Zmm
*
* @param cReg contains result
*
*/
template <typename T>
void genU8I8S32FMA(
x86::Emitter* a,
T aReg,
T bReg,
T cReg,
T oneReg16Bit,
T tmpReg) {
a->vpmaddubsw(tmpReg, aReg, bReg);
a->vpmaddwd(tmpReg, oneReg16Bit, tmpReg);
a->vpaddd(cReg, tmpReg, cReg);
}
/**
* @brief Add 4 consecutive numbers of type uint8
* and emit their sum as 32-bit numbers.
* i.e., dest[0:31] contains
* src[0:7] + src[8:15] + src[16:23] + src[24:31]
* @tparam T Register type of destination, e.g., x86::Ymm or x86::Zmm
*
* @param dest contains result
*
*/
template <typename T>
void genU8Sum4(
x86::Emitter* a,
T src,
T dest,
T oneReg16Bit,
T tmpReg) {
gen8BitVectorOne(a, tmpReg);
a->vpmaddubsw(tmpReg, src, tmpReg);
a->vpmaddwd(tmpReg, tmpReg, oneReg16Bit);
a->vpaddd(dest, tmpReg, dest);
/*a->vxorps(tmpReg, tmpReg, tmpReg);*/
/*a->vmpsadbw(tmpReg, src, tmpReg, static_cast<asmjit::Imm>(0));*/
/*a->vpermilps(tmpReg, tmpReg, static_cast<asmjit::Imm>(4));*/
/*a->vpmovzxwd(tmpReg, tmpReg.half());*/
/*a->vpaddd(dest, tmpReg, dest);*/
}
/**
* @brief Add 8 consecutive numbers of type uint8
* and emit their sum as 16-bit numbers.
* i.e., dest[0:15] contains
* src[0:7] + src[8:15] + src[16:23] + src[24:31]
* src[32:39] + src[40:47] + src[48:55] + src[56:63]
*
* and
*
* dest[64:79] contains
* src[64:71] + src[71:79] + src[80:87] + src[88:95]
* src[96:103] + src[104:111] + src[112:119] + src[120:127]
*
* so on
*
* @tparam T Register type of destination, e.g., x86::Ymm or x86::Zmm
*
* @param dest contains result
*
*/
template <typename T>
void genU8Sum8(x86::Emitter* a, T src, T dest, T tmpReg) {
a->vxorps(tmpReg, tmpReg, tmpReg);
a->vpsadbw(tmpReg, src, tmpReg);
a->vpaddd(dest, tmpReg, dest);
}
/**
* @brief Broadcast lower 8-bits of src to destination vector
* register.
*/
template <typename T>
void broadcast8Bit(x86::Emitter* a, x86::Gp src, T dest) {
// move src to dest
a->movq(dest.half(), src);
a->vpbroadcastb(dest, dest.half());
}
} // namespace fbgemm