-
Notifications
You must be signed in to change notification settings - Fork 234
/
Copy pathdevice.c
253 lines (231 loc) · 8.63 KB
/
device.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#include <rte_config.h>
#include <rte_ethdev.h>
#include <rte_mempool.h>
#include <rte_ether.h>
#include <rte_cycles.h>
#include <rte_mbuf.h>
#include <ixgbe_type.h>
// default descriptors per queue
#define DEFAULT_RX_DESCS 512
#define DEFAULT_TX_DESCS 256
// values taken from the DPDK-L2FWD example, optimized for 82599 chips
#define RX_PTHRESH 8
#define RX_HTHRESH 8
#define RX_WTHRESH 4
#define TX_PTHRESH 36
#define TX_HTHRESH 0
#define TX_WTHRESH 0
#define MAX_ETH_DEVICES 64
static uint8_t* registers[RTE_MAX_ETHPORTS];
int configure_device(int port, int rx_queues, int tx_queues, int rx_descs, int tx_descs, struct rte_mempool* mempool) {
if (port > RTE_MAX_ETHPORTS) {
printf("error: Maximum number of supported ports is %d\n This can be changed with the DPDK compile-time configuration variable RTE_MAX_ETHPORTS\n", RTE_MAX_ETHPORTS);
return -1;
}
// TODO: enable FDIR perfect filter to forward specific streams to cores
struct rte_eth_conf port_conf = {
.rxmode = {
.split_hdr_size = 0,
.header_split = 0,
.hw_ip_checksum = 1,
.hw_vlan_filter = 0,
.jumbo_frame = 0,
.hw_strip_crc = 1,
},
.txmode = {
.mq_mode = ETH_MQ_TX_NONE,
},
};
int rc = rte_eth_dev_configure(port, rx_queues, tx_queues, &port_conf);
if (rc) return rc;
// DPDK documentation suggests that the tx queues should be set up before the rx queues
struct rte_eth_txconf tx_conf = {
// TODO: this should use different values for older GbE NICs
.tx_thresh = {
.pthresh = TX_PTHRESH,
.hthresh = TX_HTHRESH,
.wthresh = TX_WTHRESH,
},
.tx_free_thresh = 0, // 0 = default
.tx_rs_thresh = 0, // 0 = default
.txq_flags = ETH_TXQ_FLAGS_NOMULTSEGS,
};
for (int i = 0; i < tx_queues; i++) {
// TODO: get socket id for the NIC
rc = rte_eth_tx_queue_setup(port, i, tx_descs ? tx_descs : DEFAULT_TX_DESCS, SOCKET_ID_ANY, &tx_conf);
if (rc) {
printf("could not configure tx queue %d\n", i);
return rc;
}
}
struct rte_eth_rxconf rx_conf = {
.rx_drop_en = 0,
.rx_thresh = {
.pthresh = RX_PTHRESH,
.hthresh = RX_HTHRESH,
.wthresh = RX_WTHRESH,
},
};
for (int i = 0; i < rx_queues; i++) {
// TODO: get socket id for the NIC
rc = rte_eth_rx_queue_setup(port, i, rx_descs ? rx_descs : DEFAULT_RX_DESCS, SOCKET_ID_ANY, &rx_conf, mempool);
if (rc) {
printf("could not configure rx queue %d\n", i);
return rc;
}
}
rte_eth_promiscuous_enable(port);
rc = rte_eth_dev_start(port);
// save memory address of the register file
struct rte_eth_dev_info dev_info;
rte_eth_dev_info_get(port, &dev_info);
registers[port] = (uint8_t*) dev_info.pci_dev->mem_resource[0].addr;
return rc;
}
uint32_t read_reg32(uint8_t port, uint32_t reg) {
return *(volatile uint32_t*)(registers[port] + reg);
}
void write_reg32(uint8_t port, uint32_t reg, uint32_t val) {
//printf("write_reg32(%u, %u, %u)\n", port, reg, val);
*(volatile uint32_t*)(registers[port] + reg) = val;
}
static inline volatile uint32_t* get_reg_addr(uint8_t port, uint32_t reg) {
return (volatile uint32_t*)(registers[port] + reg);
}
uint64_t get_mac_addr(int port, char* buf) {
struct ether_addr addr;
rte_eth_macaddr_get(port, &addr);
if (buf) {
sprintf(buf, "%02X:%02X:%02X:%02X:%02X:%02X", addr.addr_bytes[0], addr.addr_bytes[1], addr.addr_bytes[2], addr.addr_bytes[3], addr.addr_bytes[4], addr.addr_bytes[5]);
}
return addr.addr_bytes[0] | (addr.addr_bytes[1] << 8) | (addr.addr_bytes[2] << 16) | ((uint64_t) addr.addr_bytes[3] << 24) | ((uint64_t) addr.addr_bytes[4] << 32) | ((uint64_t) addr.addr_bytes[5] << 40);
}
uint32_t get_pci_id(uint8_t port) {
struct rte_eth_dev_info dev_info;
rte_eth_dev_info_get(port, &dev_info);
return dev_info.pci_dev->id.vendor_id << 16 | dev_info.pci_dev->id.device_id;
}
uint8_t get_socket(uint8_t port) {
// TODO: figure out to get this information
return 0;
}
void sync_clocks(uint8_t port1, uint8_t port2) {
// resetting SYSTIML twice prevents a race-condition when SYSTIML is just about to overflow into SYSTIMH
write_reg32(port1, IXGBE_SYSTIML, 0);
write_reg32(port2, IXGBE_SYSTIML, 0);
write_reg32(port1, IXGBE_SYSTIMH, 0);
write_reg32(port2, IXGBE_SYSTIMH, 0);
if (port1 == port2) {
// just reset timers if port1 == port2
return;
}
// to avoid potential unnecessary overhead between the two accesses; especially if compiler optimizations are disabled for some reason
// this is probably completely unnecessary on a modern OoO cpu
volatile uint32_t* port1time = get_reg_addr(port1, IXGBE_SYSTIML);
volatile uint32_t* port2time = get_reg_addr(port2, IXGBE_SYSTIML);
const int num_runs = 7; // must be odd
int32_t offsets[num_runs];
*port1time = 0;
*port2time = 0; // the clocks now differ by offs, the time for the write access which is calculated in the following loop
for (int i = 0; i < num_runs; i++) {
uint32_t x1 = *port1time;
uint32_t x2 = *port2time;
uint32_t y1 = *port2time;
uint32_t y2 = *port1time;
int32_t delta_t = abs(((int64_t) x1 - x2 - ((int64_t) y2 - y1)) / 2); // time between two reads
int32_t offs = delta_t + x1 - x2;
offsets[i] = offs;
//printf("%d: delta_t: %d\toffs: %d\n", i, delta_t, offs);
}
int cmp(const void* e1, const void* e2) {
int32_t offs1 = *(int32_t*) e1;
int32_t offs2 = *(int32_t*) e2;
return offs1 < offs2 ? -1 : offs1 > offs2 ? 1 : 0;
}
// use the median offset
qsort(offsets, num_runs, sizeof(int32_t), &cmp);
int32_t offs = offsets[num_runs / 2];
if (offs) {
// offs of 0 is not supported
write_reg32(port2, IXGBE_TIMADJL, offs < 0 ? (uint32_t) -offs : (uint32_t) offs);
write_reg32(port2, IXGBE_TIMADJH, offs < 0 ? 1 << 31 : 0);
// verification that the clocks are synced: the two clocks should only differ by a constant caused by the read operation
// i.e. x2 - x1 = y2 - y1 iff clock1 == clock2
/*uint32_t x1 = *port1time;
uint32_t x2 = *port2time;
uint32_t y1 = *port2time;
uint32_t y2 = *port1time;
printf("%d %d\n", x2 - x1, y2 - y1);*/
}
}
// for calibration
uint32_t get_clock_difference(uint8_t port1, uint8_t port2) {
// TODO: this should take the delay between reading the two registers into account
// however, this is not necessary for the current use case (measuring clock drift)
volatile uint32_t p1time = read_reg32(port1, IXGBE_SYSTIML);
volatile uint32_t p2time = read_reg32(port2, IXGBE_SYSTIML);
volatile uint32_t p1timeh = read_reg32(port1, IXGBE_SYSTIMH);
volatile uint32_t p2timeh = read_reg32(port2, IXGBE_SYSTIMH);
return (((int64_t) p1timeh << 32) | p1time) - (((int64_t) p2timeh << 32) | p2time);
}
void send_all_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf** pkts, uint16_t num_pkts) {
uint32_t sent = 0;
while (1) {
sent += rte_eth_tx_burst(port_id, queue_id, pkts + sent, num_pkts - sent);
if (sent >= num_pkts) {
return;
}
}
return;
}
// registers all libraries
// this should be done on startup via a __attribute__((__constructor__)) function
// however, there seems to be a bug: the init functions don't seem to work if called in the wrong order (note that the order depends on the linker)
// calling devinitfn_bond_drv() last causes problems
// so we just add them here again in an order that actually works independent from the link order
void devinitfn_rte_vmxnet3_driver();
void devinitfn_rte_virtio_driver();
void devinitfn_pmd_ring_drv();
void devinitfn_rte_ixgbe_driver();
void devinitfn_rte_ixgbevf_driver();
void devinitfn_rte_i40evf_driver();
void devinitfn_rte_i40e_driver();
void devinitfn_pmd_igb_drv();
void devinitfn_pmd_igbvf_drv();
void devinitfn_em_pmd_drv();
void devinitfn_bond_drv();
void devinitfn_pmd_xenvirt_drv();
void devinitfn_pmd_pcap_drv();
void register_pmd_drivers() {
devinitfn_bond_drv();
devinitfn_rte_vmxnet3_driver();
devinitfn_rte_virtio_driver();
devinitfn_pmd_ring_drv();
devinitfn_rte_ixgbevf_driver();
devinitfn_rte_ixgbe_driver();
devinitfn_rte_i40evf_driver();
devinitfn_rte_i40e_driver();
devinitfn_pmd_igb_drv();
devinitfn_pmd_igbvf_drv();
devinitfn_em_pmd_drv();
// TODO: what's wrong with these two?
//devinitfn_pmd_xenvirt_drv();
//devinitfn_pmd_pcap_drv();
}
// the following functions are static inline function in header files
// this is the easiest/least ugly way to make them available to luajit (#defining static before including the header breaks stuff)
uint16_t rte_eth_rx_burst_export(uint8_t port_id, uint16_t queue_id, void* rx_pkts, uint16_t nb_pkts) {
return rte_eth_rx_burst(port_id, queue_id, rx_pkts, nb_pkts);
}
uint16_t rte_eth_tx_burst_export(uint8_t port_id, uint16_t queue_id, void* tx_pkts, uint16_t nb_pkts) {
return rte_eth_tx_burst(port_id, queue_id, tx_pkts, nb_pkts);
}
void rte_pktmbuf_free_export(void* m) {
rte_pktmbuf_free(m);
}
void rte_delay_ms_export(uint32_t ms) {
rte_delay_ms(ms);
}
void rte_delay_us_export(uint32_t us) {
rte_delay_us(us);
}