forked from freebsd/freebsd-src
-
Notifications
You must be signed in to change notification settings - Fork 0
/
iflib.c
7101 lines (6313 loc) · 190 KB
/
iflib.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*-
* Copyright (c) 2014-2018, Matthew Macy <[email protected]>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Neither the name of Matthew Macy nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_acpi.h"
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/bus.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/module.h>
#include <sys/kobj.h>
#include <sys/rman.h>
#include <sys/sbuf.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/taskqueue.h>
#include <sys/limits.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_private.h>
#include <net/if_types.h>
#include <net/if_media.h>
#include <net/bpf.h>
#include <net/ethernet.h>
#include <net/mp_ring.h>
#include <net/debugnet.h>
#include <net/pfil.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp_lro.h>
#include <netinet/in_systm.h>
#include <netinet/if_ether.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <netinet/ip_var.h>
#include <netinet6/ip6_var.h>
#include <machine/bus.h>
#include <machine/in_cksum.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <dev/led/led.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pci_private.h>
#include <net/iflib.h>
#include "ifdi_if.h"
#ifdef PCI_IOV
#include <dev/pci/pci_iov.h>
#endif
#include <sys/bitstring.h>
/*
* enable accounting of every mbuf as it comes in to and goes out of
* iflib's software descriptor references
*/
#define MEMORY_LOGGING 0
/*
* Enable mbuf vectors for compressing long mbuf chains
*/
/*
* NB:
* - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
* we prefetch needs to be determined by the time spent in m_free vis a vis
* the cost of a prefetch. This will of course vary based on the workload:
* - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
* is quite expensive, thus suggesting very little prefetch.
* - small packet forwarding which is just returning a single mbuf to
* UMA will typically be very fast vis a vis the cost of a memory
* access.
*/
/*
* File organization:
* - private structures
* - iflib private utility functions
* - ifnet functions
* - vlan registry and other exported functions
* - iflib public core functions
*
*
*/
static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
#define IFLIB_RXEOF_MORE (1U << 0)
#define IFLIB_RXEOF_EMPTY (2U << 0)
struct iflib_txq;
typedef struct iflib_txq *iflib_txq_t;
struct iflib_rxq;
typedef struct iflib_rxq *iflib_rxq_t;
struct iflib_fl;
typedef struct iflib_fl *iflib_fl_t;
struct iflib_ctx;
static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
static void iflib_timer(void *arg);
static void iflib_tqg_detach(if_ctx_t ctx);
typedef struct iflib_filter_info {
driver_filter_t *ifi_filter;
void *ifi_filter_arg;
struct grouptask *ifi_task;
void *ifi_ctx;
} *iflib_filter_info_t;
struct iflib_ctx {
KOBJ_FIELDS;
/*
* Pointer to hardware driver's softc
*/
void *ifc_softc;
device_t ifc_dev;
if_t ifc_ifp;
cpuset_t ifc_cpus;
if_shared_ctx_t ifc_sctx;
struct if_softc_ctx ifc_softc_ctx;
struct sx ifc_ctx_sx;
struct mtx ifc_state_mtx;
iflib_txq_t ifc_txqs;
iflib_rxq_t ifc_rxqs;
uint32_t ifc_if_flags;
uint32_t ifc_flags;
uint32_t ifc_max_fl_buf_size;
uint32_t ifc_rx_mbuf_sz;
int ifc_link_state;
int ifc_watchdog_events;
struct cdev *ifc_led_dev;
struct resource *ifc_msix_mem;
struct if_irq ifc_legacy_irq;
struct grouptask ifc_admin_task;
struct grouptask ifc_vflr_task;
struct iflib_filter_info ifc_filter_info;
struct ifmedia ifc_media;
struct ifmedia *ifc_mediap;
struct sysctl_oid *ifc_sysctl_node;
uint16_t ifc_sysctl_ntxqs;
uint16_t ifc_sysctl_nrxqs;
uint16_t ifc_sysctl_qs_eq_override;
uint16_t ifc_sysctl_rx_budget;
uint16_t ifc_sysctl_tx_abdicate;
uint16_t ifc_sysctl_core_offset;
#define CORE_OFFSET_UNSPECIFIED 0xffff
uint8_t ifc_sysctl_separate_txrx;
uint8_t ifc_sysctl_use_logical_cores;
bool ifc_cpus_are_physical_cores;
qidx_t ifc_sysctl_ntxds[8];
qidx_t ifc_sysctl_nrxds[8];
struct if_txrx ifc_txrx;
#define isc_txd_encap ifc_txrx.ift_txd_encap
#define isc_txd_flush ifc_txrx.ift_txd_flush
#define isc_txd_credits_update ifc_txrx.ift_txd_credits_update
#define isc_rxd_available ifc_txrx.ift_rxd_available
#define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
#define isc_rxd_refill ifc_txrx.ift_rxd_refill
#define isc_rxd_flush ifc_txrx.ift_rxd_flush
#define isc_legacy_intr ifc_txrx.ift_legacy_intr
#define isc_txq_select ifc_txrx.ift_txq_select
#define isc_txq_select_v2 ifc_txrx.ift_txq_select_v2
eventhandler_tag ifc_vlan_attach_event;
eventhandler_tag ifc_vlan_detach_event;
struct ether_addr ifc_mac;
};
void *
iflib_get_softc(if_ctx_t ctx)
{
return (ctx->ifc_softc);
}
device_t
iflib_get_dev(if_ctx_t ctx)
{
return (ctx->ifc_dev);
}
if_t
iflib_get_ifp(if_ctx_t ctx)
{
return (ctx->ifc_ifp);
}
struct ifmedia *
iflib_get_media(if_ctx_t ctx)
{
return (ctx->ifc_mediap);
}
void
iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
{
bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN);
}
if_softc_ctx_t
iflib_get_softc_ctx(if_ctx_t ctx)
{
return (&ctx->ifc_softc_ctx);
}
if_shared_ctx_t
iflib_get_sctx(if_ctx_t ctx)
{
return (ctx->ifc_sctx);
}
#define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
#define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
#define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
#define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
#define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
typedef struct iflib_sw_rx_desc_array {
bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */
struct mbuf **ifsd_m; /* pkthdr mbufs */
caddr_t *ifsd_cl; /* direct cluster pointer for rx */
bus_addr_t *ifsd_ba; /* bus addr of cluster for rx */
} iflib_rxsd_array_t;
typedef struct iflib_sw_tx_desc_array {
bus_dmamap_t *ifsd_map; /* bus_dma maps for packet */
bus_dmamap_t *ifsd_tso_map; /* bus_dma maps for TSO packet */
struct mbuf **ifsd_m; /* pkthdr mbufs */
} if_txsd_vec_t;
/* magic number that should be high enough for any hardware */
#define IFLIB_MAX_TX_SEGS 128
#define IFLIB_RX_COPY_THRESH 128
#define IFLIB_MAX_RX_REFRESH 32
/* The minimum descriptors per second before we start coalescing */
#define IFLIB_MIN_DESC_SEC 16384
#define IFLIB_DEFAULT_TX_UPDATE_FREQ 16
#define IFLIB_QUEUE_IDLE 0
#define IFLIB_QUEUE_HUNG 1
#define IFLIB_QUEUE_WORKING 2
/* maximum number of txqs that can share an rx interrupt */
#define IFLIB_MAX_TX_SHARED_INTR 4
/* this should really scale with ring size - this is a fairly arbitrary value */
#define TX_BATCH_SIZE 32
#define IFLIB_RESTART_BUDGET 8
#define IFC_LEGACY 0x001
#define IFC_QFLUSH 0x002
#define IFC_MULTISEG 0x004
#define IFC_SPARE1 0x008
#define IFC_SC_ALLOCATED 0x010
#define IFC_INIT_DONE 0x020
#define IFC_PREFETCH 0x040
#define IFC_DO_RESET 0x080
#define IFC_DO_WATCHDOG 0x100
#define IFC_SPARE0 0x200
#define IFC_SPARE2 0x400
#define IFC_IN_DETACH 0x800
#define IFC_NETMAP_TX_IRQ 0x80000000
#define CSUM_OFFLOAD (CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
struct iflib_txq {
qidx_t ift_in_use;
qidx_t ift_cidx;
qidx_t ift_cidx_processed;
qidx_t ift_pidx;
uint8_t ift_gen;
uint8_t ift_br_offset;
uint16_t ift_npending;
uint16_t ift_db_pending;
uint16_t ift_rs_pending;
/* implicit pad */
uint8_t ift_txd_size[8];
uint64_t ift_processed;
uint64_t ift_cleaned;
uint64_t ift_cleaned_prev;
#if MEMORY_LOGGING
uint64_t ift_enqueued;
uint64_t ift_dequeued;
#endif
uint64_t ift_no_tx_dma_setup;
uint64_t ift_no_desc_avail;
uint64_t ift_mbuf_defrag_failed;
uint64_t ift_mbuf_defrag;
uint64_t ift_map_failed;
uint64_t ift_txd_encap_efbig;
uint64_t ift_pullups;
uint64_t ift_last_timer_tick;
struct mtx ift_mtx;
struct mtx ift_db_mtx;
/* constant values */
if_ctx_t ift_ctx;
struct ifmp_ring *ift_br;
struct grouptask ift_task;
qidx_t ift_size;
uint16_t ift_id;
struct callout ift_timer;
#ifdef DEV_NETMAP
struct callout ift_netmap_timer;
#endif /* DEV_NETMAP */
if_txsd_vec_t ift_sds;
uint8_t ift_qstatus;
uint8_t ift_closed;
uint8_t ift_update_freq;
struct iflib_filter_info ift_filter_info;
bus_dma_tag_t ift_buf_tag;
bus_dma_tag_t ift_tso_buf_tag;
iflib_dma_info_t ift_ifdi;
#define MTX_NAME_LEN 32
char ift_mtx_name[MTX_NAME_LEN];
bus_dma_segment_t ift_segs[IFLIB_MAX_TX_SEGS] __aligned(CACHE_LINE_SIZE);
#ifdef IFLIB_DIAGNOSTICS
uint64_t ift_cpu_exec_count[256];
#endif
} __aligned(CACHE_LINE_SIZE);
struct iflib_fl {
qidx_t ifl_cidx;
qidx_t ifl_pidx;
qidx_t ifl_credits;
uint8_t ifl_gen;
uint8_t ifl_rxd_size;
#if MEMORY_LOGGING
uint64_t ifl_m_enqueued;
uint64_t ifl_m_dequeued;
uint64_t ifl_cl_enqueued;
uint64_t ifl_cl_dequeued;
#endif
/* implicit pad */
bitstr_t *ifl_rx_bitmap;
qidx_t ifl_fragidx;
/* constant */
qidx_t ifl_size;
uint16_t ifl_buf_size;
uint16_t ifl_cltype;
uma_zone_t ifl_zone;
iflib_rxsd_array_t ifl_sds;
iflib_rxq_t ifl_rxq;
uint8_t ifl_id;
bus_dma_tag_t ifl_buf_tag;
iflib_dma_info_t ifl_ifdi;
uint64_t ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
qidx_t ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
} __aligned(CACHE_LINE_SIZE);
static inline qidx_t
get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
{
qidx_t used;
if (pidx > cidx)
used = pidx - cidx;
else if (pidx < cidx)
used = size - cidx + pidx;
else if (gen == 0 && pidx == cidx)
used = 0;
else if (gen == 1 && pidx == cidx)
used = size;
else
panic("bad state");
return (used);
}
#define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
#define IDXDIFF(head, tail, wrap) \
((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
struct iflib_rxq {
if_ctx_t ifr_ctx;
iflib_fl_t ifr_fl;
uint64_t ifr_rx_irq;
struct pfil_head *pfil;
/*
* If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
* the completion queue consumer index. Otherwise it's unused.
*/
qidx_t ifr_cq_cidx;
uint16_t ifr_id;
uint8_t ifr_nfl;
uint8_t ifr_ntxqirq;
uint8_t ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
uint8_t ifr_fl_offset;
struct lro_ctrl ifr_lc;
struct grouptask ifr_task;
struct callout ifr_watchdog;
struct iflib_filter_info ifr_filter_info;
iflib_dma_info_t ifr_ifdi;
/* dynamically allocate if any drivers need a value substantially larger than this */
struct if_rxd_frag ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
#ifdef IFLIB_DIAGNOSTICS
uint64_t ifr_cpu_exec_count[256];
#endif
} __aligned(CACHE_LINE_SIZE);
typedef struct if_rxsd {
caddr_t *ifsd_cl;
iflib_fl_t ifsd_fl;
} *if_rxsd_t;
/* multiple of word size */
#ifdef __LP64__
#define PKT_INFO_SIZE 6
#define RXD_INFO_SIZE 5
#define PKT_TYPE uint64_t
#else
#define PKT_INFO_SIZE 11
#define RXD_INFO_SIZE 8
#define PKT_TYPE uint32_t
#endif
#define PKT_LOOP_BOUND ((PKT_INFO_SIZE/3)*3)
#define RXD_LOOP_BOUND ((RXD_INFO_SIZE/4)*4)
typedef struct if_pkt_info_pad {
PKT_TYPE pkt_val[PKT_INFO_SIZE];
} *if_pkt_info_pad_t;
typedef struct if_rxd_info_pad {
PKT_TYPE rxd_val[RXD_INFO_SIZE];
} *if_rxd_info_pad_t;
CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
static inline void
pkt_info_zero(if_pkt_info_t pi)
{
if_pkt_info_pad_t pi_pad;
pi_pad = (if_pkt_info_pad_t)pi;
pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
#ifndef __LP64__
pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
#endif
}
static inline void
rxd_info_zero(if_rxd_info_t ri)
{
if_rxd_info_pad_t ri_pad;
int i;
ri_pad = (if_rxd_info_pad_t)ri;
for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
ri_pad->rxd_val[i] = 0;
ri_pad->rxd_val[i+1] = 0;
ri_pad->rxd_val[i+2] = 0;
ri_pad->rxd_val[i+3] = 0;
}
#ifdef __LP64__
ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
#endif
}
/*
* Only allow a single packet to take up most 1/nth of the tx ring
*/
#define MAX_SINGLE_PACKET_FRACTION 12
#define IF_BAD_DMA (bus_addr_t)-1
#define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
#define CTX_LOCK_INIT(_sc) sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
#define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
#define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
#define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
#define STATE_LOCK_INIT(_sc, _name) mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
#define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
#define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
#define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
#define CALLOUT_LOCK(txq) mtx_lock(&txq->ift_mtx)
#define CALLOUT_UNLOCK(txq) mtx_unlock(&txq->ift_mtx)
/* Our boot-time initialization hook */
static int iflib_module_event_handler(module_t, int, void *);
static moduledata_t iflib_moduledata = {
"iflib",
iflib_module_event_handler,
NULL
};
DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
MODULE_VERSION(iflib, 1);
MODULE_DEPEND(iflib, pci, 1, 1, 1);
MODULE_DEPEND(iflib, ether, 1, 1, 1);
TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
#ifndef IFLIB_DEBUG_COUNTERS
#ifdef INVARIANTS
#define IFLIB_DEBUG_COUNTERS 1
#else
#define IFLIB_DEBUG_COUNTERS 0
#endif /* !INVARIANTS */
#endif
static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"iflib driver parameters");
/*
* XXX need to ensure that this can't accidentally cause the head to be moved backwards
*/
static int iflib_min_tx_latency = 0;
SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
&iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
static int iflib_no_tx_batch = 0;
SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
&iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
static int iflib_timer_default = 1000;
SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW,
&iflib_timer_default, 0, "number of ticks between iflib_timer calls");
#if IFLIB_DEBUG_COUNTERS
static int iflib_tx_seen;
static int iflib_tx_sent;
static int iflib_tx_encap;
static int iflib_rx_allocs;
static int iflib_fl_refills;
static int iflib_fl_refills_large;
static int iflib_tx_frees;
SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
&iflib_tx_seen, 0, "# TX mbufs seen");
SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
&iflib_tx_sent, 0, "# TX mbufs sent");
SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
&iflib_tx_encap, 0, "# TX mbufs encapped");
SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
&iflib_tx_frees, 0, "# TX frees");
SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
&iflib_rx_allocs, 0, "# RX allocations");
SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
&iflib_fl_refills, 0, "# refills");
SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
&iflib_fl_refills_large, 0, "# large refills");
static int iflib_txq_drain_flushing;
static int iflib_txq_drain_oactive;
static int iflib_txq_drain_notready;
SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
&iflib_txq_drain_flushing, 0, "# drain flushes");
SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
&iflib_txq_drain_oactive, 0, "# drain oactives");
SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
&iflib_txq_drain_notready, 0, "# drain notready");
static int iflib_encap_load_mbuf_fail;
static int iflib_encap_pad_mbuf_fail;
static int iflib_encap_txq_avail_fail;
static int iflib_encap_txd_encap_fail;
SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
&iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
&iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
&iflib_encap_txq_avail_fail, 0, "# txq avail failures");
SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
&iflib_encap_txd_encap_fail, 0, "# driver encap failures");
static int iflib_task_fn_rxs;
static int iflib_rx_intr_enables;
static int iflib_fast_intrs;
static int iflib_rx_unavail;
static int iflib_rx_ctx_inactive;
static int iflib_rx_if_input;
static int iflib_rxd_flush;
static int iflib_verbose_debug;
SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
&iflib_task_fn_rxs, 0, "# task_fn_rx calls");
SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
&iflib_rx_intr_enables, 0, "# RX intr enables");
SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
&iflib_fast_intrs, 0, "# fast_intr calls");
SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
&iflib_rx_unavail, 0, "# times rxeof called with no available data");
SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
&iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
&iflib_rx_if_input, 0, "# times rxeof called if_input");
SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
&iflib_rxd_flush, 0, "# times rxd_flush called");
SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
&iflib_verbose_debug, 0, "enable verbose debugging");
#define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
static void
iflib_debug_reset(void)
{
iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
iflib_txq_drain_flushing = iflib_txq_drain_oactive =
iflib_txq_drain_notready =
iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
iflib_rx_unavail =
iflib_rx_ctx_inactive = iflib_rx_if_input =
iflib_rxd_flush = 0;
}
#else
#define DBG_COUNTER_INC(name)
static void iflib_debug_reset(void) {}
#endif
#define IFLIB_DEBUG 0
static void iflib_tx_structures_free(if_ctx_t ctx);
static void iflib_rx_structures_free(if_ctx_t ctx);
static int iflib_queues_alloc(if_ctx_t ctx);
static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
static int iflib_qset_structures_setup(if_ctx_t ctx);
static int iflib_msix_init(if_ctx_t ctx);
static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
#ifdef ALTQ
static void iflib_altq_if_start(if_t ifp);
static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
#endif
static int iflib_register(if_ctx_t);
static void iflib_deregister(if_ctx_t);
static void iflib_unregister_vlan_handlers(if_ctx_t ctx);
static uint16_t iflib_get_mbuf_size_for(unsigned int size);
static void iflib_init_locked(if_ctx_t ctx);
static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
static void iflib_add_device_sysctl_post(if_ctx_t ctx);
static void iflib_ifmp_purge(iflib_txq_t txq);
static void _iflib_pre_assert(if_softc_ctx_t scctx);
static void iflib_stop(if_ctx_t ctx);
static void iflib_if_init_locked(if_ctx_t ctx);
static void iflib_free_intr_mem(if_ctx_t ctx);
#ifndef __NO_STRICT_ALIGNMENT
static struct mbuf * iflib_fixup_rx(struct mbuf *m);
#endif
static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
SLIST_HEAD_INITIALIZER(cpu_offsets);
struct cpu_offset {
SLIST_ENTRY(cpu_offset) entries;
cpuset_t set;
unsigned int refcount;
uint16_t next_cpuid;
};
static struct mtx cpu_offset_mtx;
MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
MTX_DEF);
DEBUGNET_DEFINE(iflib);
static int
iflib_num_rx_descs(if_ctx_t ctx)
{
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
if_shared_ctx_t sctx = ctx->ifc_sctx;
uint16_t first_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
return scctx->isc_nrxd[first_rxq];
}
static int
iflib_num_tx_descs(if_ctx_t ctx)
{
if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
if_shared_ctx_t sctx = ctx->ifc_sctx;
uint16_t first_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
return scctx->isc_ntxd[first_txq];
}
#ifdef DEV_NETMAP
#include <sys/selinfo.h>
#include <net/netmap.h>
#include <dev/netmap/netmap_kern.h>
MODULE_DEPEND(iflib, netmap, 1, 1, 1);
static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
static void iflib_netmap_timer(void *arg);
/*
* device-specific sysctl variables:
*
* iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
* During regular operations the CRC is stripped, but on some
* hardware reception of frames not multiple of 64 is slower,
* so using crcstrip=0 helps in benchmarks.
*
* iflib_rx_miss, iflib_rx_miss_bufs:
* count packets that might be missed due to lost interrupts.
*/
SYSCTL_DECL(_dev_netmap);
/*
* The xl driver by default strips CRCs and we do not override it.
*/
int iflib_crcstrip = 1;
SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
int iflib_rx_miss, iflib_rx_miss_bufs;
SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
/*
* Register/unregister. We are already under netmap lock.
* Only called on the first register or the last unregister.
*/
static int
iflib_netmap_register(struct netmap_adapter *na, int onoff)
{
if_t ifp = na->ifp;
if_ctx_t ctx = if_getsoftc(ifp);
int status;
CTX_LOCK(ctx);
if (!CTX_IS_VF(ctx))
IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
iflib_stop(ctx);
/*
* Enable (or disable) netmap flags, and intercept (or restore)
* ifp->if_transmit. This is done once the device has been stopped
* to prevent race conditions. Also, this must be done after
* calling netmap_disable_all_rings() and before calling
* netmap_enable_all_rings(), so that these two functions see the
* updated state of the NAF_NETMAP_ON bit.
*/
if (onoff) {
nm_set_native_flags(na);
} else {
nm_clear_native_flags(na);
}
iflib_init_locked(ctx);
IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
status = if_getdrvflags(ifp) & IFF_DRV_RUNNING ? 0 : 1;
if (status)
nm_clear_native_flags(na);
CTX_UNLOCK(ctx);
return (status);
}
static int
iflib_netmap_config(struct netmap_adapter *na, struct nm_config_info *info)
{
if_t ifp = na->ifp;
if_ctx_t ctx = if_getsoftc(ifp);
iflib_rxq_t rxq = &ctx->ifc_rxqs[0];
iflib_fl_t fl = &rxq->ifr_fl[0];
info->num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
info->num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
info->num_tx_descs = iflib_num_tx_descs(ctx);
info->num_rx_descs = iflib_num_rx_descs(ctx);
info->rx_buf_maxsize = fl->ifl_buf_size;
nm_prinf("txr %u rxr %u txd %u rxd %u rbufsz %u",
info->num_tx_rings, info->num_rx_rings, info->num_tx_descs,
info->num_rx_descs, info->rx_buf_maxsize);
return 0;
}
static int
netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
{
struct netmap_adapter *na = kring->na;
u_int const lim = kring->nkr_num_slots - 1;
struct netmap_ring *ring = kring->ring;
bus_dmamap_t *map;
struct if_rxd_update iru;
if_ctx_t ctx = rxq->ifr_ctx;
iflib_fl_t fl = &rxq->ifr_fl[0];
u_int nic_i_first, nic_i;
u_int nm_i;
int i, n;
#if IFLIB_DEBUG_COUNTERS
int rf_count = 0;
#endif
/*
* This function is used both at initialization and in rxsync.
* At initialization we need to prepare (with isc_rxd_refill())
* all the netmap buffers currently owned by the kernel, in
* such a way to keep fl->ifl_pidx and kring->nr_hwcur in sync
* (except for kring->nkr_hwofs). These may be less than
* kring->nkr_num_slots if netmap_reset() was called while
* an application using the kring that still owned some
* buffers.
* At rxsync time, both indexes point to the next buffer to be
* refilled.
* In any case we publish (with isc_rxd_flush()) up to
* (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod
* pointer to overrun the head/cons pointer, although this is
* not necessary for some NICs (e.g. vmx).
*/
if (__predict_false(init)) {
n = kring->nkr_num_slots - nm_kr_rxspace(kring);
} else {
n = kring->rhead - kring->nr_hwcur;
if (n == 0)
return (0); /* Nothing to do. */
if (n < 0)
n += kring->nkr_num_slots;
}
iru_init(&iru, rxq, 0 /* flid */);
map = fl->ifl_sds.ifsd_map;
nic_i = fl->ifl_pidx;
nm_i = netmap_idx_n2k(kring, nic_i);
if (__predict_false(init)) {
/*
* On init/reset, nic_i must be 0, and we must
* start to refill from hwtail (see netmap_reset()).
*/
MPASS(nic_i == 0);
MPASS(nm_i == kring->nr_hwtail);
} else
MPASS(nm_i == kring->nr_hwcur);
DBG_COUNTER_INC(fl_refills);
while (n > 0) {
#if IFLIB_DEBUG_COUNTERS
if (++rf_count == 9)
DBG_COUNTER_INC(fl_refills_large);
#endif
nic_i_first = nic_i;
for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(na, slot, &paddr);
MPASS(i < IFLIB_MAX_RX_REFRESH);
if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
return netmap_ring_reinit(kring);
fl->ifl_bus_addrs[i] = paddr +
nm_get_offset(kring, slot);
fl->ifl_rxd_idxs[i] = nic_i;
if (__predict_false(init)) {
netmap_load_map(na, fl->ifl_buf_tag,
map[nic_i], addr);
} else if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
netmap_reload_map(na, fl->ifl_buf_tag,
map[nic_i], addr);
}
bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i],
BUS_DMASYNC_PREREAD);
slot->flags &= ~NS_BUF_CHANGED;
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
iru.iru_pidx = nic_i_first;
iru.iru_count = i;
ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
}
fl->ifl_pidx = nic_i;
/*
* At the end of the loop we must have refilled everything
* we could possibly refill.
*/
MPASS(nm_i == kring->rhead);
kring->nr_hwcur = nm_i;
bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id,
nm_prev(nic_i, lim));
DBG_COUNTER_INC(rxd_flush);
return (0);
}
#define NETMAP_TX_TIMER_US 90
/*
* Reconcile kernel and user view of the transmit ring.
*
* All information is in the kring.
* Userspace wants to send packets up to the one before kring->rhead,
* kernel knows kring->nr_hwcur is the first unsent packet.
*
* Here we push packets out (as many as possible), and possibly
* reclaim buffers from previously completed transmission.
*
* The caller (netmap) guarantees that there is only one instance
* running at any time. Any interference with other driver
* methods should be handled by the individual drivers.
*/
static int
iflib_netmap_txsync(struct netmap_kring *kring, int flags)
{
struct netmap_adapter *na = kring->na;
if_t ifp = na->ifp;
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap kring */
u_int nic_i; /* index into the NIC ring */
u_int const lim = kring->nkr_num_slots - 1;
u_int const head = kring->rhead;
struct if_pkt_info pi;
int tx_pkts = 0, tx_bytes = 0;
/*
* interrupts on every tx packet are expensive so request
* them every half ring, or where NS_REPORT is set
*/
u_int report_frequency = kring->nkr_num_slots >> 1;
/* device-specific */
if_ctx_t ctx = if_getsoftc(ifp);
iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);