Skip to content

Commit

Permalink
liquidio: improve UDP TX performance
Browse files Browse the repository at this point in the history
Improve UDP TX performance by:
* reducing the ring size from 2K to 512
* replacing the numerous streaming DMA allocations for info buffers and
  gather lists with one large consistent DMA allocation per ring

BQL is not effective here.  We reduced the ring size because there is heavy
overhead with dma_map_single every so often.  With iommu=on, dma_map_single
in PF Tx data path was taking longer time (~700usec) for every ~250
packets.  Debugged intel_iommu code, and found that PF driver is utilizing
too many static IO virtual address mapping entries (for gather list entries
and info buffers): about 100K entries for two PF's each using 8 rings.
Also, finding an empty entry (in rbtree of device domain's iova mapping in
kernel) during Tx path becomes a bottleneck every so often; the loop to
find the empty entry goes through over 40K iterations; this is too costly
and was the major overhead.  Overhead is low when this loop quits quickly.

Netperf benchmark numbers before and after patch:

PF UDP TX
+--------+--------+------------+------------+---------+
|        |        |  Before    |  After     |         |
| Number |        |  Patch     |  Patch     |         |
|  of    | Packet | Throughput | Throughput | Percent |
| Flows  |  Size  |  (Gbps)    |  (Gbps)    | Change  |
+--------+--------+------------+------------+---------+
|        |   360  |   0.52     |   0.93     |  +78.9  |
|   1    |  1024  |   1.62     |   2.84     |  +75.3  |
|        |  1518  |   2.44     |   4.21     |  +72.5  |
+--------+--------+------------+------------+---------+
|        |   360  |   0.45     |   1.59     | +253.3  |
|   4    |  1024  |   1.34     |   5.48     | +308.9  |
|        |  1518  |   2.27     |   8.31     | +266.1  |
+--------+--------+------------+------------+---------+
|        |   360  |   0.40     |   1.61     | +302.5  |
|   8    |  1024  |   1.64     |   4.24     | +158.5  |
|        |  1518  |   2.87     |   6.52     | +127.2  |
+--------+--------+------------+------------+---------+

VF UDP TX
+--------+--------+------------+------------+---------+
|        |        |  Before    |  After     |         |
| Number |        |  Patch     |  Patch     |         |
|  of    | Packet | Throughput | Throughput | Percent |
| Flows  |  Size  |  (Gbps)    |  (Gbps)    | Change  |
+--------+--------+------------+------------+---------+
|        |   360  |   1.28     |   1.49     |  +16.4  |
|   1    |  1024  |   4.44     |   4.39     |   -1.1  |
|        |  1518  |   6.08     |   6.51     |   +7.1  |
+--------+--------+------------+------------+---------+
|        |   360  |   2.35     |   2.35     |    0.0  |
|   4    |  1024  |   6.41     |   8.07     |  +25.9  |
|        |  1518  |   9.56     |   9.54     |   -0.2  |
+--------+--------+------------+------------+---------+
|        |   360  |   3.41     |   3.65     |   +7.0  |
|   8    |  1024  |   9.35     |   9.34     |   -0.1  |
|        |  1518  |   9.56     |   9.57     |   +0.1  |
+--------+--------+------------+------------+---------+

Signed-off-by: VSR Burru <[email protected]>
Signed-off-by: Felix Manlunas <[email protected]>
Signed-off-by: Derek Chickles <[email protected]>
Signed-off-by: Raghu Vatsavayi <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
VSR Burru authored and davem330 committed Mar 9, 2017
1 parent 5be083c commit 67e303e
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 182 deletions.
110 changes: 55 additions & 55 deletions drivers/net/ethernet/cavium/liquidio/lio_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ struct octnic_gather {
*/
struct octeon_sg_entry *sg;

u64 sg_dma_ptr;
dma_addr_t sg_dma_ptr;
};

struct handshake {
Expand Down Expand Up @@ -734,30 +734,36 @@ static void delete_glists(struct lio *lio)
struct octnic_gather *g;
int i;

kfree(lio->glist_lock);
lio->glist_lock = NULL;

if (!lio->glist)
return;

for (i = 0; i < lio->linfo.num_txpciq; i++) {
do {
g = (struct octnic_gather *)
list_delete_head(&lio->glist[i]);
if (g) {
if (g->sg) {
dma_unmap_single(&lio->oct_dev->
pci_dev->dev,
g->sg_dma_ptr,
g->sg_size,
DMA_TO_DEVICE);
kfree((void *)((unsigned long)g->sg -
g->adjust));
}
if (g)
kfree(g);
}
} while (g);

if (lio->glists_virt_base && lio->glists_virt_base[i]) {
lio_dma_free(lio->oct_dev,
lio->glist_entry_size * lio->tx_qsize,
lio->glists_virt_base[i],
lio->glists_dma_base[i]);
}
}

kfree((void *)lio->glist);
kfree((void *)lio->glist_lock);
kfree(lio->glists_virt_base);
lio->glists_virt_base = NULL;

kfree(lio->glists_dma_base);
lio->glists_dma_base = NULL;

kfree(lio->glist);
lio->glist = NULL;
}

/**
Expand All @@ -772,13 +778,30 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
lio->glist_lock = kcalloc(num_iqs, sizeof(*lio->glist_lock),
GFP_KERNEL);
if (!lio->glist_lock)
return 1;
return -ENOMEM;

lio->glist = kcalloc(num_iqs, sizeof(*lio->glist),
GFP_KERNEL);
if (!lio->glist) {
kfree((void *)lio->glist_lock);
return 1;
kfree(lio->glist_lock);
lio->glist_lock = NULL;
return -ENOMEM;
}

lio->glist_entry_size =
ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);

/* allocate memory to store virtual and dma base address of
* per glist consistent memory
*/
lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
GFP_KERNEL);
lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
GFP_KERNEL);

if (!lio->glists_virt_base || !lio->glists_dma_base) {
delete_glists(lio);
return -ENOMEM;
}

for (i = 0; i < num_iqs; i++) {
Expand All @@ -788,6 +811,16 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)

INIT_LIST_HEAD(&lio->glist[i]);

lio->glists_virt_base[i] =
lio_dma_alloc(oct,
lio->glist_entry_size * lio->tx_qsize,
&lio->glists_dma_base[i]);

if (!lio->glists_virt_base[i]) {
delete_glists(lio);
return -ENOMEM;
}

for (j = 0; j < lio->tx_qsize; j++) {
g = kzalloc_node(sizeof(*g), GFP_KERNEL,
numa_node);
Expand All @@ -796,43 +829,18 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
if (!g)
break;

g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) *
OCT_SG_ENTRY_SIZE);
g->sg = lio->glists_virt_base[i] +
(j * lio->glist_entry_size);

g->sg = kmalloc_node(g->sg_size + 8,
GFP_KERNEL, numa_node);
if (!g->sg)
g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL);
if (!g->sg) {
kfree(g);
break;
}

/* The gather component should be aligned on 64-bit
* boundary
*/
if (((unsigned long)g->sg) & 7) {
g->adjust = 8 - (((unsigned long)g->sg) & 7);
g->sg = (struct octeon_sg_entry *)
((unsigned long)g->sg + g->adjust);
}
g->sg_dma_ptr = dma_map_single(&oct->pci_dev->dev,
g->sg, g->sg_size,
DMA_TO_DEVICE);
if (dma_mapping_error(&oct->pci_dev->dev,
g->sg_dma_ptr)) {
kfree((void *)((unsigned long)g->sg -
g->adjust));
kfree(g);
break;
}
g->sg_dma_ptr = lio->glists_dma_base[i] +
(j * lio->glist_entry_size);

list_add_tail(&g->list, &lio->glist[i]);
}

if (j != lio->tx_qsize) {
delete_glists(lio);
return 1;
return -ENOMEM;
}
}

Expand Down Expand Up @@ -1885,9 +1893,6 @@ static void free_netsgbuf(void *buf)
i++;
}

dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev,
g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE);

iq = skb_iq(lio, skb);
spin_lock(&lio->glist_lock[iq]);
list_add_tail(&g->list, &lio->glist[iq]);
Expand Down Expand Up @@ -1933,9 +1938,6 @@ static void free_netsgbuf_with_resp(void *buf)
i++;
}

dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev,
g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE);

iq = skb_iq(lio, skb);

spin_lock(&lio->glist_lock[iq]);
Expand Down Expand Up @@ -3273,8 +3275,6 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
i++;
}

dma_sync_single_for_device(&oct->pci_dev->dev, g->sg_dma_ptr,
g->sg_size, DMA_TO_DEVICE);
dptr = g->sg_dma_ptr;

if (OCTEON_CN23XX_PF(oct))
Expand Down
104 changes: 55 additions & 49 deletions drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ struct octnic_gather {
* received from the IP layer.
*/
struct octeon_sg_entry *sg;

dma_addr_t sg_dma_ptr;
};

struct octeon_device_priv {
Expand Down Expand Up @@ -490,24 +492,36 @@ static void delete_glists(struct lio *lio)
struct octnic_gather *g;
int i;

kfree(lio->glist_lock);
lio->glist_lock = NULL;

if (!lio->glist)
return;

for (i = 0; i < lio->linfo.num_txpciq; i++) {
do {
g = (struct octnic_gather *)
list_delete_head(&lio->glist[i]);
if (g) {
if (g->sg)
kfree((void *)((unsigned long)g->sg -
g->adjust));
if (g)
kfree(g);
}
} while (g);

if (lio->glists_virt_base && lio->glists_virt_base[i]) {
lio_dma_free(lio->oct_dev,
lio->glist_entry_size * lio->tx_qsize,
lio->glists_virt_base[i],
lio->glists_dma_base[i]);
}
}

kfree(lio->glists_virt_base);
lio->glists_virt_base = NULL;

kfree(lio->glists_dma_base);
lio->glists_dma_base = NULL;

kfree(lio->glist);
kfree(lio->glist_lock);
lio->glist = NULL;
}

/**
Expand All @@ -522,48 +536,64 @@ static int setup_glists(struct lio *lio, int num_iqs)
lio->glist_lock =
kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL);
if (!lio->glist_lock)
return 1;
return -ENOMEM;

lio->glist =
kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL);
if (!lio->glist) {
kfree(lio->glist_lock);
return 1;
lio->glist_lock = NULL;
return -ENOMEM;
}

lio->glist_entry_size =
ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);

/* allocate memory to store virtual and dma base address of
* per glist consistent memory
*/
lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
GFP_KERNEL);
lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
GFP_KERNEL);

if (!lio->glists_virt_base || !lio->glists_dma_base) {
delete_glists(lio);
return -ENOMEM;
}

for (i = 0; i < num_iqs; i++) {
spin_lock_init(&lio->glist_lock[i]);

INIT_LIST_HEAD(&lio->glist[i]);

lio->glists_virt_base[i] =
lio_dma_alloc(lio->oct_dev,
lio->glist_entry_size * lio->tx_qsize,
&lio->glists_dma_base[i]);

if (!lio->glists_virt_base[i]) {
delete_glists(lio);
return -ENOMEM;
}

for (j = 0; j < lio->tx_qsize; j++) {
g = kzalloc(sizeof(*g), GFP_KERNEL);
if (!g)
break;

g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) *
OCT_SG_ENTRY_SIZE);
g->sg = lio->glists_virt_base[i] +
(j * lio->glist_entry_size);

g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL);
if (!g->sg) {
kfree(g);
break;
}
g->sg_dma_ptr = lio->glists_dma_base[i] +
(j * lio->glist_entry_size);

/* The gather component should be aligned on 64-bit
* boundary
*/
if (((unsigned long)g->sg) & 7) {
g->adjust = 8 - (((unsigned long)g->sg) & 7);
g->sg = (struct octeon_sg_entry *)
((unsigned long)g->sg + g->adjust);
}
list_add_tail(&g->list, &lio->glist[i]);
}

if (j != lio->tx_qsize) {
delete_glists(lio);
return 1;
return -ENOMEM;
}
}

Expand Down Expand Up @@ -1324,10 +1354,6 @@ static void free_netsgbuf(void *buf)
i++;
}

dma_unmap_single(&lio->oct_dev->pci_dev->dev,
finfo->dptr, g->sg_size,
DMA_TO_DEVICE);

iq = skb_iq(lio, skb);

spin_lock(&lio->glist_lock[iq]);
Expand Down Expand Up @@ -1374,10 +1400,6 @@ static void free_netsgbuf_with_resp(void *buf)
i++;
}

dma_unmap_single(&lio->oct_dev->pci_dev->dev,
finfo->dptr, g->sg_size,
DMA_TO_DEVICE);

iq = skb_iq(lio, skb);

spin_lock(&lio->glist_lock[iq]);
Expand Down Expand Up @@ -2382,23 +2404,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
i++;
}

dptr = dma_map_single(&oct->pci_dev->dev,
g->sg, g->sg_size,
DMA_TO_DEVICE);
if (dma_mapping_error(&oct->pci_dev->dev, dptr)) {
dev_err(&oct->pci_dev->dev, "%s DMA mapping error 4\n",
__func__);
dma_unmap_single(&oct->pci_dev->dev, g->sg[0].ptr[0],
skb->len - skb->data_len,
DMA_TO_DEVICE);
for (j = 1; j <= frags; j++) {
frag = &skb_shinfo(skb)->frags[j - 1];
dma_unmap_page(&oct->pci_dev->dev,
g->sg[j >> 2].ptr[j & 3],
frag->size, DMA_TO_DEVICE);
}
return NETDEV_TX_BUSY;
}
dptr = g->sg_dma_ptr;

ndata.cmd.cmd3.dptr = dptr;
finfo->dptr = dptr;
Expand Down
6 changes: 3 additions & 3 deletions drivers/net/ethernet/cavium/liquidio/octeon_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,17 @@
#define CN23XX_MAX_RINGS_PER_VF 8

#define CN23XX_MAX_INPUT_QUEUES CN23XX_MAX_RINGS_PER_PF
#define CN23XX_MAX_IQ_DESCRIPTORS 2048
#define CN23XX_MAX_IQ_DESCRIPTORS 512
#define CN23XX_DB_MIN 1
#define CN23XX_DB_MAX 8
#define CN23XX_DB_TIMEOUT 1

#define CN23XX_MAX_OUTPUT_QUEUES CN23XX_MAX_RINGS_PER_PF
#define CN23XX_MAX_OQ_DESCRIPTORS 2048
#define CN23XX_MAX_OQ_DESCRIPTORS 512
#define CN23XX_OQ_BUF_SIZE 1536
#define CN23XX_OQ_PKTSPER_INTR 128
/*#define CAVIUM_ONLY_CN23XX_RX_PERF*/
#define CN23XX_OQ_REFIL_THRESHOLD 128
#define CN23XX_OQ_REFIL_THRESHOLD 16

#define CN23XX_OQ_INTR_PKT 64
#define CN23XX_OQ_INTR_TIME 100
Expand Down
Loading

0 comments on commit 67e303e

Please sign in to comment.