diff --git a/Documentation/automake.mk b/Documentation/automake.mk
index bc728dff315..24447949061 100644
--- a/Documentation/automake.mk
+++ b/Documentation/automake.mk
@@ -36,6 +36,7 @@ DOC_SOURCE = \
Documentation/topics/dpdk/index.rst \
Documentation/topics/dpdk/bridge.rst \
Documentation/topics/dpdk/jumbo-frames.rst \
+ Documentation/topics/dpdk/memory.rst \
Documentation/topics/dpdk/pdump.rst \
Documentation/topics/dpdk/phy.rst \
Documentation/topics/dpdk/pmd.rst \
diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst
index 9e885ec5284..2468c641b08 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -170,6 +170,12 @@ Mount the hugepages, if not already mounted by default::
$ mount -t hugetlbfs none /dev/hugepages``
+.. note::
+
+ The amount of hugepage memory required can be affected by various
+ aspects of the datapath and device configuration. Refer to
+ :doc:`/topics/dpdk/memory` for more details.
+
.. _dpdk-vfio:
Setup DPDK devices using VFIO
diff --git a/Documentation/topics/dpdk/index.rst b/Documentation/topics/dpdk/index.rst
index 181f61abbab..cf24a7b6d47 100644
--- a/Documentation/topics/dpdk/index.rst
+++ b/Documentation/topics/dpdk/index.rst
@@ -40,3 +40,4 @@ The DPDK Datapath
/topics/dpdk/qos
/topics/dpdk/pdump
/topics/dpdk/jumbo-frames
+ /topics/dpdk/memory
diff --git a/Documentation/topics/dpdk/memory.rst b/Documentation/topics/dpdk/memory.rst
new file mode 100644
index 00000000000..e5fb166d5f2
--- /dev/null
+++ b/Documentation/topics/dpdk/memory.rst
@@ -0,0 +1,216 @@
+..
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you may
+ not use this file except in compliance with the License. You may obtain
+ a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ License for the specific language governing permissions and limitations
+ under the License.
+
+ Convention for heading levels in Open vSwitch documentation:
+
+ ======= Heading 0 (reserved for the title in a document)
+ ------- Heading 1
+ ~~~~~~~ Heading 2
+ +++++++ Heading 3
+ ''''''' Heading 4
+
+ Avoid deeper levels because they do not render well.
+
+=========================
+DPDK Device Memory Models
+=========================
+
+DPDK device memory can be allocated in one of two ways in OVS DPDK,
+**shared memory** or **per port memory**. The specifics of both are
+detailed below.
+
+Shared Memory
+-------------
+
+By default OVS DPDK uses a shared memory model. This means that multiple
+ports can share the same mempool. For example when a port is added it will
+have a given MTU and socket ID associated with it. If a mempool has been
+created previously for an existing port that has the same MTU and socket ID,
+that mempool is used for both ports. If there is no existing mempool
+supporting these parameters then a new mempool is created.
+
+Per Port Memory
+---------------
+
+In the per port memory model, mempools are created per device and are not
+shared. The benefit of this is a more transparent memory model where mempools
+will not be exhausted by other DPDK devices. However this comes at a potential
+increase in cost for memory dimensioning for a given deployment. Users should
+be aware of the memory requirements for their deployment before using this
+model and allocate the required hugepage memory.
+
+Per port mempool support may be enabled via a global config value,
+```per-port-memory```. Setting this to true enables the per port memory
+model for all DPDK devices in OVS::
+
+ $ ovs-vsctl set Open_vSwitch . other_config:per-port-memory=true
+
+.. important::
+
+ This value should be set before setting dpdk-init=true. If set after
+ dpdk-init=true then the daemon must be restarted to use per-port-memory.
+
+Calculating Memory Requirements
+-------------------------------
+
+The amount of memory required for a given mempool can be calculated by the
+**number mbufs in the mempool \* mbuf size**.
+
+Users should be aware of the following:
+
+* The **number of mbufs** per mempool will differ between memory models.
+
+* The **size of each mbuf** will be affected by the requested **MTU** size.
+
+.. important::
+
+ An mbuf size in bytes is always larger than the requested MTU size due to
+ alignment and rounding needed in OVS DPDK.
+
+Below are a number of examples of memory requirement calculations for both
+shared and per port memory models.
+
+Shared Memory Calculations
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In the shared memory model the number of mbufs requested is directly
+affected by the requested MTU size as described in the table below.
+
++--------------------+-------------+
+| MTU Size | Num MBUFS |
++====================+=============+
+| 1500 or greater | 262144 |
++--------------------+-------------+
+| Less than 1500 | 16384 |
++------------+-------+-------------+
+
+.. Important::
+
+ If a deployment does not have enough memory to provide 262144 mbufs then
+ the requested amount is halved up until 16384.
+
+Example 1
++++++++++
+::
+
+ MTU = 1500 Bytes
+ Number of mbufs = 262144
+ Mbuf size = 3008 Bytes
+ Memory required = 262144 * 3008 = 788 MB
+
+Example 2
++++++++++
+::
+
+ MTU = 1800 Bytes
+ Number of mbufs = 262144
+ Mbuf size = 3008 Bytes
+ Memory required = 262144 * 3008 = 788 MB
+
+.. note::
+
+ Assuming the same socket is in use for example 1 and 2 the same mempool
+ would be shared.
+
+Example 3
++++++++++
+::
+
+ MTU = 6000 Bytes
+ Number of mbufs = 262144
+ Mbuf size = 8128 Bytes
+ Memory required = 262144 * 8128 = 2130 MB
+
+Example 4
++++++++++
+::
+
+ MTU = 9000 Bytes
+ Number of mbufs = 262144
+ Mbuf size = 10176 Bytes
+ Memory required = 262144 * 10176 = 2667 MB
+
+Per Port Memory Calculations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The number of mbufs requested in the per port model is more complicated and
+accounts for multiple dynamic factors in the datapath and device
+configuration.
+
+A rough estimation of the number of mbufs required for a port is:
+::
+
+ packets required to fill the device rxqs +
+ packets that could be stuck on other ports txqs +
+ packets on the pmd threads +
+ additional corner case memory.
+
+The algorithm in OVS used to calculate this is as follows:
+::
+
+ requested number of rxqs * requested rxq size +
+ requested number of txqs * requested txq size +
+ min(RTE_MAX_LCORE, requested number of rxqs) * netdev_max_burst +
+ MIN_NB_MBUF.
+
+where:
+
+* **requested number of rxqs**: Number of requested receive queues for a
+ device.
+* **requested rxq size**: The number of descriptors requested for a rx queue.
+* **requested number of txqs**: Number of requested transmit queues for a
+ device. Calculated as the number of PMDs configured +1.
+* **requested txq size**: the number of descriptors requested for a tx queue.
+* **min(RTE_MAX_LCORE, requested number of rxqs)**: Compare the maximum
+ number of lcores supported by DPDK to the number of requested receive
+ queues for the device and use the variable of lesser value.
+* **NETDEV_MAX_BURST**: Maximum number of of packets in a burst, defined as
+ 32.
+* **MIN_NB_MBUF**: Additional memory for corner case, defined as 16384.
+
+For all examples below assume the following values:
+
+* requested_rxq_size = 2048
+* requested_txq_size = 2048
+* RTE_MAX_LCORE = 128
+* netdev_max_burst = 32
+* MIN_NB_MBUF = 16384
+
+Example 1: (1 rxq, 1 PMD, 1500 MTU)
++++++++++++++++++++++++++++++++++++
+::
+
+ MTU = 1500
+ Number of mbufs = (1 * 2048) + (2 * 2048) + (1 * 32) + (16384) = 22560
+ Mbuf size = 3008 Bytes
+ Memory required = 22560 * 3008 = 67 MB
+
+Example 2: (1 rxq, 2 PMD, 6000 MTU)
++++++++++++++++++++++++++++++++++++
+::
+
+ MTU = 6000
+ Number of mbufs = (1 * 2048) + (3 * 2048) + (1 * 32) + (16384) = 24608
+ Mbuf size = 8128 Bytes
+ Memory required = 24608 * 8128 = 200 MB
+
+Example 3: (2 rxq, 2 PMD, 9000 MTU)
++++++++++++++++++++++++++++++++++++
+::
+
+ MTU = 9000
+ Number of mbufs = (2 * 2048) + (3 * 2048) + (1 * 32) + (16384) = 26656
+ Mbuf size = 10176 Bytes
+ Memory required = 26656 * 10176 = 271 MB
diff --git a/NEWS b/NEWS
index 74dca151660..b832cae7017 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,7 @@ Post-v2.9.0
* Add LSC interrupt support for DPDK physical devices.
* Allow init to fail and record DPDK status/version in OVS database.
* Add experimental flow hardware offload support
+ * Support both shared and per port mempools for DPDK devices.
- Userspace datapath:
* Commands ovs-appctl dpif-netdev/pmd-*-show can now work on a single PMD
* Detailed PMD performance metrics available with new command
diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
index 1df1c5848a3..1e0f46101d9 100644
--- a/lib/dpdk-stub.c
+++ b/lib/dpdk-stub.c
@@ -56,6 +56,12 @@ dpdk_vhost_iommu_enabled(void)
return false;
}
+bool
+dpdk_per_port_memory(void)
+{
+ return false;
+}
+
void
print_dpdk_version(void)
{
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 5c68ce430d6..0ee3e19c63b 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -48,6 +48,7 @@ static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */
static bool dpdk_initialized = false; /* Indicates successful initialization
* of DPDK. */
+static bool per_port_memory = false; /* Status of per port memory support */
static int
process_vhost_flags(char *flag, const char *default_val, int size,
@@ -384,6 +385,11 @@ dpdk_init__(const struct smap *ovs_other_config)
VLOG_INFO("IOMMU support for vhost-user-client %s.",
vhost_iommu_enabled ? "enabled" : "disabled");
+ per_port_memory = smap_get_bool(ovs_other_config,
+ "per-port-memory", false);
+ VLOG_INFO("Per port memory for DPDK devices %s.",
+ per_port_memory ? "enabled" : "disabled");
+
argv = grow_argv(&argv, 0, 1);
argc = 1;
argv[0] = xstrdup(ovs_get_program_name());
@@ -541,6 +547,12 @@ dpdk_vhost_iommu_enabled(void)
return vhost_iommu_enabled;
}
+bool
+dpdk_per_port_memory(void)
+{
+ return per_port_memory;
+}
+
void
dpdk_set_lcore_id(unsigned cpu)
{
diff --git a/lib/dpdk.h b/lib/dpdk.h
index efdaa637c2b..bbb89d4e6f0 100644
--- a/lib/dpdk.h
+++ b/lib/dpdk.h
@@ -39,6 +39,7 @@ void dpdk_init(const struct smap *ovs_other_config);
void dpdk_set_lcore_id(unsigned cpu);
const char *dpdk_get_vhost_sock_dir(void);
bool dpdk_vhost_iommu_enabled(void);
+bool dpdk_per_port_memory(void);
void print_dpdk_version(void);
void dpdk_status(const struct ovsrec_open_vswitch *);
#endif /* dpdk.h */
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 48e10f3cdc5..bb4d60f26cc 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -95,13 +95,24 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
#define NETDEV_DPDK_MBUF_ALIGN 1024
#define NETDEV_DPDK_MAX_PKT_LEN 9728
-/* Min number of packets in the mempool. OVS tries to allocate a mempool with
- * roughly estimated number of mbufs: if this fails (because the system doesn't
- * have enough hugepages) we keep halving the number until the allocation
- * succeeds or we reach MIN_NB_MBUF */
+/* Max and min number of packets in the mempool. OVS tries to allocate a
+ * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
+ * enough hugepages) we keep halving the number until the allocation succeeds
+ * or we reach MIN_NB_MBUF */
+
+#define MAX_NB_MBUF (4096 * 64)
#define MIN_NB_MBUF (4096 * 4)
#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
+/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
+BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
+ == 0);
+
+/* The smallest possible NB_MBUF that we're going to try should be a multiple
+ * of MP_CACHE_SZ. This is advised by DPDK documentation. */
+BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
+ % MP_CACHE_SZ == 0);
+
/*
* DPDK XSTATS Counter names definition
*/
@@ -312,12 +323,14 @@ static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
= OVS_MUTEX_INITIALIZER;
/* Contains all 'struct dpdk_mp's. */
-static struct ovs_list dpdk_mp_free_list OVS_GUARDED_BY(dpdk_mp_mutex)
- = OVS_LIST_INITIALIZER(&dpdk_mp_free_list);
+static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
+ = OVS_LIST_INITIALIZER(&dpdk_mp_list);
-/* Wrapper for a mempool released but not yet freed. */
struct dpdk_mp {
struct rte_mempool *mp;
+ int mtu;
+ int socket_id;
+ int refcount;
struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
};
@@ -399,7 +412,7 @@ struct netdev_dpdk {
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
- struct rte_mempool *mp;
+ struct dpdk_mp *dpdk_mp;
/* virtio identifier for vhost devices */
ovsrcu_index vid;
@@ -565,68 +578,89 @@ dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
/* Free unused mempools. */
static void
-dpdk_mp_sweep(void)
+dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
{
struct dpdk_mp *dmp, *next;
- ovs_mutex_lock(&dpdk_mp_mutex);
- LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_free_list) {
- if (dpdk_mp_full(dmp->mp)) {
+ LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_list) {
+ if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
ovs_list_remove(&dmp->list_node);
rte_mempool_free(dmp->mp);
rte_free(dmp);
}
}
- ovs_mutex_unlock(&dpdk_mp_mutex);
}
-/* Ensure a mempool will not be freed. */
-static void
-dpdk_mp_do_not_free(struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
+/* Calculating the required number of mbufs differs depending on the
+ * mempool model being used. Check if per port memory is in use before
+ * calculating.
+ */
+static uint32_t
+dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
{
- struct dpdk_mp *dmp, *next;
+ uint32_t n_mbufs;
- LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_free_list) {
- if (dmp->mp == mp) {
- VLOG_DBG("Removing mempool \"%s\" from free list", dmp->mp->name);
- ovs_list_remove(&dmp->list_node);
- rte_free(dmp);
- break;
+ if (!per_port_mp) {
+ /* Shared memory are being used.
+ * XXX: this is a really rough method of provisioning memory.
+ * It's impossible to determine what the exact memory requirements are
+ * when the number of ports and rxqs that utilize a particular mempool
+ * can change dynamically at runtime. For now, use this rough
+ * heurisitic.
+ */
+ if (mtu >= ETHER_MTU) {
+ n_mbufs = MAX_NB_MBUF;
+ } else {
+ n_mbufs = MIN_NB_MBUF;
}
+ } else {
+ /* Per port memory is being used.
+ * XXX: rough estimation of number of mbufs required for this port:
+ *
+ By default OVS DPDK uses a shared memory model wherein devices
+ that have the same MTU and socket values can share the same
+ mempool. Setting this value to true
changes this
+ behaviour. Per port memory allow DPDK devices to use private
+ memory per device. This can provide greater transparency as
+ regards memory usage but potentially at the cost of greater memory
+ requirements.
+
+ Changing this value requires restarting the daemon if dpdk-init has + already been set to true. +
+