Skip to content

Commit

Permalink
tunneling: Avoid datapath-recirc by combining recirc actions at xlate.
Browse files Browse the repository at this point in the history
This patch set removes the recirculation of encapsulated tunnel packets
if possible. It is done by computing the post tunnel actions at the time of
translation. The combined nested action set are programmed in the datapath
using CLONE action.

The following test results shows the performance improvement offered by
this optimization for tunnel encap.

          +-------------+
      dpdk0 |             |
         -->o    br-in    |
            |             o--> gre0
            +-------------+

                   --> LOCAL
            +-----------o-+
            |             | dpdk1
            |    br-p1    o-->
            |             |
            +-------------+

Test result on OVS master with DPDK 16.11.2 (Without optimization):

 # dpdk0

 RX packets         : 7037641.60  / sec
 RX packet errors   : 0  / sec
 RX packets dropped : 7730632.90  / sec
 RX rate            : 402.69 MB/sec

 # dpdk1

 TX packets         : 7037641.60  / sec
 TX packet errors   : 0  / sec
 TX packets dropped : 0  / sec
 TX rate            : 657.73 MB/sec
 TX processing cost per TX packets in nsec : 142.09

Test result on OVS master + DPDK 16.11.2 (With optimization):

 # dpdk0

 RX packets         : 9386809.60  / sec
 RX packet errors   : 0  / sec
 RX packets dropped : 5381496.40  / sec
 RX rate            : 537.11 MB/sec

 # dpdk1

 TX packets         : 9386809.60  / sec
 TX packet errors   : 0  / sec
 TX packets dropped : 0  / sec
 TX rate            : 877.29 MB/sec
 TX processing cost per TX packets in nsec : 106.53

The offered performance gain is approx 30%.

Signed-off-by: Sugesh Chandran <[email protected]>
Signed-off-by: Zoltán Balogh <[email protected]>
Co-authored-by: Zoltán Balogh <[email protected]>
Signed-off-by: Joe Stringer <[email protected]>
  • Loading branch information
2 people authored and joestringer committed Jul 19, 2017
1 parent ce8bbd3 commit 7c12dfc
Show file tree
Hide file tree
Showing 8 changed files with 295 additions and 62 deletions.
18 changes: 1 addition & 17 deletions lib/dpif-netdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -5048,24 +5048,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,

case OVS_ACTION_ATTR_TUNNEL_PUSH:
if (*depth < MAX_RECIRC_DEPTH) {
struct dp_packet_batch tnl_pkt;
struct dp_packet_batch *orig_packets_ = packets_;
int err;

if (!may_steal) {
dp_packet_batch_clone(&tnl_pkt, packets_);
packets_ = &tnl_pkt;
dp_packet_batch_reset_cutlen(orig_packets_);
}

dp_packet_batch_apply_cutlen(packets_);

err = push_tnl_action(pmd, a, packets_);
if (!err) {
(*depth)++;
dp_netdev_recirculate(pmd, packets_);
(*depth)--;
}
push_tnl_action(pmd, a, packets_);
return;
}
break;
Expand Down
2 changes: 1 addition & 1 deletion lib/netdev-native-tnl.c
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
*
* This function sets the IP header's ip_tot_len field (which should be zeroed
* as part of 'header') and puts its value into '*ip_tot_size' as well. Also
* updates IP header checksum.
* updates IP header checksum, as well as the l3 and l4 offsets in 'packet'.
*
* Return pointer to the L4 header added to 'packet'. */
void *
Expand Down
33 changes: 31 additions & 2 deletions ofproto/ofproto-dpif-xlate-cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ xlate_cache_netdev(struct xc_entry *entry, const struct dpif_flow_stats *stats)
/* Push stats and perform side effects of flow translation. */
void
xlate_push_stats_entry(struct xc_entry *entry,
const struct dpif_flow_stats *stats)
struct dpif_flow_stats *stats)
{
struct eth_addr dmac;

Expand Down Expand Up @@ -159,6 +159,14 @@ xlate_push_stats_entry(struct xc_entry *entry,
entry->controller.am);
entry->controller.am = NULL; /* One time only. */
}
break;
case XC_TUNNEL_HEADER:
if (entry->tunnel_hdr.operation == ADD) {
stats->n_bytes += stats->n_packets * entry->tunnel_hdr.hdr_size;
} else {
stats->n_bytes -= stats->n_packets * entry->tunnel_hdr.hdr_size;
}

break;
default:
OVS_NOT_REACHED();
Expand All @@ -167,7 +175,7 @@ xlate_push_stats_entry(struct xc_entry *entry,

void
xlate_push_stats(struct xlate_cache *xcache,
const struct dpif_flow_stats *stats)
struct dpif_flow_stats *stats)
{
if (!stats->n_packets) {
return;
Expand Down Expand Up @@ -245,6 +253,8 @@ xlate_cache_clear_entry(struct xc_entry *entry)
entry->controller.am = NULL;
}
break;
case XC_TUNNEL_HEADER:
break;
default:
OVS_NOT_REACHED();
}
Expand Down Expand Up @@ -279,3 +289,22 @@ xlate_cache_delete(struct xlate_cache *xcache)
xlate_cache_uninit(xcache);
free(xcache);
}

/* Append all the entries in src into dst and remove them from src.
* The caller must own both xc-caches to use this function.
* The 'src' entries are not freed in this function as its owned by caller.
*/
void
xlate_cache_steal_entries(struct xlate_cache *dst, struct xlate_cache *src)
{
if (!dst || !src) {
return;
}
struct ofpbuf *src_entries = &src->entries;
struct ofpbuf *dst_entries = &dst->entries;
void *p;

p = ofpbuf_put_uninit(dst_entries, src_entries->size);
memcpy(p, src_entries->data, src_entries->size);
ofpbuf_clear(src_entries);
}
13 changes: 11 additions & 2 deletions ofproto/ofproto-dpif-xlate-cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ enum xc_type {
XC_GROUP,
XC_TNL_NEIGH,
XC_CONTROLLER,
XC_TUNNEL_HEADER,
};

/* xlate_cache entries hold enough information to perform the side effects of
Expand Down Expand Up @@ -119,6 +120,13 @@ struct xc_entry {
struct ofproto_dpif *ofproto;
struct ofproto_async_msg *am;
} controller;
struct {
enum {
ADD,
REMOVE,
} operation;
uint16_t hdr_size;
} tunnel_hdr;
};
};

Expand All @@ -134,11 +142,12 @@ struct xlate_cache {
void xlate_cache_init(struct xlate_cache *);
struct xlate_cache *xlate_cache_new(void);
struct xc_entry *xlate_cache_add_entry(struct xlate_cache *, enum xc_type);
void xlate_push_stats_entry(struct xc_entry *, const struct dpif_flow_stats *);
void xlate_push_stats(struct xlate_cache *, const struct dpif_flow_stats *);
void xlate_push_stats_entry(struct xc_entry *, struct dpif_flow_stats *);
void xlate_push_stats(struct xlate_cache *, struct dpif_flow_stats *);
void xlate_cache_clear_entry(struct xc_entry *);
void xlate_cache_clear(struct xlate_cache *);
void xlate_cache_uninit(struct xlate_cache *);
void xlate_cache_delete(struct xlate_cache *);
void xlate_cache_steal_entries(struct xlate_cache *, struct xlate_cache *);

#endif /* ofproto-dpif-xlate-cache.h */
237 changes: 232 additions & 5 deletions ofproto/ofproto-dpif-xlate.c
Original file line number Diff line number Diff line change
Expand Up @@ -3147,6 +3147,206 @@ tnl_send_arp_request(struct xlate_ctx *ctx, const struct xport *out_dev,
dp_packet_uninit(&packet);
}

static void
propagate_tunnel_data_to_flow__(struct flow *dst_flow,
const struct flow *src_flow,
struct eth_addr dmac, struct eth_addr smac,
struct in6_addr s_ip6, ovs_be32 s_ip,
bool is_tnl_ipv6, uint8_t nw_proto)
{
dst_flow->dl_dst = dmac;
dst_flow->dl_src = smac;

dst_flow->packet_type = htonl(PT_ETH);
dst_flow->nw_dst = src_flow->tunnel.ip_dst;
dst_flow->nw_src = src_flow->tunnel.ip_src;
dst_flow->ipv6_dst = src_flow->tunnel.ipv6_dst;
dst_flow->ipv6_src = src_flow->tunnel.ipv6_src;

dst_flow->nw_tos = src_flow->tunnel.ip_tos;
dst_flow->nw_ttl = src_flow->tunnel.ip_ttl;
dst_flow->tp_dst = src_flow->tunnel.tp_dst;
dst_flow->tp_src = src_flow->tunnel.tp_src;

if (is_tnl_ipv6) {
dst_flow->dl_type = htons(ETH_TYPE_IPV6);
if (ipv6_mask_is_any(&dst_flow->ipv6_src)
&& !ipv6_mask_is_any(&s_ip6)) {
dst_flow->ipv6_src = s_ip6;
}
} else {
dst_flow->dl_type = htons(ETH_TYPE_IP);
if (dst_flow->nw_src == 0 && s_ip) {
dst_flow->nw_src = s_ip;
}
}
dst_flow->nw_proto = nw_proto;
}

/*
* Populate the 'flow' and 'base_flow' L3 fields to do the post tunnel push
* translations.
*/
static void
propagate_tunnel_data_to_flow(struct xlate_ctx *ctx, struct eth_addr dmac,
struct eth_addr smac, struct in6_addr s_ip6,
ovs_be32 s_ip, bool is_tnl_ipv6,
enum ovs_vport_type tnl_type)
{
struct flow *base_flow, *flow;
flow = &ctx->xin->flow;
base_flow = &ctx->base_flow;
uint8_t nw_proto = 0;

switch (tnl_type) {
case OVS_VPORT_TYPE_GRE:
nw_proto = IPPROTO_GRE;
break;
case OVS_VPORT_TYPE_VXLAN:
case OVS_VPORT_TYPE_GENEVE:
nw_proto = IPPROTO_UDP;
break;
case OVS_VPORT_TYPE_LISP:
case OVS_VPORT_TYPE_STT:
case OVS_VPORT_TYPE_UNSPEC:
case OVS_VPORT_TYPE_NETDEV:
case OVS_VPORT_TYPE_INTERNAL:
case __OVS_VPORT_TYPE_MAX:
default:
OVS_NOT_REACHED();
break;
}
/*
* Update base_flow first followed by flow as the dst_flow gets modified
* in the function.
*/
propagate_tunnel_data_to_flow__(base_flow, flow, dmac, smac, s_ip6, s_ip,
is_tnl_ipv6, nw_proto);
propagate_tunnel_data_to_flow__(flow, flow, dmac, smac, s_ip6, s_ip,
is_tnl_ipv6, nw_proto);
}

/* Validate if the transalated combined actions are OK to proceed.
* If actions consist of TRUNC action, it is not allowed to do the
* tunnel_push combine as it cannot update stats correctly.
*/
static bool
is_tunnel_actions_clone_ready(struct xlate_ctx *ctx)
{
struct nlattr *tnl_actions;
const struct nlattr *a;
unsigned int left;
size_t actions_len;
struct ofpbuf *actions = ctx->odp_actions;

if (!actions) {
/* No actions, no harm in doing combine. */
return true;
}

/* Cannot perform tunnel push on slow path action CONTROLLER_OUTPUT. */
if (ctx->xout->slow & SLOW_CONTROLLER) {
return false;
}
actions_len = actions->size;

tnl_actions =(struct nlattr *)(actions->data);
NL_ATTR_FOR_EACH_UNSAFE (a, left, tnl_actions, actions_len) {
int type = nl_attr_type(a);
if (type == OVS_ACTION_ATTR_TRUNC) {
VLOG_DBG("Cannot do tunnel action-combine on trunc action");
return false;
break;
}
}
return true;
}

static bool
validate_and_combine_post_tnl_actions(struct xlate_ctx *ctx,
const struct xport *xport,
struct xport *out_dev,
struct ovs_action_push_tnl tnl_push_data)
{
const struct dpif_flow_stats *backup_resubmit_stats;
struct xlate_cache *backup_xcache;
bool nested_act_flag = false;
struct flow_wildcards tmp_flow_wc;
struct flow_wildcards *backup_flow_wc_ptr;
bool backup_side_effects;
const struct dp_packet *backup_pkt;

memset(&tmp_flow_wc, 0 , sizeof tmp_flow_wc);
backup_flow_wc_ptr = ctx->wc;
ctx->wc = &tmp_flow_wc;
ctx->xin->wc = NULL;
backup_resubmit_stats = ctx->xin->resubmit_stats;
backup_xcache = ctx->xin->xcache;
backup_side_effects = ctx->xin->allow_side_effects;
backup_pkt = ctx->xin->packet;

size_t push_action_size = 0;
size_t clone_ofs = nl_msg_start_nested(ctx->odp_actions,
OVS_ACTION_ATTR_CLONE);
odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
push_action_size = ctx->odp_actions->size;

ctx->xin->resubmit_stats = NULL;
ctx->xin->xcache = xlate_cache_new(); /* Use new temporary cache. */
ctx->xin->allow_side_effects = false;
ctx->xin->packet = NULL;

/* Push the cache entry for the tunnel first. */
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_TUNNEL_HEADER);
entry->tunnel_hdr.hdr_size = tnl_push_data.header_len;
entry->tunnel_hdr.operation = ADD;

apply_nested_clone_actions(ctx, xport, out_dev);
nested_act_flag = is_tunnel_actions_clone_ready(ctx);

if (nested_act_flag) {
/* Similar to the stats update in revalidation, the x_cache entries
* are populated by the previous translation are used to update the
* stats correctly.
*/
if (backup_resubmit_stats) {
struct dpif_flow_stats tmp_resubmit_stats;
memcpy(&tmp_resubmit_stats, backup_resubmit_stats,
sizeof tmp_resubmit_stats);
xlate_push_stats(ctx->xin->xcache, &tmp_resubmit_stats);
}
xlate_cache_steal_entries(backup_xcache, ctx->xin->xcache);
} else {
/* Combine is not valid. */
nl_msg_cancel_nested(ctx->odp_actions, clone_ofs);
goto out;
}
if (ctx->odp_actions->size > push_action_size) {
/* Update the CLONE action only when combined. */
nl_msg_end_nested(ctx->odp_actions, clone_ofs);
} else {
nl_msg_cancel_nested(ctx->odp_actions, clone_ofs);
/* XXX : There is no real use-case for a tunnel push without
* any post actions. However keeping it now
* as is to make the 'make check' happy. Should remove when all the
* make check tunnel test case does something meaningful on a
* tunnel encap packets.
*/
odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
}

out:
/* Restore context status. */
ctx->xin->resubmit_stats = backup_resubmit_stats;
xlate_cache_delete(ctx->xin->xcache);
ctx->xin->xcache = backup_xcache;
ctx->xin->allow_side_effects = backup_side_effects;
ctx->xin->packet = backup_pkt;
ctx->wc = backup_flow_wc_ptr;
return nested_act_flag;
}

static int
build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
const struct flow *flow, odp_port_t tunnel_odp_port)
Expand All @@ -3163,6 +3363,14 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
char buf_sip6[INET6_ADDRSTRLEN];
char buf_dip6[INET6_ADDRSTRLEN];

/* Structures to backup Ethernet and IP of base_flow. */
struct flow old_base_flow;
struct flow old_flow;

/* Backup flow & base_flow data. */
memcpy(&old_base_flow, &ctx->base_flow, sizeof old_base_flow);
memcpy(&old_flow, &ctx->xin->flow, sizeof old_flow);

err = tnl_route_lookup_flow(flow, &d_ip6, &s_ip6, &out_dev);
if (err) {
xlate_report(ctx, OFT_WARN, "native tunnel routing failed");
Expand Down Expand Up @@ -3222,12 +3430,31 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
tnl_push_data.tnl_port = tunnel_odp_port;
tnl_push_data.out_port = out_dev->odp_port;

/* After tunnel header has been added, packet_type of flow and base_flow
* need to be set to PT_ETH. */
ctx->xin->flow.packet_type = htonl(PT_ETH);
ctx->base_flow.packet_type = htonl(PT_ETH);
/* After tunnel header has been added, MAC and IP data of flow and
* base_flow need to be set properly, since there is not recirculation
* any more when sending packet to tunnel. */

odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
propagate_tunnel_data_to_flow(ctx, dmac, smac, s_ip6, s_ip,
tnl_params.is_ipv6, tnl_push_data.tnl_type);


/* Try to see if its possible to apply nested clone actions on tunnel.
* Revert the combined actions on tunnel if its not valid.
*/
if (!validate_and_combine_post_tnl_actions(ctx, xport, out_dev,
tnl_push_data)) {
/* Datapath is not doing the recirculation now, so lets make it
* happen explicitly.
*/
size_t clone_ofs = nl_msg_start_nested(ctx->odp_actions,
OVS_ACTION_ATTR_CLONE);
odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, 0);
nl_msg_end_nested(ctx->odp_actions, clone_ofs);
}
/* Restore the flows after the translation. */
memcpy(&ctx->xin->flow, &old_flow, sizeof ctx->xin->flow);
memcpy(&ctx->base_flow, &old_base_flow, sizeof ctx->base_flow);
return 0;
}

Expand Down
Loading

0 comments on commit 7c12dfc

Please sign in to comment.