From 18721c4a4d49416a0c63ca985988b000fc48f217 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Fri, 12 Jun 2015 16:12:56 -0700
Subject: [PATCH 01/19] classifier: Simplify versioning.

After all, there are some cases in which both the insertion version
and removal version of a rule need to be considered.  This makes the
cls_match a bit bigger, but makes classifier versioning much simpler
to understand.

Also, avoid using type larger than int in an enum, as it is not
portable C.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
---
 lib/classifier-private.h     | 49 +++++++++--------------
 lib/classifier.c             | 76 +++++++++++++-----------------------
 lib/classifier.h             | 45 +++++++++++----------
 ofproto/ofproto-dpif-xlate.c |  4 +-
 ofproto/ofproto-dpif.c       | 17 ++++----
 ofproto/ofproto-dpif.h       |  5 +--
 ofproto/ofproto-provider.h   |  7 ++--
 ofproto/ofproto.c            | 15 ++++---
 tests/test-classifier.c      | 12 +++---
 9 files changed, 97 insertions(+), 133 deletions(-)

diff --git a/lib/classifier-private.h b/lib/classifier-private.h
index a5403713774..77745027ab8 100644
--- a/lib/classifier-private.h
+++ b/lib/classifier-private.h
@@ -87,19 +87,13 @@ struct cls_match {
     /* Accessed by all readers. */
     struct cmap_node cmap_node; /* Within struct cls_subtable 'rules'. */
 
-    /* Controls rule's visibility to lookups.
+    /* Rule versioning.
      *
-     * When 'visibility' is:
-     *
-     * > 0  - rule is visible starting from version 'visibility'
-     * <= 0 - rule is invisible starting from version '-(visibility)'
-     *
-     * The minimum version number used in lookups is 1 (== CLS_NO_VERSION),
-     * which implies that when 'visibility' is:
-     *
-     * 1    - rule is visible in all lookup versions
-     * 0    - rule is invisible in all lookup versions. */
-    atomic_llong visibility;
+     * CLS_NOT_REMOVED_VERSION has a special meaning for 'remove_version',
+     * meaningthat the rule has been added but not yet removed.
+     */
+    const cls_version_t add_version;        /* Version rule was added in. */
+    ATOMIC(cls_version_t) remove_version;   /* Version rule is removed in. */
 
     const struct cls_rule *cls_rule;
     const struct miniflow flow; /* Matching rule. Mask is in the subtable. */
@@ -110,39 +104,34 @@ struct cls_match {
 void cls_match_free_cb(struct cls_match *);
 
 static inline void
-cls_match_set_visibility(struct cls_match *rule, long long version)
+cls_match_set_remove_version(struct cls_match *rule, cls_version_t version)
 {
-    atomic_store_relaxed(&rule->visibility, version);
+    atomic_store_relaxed(&rule->remove_version, version);
 }
 
 static inline bool
-cls_match_visible_in_version(const struct cls_match *rule, long long version)
+cls_match_visible_in_version(const struct cls_match *rule,
+                             cls_version_t version)
 {
-    long long visibility;
+    cls_version_t remove_version;
 
     /* C11 does not want to access an atomic via a const object pointer. */
-    atomic_read_relaxed(&CONST_CAST(struct cls_match *, rule)->visibility,
-                        &visibility);
-
-    if (OVS_LIKELY(visibility > 0)) {
-        /* Rule is visible starting from version 'visibility'. */
-        return version >= visibility;
-    } else {
-        /* Rule is invisible starting from version '-visibility'. */
-        return version < -visibility;
-    }
+    atomic_read_relaxed(&CONST_CAST(struct cls_match *, rule)->remove_version,
+                        &remove_version);
+
+    return rule->add_version <= version && version < remove_version;
 }
 
 static inline bool
 cls_match_is_eventually_invisible(const struct cls_match *rule)
 {
-    long long visibility;
+    cls_version_t remove_version;
 
     /* C11 does not want to access an atomic via a const object pointer. */
-    atomic_read_relaxed(&CONST_CAST(struct cls_match *, rule)->visibility,
-                        &visibility);
+    atomic_read_relaxed(&CONST_CAST(struct cls_match *, rule)->remove_version,
+                        &remove_version);
 
-    return visibility <= 0;
+    return remove_version <= CLS_MAX_VERSION;
 }
 
 
diff --git a/lib/classifier.c b/lib/classifier.c
index 66a2655cb50..db6a1691656 100644
--- a/lib/classifier.c
+++ b/lib/classifier.c
@@ -99,7 +99,9 @@ cls_match_alloc(const struct cls_rule *rule,
     ovsrcu_init(&cls_match->next, NULL);
     *CONST_CAST(const struct cls_rule **, &cls_match->cls_rule) = rule;
     *CONST_CAST(int *, &cls_match->priority) = rule->priority;
-    atomic_init(&cls_match->visibility, 0);   /* Initially invisible. */
+    *CONST_CAST(cls_version_t *, &cls_match->add_version) = rule->version;
+    atomic_init(&cls_match->remove_version, rule->version);  /* Initially
+                                                                invisible. */
     miniflow_clone_inline(CONST_CAST(struct miniflow *, &cls_match->flow),
                           &rule->match.flow, count);
     ovsrcu_set_hidden(&cls_match->conj_set,
@@ -115,7 +117,7 @@ static struct cls_subtable *insert_subtable(struct classifier *cls,
 static void destroy_subtable(struct classifier *cls, struct cls_subtable *);
 
 static const struct cls_match *find_match_wc(const struct cls_subtable *,
-                                             long long version,
+                                             cls_version_t version,
                                              const struct flow *,
                                              struct trie_ctx *,
                                              unsigned int n_tries,
@@ -128,15 +130,11 @@ static struct cls_match *find_equal(const struct cls_subtable *,
  * versioning is used at most one of them is ever visible for lookups on any
  * given 'version'. */
 static inline const struct cls_match *
-next_visible_rule_in_list(const struct cls_match *rule, long long version)
+next_visible_rule_in_list(const struct cls_match *rule, cls_version_t version)
 {
     do {
         rule = cls_match_next(rule);
-        if (!rule) {
-            /* We have reached the head of the list, stop. */
-            break;
-        }
-    } while (!cls_match_visible_in_version(rule, version));
+    } while (rule && !cls_match_visible_in_version(rule, version));
 
     return rule;
 }
@@ -166,13 +164,11 @@ static bool mask_prefix_bits_set(const struct flow_wildcards *,
 
 static inline void
 cls_rule_init__(struct cls_rule *rule, unsigned int priority,
-                long long version)
+                cls_version_t version)
 {
-    ovs_assert(version > 0);
-
     rculist_init(&rule->node);
     *CONST_CAST(int *, &rule->priority) = priority;
-    *CONST_CAST(long long *, &rule->version) = version;
+    *CONST_CAST(cls_version_t *, &rule->version) = version;
     rule->cls_match = NULL;
 }
 
@@ -186,7 +182,7 @@ cls_rule_init__(struct cls_rule *rule, unsigned int priority,
  * 0 and UINT16_MAX, inclusive.) */
 void
 cls_rule_init(struct cls_rule *rule, const struct match *match, int priority,
-              long long version)
+              cls_version_t version)
 {
     cls_rule_init__(rule, priority, version);
     minimatch_init(CONST_CAST(struct minimatch *, &rule->match), match);
@@ -196,7 +192,7 @@ cls_rule_init(struct cls_rule *rule, const struct match *match, int priority,
 void
 cls_rule_init_from_minimatch(struct cls_rule *rule,
                              const struct minimatch *match, int priority,
-                             long long version)
+                             cls_version_t version)
 {
     cls_rule_init__(rule, priority, version);
     minimatch_clone(CONST_CAST(struct minimatch *, &rule->match), match);
@@ -207,7 +203,7 @@ cls_rule_init_from_minimatch(struct cls_rule *rule,
  * The caller must eventually destroy 'dst' with cls_rule_destroy(). */
 void
 cls_rule_clone_in_version(struct cls_rule *dst, const struct cls_rule *src,
-                          long long version)
+                          cls_version_t version)
 {
     cls_rule_init__(dst, src->priority, version);
     minimatch_clone(CONST_CAST(struct minimatch *, &dst->match), &src->match);
@@ -308,28 +304,12 @@ cls_rule_is_catchall(const struct cls_rule *rule)
  *
  * 'rule_' must be in a classifier. */
 void
-cls_rule_make_invisible_in_version(const struct cls_rule *rule_,
-                                   long long version, long long lookup_version)
-{
-    struct cls_match *rule = rule_->cls_match;
-
-    /* XXX: Adjust when versioning is actually used. */
-    ovs_assert(version >= rule_->version && version >= lookup_version);
-
-    /* Normally, we call this when deleting a rule that is already visible to
-     * lookups.  However, sometimes a bundle transaction will add a rule and
-     * then delete it before the rule has ever become visible.  If we set such
-     * a rule to become invisible in a future 'version', it would become
-     * visible to all prior versions.  So, in this case we must set the rule
-     * visibility to 0 (== never visible). */
-    if (cls_match_visible_in_version(rule, lookup_version)) {
-        /* Make invisible starting at 'version'. */
-        atomic_store_relaxed(&rule->visibility, -version);
-    } else {
-        /* Rule has not yet been visible to lookups, make invisible in all
-         * version. */
-        atomic_store_relaxed(&rule->visibility, 0);
-    }
+cls_rule_make_invisible_in_version(const struct cls_rule *rule,
+                                   cls_version_t remove_version)
+{
+    ovs_assert(remove_version >= rule->cls_match->add_version);
+
+    cls_match_set_remove_version(rule->cls_match, remove_version);
 }
 
 /* This undoes the change made by cls_rule_make_invisible_after_version().
@@ -338,14 +318,14 @@ cls_rule_make_invisible_in_version(const struct cls_rule *rule_,
 void
 cls_rule_restore_visibility(const struct cls_rule *rule)
 {
-    atomic_store_relaxed(&rule->cls_match->visibility, rule->version);
+    cls_match_set_remove_version(rule->cls_match, CLS_NOT_REMOVED_VERSION);
 }
 
 /* Return true if 'rule' is visible in 'version'.
  *
  * 'rule' must be in a classifier. */
 bool
-cls_rule_visible_in_version(const struct cls_rule *rule, long long version)
+cls_rule_visible_in_version(const struct cls_rule *rule, cls_version_t version)
 {
     return cls_match_visible_in_version(rule->cls_match, version);
 }
@@ -620,8 +600,6 @@ classifier_replace(struct classifier *cls, const struct cls_rule *rule,
     uint32_t hash;
     int i;
 
-    ovs_assert(rule->version > 0);
-
     /* 'new' is initially invisible to lookups. */
     new = cls_match_alloc(rule, conjs, n_conjs);
 
@@ -736,7 +714,7 @@ classifier_replace(struct classifier *cls, const struct cls_rule *rule,
                 /* No change in subtable's max priority or max count. */
 
                 /* Make 'new' visible to lookups in the appropriate version. */
-                cls_match_set_visibility(new, rule->version);
+                cls_match_set_remove_version(new, CLS_NOT_REMOVED_VERSION);
 
                 /* Make rule visible to iterators (immediately). */
                 rculist_replace(CONST_CAST(struct rculist *, &rule->node),
@@ -753,7 +731,7 @@ classifier_replace(struct classifier *cls, const struct cls_rule *rule,
     }
 
     /* Make 'new' visible to lookups in the appropriate version. */
-    cls_match_set_visibility(new, rule->version);
+    cls_match_set_remove_version(new, CLS_NOT_REMOVED_VERSION);
 
     /* Make rule visible to iterators (immediately). */
     rculist_push_back(&subtable->rules_list,
@@ -1047,7 +1025,7 @@ free_conjunctive_matches(struct hmap *matches,
  * 'flow' is non-const to allow for temporary modifications during the lookup.
  * Any changes are restored before returning. */
 static const struct cls_rule *
-classifier_lookup__(const struct classifier *cls, long long version,
+classifier_lookup__(const struct classifier *cls, cls_version_t version,
                     struct flow *flow, struct flow_wildcards *wc,
                     bool allow_conjunctive_matches)
 {
@@ -1307,7 +1285,7 @@ classifier_lookup__(const struct classifier *cls, long long version,
  * 'flow' is non-const to allow for temporary modifications during the lookup.
  * Any changes are restored before returning. */
 const struct cls_rule *
-classifier_lookup(const struct classifier *cls, long long version,
+classifier_lookup(const struct classifier *cls, cls_version_t version,
                   struct flow *flow, struct flow_wildcards *wc)
 {
     return classifier_lookup__(cls, version, flow, wc, true);
@@ -1354,7 +1332,7 @@ classifier_find_rule_exactly(const struct classifier *cls,
 const struct cls_rule *
 classifier_find_match_exactly(const struct classifier *cls,
                               const struct match *target, int priority,
-                              long long version)
+                              cls_version_t version)
 {
     const struct cls_rule *retval;
     struct cls_rule cr;
@@ -1480,7 +1458,7 @@ search_subtable(const struct cls_subtable *subtable,
  * first matching cls_rule via '*pnode', or NULL if there are no matches.
  *
  *     - If 'target' is null, or if the 'target' is a catchall target and the
- *       target's version is CLS_NO_VERSION, the cursor will visit every rule
+ *       target's version is CLS_MAX_VERSION, the cursor will visit every rule
  *       in 'cls' that is not invisible in any version.
  *
  *     - If 'target' is nonnull, the cursor will visit each 'rule' in 'cls'
@@ -1749,7 +1727,7 @@ miniflow_and_mask_matches_flow(const struct miniflow *flow,
 }
 
 static inline const struct cls_match *
-find_match(const struct cls_subtable *subtable, long long version,
+find_match(const struct cls_subtable *subtable, cls_version_t version,
            const struct flow *flow, uint32_t hash)
 {
     const struct cls_match *head, *rule;
@@ -1818,7 +1796,7 @@ fill_range_wc(const struct cls_subtable *subtable, struct flow_wildcards *wc,
 }
 
 static const struct cls_match *
-find_match_wc(const struct cls_subtable *subtable, long long version,
+find_match_wc(const struct cls_subtable *subtable, cls_version_t version,
               const struct flow *flow, struct trie_ctx trie_ctx[CLS_MAX_TRIES],
               unsigned int n_tries, struct flow_wildcards *wc)
 {
diff --git a/lib/classifier.h b/lib/classifier.h
index ef5744631cd..8bbc7366b52 100644
--- a/lib/classifier.h
+++ b/lib/classifier.h
@@ -222,21 +222,16 @@
  * invisible to lookups.  This means that lookups won't find the rule, but the
  * rule is immediately available to classifier iterations.
  *
- * Similarly, a rule can be marked as to be deleted in a future version, or
- * more precisely, to be visible upto a given version number.  To delete a rule
- * in a way to not remove the rule before all ongoing lookups are finished, the
- * rule should be marked as "to be deleted" by setting the rule's visibility to
- * the negation of the last version number in which it should be visible.
+ * Similarly, a rule can be marked as to be deleted in a future version.  To
+ * delete a rule in a way to not remove the rule before all ongoing lookups are
+ * finished, the rule should be made invisible in a specific version number.
  * Then, when all the lookups use a later version number, the rule can be
- * actually deleted from the classifier.  A rule that is marked for deletion
- * after a future version will not appear in iterations, although it will still
- * be found by lookups using a lookup version number up to that future version
- * number.
+ * actually removed from the classifier.
  *
  * Classifiers can hold duplicate rules (rules with the same match criteria and
- * priority) when at most one of the duplicates with the same priority is
- * visible in any given lookup version.  The caller responsible for classifier
- * modifications must maintain this invariant.
+ * priority) when at most one of these duplicates is visible in any given
+ * lookup version.  The caller responsible for classifier modifications must
+ * maintain this invariant.
  *
  * The classifier supports versioning for two reasons:
  *
@@ -308,6 +303,7 @@
 #include "meta-flow.h"
 #include "pvector.h"
 #include "rculist.h"
+#include "type-props.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -326,9 +322,13 @@ struct cls_trie {
     rcu_trie_ptr root;            /* NULL if none. */
 };
 
+typedef uint64_t cls_version_t;
+
+#define CLS_MIN_VERSION 0                  /* Default version number to use. */
+#define CLS_MAX_VERSION (TYPE_MAXIMUM(cls_version_t) - 1)
+#define CLS_NOT_REMOVED_VERSION TYPE_MAXIMUM(cls_version_t)
+
 enum {
-    CLS_MIN_VERSION = 1,   /* Default version number to use. */
-    CLS_MAX_VERSION = LLONG_MAX, /* Last possible version number. */
     CLS_MAX_INDICES = 3,   /* Maximum number of lookup indices per subtable. */
     CLS_MAX_TRIES = 3      /* Maximum number of prefix trees per classifier. */
 };
@@ -357,18 +357,18 @@ struct cls_conjunction {
 struct cls_rule {
     struct rculist node;          /* In struct cls_subtable 'rules_list'. */
     const int priority;           /* Larger numbers are higher priorities. */
-    const long long version;      /* Version in which the rule was added. */
+    const cls_version_t version;  /* Version in which the rule was added. */
     struct cls_match *cls_match;  /* NULL if not in a classifier. */
     const struct minimatch match; /* Matching rule. */
 };
 
 void cls_rule_init(struct cls_rule *, const struct match *, int priority,
-                   long long version);
+                   cls_version_t);
 void cls_rule_init_from_minimatch(struct cls_rule *, const struct minimatch *,
-                                  int priority, long long version);
+                                  int priority, cls_version_t);
 void cls_rule_clone(struct cls_rule *, const struct cls_rule *);
 void cls_rule_clone_in_version(struct cls_rule *, const struct cls_rule *,
-        long long version);
+        cls_version_t);
 void cls_rule_move(struct cls_rule *dst, struct cls_rule *src);
 void cls_rule_destroy(struct cls_rule *);
 
@@ -381,10 +381,9 @@ void cls_rule_format(const struct cls_rule *, struct ds *);
 bool cls_rule_is_catchall(const struct cls_rule *);
 bool cls_rule_is_loose_match(const struct cls_rule *rule,
                              const struct minimatch *criteria);
-bool cls_rule_visible_in_version(const struct cls_rule *, long long version);
+bool cls_rule_visible_in_version(const struct cls_rule *, cls_version_t);
 void cls_rule_make_invisible_in_version(const struct cls_rule *,
-                                        long long version,
-                                        long long lookup_version);
+                                        cls_version_t);
 void cls_rule_restore_visibility(const struct cls_rule *);
 
 /* Constructor/destructor.  Must run single-threaded. */
@@ -409,7 +408,7 @@ static inline void classifier_publish(struct classifier *);
 /* Lookups.  These are RCU protected and may run concurrently with modifiers
  * and each other. */
 const struct cls_rule *classifier_lookup(const struct classifier *,
-                                         long long version, struct flow *,
+                                         cls_version_t, struct flow *,
                                          struct flow_wildcards *);
 bool classifier_rule_overlaps(const struct classifier *,
                               const struct cls_rule *);
@@ -418,7 +417,7 @@ const struct cls_rule *classifier_find_rule_exactly(const struct classifier *,
 const struct cls_rule *classifier_find_match_exactly(const struct classifier *,
                                                      const struct match *,
                                                      int priority,
-                                                     long long version);
+                                                     cls_version_t);
 bool classifier_is_empty(const struct classifier *);
 int classifier_count(const struct classifier *);
 
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 337d6f8c2b3..f5dc2726be8 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -160,7 +160,7 @@ struct xlate_ctx {
     const struct xbridge *xbridge;
 
     /* Flow tables version at the beginning of the translation. */
-    long long tables_version;
+    cls_version_t tables_version;
 
     /* Flow at the last commit. */
     struct flow base_flow;
@@ -2777,7 +2777,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
         const struct xport *peer = xport->peer;
         struct flow old_flow = ctx->xin->flow;
         bool old_was_mpls = ctx->was_mpls;
-        long long old_version = ctx->tables_version;
+        cls_version_t old_version = ctx->tables_version;
         enum slow_path_reason special;
         struct ofpbuf old_stack = ctx->stack;
         union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)];
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 55fea0f0563..369e0b9755b 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -296,7 +296,7 @@ struct ofproto_dpif {
     struct ofproto up;
     struct dpif_backer *backer;
 
-    atomic_llong tables_version;  /* Version # to use in classifier lookups. */
+    ATOMIC(cls_version_t) tables_version;  /* For classifier lookups. */
 
     uint64_t dump_seq; /* Last read of udpif_dump_seq(). */
 
@@ -1618,7 +1618,7 @@ query_tables(struct ofproto *ofproto,
 }
 
 static void
-set_tables_version(struct ofproto *ofproto_, long long version)
+set_tables_version(struct ofproto *ofproto_, cls_version_t version)
 {
     struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
 
@@ -3734,10 +3734,10 @@ rule_set_recirc_id(struct rule *rule_, uint32_t id)
     ovs_mutex_unlock(&rule->up.mutex);
 }
 
-long long
+cls_version_t
 ofproto_dpif_get_tables_version(struct ofproto_dpif *ofproto OVS_UNUSED)
 {
-    long long version;
+    cls_version_t version;
 
     atomic_read_relaxed(&ofproto->tables_version, &version);
 
@@ -3751,7 +3751,7 @@ ofproto_dpif_get_tables_version(struct ofproto_dpif *ofproto OVS_UNUSED)
  * 'flow' is non-const to allow for temporary modifications during the lookup.
  * Any changes are restored before returning. */
 static struct rule_dpif *
-rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto, long long version,
+rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto, cls_version_t version,
                           uint8_t table_id, struct flow *flow,
                           struct flow_wildcards *wc, bool take_ref)
 {
@@ -3797,9 +3797,10 @@ rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto, long long version,
  * 'flow' is non-const to allow for temporary modifications during the lookup.
  * Any changes are restored before returning. */
 struct rule_dpif *
-rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto, long long version,
-                            struct flow *flow, struct flow_wildcards *wc,
-                            bool take_ref, const struct dpif_flow_stats *stats,
+rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto,
+                            cls_version_t version, struct flow *flow,
+                            struct flow_wildcards *wc, bool take_ref,
+                            const struct dpif_flow_stats *stats,
                             uint8_t *table_id, ofp_port_t in_port,
                             bool may_packet_in, bool honor_table_miss)
 {
diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
index aaaf6719101..bb6df5e7339 100644
--- a/ofproto/ofproto-dpif.h
+++ b/ofproto/ofproto-dpif.h
@@ -102,11 +102,10 @@ size_t ofproto_dpif_get_max_mpls_depth(const struct ofproto_dpif *);
 bool ofproto_dpif_get_enable_recirc(const struct ofproto_dpif *);
 bool ofproto_dpif_get_enable_ufid(struct dpif_backer *backer);
 
-long long ofproto_dpif_get_tables_version(struct ofproto_dpif *);
+cls_version_t ofproto_dpif_get_tables_version(struct ofproto_dpif *);
 
 struct rule_dpif *rule_dpif_lookup_from_table(struct ofproto_dpif *,
-                                              long long version,
-                                              struct flow *,
+                                              cls_version_t, struct flow *,
                                               struct flow_wildcards *,
                                               bool take_ref,
                                               const struct dpif_flow_stats *,
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index fd66e494b66..f248f595a28 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -93,8 +93,8 @@ struct ofproto {
     long long int eviction_group_timer; /* For rate limited reheapification. */
     struct oftable *tables;
     int n_tables;
-    long long tables_version;   /* Controls which rules are visible to
-                                 * table lookups. */
+    cls_version_t tables_version;   /* Controls which rules are visible to
+                                     * table lookups. */
 
     /* Rules indexed on their cookie values, in all flow tables. */
     struct hindex cookies OVS_GUARDED_BY(ofproto_mutex);
@@ -843,8 +843,7 @@ struct ofproto_class {
 
     /* Sets the current tables version the provider should use for classifier
      * lookups. */
-    void (*set_tables_version)(struct ofproto *ofproto,
-                               long long version);
+    void (*set_tables_version)(struct ofproto *ofproto, cls_version_t version);
 /* ## ---------------- ## */
 /* ## ofport Functions ## */
 /* ## ---------------- ## */
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index cb8a941af14..2e14e8ca9e4 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -150,7 +150,7 @@ struct rule_criteria {
 
 static void rule_criteria_init(struct rule_criteria *, uint8_t table_id,
                                const struct match *match, int priority,
-                               long long version,
+                               cls_version_t version,
                                ovs_be64 cookie, ovs_be64 cookie_mask,
                                ofp_port_t out_port, uint32_t out_group);
 static void rule_criteria_require_rw(struct rule_criteria *,
@@ -3725,9 +3725,10 @@ next_matching_table(const struct ofproto *ofproto,
  * supplied as 0. */
 static void
 rule_criteria_init(struct rule_criteria *criteria, uint8_t table_id,
-                   const struct match *match, int priority, long long version,
-                   ovs_be64 cookie, ovs_be64 cookie_mask,
-                   ofp_port_t out_port, uint32_t out_group)
+                   const struct match *match, int priority,
+                   cls_version_t version, ovs_be64 cookie,
+                   ovs_be64 cookie_mask, ofp_port_t out_port,
+                   uint32_t out_group)
 {
     criteria->table_id = table_id;
     cls_rule_init(&criteria->cr, match, priority, version);
@@ -4664,8 +4665,7 @@ replace_rule_start(struct ofproto *ofproto,
     if (old_rule) {
         /* Mark the old rule for removal in the next version. */
         cls_rule_make_invisible_in_version(&old_rule->cr,
-                                           ofproto->tables_version + 1,
-                                           ofproto->tables_version);
+                                           ofproto->tables_version + 1);
     } else {
         table->n_flows++;
     }
@@ -4943,8 +4943,7 @@ delete_flows_start__(struct ofproto *ofproto,
 
         table->n_flows--;
         cls_rule_make_invisible_in_version(&rule->cr,
-                                           ofproto->tables_version + 1,
-                                           ofproto->tables_version);
+                                           ofproto->tables_version + 1);
     }
 }
 
diff --git a/tests/test-classifier.c b/tests/test-classifier.c
index 9e837c369d7..2fe9a5d555f 100644
--- a/tests/test-classifier.c
+++ b/tests/test-classifier.c
@@ -975,7 +975,8 @@ test_many_rules_in_one_list (struct ovs_cmdl_context *ctx OVS_UNUSED)
                     tcls_rules[j] = tcls_insert(&tcls, rules[j]);
                     if (versioned) {
                         /* Insert the new rule in the next version. */
-                        *CONST_CAST(long long *, &rules[j]->cls_rule.version)
+                        *CONST_CAST(cls_version_t *,
+                                    &rules[j]->cls_rule.version)
                             = ++version;
 
                         displaced_rule = test_rule_from_cls_rule(
@@ -985,8 +986,7 @@ test_many_rules_in_one_list (struct ovs_cmdl_context *ctx OVS_UNUSED)
                             /* Mark the old rule for removal after the current
                              * version. */
                             cls_rule_make_invisible_in_version(
-                                &displaced_rule->cls_rule, version,
-                                version - 1);
+                                &displaced_rule->cls_rule, version);
                             n_invisible_rules++;
                             removable_rule = &displaced_rule->cls_rule;
                         }
@@ -1011,7 +1011,7 @@ test_many_rules_in_one_list (struct ovs_cmdl_context *ctx OVS_UNUSED)
                         /* Mark the rule for removal after the current
                          * version. */
                         cls_rule_make_invisible_in_version(
-                            &rules[j]->cls_rule, version + 1, version);
+                            &rules[j]->cls_rule, version + 1);
                         ++version;
                         n_invisible_rules++;
                         removable_rule = &rules[j]->cls_rule;
@@ -1131,7 +1131,7 @@ test_many_rules_in_one_table(struct ovs_cmdl_context *ctx OVS_UNUSED)
             if (versioned) {
                 /* Mark the rule for removal after the current version. */
                 cls_rule_make_invisible_in_version(&rules[i]->cls_rule,
-                                                   version + 1, version);
+                                                   version + 1);
                 ++version;
                 n_invisible_rules++;
             } else {
@@ -1219,7 +1219,7 @@ test_many_rules_in_n_tables(int n_tables)
                 if (versioned) {
                     /* Mark the rule for removal after the current version. */
                     cls_rule_make_invisible_in_version(&rule->cls_rule,
-                                                       version + 1, version);
+                                                       version + 1);
                     n_removable_rules++;
                     compare_classifiers(&cls, n_invisible_rules, version,
                                         &tcls);

From f695ebfae56c981332b9030b9ed905f90de8b0c5 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Fri, 12 Jun 2015 16:12:56 -0700
Subject: [PATCH 02/19] ofproto: Postpone sending flow removed messages.

The final flow stats are available only after there are no references
to the rule.  Postpone sending the flow removed message until the
final stats are available.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
---
 include/openflow/openflow-common.h |  2 ++
 lib/ofp-print.c                    |  1 +
 ofproto/ofproto-provider.h         |  5 ++++
 ofproto/ofproto.c                  | 37 ++++++++++++++++++------------
 4 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/include/openflow/openflow-common.h b/include/openflow/openflow-common.h
index e4fecceef7d..d32213fb971 100644
--- a/include/openflow/openflow-common.h
+++ b/include/openflow/openflow-common.h
@@ -301,6 +301,8 @@ enum ofp_flow_removed_reason {
     OFPRR_GROUP_DELETE,         /* Group was removed. */
     OFPRR_METER_DELETE,         /* Meter was removed. */
     OFPRR_EVICTION,             /* Switch eviction to free resources. */
+
+    OVS_OFPRR_NONE              /* OVS internal_use only, keep last!. */
 };
 
 /* What changed about the physical port */
diff --git a/lib/ofp-print.c b/lib/ofp-print.c
index 96e65a7e7a6..2ac11b1ea39 100644
--- a/lib/ofp-print.c
+++ b/lib/ofp-print.c
@@ -865,6 +865,7 @@ ofp_flow_removed_reason_to_string(enum ofp_flow_removed_reason reason,
         return "eviction";
     case OFPRR_METER_DELETE:
         return "meter_delete";
+    case OVS_OFPRR_NONE:
     default:
         snprintf(reasonbuf, bufsize, "%d", (int) reason);
         return reasonbuf;
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index f248f595a28..0d25f68f1cd 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -356,6 +356,11 @@ struct rule {
     /* Eviction precedence. */
     uint16_t importance OVS_GUARDED;
 
+    /* Removal reason for sending flow removed message.
+     * Used only if 'flags' has OFPUTIL_FF_SEND_FLOW_REM set and if the
+     * value is not OVS_OFPRR_NONE. */
+    uint8_t removed_reason;
+
     /* Eviction groups (see comment on struct eviction_group for explanation) .
      *
      * 'eviction_group' is this rule's eviction group, or NULL if it is not in
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index 2e14e8ca9e4..cf5fa34f76c 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -232,7 +232,8 @@ struct ofport_usage {
 };
 
 /* rule. */
-static void ofproto_rule_send_removed(struct rule *, uint8_t reason);
+static void ofproto_rule_send_removed(struct rule *)
+        OVS_EXCLUDED(ofproto_mutex);
 static bool rule_is_readonly(const struct rule *);
 static void ofproto_rule_insert__(struct ofproto *, struct rule *)
     OVS_REQUIRES(ofproto_mutex);
@@ -2758,7 +2759,14 @@ ofproto_rule_destroy__(struct rule *rule)
 
 static void
 rule_destroy_cb(struct rule *rule)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
 {
+    /* Send rule removed if needed. */
+    if (rule->flags & OFPUTIL_FF_SEND_FLOW_REM
+        && rule->removed_reason != OVS_OFPRR_NONE
+        && !rule_is_hidden(rule)) {
+        ofproto_rule_send_removed(rule);
+    }
     rule->ofproto->ofproto_class->rule_destruct(rule);
     ofproto_rule_destroy__(rule);
 }
@@ -4607,6 +4615,7 @@ replace_rule_create(struct ofproto *ofproto, struct ofputil_flow_mod *fm,
     rule->idle_timeout = fm->idle_timeout;
     rule->hard_timeout = fm->hard_timeout;
     rule->importance = fm->importance;
+    rule->removed_reason = OVS_OFPRR_NONE;
 
     *CONST_CAST(uint8_t *, &rule->table_id) = table_id;
     rule->flags = fm->flags & OFPUTIL_FF_STATE;
@@ -4753,9 +4762,7 @@ replace_rule_finish(struct ofproto *ofproto, struct ofputil_flow_mod *fm,
         } else {
             /* XXX: This is slight duplication with delete_flows_finish__() */
 
-            /* XXX: This call should done when rule's refcount reaches
-             * zero to get accurate stats in the flow removed message. */
-            ofproto_rule_send_removed(old_rule, OFPRR_EVICTION);
+            old_rule->removed_reason = OFPRR_EVICTION;
 
             ofmonitor_report(ofproto->connmgr, old_rule, NXFME_DELETED,
                              OFPRR_EVICTION,
@@ -4960,7 +4967,10 @@ delete_flows_finish__(struct ofproto *ofproto,
         for (size_t i = 0; i < rules->n; i++) {
             struct rule *rule = rules->rules[i];
 
-            ofproto_rule_send_removed(rule, reason);
+            /* This value will be used to send the flow removed message right
+             * before the rule is actually destroyed. */
+            rule->removed_reason = reason;
+
             ofmonitor_report(ofproto->connmgr, rule, NXFME_DELETED, reason,
                              req ? req->ofconn : NULL,
                              req ? req->request->xid : 0, NULL);
@@ -5071,23 +5081,20 @@ delete_flow_start_strict(struct ofproto *ofproto,
     return error;
 }
 
-/* XXX: This should be sent right when the rule refcount gets to zero! */
+/* This may only be called by rule_destroy_cb()! */
 static void
-ofproto_rule_send_removed(struct rule *rule, uint8_t reason)
-    OVS_REQUIRES(ofproto_mutex)
+ofproto_rule_send_removed(struct rule *rule)
+    OVS_EXCLUDED(ofproto_mutex)
 {
     struct ofputil_flow_removed fr;
     long long int used;
 
-    if (rule_is_hidden(rule) ||
-        !(rule->flags & OFPUTIL_FF_SEND_FLOW_REM)) {
-        return;
-    }
-
     minimatch_expand(&rule->cr.match, &fr.match);
     fr.priority = rule->cr.priority;
+
+    ovs_mutex_lock(&ofproto_mutex);
     fr.cookie = rule->flow_cookie;
-    fr.reason = reason;
+    fr.reason = rule->removed_reason;
     fr.table_id = rule->table_id;
     calc_duration(rule->created, time_msec(),
                   &fr.duration_sec, &fr.duration_nsec);
@@ -5097,8 +5104,8 @@ ofproto_rule_send_removed(struct rule *rule, uint8_t reason)
     ovs_mutex_unlock(&rule->mutex);
     rule->ofproto->ofproto_class->rule_get_stats(rule, &fr.packet_count,
                                                  &fr.byte_count, &used);
-
     connmgr_send_flow_removed(rule->ofproto->connmgr, &fr);
+    ovs_mutex_unlock(&ofproto_mutex);
 }
 
 /* Sends an OpenFlow "flow removed" message with the given 'reason' (either

From 1c38055de17b4ef00e12e0573fd433989309dc96 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Fri, 12 Jun 2015 16:12:56 -0700
Subject: [PATCH 03/19] ofproto: Support port mods in bundles.

Add support for port mods in an OpenFlow 1.4 bundle, as required for
the minimum support level by the OpenFlow 1.4 specification.  If the
bundle includes port mods, it may not specify the OFPBF_ATOMIC flag.
Port mods and flow mods in a bundle are always applied in order and
the consecutive flow mods between port mods are made available to
lookups atomically.

Note that ovs-ofctl does not support creating bundles with port mods.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Ben Pfaff <blp@nicira.com>
---
 DESIGN.md                  |  53 ++++++++++++++++++-
 NEWS                       |  11 ++--
 OPENFLOW-1.1+.md           |   6 +++
 ofproto/bundles.h          |   6 ++-
 ofproto/ofproto-provider.h |   4 +-
 ofproto/ofproto.c          | 104 +++++++++++++++++++++++++++----------
 6 files changed, 147 insertions(+), 37 deletions(-)

diff --git a/DESIGN.md b/DESIGN.md
index 4d94c2d092b..e533b7c29d5 100644
--- a/DESIGN.md
+++ b/DESIGN.md
@@ -280,11 +280,60 @@ OpenFlow 1.4
 ------------
 
 OpenFlow 1.4 adds the "importance" field to flow_mods, but it does not
-explicitly specify which kinds of flow_mods set the importance.For
+explicitly specify which kinds of flow_mods set the importance.  For
 consistency, Open vSwitch uses the same rule for importance as for
 idle_timeout and hard_timeout, that is, only an "ADD" flow_mod sets
 the importance.  (This issue has been filed with the ONF as EXT-496.)
 
+
+OpenFlow 1.4 Bundles
+====================
+
+Open vSwitch makes all flow table modifications atomically, i.e., any
+datapath packet only sees flow table configurations either before or
+after any change made by any flow_mod.  For example, if a controller
+removes all flows with a single OpenFlow "flow_mod", no packet sees an
+intermediate version of the OpenFlow pipeline where only some of the
+flows have been deleted.
+
+It should be noted that Open vSwitch caches datapath flows, and that
+the cached flows are NOT flushed immediately when a flow table
+changes.  Instead, the datapath flows are revalidated against the new
+flow table as soon as possible, and usually within one second of the
+modification.  This design amortizes the cost of datapath cache
+flushing across multiple flow table changes, and has a significant
+performance effect during simultaneous heavy flow table churn and high
+traffic load.  This means that different cached datapath flows may
+have been computed based on a different flow table configurations, but
+each of the datapath flows is guaranteed to have been computed over a
+coherent view of the flow tables, as described above.
+
+With OpenFlow 1.4 bundles this atomicity can be extended across an
+arbitrary set of flow_mods.  Bundles are supported for flow_mod and
+port_mod messages only.  For flow_mods, both 'atomic' and 'ordered'
+bundle flags are trivially supported, as all bundled messages are
+executed in the order they were added and all flow table modifications
+are now atomic to the datapath.  Port mods may not appear in atomic
+bundles, as port status modifications are not atomic.
+
+To support bundles, ovs-ofctl has a '--bundle' option that makes the
+flow mod commands ('add-flow', 'add-flows', 'mod-flows', 'del-flows',
+and 'replace-flows') use an OpenFlow 1.4 bundle to operate the
+modifications as a single atomic transaction.  If any of the flow mods
+in a transaction fail, none of them are executed.  All flow mods in a
+bundle appear to datapath lookups simultaneously.
+
+Furthermore, ovs-ofctl 'add-flow' and 'add-flows' commands now accept
+arbitrary flow mods as an input by allowing the flow specification to
+start with an explicit 'add', 'modify', 'modify_strict', 'delete', or
+'delete_strict' keyword.  A missing keyword is treated as 'add', so
+this is fully backwards compatible.  With the new '--bundle' option
+all the flow mods are executed as a single atomic transaction using an
+OpenFlow 1.4 bundle.  Without the '--bundle' option the flow mods are
+executed in order up to the first failing flow_mod, and in case of an
+error the earlier successful flow_mods are not rolled back.
+
+
 OFPT_PACKET_IN
 ==============
 
@@ -844,7 +893,7 @@ not know the MAC address of the local port that is sending the traffic
 or the MAC address of the remote in the guest VM.
 
 With a few notable exceptions below, in-band should work in most
-network setups.  The following are considered "supported' in the
+network setups.  The following are considered "supported" in the
 current implementation:
 
  - Locally Connected.  The switch and remote are on the same
diff --git a/NEWS b/NEWS
index a3eeed52b34..90d9a293462 100644
--- a/NEWS
+++ b/NEWS
@@ -32,11 +32,12 @@ Post-v2.3.0
      commands are now redundant and will be removed in a future
      release.  See ovs-vswitchd(8) for details.
    - OpenFlow:
-     * OpenFlow 1.4 bundles are now supported, but for flow mod
-       messages only.  Both 'atomic' and 'ordered' bundle flags are
-       trivially supported, as all bundled messages are executed in
-       the order they were added and all flow table modifications are
-       now atomic to the datapath.
+     * OpenFlow 1.4 bundles are now supported for flow mods and port
+       mods.  For flow mods, both 'atomic' and 'ordered' bundle flags
+       are trivially supported, as all bundled messages are executed
+       in the order they were added and all flow table modifications
+       are now atomic to the datapath.  Port mods may not appear in
+       atomic bundles, as port status modifications are not atomic.
      * IPv6 flow label and neighbor discovery fields are now modifiable.
      * OpenFlow 1.5 extended registers are now supported.
      * The OpenFlow 1.5 actset_output field is now supported.
diff --git a/OPENFLOW-1.1+.md b/OPENFLOW-1.1+.md
index 79114069722..0b2f0f91a90 100644
--- a/OPENFLOW-1.1+.md
+++ b/OPENFLOW-1.1+.md
@@ -148,6 +148,12 @@ parallel in OVS.
     Transactional modification.  OpenFlow 1.4 requires to support
     flow_mods and port_mods in a bundle if bundle is supported.
     (Not related to OVS's 'ofbundle' stuff.)
+    Implemented as an OpenFlow 1.4 feature.  Only flow_mods and
+    port_mods are supported in a bundle.  If the bundle includes port
+    mods, it may not specify the OFPBF_ATOMIC flag.  Nevertheless,
+    port mods and flow mods in a bundle are always applied in order
+    and consecutive flow mods between port mods are made available to
+    lookups atomically.
     [EXT-230]
     [optional for OF1.4+]
 
diff --git a/ofproto/bundles.h b/ofproto/bundles.h
index 0c7daf28841..65717df2285 100644
--- a/ofproto/bundles.h
+++ b/ofproto/bundles.h
@@ -34,14 +34,17 @@ extern "C" {
 struct ofp_bundle_entry {
     struct ovs_list   node;
     enum ofptype      type;  /* OFPTYPE_FLOW_MOD or OFPTYPE_PORT_MOD. */
+    long long         version;          /* Version in which the changes take
+                                         * effect. */
     union {
         struct ofputil_flow_mod fm;   /* 'fm.ofpacts' must be malloced. */
         struct ofputil_port_mod pm;
     };
 
     /* Used during commit. */
+    struct ofport *port;                /* Affected port. */
     struct rule_collection old_rules;   /* Affected rules. */
-    struct rule_collection new_rules;   /* Affected rules. */
+    struct rule_collection new_rules;   /* Replacement rules. */
 
     /* OpenFlow header and some of the message contents for error reporting. */
     struct ofp_header ofp_msg[DIV_ROUND_UP(64, sizeof(struct ofp_header))];
@@ -80,6 +83,7 @@ ofp_bundle_entry_alloc(enum ofptype type, const struct ofp_header *oh)
     struct ofp_bundle_entry *entry = xmalloc(sizeof *entry);
 
     entry->type = type;
+    entry->version = 0;
 
     /* Max 64 bytes for error reporting. */
     memcpy(entry->ofp_msg, oh, MIN(ntohs(oh->length), sizeof entry->ofp_msg));
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index 0d25f68f1cd..527823aacb2 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -93,8 +93,8 @@ struct ofproto {
     long long int eviction_group_timer; /* For rate limited reheapification. */
     struct oftable *tables;
     int n_tables;
-    cls_version_t tables_version;   /* Controls which rules are visible to
-                                     * table lookups. */
+    cls_version_t tables_version;  /* Controls which rules are visible to
+                                    * table lookups. */
 
     /* Rules indexed on their cookie values, in all flow tables. */
     struct hindex cookies OVS_GUARDED_BY(ofproto_mutex);
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index cf5fa34f76c..08ba043079b 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -3433,9 +3433,34 @@ update_port_config(struct ofconn *ofconn, struct ofport *port,
 }
 
 static enum ofperr
-handle_port_mod(struct ofconn *ofconn, const struct ofp_header *oh)
+port_mod_start(struct ofconn *ofconn, struct ofputil_port_mod *pm,
+               struct ofport **port)
 {
     struct ofproto *p = ofconn_get_ofproto(ofconn);
+
+    *port = ofproto_get_port(p, pm->port_no);
+    if (!*port) {
+        return OFPERR_OFPPMFC_BAD_PORT;
+    }
+    if (!eth_addr_equals((*port)->pp.hw_addr, pm->hw_addr)) {
+        return OFPERR_OFPPMFC_BAD_HW_ADDR;
+    }
+    return 0;
+}
+
+static void
+port_mod_finish(struct ofconn *ofconn, struct ofputil_port_mod *pm,
+                struct ofport *port)
+{
+    update_port_config(ofconn, port, pm->config, pm->mask);
+    if (pm->advertise) {
+        netdev_set_advertisements(port->netdev, pm->advertise);
+    }
+}
+
+static enum ofperr
+handle_port_mod(struct ofconn *ofconn, const struct ofp_header *oh)
+{
     struct ofputil_port_mod pm;
     struct ofport *port;
     enum ofperr error;
@@ -3450,18 +3475,11 @@ handle_port_mod(struct ofconn *ofconn, const struct ofp_header *oh)
         return error;
     }
 
-    port = ofproto_get_port(p, pm.port_no);
-    if (!port) {
-        return OFPERR_OFPPMFC_BAD_PORT;
-    } else if (!eth_addr_equals(port->pp.hw_addr, pm.hw_addr)) {
-        return OFPERR_OFPPMFC_BAD_HW_ADDR;
-    } else {
-        update_port_config(ofconn, port, pm.config, pm.mask);
-        if (pm.advertise) {
-            netdev_set_advertisements(port->netdev, pm.advertise);
-        }
+    error = port_mod_start(ofconn, &pm, &port);
+    if (!error) {
+        port_mod_finish(ofconn, &pm, port);
     }
-    return 0;
+    return error;
 }
 
 static enum ofperr
@@ -6668,17 +6686,16 @@ do_bundle_flow_mod_finish(struct ofproto *ofproto, struct ofputil_flow_mod *fm,
  * possible.  No visible changes were made, so rollback is minimal (remove
  * added invisible rules, restore visibility of rules marked for removal).
  *
- * 3. Bump the version visible to lookups.
- *
- * 4. Finish: Insert replacement rules to the ofproto provider. Remove replaced
- * and deleted rules from ofproto data structures, and Schedule postponed
- * removal of deleted rules from the classifier.  Send notifications, buffered
- * packets, etc.
+ * 3. Finish: Make the changes visible for lookups. Insert replacement rules to
+ * the ofproto provider. Remove replaced and deleted rules from ofproto data
+ * structures, and Schedule postponed removal of deleted rules from the
+ * classifier.  Send notifications, buffered packets, etc.
  */
 static enum ofperr
 do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
 {
     struct ofproto *ofproto = ofconn_get_ofproto(ofconn);
+    cls_version_t visible_version = ofproto->tables_version;
     struct ofp_bundle *bundle;
     struct ofp_bundle_entry *be;
     enum ofperr error;
@@ -6691,23 +6708,42 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
     if (bundle->flags != flags) {
         error = OFPERR_OFPBFC_BAD_FLAGS;
     } else {
+        bool prev_is_port_mod = false;
+
         error = 0;
         ovs_mutex_lock(&ofproto_mutex);
 
         /* 1. Begin. */
         LIST_FOR_EACH (be, node, &bundle->msg_list) {
             if (be->type == OFPTYPE_PORT_MOD) {
-                /* Not supported yet. */
-                error = OFPERR_OFPBFC_MSG_FAILED;
+                /* Our port mods are not atomic. */
+                if (flags & OFPBF_ATOMIC) {
+                    error = OFPERR_OFPBFC_MSG_FAILED;
+                } else {
+                    prev_is_port_mod = true;
+                    error = port_mod_start(ofconn, &be->pm, &be->port);
+                }
             } else if (be->type == OFPTYPE_FLOW_MOD) {
+                /* Flow mods between port mods are applied as a single
+                 * version, but the versions are published only after
+                 * we know the commit is successful. */
+                if (prev_is_port_mod) {
+                    ++ofproto->tables_version;
+                }
+                prev_is_port_mod = false;
                 error = do_bundle_flow_mod_start(ofproto, &be->fm, be);
             } else {
                 OVS_NOT_REACHED();
             }
             if (error) {
                 break;
+            } else {
+                /* Store the version in which the changes should take
+                 * effect. */
+                be->version = ofproto->tables_version + 1;
             }
         }
+
         if (error) {
             /* Send error referring to the original message. */
             if (error) {
@@ -6720,24 +6756,38 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
                 if (be->type == OFPTYPE_FLOW_MOD) {
                     do_bundle_flow_mod_revert(ofproto, &be->fm, be);
                 }
+                /* Nothing needs to be reverted for a port mod. */
             }
         } else {
-            /* 3. Bump the version.  This makes all the changes in the bundle
-             * visible to the lookups at once.  For this to work an upcall must
-             * read the tables_version once at the beginning and keep using the
-             * same version number for the whole duration of the upcall
-             * processing. */
-            ofproto_bump_tables_version(ofproto);
-
             /* 4. Finish. */
             LIST_FOR_EACH (be, node, &bundle->msg_list) {
+                /* Bump the lookup version to the one of the current message.
+                 * This makes all the changes in the bundle at this version
+                 * visible to lookups at once. */
+                if (visible_version < be->version) {
+                    visible_version = be->version;
+                    ofproto->ofproto_class->set_tables_version(
+                        ofproto, visible_version);
+                }
                 if (be->type == OFPTYPE_FLOW_MOD) {
                     struct flow_mod_requester req = { ofconn, be->ofp_msg };
 
                     do_bundle_flow_mod_finish(ofproto, &be->fm, &req, be);
+                } else if (be->type == OFPTYPE_PORT_MOD) {
+                    /* Perform the actual port mod. This is not atomic, i.e.,
+                     * the effects will be immediately seen by upcall
+                     * processing regardless of the lookup version.  It should
+                     * be noted that port configuration changes can originate
+                     * also from OVSDB changes asynchronously to all upcall
+                     * processing. */
+                    port_mod_finish(ofconn, &be->pm, be->port);
                 }
             }
         }
+
+        /* Reset the tables_version. */
+        ofproto->tables_version = visible_version;
+
         ofmonitor_flush(ofproto->connmgr);
         ovs_mutex_unlock(&ofproto_mutex);
 

From 7d1ced01772de541d6692c7d5604210e274bcd37 Mon Sep 17 00:00:00 2001
From: Ciara Loftus <ciara.loftus@intel.com>
Date: Thu, 4 Jun 2015 06:51:40 -0700
Subject: [PATCH 04/19] netdev-dpdk: add dpdk vhost-user ports

This patch adds support for a new port type to the userspace
datapath called dpdkvhostuser.

A new dpdkvhostuser port will create a unix domain socket which
when provided to QEMU is used to facilitate communication between
the virtio-net device on the VM and the OVS port on the host.

vhost-cuse ('dpdkvhost') ports are still available as 'dpdkvhostcuse'
ports and will be enabled if vhost-cuse support is detected in the
DPDK build specified during compilation of the switch. Otherwise,
vhost-user ports are enabled.

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Acked-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 INSTALL.DPDK.md         | 194 +++++++++++++++++++++++++++++++++-------
 acinclude.m4            |   3 +
 lib/netdev-dpdk.c       | 161 ++++++++++++++++++++++++++-------
 lib/netdev.c            |   3 +-
 vswitchd/ovs-vswitchd.c |  19 +++-
 5 files changed, 310 insertions(+), 70 deletions(-)

diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
index 462ba0e4c62..cdef6cfcb6f 100644
--- a/INSTALL.DPDK.md
+++ b/INSTALL.DPDK.md
@@ -16,7 +16,9 @@ OVS needs a system with 1GB hugepages support.
 Building and Installing:
 ------------------------
 
-Required DPDK 2.0, `fuse`, `fuse-devel` (`libfuse-dev` on Debian/Ubuntu)
+Required: DPDK 2.0
+Optional (if building with vhost-cuse): `fuse`, `fuse-devel` (`libfuse-dev`
+on Debian/Ubuntu)
 
 1. Configure build & install DPDK:
   1. Set `$DPDK_DIR`
@@ -32,12 +34,9 @@ Required DPDK 2.0, `fuse`, `fuse-devel` (`libfuse-dev` on Debian/Ubuntu)
      `CONFIG_RTE_BUILD_COMBINE_LIBS=y`
 
      Update `config/common_linuxapp` so that DPDK is built with vhost
-     libraries; currently, OVS only supports vhost-cuse, so DPDK vhost-user
-     libraries should be explicitly turned off (they are enabled by default
-     in DPDK 2.0).
+     libraries.
 
      `CONFIG_RTE_LIBRTE_VHOST=y`
-     `CONFIG_RTE_LIBRTE_VHOST_USER=n`
 
      Then run `make install` to build and install the library.
      For default install without IVSHMEM:
@@ -316,40 +315,164 @@ the vswitchd.
 DPDK vhost:
 -----------
 
-vhost-cuse is only supported at present i.e. not using the standard QEMU
-vhost-user interface. It is intended that vhost-user support will be added
-in future releases when supported in DPDK and that vhost-cuse will eventually
-be deprecated. See [DPDK Docs] for more info on vhost.
+DPDK 2.0 supports two types of vhost:
 
-Prerequisites:
-1.  Insert the Cuse module:
+1. vhost-user
+2. vhost-cuse
 
-      `modprobe cuse`
+Whatever type of vhost is enabled in the DPDK build specified, is the type
+that will be enabled in OVS. By default, vhost-user is enabled in DPDK.
+Therefore, unless vhost-cuse has been enabled in DPDK, vhost-user ports
+will be enabled in OVS.
+Please note that support for vhost-cuse is intended to be deprecated in OVS
+in a future release.
 
-2.  Build and insert the `eventfd_link` module:
+DPDK vhost-user:
+----------------
 
-     `cd $DPDK_DIR/lib/librte_vhost/eventfd_link/`
-     `make`
-     `insmod $DPDK_DIR/lib/librte_vhost/eventfd_link.ko`
+The following sections describe the use of vhost-user 'dpdkvhostuser' ports
+with OVS.
 
-Following the steps above to create a bridge, you can now add DPDK vhost
-as a port to the vswitch.
+DPDK vhost-user Prerequisites:
+-------------------------
 
-`ovs-vsctl add-port br0 dpdkvhost0 -- set Interface dpdkvhost0 type=dpdkvhost`
+1. DPDK 2.0 with vhost support enabled as documented in the "Building and
+   Installing section"
 
-Unlike DPDK ring ports, DPDK vhost ports can have arbitrary names:
+2. QEMU version v2.1.0+
 
-`ovs-vsctl add-port br0 port123ABC -- set Interface port123ABC type=dpdkvhost`
+   QEMU v2.1.0 will suffice, but it is recommended to use v2.2.0 if providing
+   your VM with memory greater than 1GB due to potential issues with memory
+   mapping larger areas.
 
-However, please note that when attaching userspace devices to QEMU, the
-name provided during the add-port operation must match the ifname parameter
-on the QEMU command line.
+Adding DPDK vhost-user ports to the Switch:
+--------------------------------------
 
+Following the steps above to create a bridge, you can now add DPDK vhost-user
+as a port to the vswitch. Unlike DPDK ring ports, DPDK vhost-user ports can
+have arbitrary names.
 
-DPDK vhost VM configuration:
-----------------------------
+  -  For vhost-user, the name of the port type is `dpdkvhostuser`
 
-   vhost ports use a Linux* character device to communicate with QEMU.
+     ```
+     ovs-ofctl add-port br0 vhost-user-1 -- set Interface vhost-user-1
+     type=dpdkvhostuser
+     ```
+
+     This action creates a socket located at
+     `/usr/local/var/run/openvswitch/vhost-user-1`, which you must provide
+     to your VM on the QEMU command line. More instructions on this can be
+     found in the next section "DPDK vhost-user VM configuration"
+     Note: If you wish for the vhost-user sockets to be created in a
+     directory other than `/usr/local/var/run/openvswitch`, you may specify
+     another location on the ovs-vswitchd command line like so:
+
+      `./vswitchd/ovs-vswitchd --dpdk -vhost_sock_dir /my-dir -c 0x1 ...`
+
+DPDK vhost-user VM configuration:
+---------------------------------
+Follow the steps below to attach vhost-user port(s) to a VM.
+
+1. Configure sockets.
+   Pass the following parameters to QEMU to attach a vhost-user device:
+
+   ```
+   -chardev socket,id=char1,path=/usr/local/var/run/openvswitch/vhost-user-1
+   -netdev type=vhost-user,id=mynet1,chardev=char1,vhostforce
+   -device virtio-net-pci,mac=00:00:00:00:00:01,netdev=mynet1
+   ```
+
+   ...where vhost-user-1 is the name of the vhost-user port added
+   to the switch.
+   Repeat the above parameters for multiple devices, changing the
+   chardev path and id as necessary. Note that a separate and different
+   chardev path needs to be specified for each vhost-user device. For
+   example you have a second vhost-user port named 'vhost-user-2', you
+   append your QEMU command line with an additional set of parameters:
+
+   ```
+   -chardev socket,id=char2,path=/usr/local/var/run/openvswitch/vhost-user-2
+   -netdev type=vhost-user,id=mynet2,chardev=char2,vhostforce
+   -device virtio-net-pci,mac=00:00:00:00:00:02,netdev=mynet2
+   ```
+
+2. Configure huge pages.
+   QEMU must allocate the VM's memory on hugetlbfs. vhost-user ports access
+   a virtio-net device's virtual rings and packet buffers mapping the VM's
+   physical memory on hugetlbfs. To enable vhost-user ports to map the VM's
+   memory into their process address space, pass the following paramters
+   to QEMU:
+
+   ```
+   -object memory-backend-file,id=mem,size=4096M,mem-path=/dev/hugepages,
+   share=on
+   -numa node,memdev=mem -mem-prealloc
+   ```
+
+DPDK vhost-cuse:
+----------------
+
+The following sections describe the use of vhost-cuse 'dpdkvhostcuse' ports
+with OVS.
+
+DPDK vhost-cuse Prerequisites:
+-------------------------
+
+1. DPDK 2.0 with vhost support enabled as documented in the "Building and
+   Installing section"
+   As an additional step, you must enable vhost-cuse in DPDK by setting the
+   following additional flag in `config/common_linuxapp`:
+
+   `CONFIG_RTE_LIBRTE_VHOST_USER=n`
+
+   Following this, rebuild DPDK as per the instructions in the "Building and
+   Installing" section. Finally, rebuild OVS as per step 3 in the "Building
+   and Installing" section - OVS will detect that DPDK has vhost-cuse libraries
+   compiled and in turn will enable support for it in the switch and disable
+   vhost-user support.
+
+2. Insert the Cuse module:
+
+     `modprobe cuse`
+
+3. Build and insert the `eventfd_link` module:
+
+     ```
+     cd $DPDK_DIR/lib/librte_vhost/eventfd_link/
+     make
+     insmod $DPDK_DIR/lib/librte_vhost/eventfd_link.ko
+     ```
+
+4. QEMU version v2.1.0+
+
+   vhost-cuse will work with QEMU v2.1.0 and above, however it is recommended to
+   use v2.2.0 if providing your VM with memory greater than 1GB due to potential
+   issues with memory mapping larger areas.
+   Note: QEMU v1.6.2 will also work, with slightly different command line parameters,
+   which are specified later in this document.
+
+Adding DPDK vhost-cuse ports to the Switch:
+--------------------------------------
+
+Following the steps above to create a bridge, you can now add DPDK vhost-cuse
+as a port to the vswitch. Unlike DPDK ring ports, DPDK vhost-cuse ports can have
+arbitrary names.
+
+  -  For vhost-cuse, the name of the port type is `dpdkvhostcuse`
+
+     ```
+     ovs-ofctl add-port br0 vhost-cuse-1 -- set Interface vhost-cuse-1
+     type=dpdkvhostcuse
+     ```
+
+     When attaching vhost-cuse ports to QEMU, the name provided during the
+     add-port operation must match the ifname parameter on the QEMU command
+     line. More instructions on this can be found in the next section.
+
+DPDK vhost-cuse VM configuration:
+---------------------------------
+
+   vhost-cuse ports use a Linux* character device to communicate with QEMU.
    By default it is set to `/dev/vhost-net`. It is possible to reuse this
    standard device for DPDK vhost, which makes setup a little simpler but it
    is better practice to specify an alternative character device in order to
@@ -415,16 +538,19 @@ DPDK vhost VM configuration:
    QEMU must allocate the VM's memory on hugetlbfs. Vhost ports access a
    virtio-net device's virtual rings and packet buffers mapping the VM's
    physical memory on hugetlbfs. To enable vhost-ports to map the VM's
-   memory into their process address space, pass the following paramters
+   memory into their process address space, pass the following parameters
    to QEMU:
 
      `-object memory-backend-file,id=mem,size=4096M,mem-path=/dev/hugepages,
       share=on -numa node,memdev=mem -mem-prealloc`
 
+   Note: For use with an earlier QEMU version such as v1.6.2, use the
+   following to configure hugepages instead:
 
-DPDK vhost VM configuration with QEMU wrapper:
-----------------------------------------------
+     `-mem-path /dev/hugepages -mem-prealloc`
 
+DPDK vhost-cuse VM configuration with QEMU wrapper:
+---------------------------------------------------
 The QEMU wrapper script automatically detects and calls QEMU with the
 necessary parameters. It performs the following actions:
 
@@ -450,8 +576,8 @@ qemu-wrap.py -cpu host -boot c -hda <disk image> -m 4096 -smp 4
   netdev=net1,mac=00:00:00:00:00:01
 ```
 
-DPDK vhost VM configuration with libvirt:
------------------------------------------
+DPDK vhost-cuse VM configuration with libvirt:
+----------------------------------------------
 
 If you are using libvirt, you must enable libvirt to access the character
 device by adding it to controllers cgroup for libvirtd using the following
@@ -525,7 +651,7 @@ Now you may launch your VM using virt-manager, or like so:
 
     `virsh create my_vhost_vm.xml`
 
-DPDK vhost VM configuration with libvirt and QEMU wrapper:
+DPDK vhost-cuse VM configuration with libvirt and QEMU wrapper:
 ----------------------------------------------------------
 
 To use the qemu-wrapper script in conjuntion with libvirt, follow the
@@ -553,7 +679,7 @@ steps in the previous section before proceeding with the following steps:
   the correct emulator location and set any additional options. If you are
   using a alternative character device name, please set "us_vhost_path" to the
   location of that device. The script will automatically detect and insert
-  the correct "vhostfd" value in the QEMU command line arguements.
+  the correct "vhostfd" value in the QEMU command line arguments.
 
   5. Use virt-manager to launch the VM
 
diff --git a/acinclude.m4 b/acinclude.m4
index d09a73fc104..20391eca6c9 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -220,6 +220,9 @@ AC_DEFUN([OVS_CHECK_DPDK], [
     DPDK_vswitchd_LDFLAGS=-Wl,--whole-archive,$DPDK_LIB,--no-whole-archive
     AC_SUBST([DPDK_vswitchd_LDFLAGS])
     AC_DEFINE([DPDK_NETDEV], [1], [System uses the DPDK module.])
+
+    OVS_GREP_IFELSE([$RTE_SDK/include/rte_config.h], [define RTE_LIBRTE_VHOST_USER 1],
+                    [], [AC_DEFINE([VHOST_CUSE], [1], [DPDK vhost-cuse support enabled, vhost-user disabled.])])
   else
     RTE_SDK=
   fi
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 63243d8167e..3af1ee788a4 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -16,7 +16,6 @@
 
 #include <config.h>
 
-#include <stdio.h>
 #include <string.h>
 #include <signal.h>
 #include <stdlib.h>
@@ -26,8 +25,12 @@
 #include <sched.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <sys/stat.h>
 #include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 
+#include "dirs.h"
 #include "dp-packet.h"
 #include "dpif-netdev.h"
 #include "list.h"
@@ -90,8 +93,8 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
 #define NIC_PORT_RX_Q_SIZE 2048  /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
 #define NIC_PORT_TX_Q_SIZE 2048  /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
 
-/* Character device cuse_dev_name. */
-static char *cuse_dev_name = NULL;
+char *cuse_dev_name = NULL;    /* Character device cuse_dev_name. */
+char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
 
 /*
  * Maximum amount of time in micro seconds to try and enqueue to vhost.
@@ -126,7 +129,7 @@ enum { DRAIN_TSC = 200000ULL };
 
 enum dpdk_dev_type {
     DPDK_DEV_ETH = 0,
-    DPDK_DEV_VHOST = 1
+    DPDK_DEV_VHOST = 1,
 };
 
 static int rte_eal_init_ret = ENODEV;
@@ -221,6 +224,9 @@ struct netdev_dpdk {
     /* virtio-net structure for vhost device */
     OVSRCU_TYPE(struct virtio_net *) virtio_dev;
 
+    /* Identifier used to distinguish vhost devices from each other */
+    char vhost_id[PATH_MAX];
+
     /* In dpdk_list. */
     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 };
@@ -594,21 +600,51 @@ dpdk_dev_parse_name(const char dev_name[], const char prefix[],
 }
 
 static int
-netdev_dpdk_vhost_construct(struct netdev *netdev_)
+vhost_construct_helper(struct netdev *netdev_)
 {
     struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
-    int err;
 
     if (rte_eal_init_ret) {
         return rte_eal_init_ret;
     }
 
+    rte_spinlock_init(&netdev->vhost_tx_lock);
+    return netdev_dpdk_init(netdev_, -1, DPDK_DEV_VHOST);
+}
+
+static int
+netdev_dpdk_vhost_cuse_construct(struct netdev *netdev_)
+{
+    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
+    int err;
+
     ovs_mutex_lock(&dpdk_mutex);
-    err = netdev_dpdk_init(netdev_, -1, DPDK_DEV_VHOST);
+    strncpy(netdev->vhost_id, netdev->up.name, sizeof(netdev->vhost_id));
+    err = vhost_construct_helper(netdev_);
     ovs_mutex_unlock(&dpdk_mutex);
+    return err;
+}
 
-    rte_spinlock_init(&netdev->vhost_tx_lock);
+static int
+netdev_dpdk_vhost_user_construct(struct netdev *netdev_)
+{
+    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
+    int err;
 
+    ovs_mutex_lock(&dpdk_mutex);
+    /* Take the name of the vhost-user port and append it to the location where
+     * the socket is to be created, then register the socket.
+     */
+    snprintf(netdev->vhost_id, sizeof(netdev->vhost_id), "%s/%s",
+            vhost_sock_dir, netdev_->name);
+    err = rte_vhost_driver_register(netdev->vhost_id);
+    if (err) {
+        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
+                 netdev->vhost_id);
+    }
+    VLOG_INFO("Socket %s created for vhost-user port %s\n", netdev->vhost_id, netdev_->name);
+    err = vhost_construct_helper(netdev_);
+    ovs_mutex_unlock(&dpdk_mutex);
     return err;
 }
 
@@ -1607,7 +1643,7 @@ new_device(struct virtio_net *dev)
     ovs_mutex_lock(&dpdk_mutex);
     /* Add device to the vhost port with the same name as that passed down. */
     LIST_FOR_EACH(netdev, list_node, &dpdk_list) {
-        if (strncmp(dev->ifname, netdev->up.name, IFNAMSIZ) == 0) {
+        if (strncmp(dev->ifname, netdev->vhost_id, IF_NAME_SZ) == 0) {
             ovs_mutex_lock(&netdev->mutex);
             ovsrcu_set(&netdev->virtio_dev, dev);
             ovs_mutex_unlock(&netdev->mutex);
@@ -1687,7 +1723,7 @@ static const struct virtio_net_device_ops virtio_net_device_ops =
 };
 
 static void *
-start_cuse_session_loop(void *dummy OVS_UNUSED)
+start_vhost_loop(void *dummy OVS_UNUSED)
 {
      pthread_detach(pthread_self());
      /* Put the cuse thread into quiescent state. */
@@ -1698,10 +1734,17 @@ start_cuse_session_loop(void *dummy OVS_UNUSED)
 
 static int
 dpdk_vhost_class_init(void)
+{
+    rte_vhost_driver_callback_register(&virtio_net_device_ops);
+    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
+    return 0;
+}
+
+static int
+dpdk_vhost_cuse_class_init(void)
 {
     int err = -1;
 
-    rte_vhost_driver_callback_register(&virtio_net_device_ops);
 
     /* Register CUSE device to handle IOCTLs.
      * Unless otherwise specified on the vswitchd command line, cuse_dev_name
@@ -1714,7 +1757,14 @@ dpdk_vhost_class_init(void)
         return -1;
     }
 
-    ovs_thread_create("cuse_thread", start_cuse_session_loop, NULL);
+    dpdk_vhost_class_init();
+    return 0;
+}
+
+static int
+dpdk_vhost_user_class_init(void)
+{
+    dpdk_vhost_class_init();
     return 0;
 }
 
@@ -1923,6 +1973,33 @@ netdev_dpdk_ring_construct(struct netdev *netdev)
     NULL,                       /* rxq_drain */               \
 }
 
+static int
+process_vhost_flags(char *flag, char *default_val, int size,
+                    char **argv, char **new_val)
+{
+    int changed = 0;
+
+    /* Depending on which version of vhost is in use, process the vhost-specific
+     * flag if it is provided on the vswitchd command line, otherwise resort to
+     * a default value.
+     *
+     * For vhost-user: Process "-cuse_dev_name" to set the custom location of
+     * the vhost-user socket(s).
+     * For vhost-cuse: Process "-vhost_sock_dir" to set the custom name of the
+     * vhost-cuse character device.
+     */
+    if (!strcmp(argv[1], flag) && (strlen(argv[2]) <= size)) {
+        changed = 1;
+        *new_val = strdup(argv[2]);
+        VLOG_INFO("User-provided %s in use: %s", flag, *new_val);
+    } else {
+        VLOG_INFO("No %s provided - defaulting to %s", flag, default_val);
+        *new_val = default_val;
+    }
+
+    return changed;
+}
+
 int
 dpdk_init(int argc, char **argv)
 {
@@ -1937,27 +2014,29 @@ dpdk_init(int argc, char **argv)
     argc--;
     argv++;
 
-    /* If the cuse_dev_name parameter has been provided, set 'cuse_dev_name' to
-     * this string if it meets the correct criteria. Otherwise, set it to the
-     * default (vhost-net).
-     */
-    if (!strcmp(argv[1], "--cuse_dev_name") &&
-        (strlen(argv[2]) <= NAME_MAX)) {
-
-        cuse_dev_name = strdup(argv[2]);
+#ifdef VHOST_CUSE
+    if (process_vhost_flags("-cuse_dev_name", strdup("vhost-net"),
+                            PATH_MAX, argv, &cuse_dev_name)) {
+#else
+    if (process_vhost_flags("-vhost_sock_dir", strdup(ovs_rundir()),
+                            NAME_MAX, argv, &vhost_sock_dir)) {
+        struct stat s;
+        int err;
 
-        /* Remove the cuse_dev_name configuration parameters from the argument
+        err = stat(vhost_sock_dir, &s);
+        if (err) {
+            VLOG_ERR("vHostUser socket DIR '%s' does not exist.",
+                     vhost_sock_dir);
+            return err;
+        }
+#endif
+        /* Remove the vhost flag configuration parameters from the argument
          * list, so that the correct elements are passed to the DPDK
          * initialization function
          */
         argc -= 2;
-        argv += 2;    /* Increment by two to bypass the cuse_dev_name arguments */
+        argv += 2;    /* Increment by two to bypass the vhost flag arguments */
         base = 2;
-
-        VLOG_ERR("User-provided cuse_dev_name in use: /dev/%s", cuse_dev_name);
-    } else {
-        cuse_dev_name = "vhost-net";
-        VLOG_INFO("No cuse_dev_name provided - defaulting to /dev/vhost-net");
     }
 
     /* Keep the program name argument as this is needed for call to
@@ -2012,11 +2091,25 @@ static const struct netdev_class dpdk_ring_class =
         netdev_dpdk_get_status,
         netdev_dpdk_rxq_recv);
 
-static const struct netdev_class dpdk_vhost_class =
+static const struct netdev_class dpdk_vhost_cuse_class =
     NETDEV_DPDK_CLASS(
-        "dpdkvhost",
-        dpdk_vhost_class_init,
-        netdev_dpdk_vhost_construct,
+        "dpdkvhostcuse",
+        dpdk_vhost_cuse_class_init,
+        netdev_dpdk_vhost_cuse_construct,
+        netdev_dpdk_vhost_destruct,
+        netdev_dpdk_vhost_set_multiq,
+        netdev_dpdk_vhost_send,
+        netdev_dpdk_vhost_get_carrier,
+        netdev_dpdk_vhost_get_stats,
+        NULL,
+        NULL,
+        netdev_dpdk_vhost_rxq_recv);
+
+const struct netdev_class dpdk_vhost_user_class =
+    NETDEV_DPDK_CLASS(
+        "dpdkvhostuser",
+        dpdk_vhost_user_class_init,
+        netdev_dpdk_vhost_user_construct,
         netdev_dpdk_vhost_destruct,
         netdev_dpdk_vhost_set_multiq,
         netdev_dpdk_vhost_send,
@@ -2039,7 +2132,11 @@ netdev_dpdk_register(void)
         dpdk_common_init();
         netdev_register_provider(&dpdk_class);
         netdev_register_provider(&dpdk_ring_class);
-        netdev_register_provider(&dpdk_vhost_class);
+#ifdef VHOST_CUSE
+        netdev_register_provider(&dpdk_vhost_cuse_class);
+#else
+        netdev_register_provider(&dpdk_vhost_user_class);
+#endif
         ovsthread_once_done(&once);
     }
 }
diff --git a/lib/netdev.c b/lib/netdev.c
index 03a75497906..186c1e2e4c4 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -111,7 +111,8 @@ netdev_is_pmd(const struct netdev *netdev)
 {
     return (!strcmp(netdev->netdev_class->type, "dpdk") ||
             !strcmp(netdev->netdev_class->type, "dpdkr") ||
-            !strcmp(netdev->netdev_class->type, "dpdkvhost"));
+            !strcmp(netdev->netdev_class->type, "dpdkvhostcuse") ||
+            !strcmp(netdev->netdev_class->type, "dpdkvhostuser"));
 }
 
 static void
diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
index a1b33dad929..96bb1d8ca17 100644
--- a/vswitchd/ovs-vswitchd.c
+++ b/vswitchd/ovs-vswitchd.c
@@ -72,6 +72,10 @@ main(int argc, char *argv[])
 
     set_program_name(argv[0]);
     retval = dpdk_init(argc,argv);
+    if (retval < 0) {
+        return retval;
+    }
+
     argc -= retval;
     argv += retval;
 
@@ -252,9 +256,18 @@ usage(void)
     daemon_usage();
     vlog_usage();
     printf("\nDPDK options:\n"
-           "  --dpdk options            Initialize DPDK datapath.\n"
-           "  --cuse_dev_name BASENAME  override default character device name\n"
-           "                            for use with userspace vHost.\n");
+           "  --dpdk [VHOST] [DPDK]     Initialize DPDK datapath.\n"
+           "  where DPDK are options for initializing DPDK lib and VHOST is\n"
+#ifdef VHOST_CUSE
+           "  option to override default character device name used for\n"
+           "  for use with userspace vHost\n"
+           "    -cuse_dev_name NAME\n"
+#else
+           "  option to override default directory where vhost-user\n"
+           "  sockets are created.\n"
+           "    -vhost_sock_dir DIR\n"
+#endif
+           );
     printf("\nOther options:\n"
            "  --unixctl=SOCKET          override default control socket name\n"
            "  -h, --help                display this help message\n"

From 268eca112d919c6e9cd03be74f4deb5a91a945bc Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@nicira.com>
Date: Tue, 9 Jun 2015 11:32:24 -0700
Subject: [PATCH 05/19] flow: Make assertions about offsets within struct flow
 easier to follow.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme@nicira.com>
---
 lib/flow.c | 60 ++++++++++++++++++++++++++----------------------------
 lib/util.h |  3 +++
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/lib/flow.c b/lib/flow.c
index d2dcc460e0e..3e99d5e6c99 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -50,25 +50,32 @@ const uint8_t flow_segment_u64s[4] = {
     FLOW_U64S
 };
 
+/* Asserts that field 'f1' follows immediately after 'f0' in struct flow,
+ * without any intervening padding. */
+#define ASSERT_SEQUENTIAL(f0, f1)                       \
+    BUILD_ASSERT_DECL(offsetof(struct flow, f0)         \
+                      + MEMBER_SIZEOF(struct flow, f0)  \
+                      == offsetof(struct flow, f1))
+
+/* Asserts that fields 'f0' and 'f1' are in the same 32-bit aligned word within
+ * struct flow. */
+#define ASSERT_SAME_WORD(f0, f1)                        \
+    BUILD_ASSERT_DECL(offsetof(struct flow, f0) / 4     \
+                      == offsetof(struct flow, f1) / 4)
+
+/* Asserts that 'f0' and 'f1' are both sequential and within the same 32-bit
+ * aligned word in struct flow. */
+#define ASSERT_SEQUENTIAL_SAME_WORD(f0, f1)     \
+    ASSERT_SEQUENTIAL(f0, f1);                  \
+    ASSERT_SAME_WORD(f0, f1)
+
 /* miniflow_extract() assumes the following to be true to optimize the
  * extraction process. */
-BUILD_ASSERT_DECL(offsetof(struct flow, dl_type) + 2
-                  == offsetof(struct flow, vlan_tci) &&
-                  offsetof(struct flow, dl_type) / 4
-                  == offsetof(struct flow, vlan_tci) / 4 );
-
-BUILD_ASSERT_DECL(offsetof(struct flow, nw_frag) + 3
-                  == offsetof(struct flow, nw_proto) &&
-                  offsetof(struct flow, nw_tos) + 2
-                  == offsetof(struct flow, nw_proto) &&
-                  offsetof(struct flow, nw_ttl) + 1
-                  == offsetof(struct flow, nw_proto) &&
-                  offsetof(struct flow, nw_frag) / 4
-                  == offsetof(struct flow, nw_tos) / 4 &&
-                  offsetof(struct flow, nw_ttl) / 4
-                  == offsetof(struct flow, nw_tos) / 4 &&
-                  offsetof(struct flow, nw_proto) / 4
-                  == offsetof(struct flow, nw_tos) / 4);
+ASSERT_SEQUENTIAL_SAME_WORD(dl_type, vlan_tci);
+
+ASSERT_SEQUENTIAL_SAME_WORD(nw_frag, nw_tos);
+ASSERT_SEQUENTIAL_SAME_WORD(nw_tos, nw_ttl);
+ASSERT_SEQUENTIAL_SAME_WORD(nw_ttl, nw_proto);
 
 /* TCP flags in the middle of a BE64, zeroes in the other half. */
 BUILD_ASSERT_DECL(offsetof(struct flow, tcp_flags) % 8 == 4);
@@ -80,10 +87,7 @@ BUILD_ASSERT_DECL(offsetof(struct flow, tcp_flags) % 8 == 4);
 #define TCP_FLAGS_BE32(tcp_ctl) ((OVS_FORCE ovs_be32)TCP_FLAGS_BE16(tcp_ctl))
 #endif
 
-BUILD_ASSERT_DECL(offsetof(struct flow, tp_src) + 2
-                  == offsetof(struct flow, tp_dst) &&
-                  offsetof(struct flow, tp_src) / 4
-                  == offsetof(struct flow, tp_dst) / 4);
+ASSERT_SEQUENTIAL_SAME_WORD(tp_src, tp_dst);
 
 /* Removes 'size' bytes from the head end of '*datap', of size '*sizep', which
  * must contain at least 'size' bytes of data.  Returns the first byte of data
@@ -458,8 +462,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
         ovs_be16 vlan_tci;
 
         /* Link layer. */
-        BUILD_ASSERT(offsetof(struct flow, dl_dst) + 6
-                     == offsetof(struct flow, dl_src));
+        ASSERT_SEQUENTIAL(dl_dst, dl_src);
         miniflow_push_macs(mf, dl_dst, data);
         /* dl_type, vlan_tci. */
         vlan_tci = parse_vlan(&data, &size);
@@ -645,8 +648,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
                 }
 
                 /* Must be adjacent. */
-                BUILD_ASSERT(offsetof(struct flow, arp_sha) + 6
-                             == offsetof(struct flow, arp_tha));
+                ASSERT_SEQUENTIAL(arp_sha, arp_tha);
 
                 memcpy(arp_buf[0], arp->ar_sha, ETH_ADDR_LEN);
                 memcpy(arp_buf[1], arp->ar_tha, ETH_ADDR_LEN);
@@ -1252,12 +1254,8 @@ miniflow_hash_5tuple(const struct miniflow *flow, uint32_t basis)
     return hash;
 }
 
-BUILD_ASSERT_DECL(offsetof(struct flow, tp_src) + 2
-                  == offsetof(struct flow, tp_dst) &&
-                  offsetof(struct flow, tp_src) / 4
-                  == offsetof(struct flow, tp_dst) / 4);
-BUILD_ASSERT_DECL(offsetof(struct flow, ipv6_src) + 16
-                  == offsetof(struct flow, ipv6_dst));
+ASSERT_SEQUENTIAL_SAME_WORD(tp_src, tp_dst);
+ASSERT_SEQUENTIAL(ipv6_src, ipv6_dst);
 
 /* Calculates the 5-tuple hash from the given flow. */
 uint32_t
diff --git a/lib/util.h b/lib/util.h
index 59276e35721..0d1e727ed43 100644
--- a/lib/util.h
+++ b/lib/util.h
@@ -199,6 +199,9 @@ ovs_prefetch_range(const void *start, size_t size)
     ((char *) &(OBJECT)->MEMBER - (char *) (OBJECT))
 #endif
 
+/* Yields the size of MEMBER within STRUCT. */
+#define MEMBER_SIZEOF(STRUCT, MEMBER) (sizeof(((STRUCT *) NULL)->MEMBER))
+
 /* Given POINTER, the address of the given MEMBER in a STRUCT object, returns
    the STRUCT object. */
 #define CONTAINER_OF(POINTER, STRUCT, MEMBER)                           \

From 9fc789b9062818161890d165e0bc71621209404a Mon Sep 17 00:00:00 2001
From: Sabyasachi Sengupta <Sabyasachi.Sengupta@alcatel-lucent.com>
Date: Mon, 15 Jun 2015 14:57:53 -0700
Subject: [PATCH 06/19] ovs-ctl: let openvswitch startup to NOT hold up system
 boot upon error

Abort openvswitch startup script if ovsdb startup fails for
some reason. This helps in getting the system startup to NOT hang
indefinitely, as was seen in a recent report when ovsdb failed with
"I/O error: /etc/openvswitch/conf.db: failed to lock lockfile
(Resource temporarily unavailable)" and system remained in hung state
forever, unless manually rebooted from console.

Signed-off-by: Sabyasachi Sengupta <sabyasachi.sengupta@alcatel-lucent.com>
[blp@nicira.com changed an 'if' statement to '||']
Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 utilities/ovs-ctl.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/ovs-ctl.in b/utilities/ovs-ctl.in
index 97716e9ff48..99d789c68b0 100755
--- a/utilities/ovs-ctl.in
+++ b/utilities/ovs-ctl.in
@@ -717,7 +717,7 @@ do
 done
 case $command in
     start)
-        start_ovsdb
+        start_ovsdb || exit 1
         start_forwarding
         add_managers
         ;;

From 72a5e2b8fc8c7c316c0b9feb96b7be5b19265c7c Mon Sep 17 00:00:00 2001
From: Daniele Di Proietto <diproiettod@vmware.com>
Date: Mon, 15 Jun 2015 19:06:39 +0100
Subject: [PATCH 07/19] dpif-netdev: Prefetch next packet before
 miniflow_extract().

It appears that miniflow_extract() in emc_processing() spends a lot of
cycles waiting for the packet's data to be read.

Prefetching the next packet's data while parsing removes this delay.
For a single flow pipeline the throughput improves by ~10%.  With a
more realistic pipeline the change has a much smaller effect (~0.5%
improvement)

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
Acked-by: Ethan Jackson <ethan@nicira.com>
---
 lib/dpif-netdev.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 5b82c8b6392..f13169c14eb 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3150,6 +3150,11 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
             continue;
         }
 
+        if (i != cnt - 1) {
+            /* Prefetch next packet data */
+            OVS_PREFETCH(dp_packet_data(packets[i+1]));
+        }
+
         miniflow_extract(packets[i], &key.mf);
         key.len = 0; /* Not computed yet. */
         key.hash = dpif_netdev_packet_get_rss_hash(packets[i], &key.mf);

From 8e2fe95e533b2b9277843baeccc3d00cefdf653e Mon Sep 17 00:00:00 2001
From: Sorin Vinturis <svinturis@cloudbasesolutions.com>
Date: Thu, 28 May 2015 20:30:57 +0000
Subject: [PATCH 08/19] datapath-windows: BSOD when disabling the extension

When the filter detach routine is called while there are packets
still in processing, the OvsUninitSwitchContext function call will
decrement the switch context reference count without releasing the
switch context structure. This behaviour is correct and expected,
but the BSOD is caused in this case because the gOvsSwitchContext
variable is set to NULL, which is wrong.

The gOvsSwitchContext global variable must be set to NULL only when
the switch context structure is actually released.

Signed-off-by: Sorin Vinturis <svinturis@cloudbasesolutions.com>
Reported-by: Sorin Vinturis <svinturis@cloudbasesolutions.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/80
Acked-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 datapath-windows/ovsext/Switch.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/datapath-windows/ovsext/Switch.c b/datapath-windows/ovsext/Switch.c
index f8778546c1c..99a306d266b 100644
--- a/datapath-windows/ovsext/Switch.c
+++ b/datapath-windows/ovsext/Switch.c
@@ -201,6 +201,7 @@ OvsCreateSwitch(NDIS_HANDLE ndisFilterHandle,
     status = OvsInitSwitchContext(switchContext);
     if (status != NDIS_STATUS_SUCCESS) {
         OvsFreeMemoryWithTag(switchContext, OVS_SWITCH_POOL_TAG);
+        switchContext = NULL;
         goto create_switch_done;
     }
 
@@ -240,7 +241,6 @@ OvsExtDetach(NDIS_HANDLE filterModuleContext)
     }
     OvsDeleteSwitch(switchContext);
     OvsCleanupIpHelper();
-    gOvsSwitchContext = NULL;
     /* This completes the cleanup, and a new attach can be handled now. */
 
     OVS_LOG_TRACE("Exit: OvsDetach Successfully");
@@ -495,6 +495,7 @@ OvsReleaseSwitchContext(POVS_SWITCH_CONTEXT switchContext)
 
     if (ref == 1) {
         OvsDeleteSwitchContext(switchContext);
+        gOvsSwitchContext = NULL;
     }
 }
 

From 69f8f7e1388486e098ef95dba650345a91361cd1 Mon Sep 17 00:00:00 2001
From: Daniele Di Proietto <diproiettod@vmware.com>
Date: Tue, 16 Jun 2015 16:25:24 +0100
Subject: [PATCH 09/19] ovs-vtep: Support userspace datapaths.

With this commit, the VTEP emulator detects the datapath_type of the
bridge used as a "physical" switch, and creates subsequent bridges
with the same type.  This allows ovs-vtep to work with the userspace
datapath.

Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
Acked-by: Gurucharan Shetty <gshetty@nicira.com>
---
 vtep/ovs-vtep | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/vtep/ovs-vtep b/vtep/ovs-vtep
index 60dbb9593cb..a774e028195 100755
--- a/vtep/ovs-vtep
+++ b/vtep/ovs-vtep
@@ -40,6 +40,7 @@ vlog = ovs.vlog.Vlog("ovs-vtep")
 exiting = False
 
 ps_name = ""
+ps_type = ""
 Tunnel_Ip = ""
 Lswitches = {}
 Bindings = {}
@@ -103,7 +104,12 @@ class Logical_Switch(object):
             self.tunnel_key = 0
             vlog.warn("invalid tunnel key for %s, using 0" % self.name)
 
-        ovs_vsctl("--may-exist add-br %s" % self.short_name)
+        if ps_type:
+            ovs_vsctl("--may-exist add-br %s -- set Bridge %s datapath_type=%s"
+                      % (self.short_name, self.short_name, ps_type))
+        else:
+            ovs_vsctl("--may-exist add-br %s" % self.short_name)
+
         ovs_vsctl("br-set-external-id %s vtep_logical_switch true"
                   % self.short_name)
         ovs_vsctl("br-set-external-id %s logical_switch_name %s"
@@ -595,6 +601,9 @@ def setup():
     if (ps_name not in br_list):
         ovs.util.ovs_fatal(0, "couldn't find OVS bridge %s" % ps_name, vlog)
 
+    global ps_type
+    ps_type = ovs_vsctl("get Bridge %s datapath_type" % ps_name).strip('"')
+
     call_prog("vtep-ctl", ["set", "physical_switch", ps_name,
                            'description="OVS VTEP Emulator"'])
 
@@ -636,7 +645,11 @@ def setup():
 
             ovs_vsctl("del-br %s" % br)
 
-    ovs_vsctl("add-br %s" % bfd_bridge)
+    if ps_type:
+        ovs_vsctl("add-br %s -- set Bridge %s datapath_type=%s"
+                  % (bfd_bridge, bfd_bridge, ps_type))
+    else:
+        ovs_vsctl("add-br %s" % bfd_bridge)
 
 
 def main():

From 84072381c60d112c49ecbb634898069d682e23cb Mon Sep 17 00:00:00 2001
From: "Mark D. Gray" <mark.d.gray@intel.com>
Date: Wed, 17 Jun 2015 12:49:25 +0100
Subject: [PATCH 10/19] docs: Fix alignment for diagram in native-tunneling.md.

Markdown was not formatted correctly and, as a result,
was displaying incorrectly on github.

Signed-off-by: Mark D. Gray <mark.d.gray@intel.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 README-native-tunneling.md | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/README-native-tunneling.md b/README-native-tunneling.md
index 746d089c41c..63b02099702 100644
--- a/README-native-tunneling.md
+++ b/README-native-tunneling.md
@@ -26,24 +26,23 @@ Configure OVS bridges as follows.
    ovs-vsctl add-port int-br vxlan0 -- set interface vxlan0 type=vxlan  options:remote_ip=172.168.1.2
 4. Assign IP address to int-br, So final topology looks like:
 
-
-       192.168.1.1/24
-       +--------------+
-       |    int-br    |                                   192.168.1.2/24
-       +--------------+                                  +--------------+
-       |    vxlan0    |                                  |    vxlan0    |
-       +--------------+                                  +--------------+
-             |                                                 |
-             |                                                 |
-             |                                                 |
-        172.168.1.1/24                                         |
-       +--------------+                                        |
-       |    br-eth1   |                                  172.168.1.2/24
-       +--------------+                                  +---------------+
-       |    eth1      |----------------------------------|    eth1       |
-       +--------------+                                  +----------------
-
-       Host A with OVS.                                      Remote host.
+                 192.168.1.1/24
+                +--------------+
+                |    int-br    |                                    192.168.1.2/24
+                +--------------+                                   +--------------+
+                |    vxlan0    |                                   |    vxlan0    |
+                +--------------+                                   +--------------+
+                       |                                                  |
+                       |                                                  |
+                       |                                                  |
+                 172.168.1.1/24                                           |
+                +--------------+                                          |
+                |    br-eth1   |                                   172.168.1.2/24
+                +--------------+                                  +---------------+
+                |    eth1      |----------------------------------|      eth1     |
+                +--------------+                                  +---------------+
+
+                Host A with OVS.                                      Remote host.
 
 With this setup, ping to VXLAN target device (192.168.1.2) should work
 There are following commands that shows internal tables:

From 86b68e90b3bf96f10da90e53872b17240f6c8271 Mon Sep 17 00:00:00 2001
From: Dennis Flynn <drflynn@avaya.com>
Date: Tue, 16 Jun 2015 17:33:35 -0400
Subject: [PATCH 11/19] auto-attach: Cleanup i-sid/vlan mappings associated
 with lldp-enabled port.

This commit fixes a bug where the i-sid/vlan mapping structures associated with
an lldp-enabled port were not being freed during general port cleanup.

Signed-off-by: Dennis Flynn <drflynn@avaya.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 lib/lldp/lldpd-structs.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/lib/lldp/lldpd-structs.c b/lib/lldp/lldpd-structs.c
index b78c2e19ad4..71c4f5e0b12 100644
--- a/lib/lldp/lldpd-structs.c
+++ b/lib/lldp/lldpd-structs.c
@@ -93,17 +93,41 @@ lldpd_remote_cleanup(struct lldpd_hardware *hw,
     }
 }
 
+/* Cleanup the auto-attach mappings attached to port.
+ */
+static void
+lldpd_aa_maps_cleanup(struct lldpd_port *port)
+{
+    struct lldpd_aa_isid_vlan_maps_tlv *isid_vlan_map = NULL;
+    struct lldpd_aa_isid_vlan_maps_tlv *isid_vlan_map_next = NULL;
+
+    if (!list_is_empty(&port->p_isid_vlan_maps)) {
+
+        LIST_FOR_EACH_SAFE (isid_vlan_map, isid_vlan_map_next, m_entries,
+                            &port->p_isid_vlan_maps) {
+
+            list_remove(&isid_vlan_map->m_entries);
+            free(isid_vlan_map);
+        }
+
+        list_init(&port->p_isid_vlan_maps);
+    }
+}
+
 /* If `all' is true, clear all information, including information that
    are not refreshed periodically. Port should be freed manually. */
 void
 lldpd_port_cleanup(struct lldpd_port *port, bool all)
 {
     /* We set these to NULL so we don't free wrong memory */
-
     free(port->p_id);
     port->p_id = NULL;
     free(port->p_descr);
     port->p_descr = NULL;
+
+    /* Cleanup auto-attach mappings */
+    lldpd_aa_maps_cleanup(port);
+
     if (all) {
         free(port->p_lastframe);
         /* Chassis may not have been attributed, yet.*/

From f303f87e0fb6609f450dc556dc2b001b6845552c Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@nicira.com>
Date: Tue, 16 Jun 2015 08:47:34 -0700
Subject: [PATCH 12/19] lacp: Remove packed attribute from struct lacp_pdu.

The packed annotation doesn't do anything here because all of the members
in the structure are naturally aligned.

Signed-off-by: Ben Pfaff <blp@nicira.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Ethan Jackson <ethan@nicira.com>
---
 lib/lacp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/lacp.c b/lib/lacp.c
index 65149fdb3cf..29b8b438715 100644
--- a/lib/lacp.c
+++ b/lib/lacp.c
@@ -62,7 +62,6 @@ struct lacp_info {
 BUILD_ASSERT_DECL(LACP_INFO_LEN == sizeof(struct lacp_info));
 
 #define LACP_PDU_LEN 110
-OVS_PACKED(
 struct lacp_pdu {
     uint8_t subtype;          /* Always 1. */
     uint8_t version;          /* Always 1. */
@@ -81,7 +80,7 @@ struct lacp_pdu {
     uint8_t collector_len;    /* Always 16. */
     ovs_be16 collector_delay; /* Maximum collector delay. Set to UINT16_MAX. */
     uint8_t z3[64];           /* Combination of several fields.  Always 0. */
-});
+};
 BUILD_ASSERT_DECL(LACP_PDU_LEN == sizeof(struct lacp_pdu));
 
 /* Implementation. */

From d29f137b65e8051f7da0eacf1901bcb7245b6b4e Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Date: Wed, 17 Jun 2015 14:12:19 -0300
Subject: [PATCH 13/19] ofproto-dpif-xlate: Make IGMP packets always take slow
 path.

IGMP packets need to take the slow path. Otherwise, packets that match
the same flow will not be processed by OVS. That might prevent OVS from
updating the expire time for entries already in the mdb, but also to
lose packets with different addresses in the payload.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 ofproto/ofproto-dpif-xlate.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index f5dc2726be8..a0d13c2655c 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -2269,12 +2269,18 @@ xlate_normal(struct xlate_ctx *ctx)
         struct mcast_group *grp;
 
         if (flow->nw_proto == IPPROTO_IGMP) {
-            if (ctx->xin->may_learn) {
-                if (mcast_snooping_is_membership(flow->tp_src) ||
-                    mcast_snooping_is_query(flow->tp_src)) {
+            if (mcast_snooping_is_membership(flow->tp_src) ||
+                mcast_snooping_is_query(flow->tp_src)) {
+                if (ctx->xin->may_learn) {
                     update_mcast_snooping_table(ctx->xbridge, flow, vlan,
                                                 in_xbundle);
-                    }
+                }
+                /*
+                 * IGMP packets need to take the slow path, in order to be
+                 * processed for mdb updates. That will prevent expires
+                 * firing off even after hosts have sent reports.
+                 */
+                ctx->xout->slow |= SLOW_ACTION;
             }
 
             if (mcast_snooping_is_membership(flow->tp_src)) {

From e3102e42ef2afdeecdbdaf95f8dd119df3e1de7e Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Date: Wed, 17 Jun 2015 14:12:20 -0300
Subject: [PATCH 14/19] Add IGMPv3 support.

Support IGMPv3 messages with multiple records. Make sure all IGMPv3
messages go through slow path, since they may carry multiple multicast
addresses, unlike IGMPv2.

Tests done:

* multiple addresses in IGMPv3 report are inserted in mdb;
* address is removed from IGMPv3 if record is INCLUDE_MODE;
* reports sent on a burst with same flow all go to userspace;
* IGMPv3 reports go to mrouters, i.e., ports that have issued a query.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 NEWS                         |  2 +-
 lib/mcast-snooping.c         | 52 ++++++++++++++++++++++++++++++++++++
 lib/mcast-snooping.h         |  5 ++++
 lib/packets.h                | 26 ++++++++++++++++++
 ofproto/ofproto-dpif-xlate.c | 19 ++++++++++---
 vswitchd/vswitch.xml         |  2 +-
 6 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/NEWS b/NEWS
index 90d9a293462..43461b27c79 100644
--- a/NEWS
+++ b/NEWS
@@ -87,7 +87,7 @@ Post-v2.3.0
      with Docker, the wrapper script will be retired.
    - Added support for DPDK Tunneling. VXLAN, GRE, and Geneve are supported
      protocols. This is generic tunneling mechanism for userspace datapath.
-   - Support for multicast snooping (IGMPv1 and IGMPv2)
+   - Support for multicast snooping (IGMPv1, IGMPv2 and IGMPv3)
    - Support for Linux kernels up to 4.0.x
    - The documentation now use the term 'destination' to mean one of syslog,
      console or file for vlog logging instead of the previously used term
diff --git a/lib/mcast-snooping.c b/lib/mcast-snooping.c
index c3ffd6b77bc..7b927aa27cb 100644
--- a/lib/mcast-snooping.c
+++ b/lib/mcast-snooping.c
@@ -69,6 +69,7 @@ mcast_snooping_is_membership(ovs_be16 igmp_type)
     switch (ntohs(igmp_type)) {
     case IGMP_HOST_MEMBERSHIP_REPORT:
     case IGMPV2_HOST_MEMBERSHIP_REPORT:
+    case IGMPV3_HOST_MEMBERSHIP_REPORT:
     case IGMP_HOST_LEAVE_MESSAGE:
         return true;
     }
@@ -416,6 +417,57 @@ mcast_snooping_add_group(struct mcast_snooping *ms, ovs_be32 ip4,
     return learned;
 }
 
+int
+mcast_snooping_add_report(struct mcast_snooping *ms,
+                          const struct dp_packet *p,
+                          uint16_t vlan, void *port)
+{
+    ovs_be32 ip4;
+    size_t offset;
+    const struct igmpv3_header *igmpv3;
+    const struct igmpv3_record *record;
+    int count = 0;
+    int ngrp;
+
+    offset = (char *) dp_packet_l4(p) - (char *) dp_packet_data(p);
+    igmpv3 = dp_packet_at(p, offset, IGMPV3_HEADER_LEN);
+    if (!igmpv3) {
+        return 0;
+    }
+    ngrp = ntohs(igmpv3->ngrp);
+    offset += IGMPV3_HEADER_LEN;
+    while (ngrp--) {
+        bool ret;
+        record = dp_packet_at(p, offset, sizeof(struct igmpv3_record));
+        if (!record) {
+            break;
+        }
+        /* Only consider known record types. */
+        if (record->type < IGMPV3_MODE_IS_INCLUDE
+            || record->type > IGMPV3_BLOCK_OLD_SOURCES) {
+            continue;
+        }
+        ip4 = get_16aligned_be32(&record->maddr);
+        /*
+         * If record is INCLUDE MODE and there are no sources, it's equivalent
+         * to a LEAVE.
+         */
+        if (ntohs(record->nsrcs) == 0
+            && (record->type == IGMPV3_MODE_IS_INCLUDE
+                || record->type == IGMPV3_CHANGE_TO_INCLUDE_MODE)) {
+            ret = mcast_snooping_leave_group(ms, ip4, vlan, port);
+        } else {
+            ret = mcast_snooping_add_group(ms, ip4, vlan, port);
+        }
+        if (ret) {
+            count++;
+        }
+        offset += sizeof(*record)
+                  + ntohs(record->nsrcs) * sizeof(ovs_be32) + record->aux_len;
+    }
+    return count;
+}
+
 bool
 mcast_snooping_leave_group(struct mcast_snooping *ms, ovs_be32 ip4,
                            uint16_t vlan, void *port)
diff --git a/lib/mcast-snooping.h b/lib/mcast-snooping.h
index 979b2aa602b..f4bc8fb0386 100644
--- a/lib/mcast-snooping.h
+++ b/lib/mcast-snooping.h
@@ -20,6 +20,7 @@
 #define MCAST_SNOOPING_H 1
 
 #include <time.h>
+#include "dp-packet.h"
 #include "hmap.h"
 #include "list.h"
 #include "ovs-atomic.h"
@@ -181,6 +182,10 @@ mcast_snooping_lookup(const struct mcast_snooping *ms, ovs_be32 dip,
 bool mcast_snooping_add_group(struct mcast_snooping *ms, ovs_be32 ip4,
                               uint16_t vlan, void *port)
     OVS_REQ_WRLOCK(ms->rwlock);
+int mcast_snooping_add_report(struct mcast_snooping *ms,
+                              const struct dp_packet *p,
+                              uint16_t vlan, void *port)
+    OVS_REQ_WRLOCK(ms->rwlock);
 bool mcast_snooping_leave_group(struct mcast_snooping *ms, ovs_be32 ip4,
                                 uint16_t vlan, void *port)
     OVS_REQ_WRLOCK(ms->rwlock);
diff --git a/lib/packets.h b/lib/packets.h
index e22267efc38..63ad2ff04b1 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -540,12 +540,38 @@ struct igmp_header {
 };
 BUILD_ASSERT_DECL(IGMP_HEADER_LEN == sizeof(struct igmp_header));
 
+#define IGMPV3_HEADER_LEN 8
+struct igmpv3_header {
+    uint8_t type;
+    uint8_t rsvr1;
+    ovs_be16 csum;
+    ovs_be16 rsvr2;
+    ovs_be16 ngrp;
+};
+BUILD_ASSERT_DECL(IGMPV3_HEADER_LEN == sizeof(struct igmpv3_header));
+
+#define IGMPV3_RECORD_LEN 8
+struct igmpv3_record {
+    uint8_t type;
+    uint8_t aux_len;
+    ovs_be16 nsrcs;
+    ovs_16aligned_be32 maddr;
+};
+BUILD_ASSERT_DECL(IGMPV3_RECORD_LEN == sizeof(struct igmpv3_record));
+
 #define IGMP_HOST_MEMBERSHIP_QUERY    0x11 /* From RFC1112 */
 #define IGMP_HOST_MEMBERSHIP_REPORT   0x12 /* Ditto */
 #define IGMPV2_HOST_MEMBERSHIP_REPORT 0x16 /* V2 version of 0x12 */
 #define IGMP_HOST_LEAVE_MESSAGE       0x17
 #define IGMPV3_HOST_MEMBERSHIP_REPORT 0x22 /* V3 version of 0x12 */
 
+#define IGMPV3_MODE_IS_INCLUDE 1
+#define IGMPV3_MODE_IS_EXCLUDE 2
+#define IGMPV3_CHANGE_TO_INCLUDE_MODE 3
+#define IGMPV3_CHANGE_TO_EXCLUDE_MODE 4
+#define IGMPV3_ALLOW_NEW_SOURCES 5
+#define IGMPV3_BLOCK_OLD_SOURCES 6
+
 #define SCTP_HEADER_LEN 12
 struct sctp_header {
     ovs_be16 sctp_src;
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index a0d13c2655c..5c1e63c5606 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -1997,10 +1997,12 @@ update_mcast_snooping_table__(const struct xbridge *xbridge,
                               const struct flow *flow,
                               struct mcast_snooping *ms,
                               ovs_be32 ip4, int vlan,
-                              struct xbundle *in_xbundle)
+                              struct xbundle *in_xbundle,
+                              const struct dp_packet *packet)
     OVS_REQ_WRLOCK(ms->rwlock)
 {
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 30);
+    int count;
 
     switch (ntohs(flow->tp_src)) {
     case IGMP_HOST_MEMBERSHIP_REPORT:
@@ -2027,6 +2029,14 @@ update_mcast_snooping_table__(const struct xbridge *xbridge,
                         in_xbundle->name, vlan);
         }
         break;
+    case IGMPV3_HOST_MEMBERSHIP_REPORT:
+        if ((count = mcast_snooping_add_report(ms, packet, vlan,
+                                               in_xbundle->ofbundle))) {
+            VLOG_DBG_RL(&rl, "bridge %s: multicast snooping processed %d "
+                        "addresses on port %s in VLAN %d",
+                        xbridge->name, count, in_xbundle->name, vlan);
+        }
+        break;
     }
 }
 
@@ -2035,7 +2045,8 @@ update_mcast_snooping_table__(const struct xbridge *xbridge,
 static void
 update_mcast_snooping_table(const struct xbridge *xbridge,
                             const struct flow *flow, int vlan,
-                            struct xbundle *in_xbundle)
+                            struct xbundle *in_xbundle,
+                            const struct dp_packet *packet)
 {
     struct mcast_snooping *ms = xbridge->ms;
     struct xlate_cfg *xcfg;
@@ -2060,7 +2071,7 @@ update_mcast_snooping_table(const struct xbridge *xbridge,
 
     if (!mcast_xbundle || mcast_xbundle != in_xbundle) {
         update_mcast_snooping_table__(xbridge, flow, ms, flow->igmp_group_ip4,
-                                      vlan, in_xbundle);
+                                      vlan, in_xbundle, packet);
     }
     ovs_rwlock_unlock(&ms->rwlock);
 }
@@ -2273,7 +2284,7 @@ xlate_normal(struct xlate_ctx *ctx)
                 mcast_snooping_is_query(flow->tp_src)) {
                 if (ctx->xin->may_learn) {
                     update_mcast_snooping_table(ctx->xbridge, flow, vlan,
-                                                in_xbundle);
+                                                in_xbundle, ctx->xin->packet);
                 }
                 /*
                  * IGMP packets need to take the slow path, in order to be
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 8a604744aca..c43bfd1a78f 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -940,7 +940,7 @@
       Protocol (IGMP) traffic between hosts and multicast routers.  The
       switch uses what IGMP snooping learns to forward multicast traffic
       only to interfaces that are connected to interested receivers.
-      Currently it supports IGMPv1 and IGMPv2 protocols.
+      Currently it supports IGMPv1, IGMPv2 and IGMPv3 protocols.
 
       <column name="mcast_snooping_enable">
         Enable multicast snooping on the bridge. For now, the default

From 85bed1db2d0f2c015a7a154b30bd7c45a83ffd63 Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@nicira.com>
Date: Wed, 17 Jun 2015 11:14:31 -0700
Subject: [PATCH 15/19] AUTHORS: Add Thadeu Lima de Souza Cascardo.

Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 AUTHORS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS b/AUTHORS
index 936742fe94c..cebab48a74d 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -166,6 +166,7 @@ SUGYO Kazushi           sugyo.org@gmail.com
 Tadaaki Nagao           nagao@stratosphere.co.jp
 Terry Wilson            twilson@redhat.com
 Tetsuo NAKAGAWA         nakagawa@mxc.nes.nec.co.jp
+Thadeu Lima de Souza Cascardo    cascardo@redhat.com
 Thomas F. Herbert       thomasfherbert@gmail.com
 Thomas Goirand          zigo@debian.org
 Thomas Graf             tgraf@noironetworks.com

From cc84898c4c1b52be8fff4431b6dd5d0fdd8721e0 Mon Sep 17 00:00:00 2001
From: Sorin Vinturis <svinturis@cloudbasesolutions.com>
Date: Thu, 18 Jun 2015 13:48:09 +0000
Subject: [PATCH 16/19] datapath-windows: Return pending for IRPs completed
 later

Return STATUS_PENDING for IRPs that are completed later in another
thread.

Signed-off-by: Sorin Vinturis <svinturis@cloudbasesolutions.com>
Reported-by: Sorin Vinturis <svinturis@cloudbasesolutions.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/83
Acked-by: Alin Gabriel Serdean <aserdean@cloudbasesolutions.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
---
 datapath-windows/ovsext/Datapath.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/datapath-windows/ovsext/Datapath.c b/datapath-windows/ovsext/Datapath.c
index b5832de3b38..d8024c85fe2 100644
--- a/datapath-windows/ovsext/Datapath.c
+++ b/datapath-windows/ovsext/Datapath.c
@@ -921,6 +921,7 @@ OvsDeviceControl(PDEVICE_OBJECT deviceObject,
          * to be processed later, so we mark the IRP as pending and complete
          * it in another thread when the request is processed. */
         IoMarkIrpPending(irp);
+        return status;
     }
     return OvsCompleteIrpRequest(irp, (ULONG_PTR)replyLen, status);
 }

From e100e453108bfd3d29f2ba0050f80e4e5eeeb2aa Mon Sep 17 00:00:00 2001
From: Justin Pettit <jpettit@nicira.com>
Date: Wed, 17 Jun 2015 16:09:48 -0700
Subject: [PATCH 17/19] Prepare for 2.4.0.

Signed-off-by: Justin Pettit <jpettit@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
---
 NEWS             | 2 +-
 configure.ac     | 2 +-
 debian/changelog | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index 43461b27c79..f615a58ee94 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,4 @@
-Post-v2.3.0
+v2.4.0 - xx xxx xxxx
 ---------------------
    - Flow table modifications are now atomic, meaning that each packet
      now sees a coherent version of the OpenFlow pipeline.  For
diff --git a/configure.ac b/configure.ac
index be5cf086c8c..e6a23a69210 100644
--- a/configure.ac
+++ b/configure.ac
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 AC_PREREQ(2.63)
-AC_INIT(openvswitch, 2.3.90, bugs@openvswitch.org)
+AC_INIT(openvswitch, 2.4.0, bugs@openvswitch.org)
 AC_CONFIG_SRCDIR([datapath/datapath.c])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_AUX_DIR([build-aux])
diff --git a/debian/changelog b/debian/changelog
index ff2e0bcd00e..5697624a5c6 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-openvswitch (2.3.90-1) unstable; urgency=low
+openvswitch (2.4.0-1) unstable; urgency=low
    [ Open vSwitch team ]
    * The openvswitch-testcontroller package is new.  It reintroduces the
      simple OpenFlow controller that was packaged with Open vSwitch prior to
@@ -8,7 +8,7 @@ openvswitch (2.3.90-1) unstable; urgency=low
    * New upstream version
     - Nothing yet!  Try NEWS...
 
- -- Open vSwitch team <dev@openvswitch.org>  Thu, 15 May 2014 17:08:39 -0700
+ -- Open vSwitch team <dev@openvswitch.org>  Wed, 17 Jun 2015 16:09:32 -0700
 
 openvswitch (2.3.0-1) unstable; urgency=low
    [ Open vSwitch team ]

From c4c7e593f5800e189f323de1bd92f9e29a097bee Mon Sep 17 00:00:00 2001
From: Justin Pettit <jpettit@nicira.com>
Date: Wed, 17 Jun 2015 16:12:46 -0700
Subject: [PATCH 18/19] Prepare for post-2.4.0 (2.4.90).

Signed-off-by: Justin Pettit <jpettit@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
---
 NEWS             | 4 ++++
 configure.ac     | 2 +-
 debian/changelog | 7 +++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index f615a58ee94..831bf5f5591 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,7 @@
+Post-v2.4.0
+---------------------
+
+
 v2.4.0 - xx xxx xxxx
 ---------------------
    - Flow table modifications are now atomic, meaning that each packet
diff --git a/configure.ac b/configure.ac
index e6a23a69210..673ad36d81e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 AC_PREREQ(2.63)
-AC_INIT(openvswitch, 2.4.0, bugs@openvswitch.org)
+AC_INIT(openvswitch, 2.4.90, bugs@openvswitch.org)
 AC_CONFIG_SRCDIR([datapath/datapath.c])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_AUX_DIR([build-aux])
diff --git a/debian/changelog b/debian/changelog
index 5697624a5c6..4afb53523cb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+openvswitch (2.4.90-1) unstable; urgency=low
+   [ Open vSwitch team ]
+   * New upstream version
+    - Nothing yet!  Try NEWS...
+
+ -- Open vSwitch team <dev@openvswitch.org>  Wed, 17 Jun 2015 16:09:32 -0700
+
 openvswitch (2.4.0-1) unstable; urgency=low
    [ Open vSwitch team ]
    * The openvswitch-testcontroller package is new.  It reintroduces the

From 5262eea1b88b99b71decfe944aea85ce01166a09 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Tue, 16 Jun 2015 11:15:28 -0700
Subject: [PATCH 19/19] odp-util: Convert flow serialization parameters to a
 struct.

Serializing between userspace flows and netlink attributes currently
requires several additional parameters besides the flows themselves.
This will continue to grow in the future as well. This converts
the function arguments to a parameters struct, which makes the code
easier to read and allowing irrelevant arguments to be omitted.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 lib/dpif-netdev.c             | 24 +++++++++++-----
 lib/odp-util.c                | 53 +++++++++++++----------------------
 lib/odp-util.h                | 31 ++++++++++++++++----
 lib/tnl-ports.c               | 16 +++++++----
 ofproto/ofproto-dpif-upcall.c | 17 +++++++----
 ofproto/ofproto-dpif.c        | 16 +++++++++--
 tests/test-odp.c              |  9 ++++--
 7 files changed, 103 insertions(+), 63 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index f13169c14eb..e8ffcd7105a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1815,22 +1815,27 @@ dp_netdev_flow_to_dpif_flow(const struct dp_netdev_flow *netdev_flow,
         struct flow_wildcards wc;
         struct dp_netdev_actions *actions;
         size_t offset;
+        struct odp_flow_key_parms odp_parms = {
+            .flow = &netdev_flow->flow,
+            .mask = &wc.masks,
+            .recirc = true,
+            .max_mpls_depth = SIZE_MAX,
+        };
 
         miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
 
         /* Key */
         offset = key_buf->size;
         flow->key = ofpbuf_tail(key_buf);
-        odp_flow_key_from_flow(key_buf, &netdev_flow->flow, &wc.masks,
-                               netdev_flow->flow.in_port.odp_port, true);
+        odp_parms.odp_in_port = netdev_flow->flow.in_port.odp_port;
+        odp_flow_key_from_flow(&odp_parms, key_buf);
         flow->key_len = key_buf->size - offset;
 
         /* Mask */
         offset = mask_buf->size;
         flow->mask = ofpbuf_tail(mask_buf);
-        odp_flow_key_from_mask(mask_buf, &wc.masks, &netdev_flow->flow,
-                               odp_to_u32(wc.masks.in_port.odp_port),
-                               SIZE_MAX, true);
+        odp_parms.odp_in_port = wc.masks.in_port.odp_port;
+        odp_flow_key_from_mask(&odp_parms, mask_buf);
         flow->mask_len = mask_buf->size - offset;
 
         /* Actions */
@@ -3008,10 +3013,15 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
         struct ds ds = DS_EMPTY_INITIALIZER;
         char *packet_str;
         struct ofpbuf key;
+        struct odp_flow_key_parms odp_parms = {
+            .flow = flow,
+            .mask = &wc->masks,
+            .odp_in_port = flow->in_port.odp_port,
+            .recirc = true,
+        };
 
         ofpbuf_init(&key, 0);
-        odp_flow_key_from_flow(&key, flow, &wc->masks, flow->in_port.odp_port,
-                               true);
+        odp_flow_key_from_flow(&odp_parms, &key);
         packet_str = ofp_packet_to_string(dp_packet_data(packet_),
                                           dp_packet_size(packet_));
 
diff --git a/lib/odp-util.c b/lib/odp-util.c
index 76dc44bae8b..56979ac3409 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -3417,13 +3417,13 @@ static void get_tp_key(const struct flow *, union ovs_key_tp *);
 static void put_tp_key(const union ovs_key_tp *, struct flow *);
 
 static void
-odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *flow,
-                         const struct flow *mask, odp_port_t odp_in_port,
-                         size_t max_mpls_depth, bool recirc, bool export_mask)
+odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms,
+                         bool export_mask, struct ofpbuf *buf)
 {
     struct ovs_key_ethernet *eth_key;
     size_t encap;
-    const struct flow *data = export_mask ? mask : flow;
+    const struct flow *flow = parms->flow;
+    const struct flow *data = export_mask ? parms->mask : parms->flow;
 
     nl_msg_put_u32(buf, OVS_KEY_ATTR_PRIORITY, data->skb_priority);
 
@@ -3433,15 +3433,15 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *flow,
 
     nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->pkt_mark);
 
-    if (recirc) {
+    if (parms->recirc) {
         nl_msg_put_u32(buf, OVS_KEY_ATTR_RECIRC_ID, data->recirc_id);
         nl_msg_put_u32(buf, OVS_KEY_ATTR_DP_HASH, data->dp_hash);
     }
 
     /* Add an ingress port attribute if this is a mask or 'odp_in_port'
      * is not the magical value "ODPP_NONE". */
-    if (export_mask || odp_in_port != ODPP_NONE) {
-        nl_msg_put_odp_port(buf, OVS_KEY_ATTR_IN_PORT, odp_in_port);
+    if (export_mask || parms->odp_in_port != ODPP_NONE) {
+        nl_msg_put_odp_port(buf, OVS_KEY_ATTR_IN_PORT, parms->odp_in_port);
     }
 
     eth_key = nl_msg_put_unspec_uninit(buf, OVS_KEY_ATTR_ETHERNET,
@@ -3507,7 +3507,9 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *flow,
         int i, n;
 
         n = flow_count_mpls_labels(flow, NULL);
-        n = MIN(n, max_mpls_depth);
+        if (export_mask) {
+            n = MIN(n, parms->max_mpls_depth);
+        }
         mpls_key = nl_msg_put_unspec_uninit(buf, OVS_KEY_ATTR_MPLS,
                                             n * sizeof *mpls_key);
         for (i = 0; i < n; i++) {
@@ -3579,43 +3581,26 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *flow,
 }
 
 /* Appends a representation of 'flow' as OVS_KEY_ATTR_* attributes to 'buf'.
- * 'flow->in_port' is ignored (since it is likely to be an OpenFlow port
- * number rather than a datapath port number).  Instead, if 'odp_in_port'
- * is anything other than ODPP_NONE, it is included in 'buf' as the input
- * port.
  *
  * 'buf' must have at least ODPUTIL_FLOW_KEY_BYTES bytes of space, or be
- * capable of being expanded to allow for that much space.
- *
- * 'recirc' indicates support for recirculation fields. If this is true, then
- * these fields will always be serialised. */
+ * capable of being expanded to allow for that much space. */
 void
-odp_flow_key_from_flow(struct ofpbuf *buf, const struct flow *flow,
-                       const struct flow *mask, odp_port_t odp_in_port,
-                       bool recirc)
+odp_flow_key_from_flow(const struct odp_flow_key_parms *parms,
+                       struct ofpbuf *buf)
 {
-    odp_flow_key_from_flow__(buf, flow, mask, odp_in_port, SIZE_MAX, recirc,
-                             false);
+    odp_flow_key_from_flow__(parms, false, buf);
 }
 
 /* Appends a representation of 'mask' as OVS_KEY_ATTR_* attributes to
- * 'buf'.  'flow' is used as a template to determine how to interpret
- * 'mask'.  For example, the 'dl_type' of 'mask' describes the mask, but
- * it doesn't indicate whether the other fields should be interpreted as
- * ARP, IPv4, IPv6, etc.
+ * 'buf'.
  *
  * 'buf' must have at least ODPUTIL_FLOW_KEY_BYTES bytes of space, or be
- * capable of being expanded to allow for that much space.
- *
- * 'recirc' indicates support for recirculation fields. If this is true, then
- * these fields will always be serialised. */
+ * capable of being expanded to allow for that much space. */
 void
-odp_flow_key_from_mask(struct ofpbuf *buf, const struct flow *mask,
-                       const struct flow *flow, uint32_t odp_in_port_mask,
-                       size_t max_mpls_depth, bool recirc)
+odp_flow_key_from_mask(const struct odp_flow_key_parms *parms,
+                       struct ofpbuf *buf)
 {
-    odp_flow_key_from_flow__(buf, flow, mask, u32_to_odp(odp_in_port_mask),
-                             max_mpls_depth, recirc, true);
+    odp_flow_key_from_flow__(parms, true, buf);
 }
 
 /* Generate ODP flow key from the given packet metadata */
diff --git a/lib/odp-util.h b/lib/odp-util.h
index 4f0e794539b..59d29f3eece 100644
--- a/lib/odp-util.h
+++ b/lib/odp-util.h
@@ -158,12 +158,31 @@ int odp_flow_from_string(const char *s,
                          const struct simap *port_names,
                          struct ofpbuf *, struct ofpbuf *);
 
-void odp_flow_key_from_flow(struct ofpbuf *, const struct flow * flow,
-                            const struct flow *mask, odp_port_t odp_in_port,
-                            bool recirc);
-void odp_flow_key_from_mask(struct ofpbuf *, const struct flow *mask,
-                            const struct flow *flow, uint32_t odp_in_port,
-                            size_t max_mpls_depth, bool recirc);
+struct odp_flow_key_parms {
+    /* The flow and mask to be serialized. In the case of masks, 'flow'
+     * is used as a template to determine how to interpret 'mask'.  For
+     * example, the 'dl_type' of 'mask' describes the mask, but it doesn't
+     * indicate whether the other fields should be interpreted as ARP, IPv4,
+     * IPv6, etc. */
+    const struct flow *flow;
+    const struct flow *mask;
+
+   /* 'flow->in_port' is ignored (since it is likely to be an OpenFlow port
+    * number rather than a datapath port number).  Instead, if 'odp_in_port'
+    * is anything other than ODPP_NONE, it is included in 'buf' as the input
+    * port. */
+    odp_port_t odp_in_port;
+
+    /* Indicates support for recirculation fields. If this is true, then
+     * these fields will always be serialised. */
+    bool recirc;
+
+    /* Only used for mask translation: */
+    size_t max_mpls_depth;
+};
+
+void odp_flow_key_from_flow(const struct odp_flow_key_parms *, struct ofpbuf *);
+void odp_flow_key_from_mask(const struct odp_flow_key_parms *, struct ofpbuf *);
 
 uint32_t odp_flow_key_hash(const struct nlattr *, size_t);
 
diff --git a/lib/tnl-ports.c b/lib/tnl-ports.c
index 2602db54321..a0a73c85dba 100644
--- a/lib/tnl-ports.c
+++ b/lib/tnl-ports.c
@@ -161,22 +161,28 @@ tnl_port_show(struct unixctl_conn *conn, int argc OVS_UNUSED,
         size_t key_len, mask_len;
         struct flow_wildcards wc;
         struct ofpbuf buf;
+        struct odp_flow_key_parms odp_parms = {
+            .flow = &flow,
+            .mask = &wc.masks,
+        };
 
         ds_put_format(&ds, "%s (%"PRIu32") : ", p->dev_name, p->portno);
         minimask_expand(&p->cr.match.mask, &wc);
         miniflow_expand(&p->cr.match.flow, &flow);
 
         /* Key. */
+        odp_parms.odp_in_port = flow.in_port.odp_port;
+        odp_parms.recirc = true;
         ofpbuf_use_stack(&buf, &keybuf, sizeof keybuf);
-        odp_flow_key_from_flow(&buf, &flow, &wc.masks,
-                               flow.in_port.odp_port, true);
+        odp_flow_key_from_flow(&odp_parms, &buf);
         key = buf.data;
         key_len = buf.size;
+
         /* mask*/
+        odp_parms.odp_in_port = wc.masks.in_port.odp_port;
+        odp_parms.recirc = false;
         ofpbuf_use_stack(&buf, &maskbuf, sizeof maskbuf);
-        odp_flow_key_from_mask(&buf, &wc.masks, &flow,
-                               odp_to_u32(wc.masks.in_port.odp_port),
-                               SIZE_MAX, false);
+        odp_flow_key_from_mask(&odp_parms, &buf);
         mask = buf.data;
         mask_len = buf.size;
 
diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index c39b5712e30..f75bc69fb72 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -1363,6 +1363,10 @@ ukey_create_from_upcall(struct upcall *upcall)
     struct odputil_keybuf keystub, maskstub;
     struct ofpbuf keybuf, maskbuf;
     bool recirc, megaflow;
+    struct odp_flow_key_parms odp_parms = {
+        .flow = upcall->flow,
+        .mask = &upcall->xout.wc.masks,
+    };
 
     if (upcall->key_len) {
         ofpbuf_use_const(&keybuf, upcall->key, upcall->key_len);
@@ -1370,19 +1374,20 @@ ukey_create_from_upcall(struct upcall *upcall)
         /* dpif-netdev doesn't provide a netlink-formatted flow key in the
          * upcall, so convert the upcall's flow here. */
         ofpbuf_use_stack(&keybuf, &keystub, sizeof keystub);
-        odp_flow_key_from_flow(&keybuf, upcall->flow, &upcall->xout.wc.masks,
-                               upcall->flow->in_port.odp_port, true);
+        odp_parms.odp_in_port = upcall->flow->in_port.odp_port;
+        odp_parms.recirc = true;
+        odp_flow_key_from_flow(&odp_parms, &keybuf);
     }
 
     atomic_read_relaxed(&enable_megaflows, &megaflow);
     recirc = ofproto_dpif_get_enable_recirc(upcall->ofproto);
     ofpbuf_use_stack(&maskbuf, &maskstub, sizeof maskstub);
     if (megaflow) {
-        size_t max_mpls;
+        odp_parms.odp_in_port = ODPP_NONE;
+        odp_parms.max_mpls_depth = ofproto_dpif_get_max_mpls_depth(upcall->ofproto);
+        odp_parms.recirc = recirc;
 
-        max_mpls = ofproto_dpif_get_max_mpls_depth(upcall->ofproto);
-        odp_flow_key_from_mask(&maskbuf, &upcall->xout.wc.masks, upcall->flow,
-                               UINT32_MAX, max_mpls, recirc);
+        odp_flow_key_from_mask(&odp_parms, &maskbuf);
     }
 
     return ukey_create__(keybuf.data, keybuf.size, maskbuf.data, maskbuf.size,
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 369e0b9755b..0de86869301 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -1003,13 +1003,17 @@ check_recirc(struct dpif_backer *backer)
     struct odputil_keybuf keybuf;
     struct ofpbuf key;
     bool enable_recirc;
+    struct odp_flow_key_parms odp_parms = {
+        .flow = &flow,
+        .recirc = true,
+    };
 
     memset(&flow, 0, sizeof flow);
     flow.recirc_id = 1;
     flow.dp_hash = 1;
 
     ofpbuf_use_stack(&key, &keybuf, sizeof keybuf);
-    odp_flow_key_from_flow(&key, &flow, NULL, 0, true);
+    odp_flow_key_from_flow(&odp_parms, &key);
     enable_recirc = dpif_probe_feature(backer->dpif, "recirculation", &key,
                                        NULL);
 
@@ -1037,12 +1041,15 @@ check_ufid(struct dpif_backer *backer)
     struct ofpbuf key;
     ovs_u128 ufid;
     bool enable_ufid;
+    struct odp_flow_key_parms odp_parms = {
+        .flow = &flow,
+    };
 
     memset(&flow, 0, sizeof flow);
     flow.dl_type = htons(0x1234);
 
     ofpbuf_use_stack(&key, &keybuf, sizeof keybuf);
-    odp_flow_key_from_flow(&key, &flow, NULL, 0, true);
+    odp_flow_key_from_flow(&odp_parms, &key);
     dpif_flow_hash(backer->dpif, key.data, key.size, &ufid);
 
     enable_ufid = dpif_probe_feature(backer->dpif, "UFID", &key, &ufid);
@@ -1144,13 +1151,16 @@ check_max_mpls_depth(struct dpif_backer *backer)
     for (n = 0; n < FLOW_MAX_MPLS_LABELS; n++) {
         struct odputil_keybuf keybuf;
         struct ofpbuf key;
+        struct odp_flow_key_parms odp_parms = {
+            .flow = &flow,
+        };
 
         memset(&flow, 0, sizeof flow);
         flow.dl_type = htons(ETH_TYPE_MPLS);
         flow_set_mpls_bos(&flow, n, 1);
 
         ofpbuf_use_stack(&key, &keybuf, sizeof keybuf);
-        odp_flow_key_from_flow(&key, &flow, NULL, 0, false);
+        odp_flow_key_from_flow(&odp_parms, &key);
         if (!dpif_probe_feature(backer->dpif, "MPLS", &key, NULL)) {
             break;
         }
diff --git a/tests/test-odp.c b/tests/test-odp.c
index 4daf4728729..faa60d4727e 100644
--- a/tests/test-odp.c
+++ b/tests/test-odp.c
@@ -54,6 +54,11 @@ parse_keys(bool wc_keys)
         }
 
         if (!wc_keys) {
+            struct odp_flow_key_parms odp_parms = {
+                .flow = &flow,
+                .recirc = true,
+            };
+
             /* Convert odp_key to flow. */
             fitness = odp_flow_key_to_flow(odp_key.data, odp_key.size, &flow);
             switch (fitness) {
@@ -75,8 +80,8 @@ parse_keys(bool wc_keys)
             /* Convert cls_rule back to odp_key. */
             ofpbuf_uninit(&odp_key);
             ofpbuf_init(&odp_key, 0);
-            odp_flow_key_from_flow(&odp_key, &flow, NULL,
-                                   flow.in_port.odp_port, true);
+            odp_parms.odp_in_port = flow.in_port.odp_port;
+            odp_flow_key_from_flow(&odp_parms, &odp_key);
 
             if (odp_key.size > ODPUTIL_FLOW_KEY_BYTES) {
                 printf ("too long: %"PRIu32" > %d\n",