Skip to content

Commit

Permalink
net/mlx5: Support multiport eswitch mode
Browse files Browse the repository at this point in the history
Multiport eswitch mode is a LAG mode that allows to add rules that
forward traffic to a specific physical port without being affected by LAG
affinity configuration.

This mode of operation is mutual exclusive with the other LAG modes used
by multipath and bonding.

To make the transition between the modes, we maintain a counter on the
number of rules specifying one of the uplink representors as the target
of mirred egress redirect action.

An example of such rule would be:

$ tc filter add dev enp8s0f0_0 prot all root flower dst_mac \
  00:11:22:33:44:55 action mirred egress redirect dev enp8s0f0

If the reference count just grows to one and LAG is not in use, we
create the LAG in multiport eswitch mode. Other mode changes are not
allowed while in this mode. When the reference count reaches zero, we
destroy the LAG and let other modes be used if needed.

logic also changed such that if forwarding to some uplink destination
cannot be guaranteed, we fail the operation so the rule will eventually
be in software and not in hardware.

Signed-off-by: Eli Cohen <[email protected]>
Reviewed-by: Mark Bloch <[email protected]>
Signed-off-by: Saeed Mahameed <[email protected]>
  • Loading branch information
elic307i authored and Saeed Mahameed committed May 18, 2022
1 parent a4a9c87 commit 94db331
Show file tree
Hide file tree
Showing 11 changed files with 259 additions and 40 deletions.
2 changes: 1 addition & 1 deletion drivers/net/ethernet/mellanox/mlx5/core/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
mlx5_core-$(CONFIG_MLX5_ESWITCH) += lag/mp.o lag/port_sel.o lib/geneve.o lib/port_tun.o \
en_rep.o en/rep/bond.o en/mod_hdr.o \
en/mapping.o
en/mapping.o lag/mpesw.o
mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \
lib/fs_chains.o en/tc_tun.o \
esw/indir_table.o en/tc_tun_encap.o \
Expand Down
14 changes: 14 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "en/tc_tun_encap.h"
#include "en/tc_priv.h"
#include "en_rep.h"
#include "lag/lag.h"

static bool
same_vf_reps(struct mlx5e_priv *priv, struct net_device *out_dev)
Expand Down Expand Up @@ -215,6 +216,7 @@ parse_mirred(struct mlx5e_tc_act_parse_state *parse_state,
struct net_device *uplink_dev;
struct mlx5e_priv *out_priv;
struct mlx5_eswitch *esw;
bool is_uplink_rep;
int *ifindexes;
int if_count;
int err;
Expand All @@ -229,6 +231,10 @@ parse_mirred(struct mlx5e_tc_act_parse_state *parse_state,

parse_state->ifindexes[if_count] = out_dev->ifindex;
parse_state->if_count++;
is_uplink_rep = mlx5e_eswitch_uplink_rep(out_dev);
err = mlx5_lag_do_mirred(priv->mdev, out_dev);
if (err)
return err;

out_dev = get_fdb_out_dev(uplink_dev, out_dev);
if (!out_dev)
Expand Down Expand Up @@ -268,6 +274,14 @@ parse_mirred(struct mlx5e_tc_act_parse_state *parse_state,
rpriv = out_priv->ppriv;
esw_attr->dests[esw_attr->out_count].rep = rpriv->rep;
esw_attr->dests[esw_attr->out_count].mdev = out_priv->mdev;

/* If output device is bond master then rules are not explicit
* so we don't attempt to count them.
*/
if (is_uplink_rep && MLX5_CAP_PORT_SELECTION(priv->mdev, port_select_flow_table) &&
MLX5_CAP_GEN(priv->mdev, create_lag_when_not_master_up))
attr->lag.count = true;

esw_attr->out_count++;

return 0;
Expand Down
28 changes: 27 additions & 1 deletion drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1740,6 +1740,9 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,

free_flow_post_acts(flow);

if (flow->attr->lag.count)
mlx5_lag_del_mpesw_rule(esw->dev);

kvfree(attr->esw_attr->rx_tun_attr);
kvfree(attr->parse_attr);
kfree(flow->attr);
Expand Down Expand Up @@ -3788,12 +3791,25 @@ static bool is_lag_dev(struct mlx5e_priv *priv,
same_hw_reps(priv, peer_netdev));
}

static bool is_multiport_eligible(struct mlx5e_priv *priv, struct net_device *out_dev)
{
if (mlx5e_eswitch_uplink_rep(out_dev) &&
MLX5_CAP_PORT_SELECTION(priv->mdev, port_select_flow_table) &&
MLX5_CAP_GEN(priv->mdev, create_lag_when_not_master_up))
return true;

return false;
}

bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv,
struct net_device *out_dev)
{
if (is_merged_eswitch_vfs(priv, out_dev))
return true;

if (is_multiport_eligible(priv, out_dev))
return true;

if (is_lag_dev(priv, out_dev))
return true;

Expand Down Expand Up @@ -4050,6 +4066,7 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5_core_dev *in_mdev)
{
struct flow_rule *rule = flow_cls_offload_flow_rule(f);
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct netlink_ext_ack *extack = f->common.extack;
struct mlx5e_tc_flow_parse_attr *parse_attr;
struct mlx5e_tc_flow *flow;
Expand Down Expand Up @@ -4085,17 +4102,26 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
if (err)
goto err_free;

if (flow->attr->lag.count) {
err = mlx5_lag_add_mpesw_rule(esw->dev);
if (err)
goto err_free;
}

err = mlx5e_tc_add_fdb_flow(priv, flow, extack);
complete_all(&flow->init_done);
if (err) {
if (!(err == -ENETUNREACH && mlx5_lag_is_multipath(in_mdev)))
goto err_free;
goto err_lag;

add_unready_flow(flow);
}

return flow;

err_lag:
if (flow->attr->lag.count)
mlx5_lag_del_mpesw_rule(esw->dev);
err_free:
mlx5e_flow_put(priv, flow);
out:
Expand Down
7 changes: 7 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ struct mlx5_flow_attr {
u32 flags;
struct list_head list;
struct mlx5e_post_act_handle *post_act_handle;
struct {
/* Indicate whether the parsed flow should be counted for lag mode decision
* making
*/
bool count;
} lag;
/* keep this union last */
union {
struct mlx5_esw_flow_attr esw_attr[0];
struct mlx5_nic_flow_attr nic_attr[0];
Expand Down
3 changes: 3 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "en_tc.h"
#include "en/mapping.h"
#include "devlink.h"
#include "lag/lag.h"

#define mlx5_esw_for_each_rep(esw, i, rep) \
xa_for_each(&((esw)->offloads.vport_reps), i, rep)
Expand Down Expand Up @@ -418,6 +419,8 @@ esw_setup_vport_dest(struct mlx5_flow_destination *dest, struct mlx5_flow_act *f
dest[dest_idx].vport.vhca_id =
MLX5_CAP_GEN(esw_attr->dests[attr_idx].mdev, vhca_id);
dest[dest_idx].vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID;
if (mlx5_lag_mpesw_is_activated(esw->dev))
dest[dest_idx].type = MLX5_FLOW_DESTINATION_TYPE_UPLINK;
}
if (esw_attr->dests[attr_idx].flags & MLX5_ESW_DEST_ENCAP) {
if (pkt_reformat) {
Expand Down
17 changes: 9 additions & 8 deletions drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@

static char *get_str_mode_type(struct mlx5_lag *ldev)
{
if (ldev->mode == MLX5_LAG_MODE_ROCE)
return "roce";
if (ldev->mode == MLX5_LAG_MODE_SRIOV)
return "switchdev";
if (ldev->mode == MLX5_LAG_MODE_MULTIPATH)
return "multipath";
switch (ldev->mode) {
case MLX5_LAG_MODE_ROCE: return "roce";
case MLX5_LAG_MODE_SRIOV: return "switchdev";
case MLX5_LAG_MODE_MULTIPATH: return "multipath";
case MLX5_LAG_MODE_MPESW: return "multiport_eswitch";
default: return "invalid";
}

return NULL;
}
Expand Down Expand Up @@ -43,11 +44,11 @@ static int port_sel_mode_show(struct seq_file *file, void *priv)
ldev = dev->priv.lag;
mutex_lock(&ldev->lock);
if (__mlx5_lag_is_active(ldev))
mode = get_str_port_sel_mode(ldev->mode_flags);
mode = mlx5_get_str_port_sel_mode(ldev);
else
ret = -EINVAL;
mutex_unlock(&ldev->lock);
if (ret || !mode)
if (ret)
return ret;

seq_printf(file, "%s\n", mode);
Expand Down
86 changes: 59 additions & 27 deletions drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "esw/acl/ofld.h"
#include "lag.h"
#include "mp.h"
#include "mpesw.h"

enum {
MLX5_LAG_EGRESS_PORT_1 = 1,
Expand All @@ -58,6 +59,9 @@ static int get_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT;

if (mode == MLX5_LAG_MODE_MPESW)
return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW;

return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY;
}

Expand Down Expand Up @@ -196,7 +200,8 @@ static void mlx5_ldev_free(struct kref *ref)
if (ldev->nb.notifier_call)
unregister_netdevice_notifier_net(&init_net, &ldev->nb);
mlx5_lag_mp_cleanup(ldev);
cancel_delayed_work_sync(&ldev->bond_work);
mlx5_lag_mpesw_cleanup(ldev);
cancel_work_sync(&ldev->mpesw_work);
destroy_workqueue(ldev->wq);
mutex_destroy(&ldev->lock);
kfree(ldev);
Expand Down Expand Up @@ -242,6 +247,8 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
if (err)
mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
err);

mlx5_lag_mpesw_init(ldev);
ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
ldev->buckets = 1;

Expand Down Expand Up @@ -442,16 +449,19 @@ static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
return 0;
}

static int mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
struct lag_tracker *tracker, unsigned long *flags)
static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
struct lag_tracker *tracker,
enum mlx5_lag_mode mode,
unsigned long *flags)
{
struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];

if (mode == MLX5_LAG_MODE_MPESW)
return;

if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)
set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);

return 0;
}

static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
Expand All @@ -467,14 +477,20 @@ static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
if (roce_lag)
return mlx5_lag_set_port_sel_mode_roce(ldev, flags);

return mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, flags);
mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags);
return 0;
}

char *get_str_port_sel_mode(unsigned long flags)
char *mlx5_get_str_port_sel_mode(struct mlx5_lag *ldev)
{
if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
return "hash";
return "queue_affinity";
int port_sel_mode = get_port_sel_mode(ldev->mode, ldev->mode_flags);

switch (port_sel_mode) {
case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity";
case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash";
case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw";
default: return "invalid";
}
}

static int mlx5_create_lag(struct mlx5_lag *ldev,
Expand All @@ -488,9 +504,10 @@ static int mlx5_create_lag(struct mlx5_lag *ldev,
u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
int err;

mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
if (tracker)
mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
shared_fdb, get_str_port_sel_mode(flags));
shared_fdb, mlx5_get_str_port_sel_mode(ldev));

err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags);
if (err) {
Expand Down Expand Up @@ -526,22 +543,24 @@ int mlx5_activate_lag(struct mlx5_lag *ldev,
{
bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
unsigned long flags;
unsigned long flags = 0;
int err;

err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags);
if (err)
return err;

mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
ldev->v2p_map);
if (err) {
mlx5_core_err(dev0,
"Failed to create LAG port selection(%d)\n",
err);
return err;
if (mode != MLX5_LAG_MODE_MPESW) {
mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
ldev->v2p_map);
if (err) {
mlx5_core_err(dev0,
"Failed to create LAG port selection(%d)\n",
err);
return err;
}
}
}

Expand All @@ -559,7 +578,7 @@ int mlx5_activate_lag(struct mlx5_lag *ldev,
return err;
}

if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
!roce_lag)
mlx5_lag_drop_rule_setup(ldev, tracker);

Expand Down Expand Up @@ -675,7 +694,7 @@ static void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
}
}

static void mlx5_disable_lag(struct mlx5_lag *ldev)
void mlx5_disable_lag(struct mlx5_lag *ldev)
{
bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
Expand Down Expand Up @@ -712,7 +731,7 @@ static void mlx5_disable_lag(struct mlx5_lag *ldev)
}
}

static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
{
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
Expand Down Expand Up @@ -748,6 +767,18 @@ static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
return roce_lag;
}

static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond)
{
return do_bond && __mlx5_lag_is_active(ldev) &&
ldev->mode != MLX5_LAG_MODE_MPESW;
}

static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
{
return !do_bond && __mlx5_lag_is_active(ldev) &&
ldev->mode != MLX5_LAG_MODE_MPESW;
}

static void mlx5_do_bond(struct mlx5_lag *ldev)
{
struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
Expand Down Expand Up @@ -810,9 +841,9 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
return;
}
}
} else if (do_bond && __mlx5_lag_is_active(ldev)) {
} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
mlx5_modify_lag(ldev, &tracker);
} else if (!do_bond && __mlx5_lag_is_active(ldev)) {
} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
mlx5_disable_lag(ldev);
}
}
Expand Down Expand Up @@ -986,6 +1017,7 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
return 1;
}

/* this handler is always registered to netdev events */
static int mlx5_lag_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
Expand Down
Loading

0 comments on commit 94db331

Please sign in to comment.