diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt index 3ecf0c3a133fb6..45bcafe6ff8af9 100644 --- a/Documentation/infiniband/sysfs.txt +++ b/Documentation/infiniband/sysfs.txt @@ -56,6 +56,18 @@ SYSFS FILES ports/1/pkeys/10 contains the value at index 10 in port 1's P_Key table. + There is an optional "hw_counters" subdirectory that may be under either + the parent device or the port subdirectories or both. If present, + there are a list of counters provided by the hardware. They may match + some of the counters in the counters directory, but they often include + many other counters. In addition to the various counters, there will + be a file named "lifespan" that configures how frequently the core + should update the counters when they are being accessed (counters are + not updated if they are not being accessed). The lifespan is in milli- + seconds and defaults to 10 unless set to something else by the driver. + Users may echo a value between 0 - 10000 to the lifespan file to set + the length of time between updates in milliseconds. + MTHCA The Mellanox HCA driver also creates the files: diff --git a/MAINTAINERS b/MAINTAINERS index f466673f86ff39..216165a1384de4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5308,6 +5308,13 @@ F: drivers/block/cciss* F: include/linux/cciss_ioctl.h F: include/uapi/linux/cciss_ioctl.h +HFI1 DRIVER +M: Mike Marciniszyn +M: Dennis Dalessandro +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/hw/hfi1 + HFS FILESYSTEM L: linux-fsdevel@vger.kernel.org S: Orphan @@ -5837,7 +5844,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git S: Supported F: Documentation/infiniband/ F: drivers/infiniband/ -F: drivers/staging/rdma/ F: include/uapi/linux/if_infiniband.h F: include/uapi/rdma/ F: include/rdma/ @@ -10920,12 +10926,6 @@ M: Arnaud Patard S: Odd Fixes F: drivers/staging/xgifb/ -HFI1 DRIVER -M: Mike Marciniszyn -L: linux-rdma@vger.kernel.org -S: Supported -F: drivers/staging/rdma/hfi1 - STARFIRE/DURALAN NETWORK DRIVER M: Ion Badulescu S: Odd Fixes diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 6425c0e5d18a65..2137adfbd8c349 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -85,4 +85,6 @@ source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" +source "drivers/infiniband/hw/hfi1/Kconfig" + endif # INFINIBAND diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 26987d9d7e1cdc..edaae9f9853c73 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,8 +1,7 @@ infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o -obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o iw_cm.o ib_addr.o \ +obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \ $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ @@ -10,14 +9,11 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ - roce_gid_mgmt.o mr_pool.o + roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ + multicast.o mad.o smi.o agent.o mad_rmpp.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o -ib_mad-y := mad.o smi.o agent.o mad_rmpp.o - -ib_sa-y := sa_query.o multicast.o - ib_cm-y := cm.o iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o @@ -28,8 +24,6 @@ rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o rdma_ucm-y := ucma.o -ib_addr-y := addr.o - ib_umad-y := user_mad.o ib_ucm-y := ucm.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 337353d86cfade..1374541a45287f 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -46,10 +46,10 @@ #include #include #include +#include +#include -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("IB Address Translation"); -MODULE_LICENSE("Dual BSD/GPL"); +#include "core_priv.h" struct addr_req { struct list_head list; @@ -62,8 +62,11 @@ struct addr_req { struct rdma_dev_addr *addr, void *context); unsigned long timeout; int status; + u32 seq; }; +static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0); + static void process_req(struct work_struct *work); static DEFINE_MUTEX(lock); @@ -71,6 +74,126 @@ static LIST_HEAD(req_list); static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; +static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { + [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, + .len = sizeof(struct rdma_nla_ls_gid)}, +}; + +static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) +{ + struct nlattr *tb[LS_NLA_TYPE_MAX] = {}; + int ret; + + if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) + return false; + + ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_addr_policy); + if (ret) + return false; + + return true; +} + +static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) +{ + const struct nlattr *head, *curr; + union ib_gid gid; + struct addr_req *req; + int len, rem; + int found = 0; + + head = (const struct nlattr *)nlmsg_data(nlh); + len = nlmsg_len(nlh); + + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type == LS_NLA_TYPE_DGID) + memcpy(&gid, nla_data(curr), nla_len(curr)); + } + + mutex_lock(&lock); + list_for_each_entry(req, &req_list, list) { + if (nlh->nlmsg_seq != req->seq) + continue; + /* We set the DGID part, the rest was set earlier */ + rdma_addr_set_dgid(req->addr, &gid); + req->status = 0; + found = 1; + break; + } + mutex_unlock(&lock); + + if (!found) + pr_info("Couldn't find request waiting for DGID: %pI6\n", + &gid); +} + +int ib_nl_handle_ip_res_resp(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; + + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (ib_nl_is_good_ip_resp(nlh)) + ib_nl_process_good_ip_rsep(nlh); + + return skb->len; +} + +static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, + const void *daddr, + u32 seq, u16 family) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + struct rdma_ls_ip_resolve_header *header; + void *data; + size_t size; + int attrtype; + int len; + + if (family == AF_INET) { + size = sizeof(struct in_addr); + attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4; + } else { + size = sizeof(struct in6_addr); + attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6; + } + + len = nla_total_size(sizeof(size)); + len += NLMSG_ALIGN(sizeof(*header)); + + skb = nlmsg_new(len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS, + RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST); + if (!data) { + nlmsg_free(skb); + return -ENODATA; + } + + /* Construct the family header first */ + header = (struct rdma_ls_ip_resolve_header *) + skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + header->ifindex = dev_addr->bound_dev_if; + nla_put(skb, attrtype, size, daddr); + + /* Repair the nlmsg header length */ + nlmsg_end(skb, nlh); + ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); + + /* Make the request retry, so when we get the response from userspace + * we will have something. + */ + return -ENODATA; +} + int rdma_addr_size(struct sockaddr *addr) { switch (addr->sa_family) { @@ -199,6 +322,17 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } +static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const void *daddr, u32 seq, u16 family) +{ + if (ibnl_chk_listeners(RDMA_NL_GROUP_LS)) + return -EADDRNOTAVAIL; + + /* We fill in what we can, the response will fill the rest */ + rdma_copy_addr(dev_addr, dst->dev, NULL); + return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); +} + static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const void *daddr) { @@ -223,6 +357,39 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, return ret; } +static bool has_gateway(struct dst_entry *dst, sa_family_t family) +{ + struct rtable *rt; + struct rt6_info *rt6; + + if (family == AF_INET) { + rt = container_of(dst, struct rtable, dst); + return rt->rt_uses_gateway; + } + + rt6 = container_of(dst, struct rt6_info, dst); + return rt6->rt6i_flags & RTF_GATEWAY; +} + +static int fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, u32 seq) +{ + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + const void *daddr = (dst_in->sa_family == AF_INET) ? + (const void *)&dst_in4->sin_addr.s_addr : + (const void *)&dst_in6->sin6_addr; + sa_family_t family = dst_in->sa_family; + + /* Gateway + ARPHRD_INFINIBAND -> IB router */ + if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) + return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family); + else + return dst_fetch_ha(dst, dev_addr, daddr); +} + static int addr4_resolve(struct sockaddr_in *src_in, const struct sockaddr_in *dst_in, struct rdma_dev_addr *addr, @@ -246,10 +413,11 @@ static int addr4_resolve(struct sockaddr_in *src_in, src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; - /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't - * routable) and we could set the network type accordingly. + /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're + * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network + * type accordingly. */ - if (rt->rt_uses_gateway) + if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND) addr->network = RDMA_NETWORK_IPV4; addr->hoplimit = ip4_dst_hoplimit(&rt->dst); @@ -291,10 +459,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, src_in->sin6_addr = fl6.saddr; } - /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't - * routable) and we could set the network type accordingly. + /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're + * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network + * type accordingly. */ - if (rt->rt6i_flags & RTF_GATEWAY) + if (rt->rt6i_flags & RTF_GATEWAY && + ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND) addr->network = RDMA_NETWORK_IPV6; addr->hoplimit = ip6_dst_hoplimit(dst); @@ -317,7 +487,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, static int addr_resolve_neigh(struct dst_entry *dst, const struct sockaddr *dst_in, - struct rdma_dev_addr *addr) + struct rdma_dev_addr *addr, + u32 seq) { if (dst->dev->flags & IFF_LOOPBACK) { int ret; @@ -331,17 +502,8 @@ static int addr_resolve_neigh(struct dst_entry *dst, } /* If the device doesn't do ARP internally */ - if (!(dst->dev->flags & IFF_NOARP)) { - const struct sockaddr_in *dst_in4 = - (const struct sockaddr_in *)dst_in; - const struct sockaddr_in6 *dst_in6 = - (const struct sockaddr_in6 *)dst_in; - - return dst_fetch_ha(dst, addr, - dst_in->sa_family == AF_INET ? - (const void *)&dst_in4->sin_addr.s_addr : - (const void *)&dst_in6->sin6_addr); - } + if (!(dst->dev->flags & IFF_NOARP)) + return fetch_ha(dst, addr, dst_in, seq); return rdma_copy_addr(addr, dst->dev, NULL); } @@ -349,7 +511,8 @@ static int addr_resolve_neigh(struct dst_entry *dst, static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, - bool resolve_neigh) + bool resolve_neigh, + u32 seq) { struct net_device *ndev; struct dst_entry *dst; @@ -366,7 +529,7 @@ static int addr_resolve(struct sockaddr *src_in, return ret; if (resolve_neigh) - ret = addr_resolve_neigh(&rt->dst, dst_in, addr); + ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); ndev = rt->dst.dev; dev_hold(ndev); @@ -383,7 +546,7 @@ static int addr_resolve(struct sockaddr *src_in, return ret; if (resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr); + ret = addr_resolve_neigh(dst, dst_in, addr, seq); ndev = dst->dev; dev_hold(ndev); @@ -412,7 +575,7 @@ static void process_req(struct work_struct *work) src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, - true); + true, req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -471,8 +634,9 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->context = context; req->client = client; atomic_inc(&client->refcount); + req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); - req->status = addr_resolve(src_in, dst_in, addr, true); + req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); switch (req->status) { case 0: req->timeout = jiffies; @@ -510,7 +674,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, src_in->sa_family = dst_addr->sa_family; } - return addr_resolve(src_in, dst_addr, addr, false); + return addr_resolve(src_in, dst_addr, addr, false, 0); } EXPORT_SYMBOL(rdma_resolve_ip_route); @@ -634,7 +798,7 @@ static struct notifier_block nb = { .notifier_call = netevent_callback }; -static int __init addr_init(void) +int addr_init(void) { addr_wq = create_singlethread_workqueue("ib_addr"); if (!addr_wq) @@ -642,15 +806,13 @@ static int __init addr_init(void) register_netevent_notifier(&nb); rdma_addr_register_client(&self); + return 0; } -static void __exit addr_cleanup(void) +void addr_cleanup(void) { rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); } - -module_init(addr_init); -module_exit(addr_cleanup); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index eab32215756b93..19d499dcab764b 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -137,4 +137,20 @@ static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, return _upper == upper; } +int addr_init(void); +void addr_cleanup(void); + +int ib_mad_init(void); +void ib_mad_cleanup(void); + +int ib_sa_init(void); +void ib_sa_cleanup(void); + +int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct netlink_callback *cb); +int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct netlink_callback *cb); +int ib_nl_handle_ip_res_resp(struct sk_buff *skb, + struct netlink_callback *cb); + #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 10979844026a01..5516fb0703442c 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -955,6 +955,29 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); +static struct ibnl_client_cbs ibnl_ls_cb_table[] = { + [RDMA_NL_LS_OP_RESOLVE] = { + .dump = ib_nl_handle_resolve_resp, + .module = THIS_MODULE }, + [RDMA_NL_LS_OP_SET_TIMEOUT] = { + .dump = ib_nl_handle_set_timeout, + .module = THIS_MODULE }, + [RDMA_NL_LS_OP_IP_RESOLVE] = { + .dump = ib_nl_handle_ip_res_resp, + .module = THIS_MODULE }, +}; + +static int ib_add_ibnl_clients(void) +{ + return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table), + ibnl_ls_cb_table); +} + +static void ib_remove_ibnl_clients(void) +{ + ibnl_remove_client(RDMA_NL_LS); +} + static int __init ib_core_init(void) { int ret; @@ -983,10 +1006,41 @@ static int __init ib_core_init(void) goto err_sysfs; } + ret = addr_init(); + if (ret) { + pr_warn("Could't init IB address resolution\n"); + goto err_ibnl; + } + + ret = ib_mad_init(); + if (ret) { + pr_warn("Couldn't init IB MAD\n"); + goto err_addr; + } + + ret = ib_sa_init(); + if (ret) { + pr_warn("Couldn't init SA\n"); + goto err_mad; + } + + if (ib_add_ibnl_clients()) { + pr_warn("Couldn't register ibnl clients\n"); + goto err_sa; + } + ib_cache_setup(); return 0; +err_sa: + ib_sa_cleanup(); +err_mad: + ib_mad_cleanup(); +err_addr: + addr_cleanup(); +err_ibnl: + ibnl_cleanup(); err_sysfs: class_unregister(&ib_class); err_comp: @@ -999,6 +1053,10 @@ static int __init ib_core_init(void) static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); + ib_remove_ibnl_clients(); + ib_sa_cleanup(); + ib_mad_cleanup(); + addr_cleanup(); ibnl_cleanup(); class_unregister(&ib_class); destroy_workqueue(ib_comp_wq); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 9fa5bf33f5a342..82fb511112da74 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -47,11 +47,7 @@ #include "smi.h" #include "opa_smi.h" #include "agent.h" - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("kernel IB MAD API"); -MODULE_AUTHOR("Hal Rosenstock"); -MODULE_AUTHOR("Sean Hefty"); +#include "core_priv.h" static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; @@ -3316,7 +3312,7 @@ static struct ib_client mad_client = { .remove = ib_mad_remove_device }; -static int __init ib_mad_init_module(void) +int ib_mad_init(void) { mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); @@ -3334,10 +3330,7 @@ static int __init ib_mad_init_module(void) return 0; } -static void __exit ib_mad_cleanup_module(void) +void ib_mad_cleanup(void) { ib_unregister_client(&mad_client); } - -module_init(ib_mad_init_module); -module_exit(ib_mad_cleanup_module); diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 250937cb9a1a50..a83ec28a147b48 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -93,6 +93,18 @@ enum { struct mcast_member; +/* +* There are 4 types of join states: +* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. +*/ +enum { + FULLMEMBER_JOIN, + NONMEMBER_JOIN, + SENDONLY_NONMEBER_JOIN, + SENDONLY_FULLMEMBER_JOIN, + NUM_JOIN_MEMBERSHIP_TYPES, +}; + struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; @@ -102,7 +114,7 @@ struct mcast_group { struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; - int members[3]; + int members[NUM_JOIN_MEMBERSHIP_TYPES]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; @@ -220,8 +232,9 @@ static void queue_join(struct mcast_member *member) } /* - * A multicast group has three types of members: full member, non member, and - * send only member. We need to keep track of the number of members of each + * A multicast group has four types of members: full member, non member, + * sendonly non member and sendonly full member. + * We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ @@ -229,7 +242,7 @@ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; - for (i = 0; i < 3; i++, join_state >>= 1) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } @@ -245,7 +258,7 @@ static u8 get_leave_state(struct mcast_group *group) u8 leave_state = 0; int i; - for (i = 0; i < 3; i++) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) if (!group->members[i]) leave_state |= (0x1 << i); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 3ebd108bcc5f27..e95538650dc6fd 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -53,10 +53,6 @@ #include "sa.h" #include "core_priv.h" -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("InfiniBand subnet administration query support"); -MODULE_LICENSE("Dual BSD/GPL"); - #define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 #define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 @@ -119,6 +115,12 @@ struct ib_sa_guidinfo_query { struct ib_sa_query sa_query; }; +struct ib_sa_classport_info_query { + void (*callback)(int, struct ib_class_port_info *, void *); + void *context; + struct ib_sa_query sa_query; +}; + struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; @@ -392,6 +394,82 @@ static const struct ib_field service_rec_table[] = { .size_bits = 2*64 }, }; +#define CLASSPORTINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ + .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ + .field_name = "ib_class_port_info:" #field + +static const struct ib_field classport_info_rec_table[] = { + { CLASSPORTINFO_REC_FIELD(base_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(class_version), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(capability_mask), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(redirect_tcslfl), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_lid), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(redirect_pkey), + .offset_words = 7, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(redirect_qp), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_qkey), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_gid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(trap_tcslfl), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_lid), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(trap_pkey), + .offset_words = 15, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(trap_hlqp), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(trap_qkey), + .offset_words = 17, + .offset_bits = 0, + .size_bits = 32 }, +}; + #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ @@ -705,8 +783,8 @@ static void ib_nl_request_timeout(struct work_struct *work) spin_unlock_irqrestore(&ib_nl_request_lock, flags); } -static int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb) +int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct netlink_callback *cb) { const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; int timeout, delta, abs_delta; @@ -782,8 +860,8 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) return 1; } -static int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb) +int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct netlink_callback *cb) { const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; unsigned long flags; @@ -838,15 +916,6 @@ static int ib_nl_handle_resolve_resp(struct sk_buff *skb, return skb->len; } -static struct ibnl_client_cbs ib_sa_cb_table[] = { - [RDMA_NL_LS_OP_RESOLVE] = { - .dump = ib_nl_handle_resolve_resp, - .module = THIS_MODULE }, - [RDMA_NL_LS_OP_SET_TIMEOUT] = { - .dump = ib_nl_handle_set_timeout, - .module = THIS_MODULE }, -}; - static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -1645,6 +1714,97 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); +/* Support get SA ClassPortInfo */ +static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_classport_info_query *query = + container_of(sa_query, struct ib_sa_classport_info_query, sa_query); + + if (mad) { + struct ib_class_port_info rec; + + ib_unpack(classport_info_rec_table, + ARRAY_SIZE(classport_info_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else { + query->callback(status, NULL, query->context); + } +} + +static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_classport_info_query, + sa_query)); +} + +int ib_sa_classport_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_class_port_info *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_classport_info_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL; + + query->sa_query.release = ib_sa_portclass_info_rec_release; + /* support GET only */ + mad->mad_hdr.method = IB_MGMT_METHOD_GET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); + mad->sa_hdr.comp_mask = 0; + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_classport_info_rec_query); + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { @@ -1794,7 +1954,7 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data) kfree(sa_dev); } -static int __init ib_sa_init(void) +int ib_sa_init(void) { int ret; @@ -1820,17 +1980,10 @@ static int __init ib_sa_init(void) goto err3; } - if (ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ib_sa_cb_table), - ib_sa_cb_table)) { - pr_err("Failed to add netlink callback\n"); - ret = -EINVAL; - goto err4; - } INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout); return 0; -err4: - destroy_workqueue(ib_nl_wq); + err3: mcast_cleanup(); err2: @@ -1839,9 +1992,8 @@ static int __init ib_sa_init(void) return ret; } -static void __exit ib_sa_cleanup(void) +void ib_sa_cleanup(void) { - ibnl_remove_client(RDMA_NL_LS); cancel_delayed_work(&ib_nl_timed_work); flush_workqueue(ib_nl_wq); destroy_workqueue(ib_nl_wq); @@ -1849,6 +2001,3 @@ static void __exit ib_sa_cleanup(void) ib_unregister_client(&sa_client); idr_destroy(&query_idr); } - -module_init(ib_sa_init); -module_exit(ib_sa_cleanup); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 14606afbfaa8d6..5e573bb18660d6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -56,8 +56,10 @@ struct ib_port { struct gid_attr_group *gid_attr_group; struct attribute_group gid_group; struct attribute_group pkey_group; - u8 port_num; struct attribute_group *pma_table; + struct attribute_group *hw_stats_ag; + struct rdma_hw_stats *hw_stats; + u8 port_num; }; struct port_attribute { @@ -80,6 +82,18 @@ struct port_table_attribute { __be16 attr_id; }; +struct hw_stats_attribute { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count); + int index; + u8 port_num; +}; + static ssize_t port_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -733,6 +747,212 @@ static struct attribute_group *get_counter_table(struct ib_device *dev, return &pma_group; } +static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, + u8 port_num, int index) +{ + int ret; + + if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan)) + return 0; + ret = dev->get_hw_stats(dev, stats, port_num, index); + if (ret < 0) + return ret; + if (ret == stats->num_counters) + stats->timestamp = jiffies; + + return 0; +} + +static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf) +{ + return sprintf(buf, "%llu\n", stats->value[index]); +} + +static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ib_device *dev; + struct ib_port *port; + struct hw_stats_attribute *hsa; + struct rdma_hw_stats *stats; + int ret; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + dev = container_of((struct device *)kobj, + struct ib_device, dev); + stats = dev->hw_stats; + } else { + port = container_of(kobj, struct ib_port, kobj); + dev = port->ibdev; + stats = port->hw_stats; + } + ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); + if (ret) + return ret; + return print_hw_stat(stats, hsa->index, buf); +} + +static ssize_t show_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct hw_stats_attribute *hsa; + int msecs; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + msecs = jiffies_to_msecs(dev->hw_stats->lifespan); + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + msecs = jiffies_to_msecs(p->hw_stats->lifespan); + } + return sprintf(buf, "%d\n", msecs); +} + +static ssize_t set_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_attribute *hsa; + int msecs; + int jiffies; + int ret; + + ret = kstrtoint(buf, 10, &msecs); + if (ret) + return ret; + if (msecs < 0 || msecs > 10000) + return -EINVAL; + jiffies = msecs_to_jiffies(msecs); + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + dev->hw_stats->lifespan = jiffies; + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + p->hw_stats->lifespan = jiffies; + } + return count; +} + +static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group) +{ + struct attribute **attr; + + sysfs_remove_group(kobj, attr_group); + + for (attr = attr_group->attrs; *attr; attr++) + kfree(*attr); + kfree(attr_group); +} + +static struct attribute *alloc_hsa(int index, u8 port_num, const char *name) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = (char *)name; + hsa->attr.mode = S_IRUGO; + hsa->show = show_hw_stats; + hsa->store = NULL; + hsa->index = index; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = name; + hsa->attr.mode = S_IWUSR | S_IRUGO; + hsa->show = show_stats_lifespan; + hsa->store = set_stats_lifespan; + hsa->index = 0; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static void setup_hw_stats(struct ib_device *device, struct ib_port *port, + u8 port_num) +{ + struct attribute_group *hsag = NULL; + struct rdma_hw_stats *stats; + int i = 0, ret; + + stats = device->alloc_hw_stats(device, port_num); + + if (!stats) + return; + + if (!stats->names || stats->num_counters <= 0) + goto err; + + hsag = kzalloc(sizeof(*hsag) + + // 1 extra for the lifespan config entry + sizeof(void *) * (stats->num_counters + 1), + GFP_KERNEL); + if (!hsag) + return; + + ret = device->get_hw_stats(device, stats, port_num, + stats->num_counters); + if (ret != stats->num_counters) + goto err; + + stats->timestamp = jiffies; + + hsag->name = "hw_counters"; + hsag->attrs = (void *)hsag + sizeof(*hsag); + + for (i = 0; i < stats->num_counters; i++) { + hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]); + if (!hsag->attrs[i]) + goto err; + } + + /* treat an error here as non-fatal */ + hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num); + + if (port) { + struct kobject *kobj = &port->kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + port->hw_stats_ag = hsag; + port->hw_stats = stats; + } else { + struct kobject *kobj = &device->dev.kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + device->hw_stats_ag = hsag; + device->hw_stats = stats; + } + + return; + +err: + kfree(stats); + for (; i >= 0; i--) + kfree(hsag->attrs[i]); + kfree(hsag); + return; +} + static int add_port(struct ib_device *device, int port_num, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -835,6 +1055,14 @@ static int add_port(struct ib_device *device, int port_num, goto err_remove_pkey; } + /* + * If port == 0, it means we have only one port and the parent + * device, not this port device, should be the holder of the + * hw_counters + */ + if (device->alloc_hw_stats && port_num) + setup_hw_stats(device, p, port_num); + list_add_tail(&p->kobj.entry, &device->port_list); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -972,120 +1200,6 @@ static struct device_attribute *ib_class_attributes[] = { &dev_attr_node_desc }; -/* Show a given an attribute in the statistics group */ -static ssize_t show_protocol_stat(const struct device *device, - struct device_attribute *attr, char *buf, - unsigned offset) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - union rdma_protocol_stats stats; - ssize_t ret; - - ret = dev->get_protocol_stats(dev, &stats); - if (ret) - return ret; - - return sprintf(buf, "%llu\n", - (unsigned long long) ((u64 *) &stats)[offset]); -} - -/* generate a read-only iwarp statistics attribute */ -#define IW_STATS_ENTRY(name) \ -static ssize_t show_##name(struct device *device, \ - struct device_attribute *attr, char *buf) \ -{ \ - return show_protocol_stat(device, attr, buf, \ - offsetof(struct iw_protocol_stats, name) / \ - sizeof (u64)); \ -} \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) - -IW_STATS_ENTRY(ipInReceives); -IW_STATS_ENTRY(ipInHdrErrors); -IW_STATS_ENTRY(ipInTooBigErrors); -IW_STATS_ENTRY(ipInNoRoutes); -IW_STATS_ENTRY(ipInAddrErrors); -IW_STATS_ENTRY(ipInUnknownProtos); -IW_STATS_ENTRY(ipInTruncatedPkts); -IW_STATS_ENTRY(ipInDiscards); -IW_STATS_ENTRY(ipInDelivers); -IW_STATS_ENTRY(ipOutForwDatagrams); -IW_STATS_ENTRY(ipOutRequests); -IW_STATS_ENTRY(ipOutDiscards); -IW_STATS_ENTRY(ipOutNoRoutes); -IW_STATS_ENTRY(ipReasmTimeout); -IW_STATS_ENTRY(ipReasmReqds); -IW_STATS_ENTRY(ipReasmOKs); -IW_STATS_ENTRY(ipReasmFails); -IW_STATS_ENTRY(ipFragOKs); -IW_STATS_ENTRY(ipFragFails); -IW_STATS_ENTRY(ipFragCreates); -IW_STATS_ENTRY(ipInMcastPkts); -IW_STATS_ENTRY(ipOutMcastPkts); -IW_STATS_ENTRY(ipInBcastPkts); -IW_STATS_ENTRY(ipOutBcastPkts); -IW_STATS_ENTRY(tcpRtoAlgorithm); -IW_STATS_ENTRY(tcpRtoMin); -IW_STATS_ENTRY(tcpRtoMax); -IW_STATS_ENTRY(tcpMaxConn); -IW_STATS_ENTRY(tcpActiveOpens); -IW_STATS_ENTRY(tcpPassiveOpens); -IW_STATS_ENTRY(tcpAttemptFails); -IW_STATS_ENTRY(tcpEstabResets); -IW_STATS_ENTRY(tcpCurrEstab); -IW_STATS_ENTRY(tcpInSegs); -IW_STATS_ENTRY(tcpOutSegs); -IW_STATS_ENTRY(tcpRetransSegs); -IW_STATS_ENTRY(tcpInErrs); -IW_STATS_ENTRY(tcpOutRsts); - -static struct attribute *iw_proto_stats_attrs[] = { - &dev_attr_ipInReceives.attr, - &dev_attr_ipInHdrErrors.attr, - &dev_attr_ipInTooBigErrors.attr, - &dev_attr_ipInNoRoutes.attr, - &dev_attr_ipInAddrErrors.attr, - &dev_attr_ipInUnknownProtos.attr, - &dev_attr_ipInTruncatedPkts.attr, - &dev_attr_ipInDiscards.attr, - &dev_attr_ipInDelivers.attr, - &dev_attr_ipOutForwDatagrams.attr, - &dev_attr_ipOutRequests.attr, - &dev_attr_ipOutDiscards.attr, - &dev_attr_ipOutNoRoutes.attr, - &dev_attr_ipReasmTimeout.attr, - &dev_attr_ipReasmReqds.attr, - &dev_attr_ipReasmOKs.attr, - &dev_attr_ipReasmFails.attr, - &dev_attr_ipFragOKs.attr, - &dev_attr_ipFragFails.attr, - &dev_attr_ipFragCreates.attr, - &dev_attr_ipInMcastPkts.attr, - &dev_attr_ipOutMcastPkts.attr, - &dev_attr_ipInBcastPkts.attr, - &dev_attr_ipOutBcastPkts.attr, - &dev_attr_tcpRtoAlgorithm.attr, - &dev_attr_tcpRtoMin.attr, - &dev_attr_tcpRtoMax.attr, - &dev_attr_tcpMaxConn.attr, - &dev_attr_tcpActiveOpens.attr, - &dev_attr_tcpPassiveOpens.attr, - &dev_attr_tcpAttemptFails.attr, - &dev_attr_tcpEstabResets.attr, - &dev_attr_tcpCurrEstab.attr, - &dev_attr_tcpInSegs.attr, - &dev_attr_tcpOutSegs.attr, - &dev_attr_tcpRetransSegs.attr, - &dev_attr_tcpInErrs.attr, - &dev_attr_tcpOutRsts.attr, - NULL -}; - -static struct attribute_group iw_stats_group = { - .name = "proto_stats", - .attrs = iw_proto_stats_attrs, -}; - static void free_port_list_attributes(struct ib_device *device) { struct kobject *p, *t; @@ -1093,6 +1207,10 @@ static void free_port_list_attributes(struct ib_device *device) list_for_each_entry_safe(p, t, &device->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); list_del(&p->entry); + if (port->hw_stats) { + kfree(port->hw_stats); + free_hsag(&port->kobj, port->hw_stats_ag); + } sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); @@ -1149,11 +1267,8 @@ int ib_device_register_sysfs(struct ib_device *device, } } - if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) { - ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group); - if (ret) - goto err_put; - } + if (device->alloc_hw_stats) + setup_hw_stats(device, NULL, 0); return 0; @@ -1169,15 +1284,18 @@ int ib_device_register_sysfs(struct ib_device *device, void ib_device_unregister_sysfs(struct ib_device *device) { - /* Hold kobject until ib_dealloc_device() */ - struct kobject *kobj_dev = kobject_get(&device->dev.kobj); int i; - if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) - sysfs_remove_group(kobj_dev, &iw_stats_group); + /* Hold kobject until ib_dealloc_device() */ + kobject_get(&device->dev.kobj); free_port_list_attributes(device); + if (device->hw_stats) { + kfree(device->hw_stats); + free_hsag(&device->dev.kobj, device->hw_stats_ag); + } + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) device_remove_file(&device->dev, ib_class_attributes[i]); diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index c7ad0a4c8b1504..c0c7cf8af3f4cc 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ obj-$(CONFIG_INFINIBAND_NES) += nes/ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ +obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index de1c61b417d6ae..ada2e5009c8699 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -327,7 +327,7 @@ int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) kfree(cq->sw_queue); dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), (1UL << (cq->size_log2)) - * sizeof(struct t3_cqe), cq->queue, + * sizeof(struct t3_cqe) + 1, cq->queue, dma_unmap_addr(cq, mapping)); cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); return err; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 47cb927a0dd665..bb1a839d4d6d43 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1218,59 +1218,119 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, iwch_dev->rdev.rnic_info.pdev->device); } -static int iwch_get_mib(struct ib_device *ibdev, - union rdma_protocol_stats *stats) +enum counters { + IPINRECEIVES, + IPINHDRERRORS, + IPINADDRERRORS, + IPINUNKNOWNPROTOS, + IPINDISCARDS, + IPINDELIVERS, + IPOUTREQUESTS, + IPOUTDISCARDS, + IPOUTNOROUTES, + IPREASMTIMEOUT, + IPREASMREQDS, + IPREASMOKS, + IPREASMFAILS, + TCPACTIVEOPENS, + TCPPASSIVEOPENS, + TCPATTEMPTFAILS, + TCPESTABRESETS, + TCPCURRESTAB, + TCPINSEGS, + TCPOUTSEGS, + TCPRETRANSSEGS, + TCPINERRS, + TCPOUTRSTS, + TCPRTOMIN, + TCPRTOMAX, + NR_COUNTERS +}; + +static const char * const names[] = { + [IPINRECEIVES] = "ipInReceives", + [IPINHDRERRORS] = "ipInHdrErrors", + [IPINADDRERRORS] = "ipInAddrErrors", + [IPINUNKNOWNPROTOS] = "ipInUnknownProtos", + [IPINDISCARDS] = "ipInDiscards", + [IPINDELIVERS] = "ipInDelivers", + [IPOUTREQUESTS] = "ipOutRequests", + [IPOUTDISCARDS] = "ipOutDiscards", + [IPOUTNOROUTES] = "ipOutNoRoutes", + [IPREASMTIMEOUT] = "ipReasmTimeout", + [IPREASMREQDS] = "ipReasmReqds", + [IPREASMOKS] = "ipReasmOKs", + [IPREASMFAILS] = "ipReasmFails", + [TCPACTIVEOPENS] = "tcpActiveOpens", + [TCPPASSIVEOPENS] = "tcpPassiveOpens", + [TCPATTEMPTFAILS] = "tcpAttemptFails", + [TCPESTABRESETS] = "tcpEstabResets", + [TCPCURRESTAB] = "tcpCurrEstab", + [TCPINSEGS] = "tcpInSegs", + [TCPOUTSEGS] = "tcpOutSegs", + [TCPRETRANSSEGS] = "tcpRetransSegs", + [TCPINERRS] = "tcpInErrs", + [TCPOUTRSTS] = "tcpOutRsts", + [TCPRTOMIN] = "tcpRtoMin", + [TCPRTOMAX] = "tcpRtoMax", +}; + +static struct rdma_hw_stats *iwch_alloc_stats(struct ib_device *ibdev, + u8 port_num) +{ + BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS); + + /* Our driver only supports device level stats */ + if (port_num != 0) + return NULL; + + return rdma_alloc_hw_stats_struct(names, NR_COUNTERS, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u8 port, int index) { struct iwch_dev *dev; struct tp_mib_stats m; int ret; + if (port != 0 || !stats) + return -ENOSYS; + PDBG("%s ibdev %p\n", __func__, ibdev); dev = to_iwch_dev(ibdev); ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m); if (ret) return -ENOSYS; - memset(stats, 0, sizeof *stats); - stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) + - m.ipInReceive_lo; - stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) + - m.ipInHdrErrors_lo; - stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) + - m.ipInAddrErrors_lo; - stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) + - m.ipInUnknownProtos_lo; - stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) + - m.ipInDiscards_lo; - stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) + - m.ipInDelivers_lo; - stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) + - m.ipOutRequests_lo; - stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) + - m.ipOutDiscards_lo; - stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) + - m.ipOutNoRoutes_lo; - stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout; - stats->iw.ipReasmReqds = (u64) m.ipReasmReqds; - stats->iw.ipReasmOKs = (u64) m.ipReasmOKs; - stats->iw.ipReasmFails = (u64) m.ipReasmFails; - stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens; - stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens; - stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails; - stats->iw.tcpEstabResets = (u64) m.tcpEstabResets; - stats->iw.tcpOutRsts = (u64) m.tcpOutRsts; - stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab; - stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) + - m.tcpInSegs_lo; - stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) + - m.tcpOutSegs_lo; - stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) + - m.tcpRetransSeg_lo; - stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) + - m.tcpInErrs_lo; - stats->iw.tcpRtoMin = (u64) m.tcpRtoMin; - stats->iw.tcpRtoMax = (u64) m.tcpRtoMax; - return 0; + stats->value[IPINRECEIVES] = ((u64)m.ipInReceive_hi << 32) + m.ipInReceive_lo; + stats->value[IPINHDRERRORS] = ((u64)m.ipInHdrErrors_hi << 32) + m.ipInHdrErrors_lo; + stats->value[IPINADDRERRORS] = ((u64)m.ipInAddrErrors_hi << 32) + m.ipInAddrErrors_lo; + stats->value[IPINUNKNOWNPROTOS] = ((u64)m.ipInUnknownProtos_hi << 32) + m.ipInUnknownProtos_lo; + stats->value[IPINDISCARDS] = ((u64)m.ipInDiscards_hi << 32) + m.ipInDiscards_lo; + stats->value[IPINDELIVERS] = ((u64)m.ipInDelivers_hi << 32) + m.ipInDelivers_lo; + stats->value[IPOUTREQUESTS] = ((u64)m.ipOutRequests_hi << 32) + m.ipOutRequests_lo; + stats->value[IPOUTDISCARDS] = ((u64)m.ipOutDiscards_hi << 32) + m.ipOutDiscards_lo; + stats->value[IPOUTNOROUTES] = ((u64)m.ipOutNoRoutes_hi << 32) + m.ipOutNoRoutes_lo; + stats->value[IPREASMTIMEOUT] = m.ipReasmTimeout; + stats->value[IPREASMREQDS] = m.ipReasmReqds; + stats->value[IPREASMOKS] = m.ipReasmOKs; + stats->value[IPREASMFAILS] = m.ipReasmFails; + stats->value[TCPACTIVEOPENS] = m.tcpActiveOpens; + stats->value[TCPPASSIVEOPENS] = m.tcpPassiveOpens; + stats->value[TCPATTEMPTFAILS] = m.tcpAttemptFails; + stats->value[TCPESTABRESETS] = m.tcpEstabResets; + stats->value[TCPCURRESTAB] = m.tcpOutRsts; + stats->value[TCPINSEGS] = m.tcpCurrEstab; + stats->value[TCPOUTSEGS] = ((u64)m.tcpInSegs_hi << 32) + m.tcpInSegs_lo; + stats->value[TCPRETRANSSEGS] = ((u64)m.tcpOutSegs_hi << 32) + m.tcpOutSegs_lo; + stats->value[TCPINERRS] = ((u64)m.tcpRetransSeg_hi << 32) + m.tcpRetransSeg_lo, + stats->value[TCPOUTRSTS] = ((u64)m.tcpInErrs_hi << 32) + m.tcpInErrs_lo; + stats->value[TCPRTOMIN] = m.tcpRtoMin; + stats->value[TCPRTOMAX] = m.tcpRtoMax; + + return stats->num_counters; } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); @@ -1373,7 +1433,8 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.req_notify_cq = iwch_arm_cq; dev->ibdev.post_send = iwch_post_send; dev->ibdev.post_recv = iwch_post_receive; - dev->ibdev.get_protocol_stats = iwch_get_mib; + dev->ibdev.alloc_hw_stats = iwch_alloc_stats; + dev->ibdev.get_hw_stats = iwch_get_mib; dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = iwch_port_immutable; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 7574f394fdac89..dd8a86b726d2a7 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -446,20 +446,59 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, c4iw_dev->rdev.lldi.pdev->device); } +enum counters { + IP4INSEGS, + IP4OUTSEGS, + IP4RETRANSSEGS, + IP4OUTRSTS, + IP6INSEGS, + IP6OUTSEGS, + IP6RETRANSSEGS, + IP6OUTRSTS, + NR_COUNTERS +}; + +static const char * const names[] = { + [IP4INSEGS] = "ip4InSegs", + [IP4OUTSEGS] = "ip4OutSegs", + [IP4RETRANSSEGS] = "ip4RetransSegs", + [IP4OUTRSTS] = "ip4OutRsts", + [IP6INSEGS] = "ip6InSegs", + [IP6OUTSEGS] = "ip6OutSegs", + [IP6RETRANSSEGS] = "ip6RetransSegs", + [IP6OUTRSTS] = "ip6OutRsts" +}; + +static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev, + u8 port_num) +{ + BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS); + + if (port_num != 0) + return NULL; + + return rdma_alloc_hw_stats_struct(names, NR_COUNTERS, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + static int c4iw_get_mib(struct ib_device *ibdev, - union rdma_protocol_stats *stats) + struct rdma_hw_stats *stats, + u8 port, int index) { struct tp_tcp_stats v4, v6; struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev); cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6); - memset(stats, 0, sizeof *stats); - stats->iw.tcpInSegs = v4.tcp_in_segs + v6.tcp_in_segs; - stats->iw.tcpOutSegs = v4.tcp_out_segs + v6.tcp_out_segs; - stats->iw.tcpRetransSegs = v4.tcp_retrans_segs + v6.tcp_retrans_segs; - stats->iw.tcpOutRsts = v4.tcp_out_rsts + v6.tcp_out_rsts; - - return 0; + stats->value[IP4INSEGS] = v4.tcp_in_segs; + stats->value[IP4OUTSEGS] = v4.tcp_out_segs; + stats->value[IP4RETRANSSEGS] = v4.tcp_retrans_segs; + stats->value[IP4OUTRSTS] = v4.tcp_out_rsts; + stats->value[IP6INSEGS] = v6.tcp_in_segs; + stats->value[IP6OUTSEGS] = v6.tcp_out_segs; + stats->value[IP6RETRANSSEGS] = v6.tcp_retrans_segs; + stats->value[IP6OUTRSTS] = v6.tcp_out_rsts; + + return stats->num_counters; } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); @@ -562,7 +601,8 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.req_notify_cq = c4iw_arm_cq; dev->ibdev.post_send = c4iw_post_send; dev->ibdev.post_recv = c4iw_post_receive; - dev->ibdev.get_protocol_stats = c4iw_get_mib; + dev->ibdev.alloc_hw_stats = c4iw_alloc_stats; + dev->ibdev.get_hw_stats = c4iw_get_mib; dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = c4iw_port_immutable; dev->ibdev.drain_sq = c4iw_drain_sq; diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig similarity index 100% rename from drivers/staging/rdma/hfi1/Kconfig rename to drivers/infiniband/hw/hfi1/Kconfig diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile similarity index 88% rename from drivers/staging/rdma/hfi1/Makefile rename to drivers/infiniband/hw/hfi1/Makefile index 8dc59382ee96cd..9b5382c94b0c86 100644 --- a/drivers/staging/rdma/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -7,7 +7,7 @@ # obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o -hfi1-y := affinity.o chip.o device.o diag.o driver.o efivar.o \ +hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ eprom.o file_ops.o firmware.o \ init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \ diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c similarity index 100% rename from drivers/staging/rdma/hfi1/affinity.c rename to drivers/infiniband/hw/hfi1/affinity.c diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h similarity index 100% rename from drivers/staging/rdma/hfi1/affinity.h rename to drivers/infiniband/hw/hfi1/affinity.h diff --git a/drivers/staging/rdma/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h similarity index 100% rename from drivers/staging/rdma/hfi1/aspm.h rename to drivers/infiniband/hw/hfi1/aspm.h diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c similarity index 99% rename from drivers/staging/rdma/hfi1/chip.c rename to drivers/infiniband/hw/hfi1/chip.c index dcae8e723f9892..3b876da745a1dc 100644 --- a/drivers/staging/rdma/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1037,6 +1037,7 @@ static void dc_shutdown(struct hfi1_devdata *); static void dc_start(struct hfi1_devdata *); static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, unsigned int *np); +static void remove_full_mgmt_pkey(struct hfi1_pportdata *ppd); /* * Error interrupt table entry. This is used as input to the interrupt @@ -6105,7 +6106,7 @@ int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok) } /* this access is valid only when the link is up */ - if ((ppd->host_link_state & HLS_UP) == 0) { + if (ppd->host_link_state & HLS_DOWN) { dd_dev_info(dd, "%s: link state %s not up\n", __func__, link_state_name(ppd->host_link_state)); ret = -EBUSY; @@ -6961,6 +6962,8 @@ void handle_link_down(struct work_struct *work) } reset_neighbor_info(ppd); + if (ppd->mgmt_allowed) + remove_full_mgmt_pkey(ppd); /* disable the port */ clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); @@ -7069,6 +7072,12 @@ static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd) (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); } +static void remove_full_mgmt_pkey(struct hfi1_pportdata *ppd) +{ + ppd->pkeys[2] = 0; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); +} + /* * Convert the given link width to the OPA link width bitmask. */ @@ -7429,7 +7438,7 @@ void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths) retry: mutex_lock(&ppd->hls_lock); /* only apply if the link is up */ - if (!(ppd->host_link_state & HLS_UP)) { + if (ppd->host_link_state & HLS_DOWN) { /* still going up..wait and retry */ if (ppd->host_link_state & HLS_GOING_UP) { if (++tries < 1000) { @@ -9212,9 +9221,6 @@ void reset_qsfp(struct hfi1_pportdata *ppd) /* Reset the QSFP */ mask = (u64)QSFP_HFI0_RESET_N; - qsfp_mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE); - qsfp_mask |= mask; - write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, qsfp_mask); qsfp_mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT); @@ -9252,6 +9258,12 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, dd_dev_info(dd, "%s: QSFP cable temperature too low\n", __func__); + /* + * The remaining alarms/warnings don't matter if the link is down. + */ + if (ppd->host_link_state & HLS_DOWN) + return 0; + if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) || (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING)) dd_dev_info(dd, "%s: QSFP supply voltage too high\n", @@ -9346,9 +9358,8 @@ void qsfp_event(struct work_struct *work) return; /* - * Turn DC back on after cables has been - * re-inserted. Up until now, the DC has been in - * reset to save power. + * Turn DC back on after cable has been re-inserted. Up until + * now, the DC has been in reset to save power. */ dc_start(dd); @@ -9480,7 +9491,15 @@ int bringup_serdes(struct hfi1_pportdata *ppd) return ret; } - /* tune the SERDES to a ballpark setting for + get_port_type(ppd); + if (ppd->port_type == PORT_TYPE_QSFP) { + set_qsfp_int_n(ppd, 0); + wait_for_qsfp_init(ppd); + set_qsfp_int_n(ppd, 1); + } + + /* + * Tune the SerDes to a ballpark setting for * optimal signal and bit error rate * Needs to be done before starting the link */ @@ -10074,7 +10093,7 @@ u32 driver_physical_state(struct hfi1_pportdata *ppd) */ u32 driver_logical_state(struct hfi1_pportdata *ppd) { - if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP)) + if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN)) return IB_PORT_DOWN; switch (ppd->host_link_state & HLS_UP) { @@ -14578,7 +14597,7 @@ u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl, (reason), (ret)) /* - * Initialize the Avago Thermal sensor. + * Initialize the thermal sensor. * * After initialization, enable polling of thermal sensor through * SBus interface. In order for this to work, the SBus Master diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h similarity index 99% rename from drivers/staging/rdma/hfi1/chip.h rename to drivers/infiniband/hw/hfi1/chip.h index 1948706fff1a95..66a327978739dd 100644 --- a/drivers/staging/rdma/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -398,6 +398,12 @@ /* Lane ID for general configuration registers */ #define GENERAL_CONFIG 4 +/* LINK_TUNING_PARAMETERS fields */ +#define TUNING_METHOD_SHIFT 24 + +/* LINK_OPTIMIZATION_SETTINGS fields */ +#define ENABLE_EXT_DEV_CONFIG_SHIFT 24 + /* LOAD_DATA 8051 command shifts and fields */ #define LOAD_DATA_FIELD_ID_SHIFT 40 #define LOAD_DATA_FIELD_ID_MASK 0xfull diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h similarity index 100% rename from drivers/staging/rdma/hfi1/chip_registers.h rename to drivers/infiniband/hw/hfi1/chip_registers.h diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h similarity index 98% rename from drivers/staging/rdma/hfi1/common.h rename to drivers/infiniband/hw/hfi1/common.h index e9b6bb32202544..fcc9c217a97a0f 100644 --- a/drivers/staging/rdma/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -178,7 +178,8 @@ HFI1_CAP_PKEY_CHECK | \ HFI1_CAP_NO_INTEGRITY) -#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR) +#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \ + HFI1_USER_SWMINOR) #ifndef HFI1_KERN_TYPE #define HFI1_KERN_TYPE 0 @@ -349,6 +350,8 @@ struct hfi1_message_header { #define HFI1_BECN_MASK 1 #define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT) +#define HFI1_PSM_IOC_BASE_SEQ 0x0 + static inline __u64 rhf_to_cpu(const __le32 *rbuf) { return __le64_to_cpu(*((__le64 *)rbuf)); diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c similarity index 100% rename from drivers/staging/rdma/hfi1/debugfs.c rename to drivers/infiniband/hw/hfi1/debugfs.c diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h similarity index 100% rename from drivers/staging/rdma/hfi1/debugfs.h rename to drivers/infiniband/hw/hfi1/debugfs.h diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c similarity index 94% rename from drivers/staging/rdma/hfi1/device.c rename to drivers/infiniband/hw/hfi1/device.c index c05c39da83b108..bf64b5a7bfd79f 100644 --- a/drivers/staging/rdma/hfi1/device.c +++ b/drivers/infiniband/hw/hfi1/device.c @@ -60,7 +60,8 @@ static dev_t hfi1_dev; int hfi1_cdev_init(int minor, const char *name, const struct file_operations *fops, struct cdev *cdev, struct device **devp, - bool user_accessible) + bool user_accessible, + struct kobject *parent) { const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor); struct device *device = NULL; @@ -68,6 +69,7 @@ int hfi1_cdev_init(int minor, const char *name, cdev_init(cdev, fops); cdev->owner = THIS_MODULE; + cdev->kobj.parent = parent; kobject_set_name(&cdev->kobj, name); ret = cdev_add(cdev, dev, 1); @@ -82,13 +84,13 @@ int hfi1_cdev_init(int minor, const char *name, else device = device_create(class, NULL, dev, NULL, "%s", name); - if (!IS_ERR(device)) - goto done; - ret = PTR_ERR(device); - device = NULL; - pr_err("Could not create device for minor %d, %s (err %d)\n", - minor, name, -ret); - cdev_del(cdev); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + device = NULL; + pr_err("Could not create device for minor %d, %s (err %d)\n", + minor, name, -ret); + cdev_del(cdev); + } done: *devp = device; return ret; diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/infiniband/hw/hfi1/device.h similarity index 97% rename from drivers/staging/rdma/hfi1/device.h rename to drivers/infiniband/hw/hfi1/device.h index 5bb3e83cf2da25..c3ec19cb0ac94d 100644 --- a/drivers/staging/rdma/hfi1/device.h +++ b/drivers/infiniband/hw/hfi1/device.h @@ -50,7 +50,8 @@ int hfi1_cdev_init(int minor, const char *name, const struct file_operations *fops, struct cdev *cdev, struct device **devp, - bool user_accessible); + bool user_accessible, + struct kobject *parent); void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp); const char *class_name(void); int __init dev_init(void); diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/infiniband/hw/hfi1/dma.c similarity index 100% rename from drivers/staging/rdma/hfi1/dma.c rename to drivers/infiniband/hw/hfi1/dma.c diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c similarity index 99% rename from drivers/staging/rdma/hfi1/driver.c rename to drivers/infiniband/hw/hfi1/driver.c index 700c6fa3a6330c..c75b0ae688f877 100644 --- a/drivers/staging/rdma/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1161,7 +1161,7 @@ int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc) ppd->lmc = lmc; hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0); - dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid); + dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid); return 0; } diff --git a/drivers/staging/rdma/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c similarity index 100% rename from drivers/staging/rdma/hfi1/efivar.c rename to drivers/infiniband/hw/hfi1/efivar.c diff --git a/drivers/staging/rdma/hfi1/efivar.h b/drivers/infiniband/hw/hfi1/efivar.h similarity index 100% rename from drivers/staging/rdma/hfi1/efivar.h rename to drivers/infiniband/hw/hfi1/efivar.h diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c new file mode 100644 index 00000000000000..36b77943cbfd6c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/eprom.c @@ -0,0 +1,102 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include "hfi.h" +#include "common.h" +#include "eprom.h" + +#define CMD_SHIFT 24 +#define CMD_RELEASE_POWERDOWN_NOID ((0xab << CMD_SHIFT)) + +/* controller interface speeds */ +#define EP_SPEED_FULL 0x2 /* full speed */ + +/* + * How long to wait for the EPROM to become available, in ms. + * The spec 32 Mb EPROM takes around 40s to erase then write. + * Double it for safety. + */ +#define EPROM_TIMEOUT 80000 /* ms */ +/* + * Initialize the EPROM handler. + */ +int eprom_init(struct hfi1_devdata *dd) +{ + int ret = 0; + + /* only the discrete chip has an EPROM */ + if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0) + return 0; + + /* + * It is OK if both HFIs reset the EPROM as long as they don't + * do it at the same time. + */ + ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT); + if (ret) { + dd_dev_err(dd, + "%s: unable to acquire EPROM resource, no EPROM support\n", + __func__); + goto done_asic; + } + + /* reset EPROM to be sure it is in a good state */ + + /* set reset */ + write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK); + /* clear reset, set speed */ + write_csr(dd, ASIC_EEP_CTL_STAT, + EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT); + + /* wake the device with command "release powerdown NoID" */ + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID); + + dd->eprom_available = true; + release_chip_resource(dd, CR_EPROM); +done_asic: + return ret; +} diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/infiniband/hw/hfi1/eprom.h similarity index 100% rename from drivers/staging/rdma/hfi1/eprom.h rename to drivers/infiniband/hw/hfi1/eprom.h diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c similarity index 78% rename from drivers/staging/rdma/hfi1/file_ops.c rename to drivers/infiniband/hw/hfi1/file_ops.c index c1c5bf82addb07..7a5b0e676cc7a4 100644 --- a/drivers/staging/rdma/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -72,8 +72,6 @@ */ static int hfi1_file_open(struct inode *, struct file *); static int hfi1_file_close(struct inode *, struct file *); -static ssize_t hfi1_file_write(struct file *, const char __user *, - size_t, loff_t *); static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *); static unsigned int hfi1_poll(struct file *, struct poll_table_struct *); static int hfi1_file_mmap(struct file *, struct vm_area_struct *); @@ -86,8 +84,7 @@ static int get_ctxt_info(struct file *, void __user *, __u32); static int get_base_info(struct file *, void __user *, __u32); static int setup_ctxt(struct file *); static int setup_subctxt(struct hfi1_ctxtdata *); -static int get_user_context(struct file *, struct hfi1_user_info *, - int, unsigned); +static int get_user_context(struct file *, struct hfi1_user_info *, int); static int find_shared_ctxt(struct file *, const struct hfi1_user_info *); static int allocate_ctxt(struct file *, struct hfi1_devdata *, struct hfi1_user_info *); @@ -97,13 +94,15 @@ static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long); static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16); static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int); static int vma_fault(struct vm_area_struct *, struct vm_fault *); +static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg); static const struct file_operations hfi1_file_ops = { .owner = THIS_MODULE, - .write = hfi1_file_write, .write_iter = hfi1_write_iter, .open = hfi1_file_open, .release = hfi1_file_close, + .unlocked_ioctl = hfi1_file_ioctl, .poll = hfi1_poll, .mmap = hfi1_file_mmap, .llseek = noop_llseek, @@ -169,6 +168,13 @@ static inline int is_valid_mmap(u64 token) static int hfi1_file_open(struct inode *inode, struct file *fp) { + struct hfi1_devdata *dd = container_of(inode->i_cdev, + struct hfi1_devdata, + user_cdev); + + /* Just take a ref now. Not all opens result in a context assign */ + kobject_get(&dd->kobj); + /* The real work is performed later in assign_ctxt() */ fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL); if (fp->private_data) /* no cpu affinity by default */ @@ -176,127 +182,59 @@ static int hfi1_file_open(struct inode *inode, struct file *fp) return fp->private_data ? 0 : -ENOMEM; } -static ssize_t hfi1_file_write(struct file *fp, const char __user *data, - size_t count, loff_t *offset) +static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg) { - const struct hfi1_cmd __user *ucmd; struct hfi1_filedata *fd = fp->private_data; struct hfi1_ctxtdata *uctxt = fd->uctxt; - struct hfi1_cmd cmd; struct hfi1_user_info uinfo; struct hfi1_tid_info tinfo; + int ret = 0; unsigned long addr; - ssize_t consumed = 0, copy = 0, ret = 0; - void *dest = NULL; - __u64 user_val = 0; - int uctxt_required = 1; - int must_be_root = 0; - - /* FIXME: This interface cannot continue out of staging */ - if (WARN_ON_ONCE(!ib_safe_file_access(fp))) - return -EACCES; - - if (count < sizeof(cmd)) { - ret = -EINVAL; - goto bail; - } - - ucmd = (const struct hfi1_cmd __user *)data; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) { - ret = -EFAULT; - goto bail; - } - - consumed = sizeof(cmd); - - switch (cmd.type) { - case HFI1_CMD_ASSIGN_CTXT: - uctxt_required = 0; /* assigned user context not required */ - copy = sizeof(uinfo); - dest = &uinfo; - break; - case HFI1_CMD_SDMA_STATUS_UPD: - case HFI1_CMD_CREDIT_UPD: - copy = 0; - break; - case HFI1_CMD_TID_UPDATE: - case HFI1_CMD_TID_FREE: - case HFI1_CMD_TID_INVAL_READ: - copy = sizeof(tinfo); - dest = &tinfo; - break; - case HFI1_CMD_USER_INFO: - case HFI1_CMD_RECV_CTRL: - case HFI1_CMD_POLL_TYPE: - case HFI1_CMD_ACK_EVENT: - case HFI1_CMD_CTXT_INFO: - case HFI1_CMD_SET_PKEY: - case HFI1_CMD_CTXT_RESET: - copy = 0; - user_val = cmd.addr; - break; - case HFI1_CMD_EP_INFO: - case HFI1_CMD_EP_ERASE_CHIP: - case HFI1_CMD_EP_ERASE_RANGE: - case HFI1_CMD_EP_READ_RANGE: - case HFI1_CMD_EP_WRITE_RANGE: - uctxt_required = 0; /* assigned user context not required */ - must_be_root = 1; /* validate user */ - copy = 0; - break; - default: - ret = -EINVAL; - goto bail; - } - - /* If the command comes with user data, copy it. */ - if (copy) { - if (copy_from_user(dest, (void __user *)cmd.addr, copy)) { - ret = -EFAULT; - goto bail; - } - consumed += copy; - } - - /* - * Make sure there is a uctxt when needed. - */ - if (uctxt_required && !uctxt) { - ret = -EINVAL; - goto bail; - } + int uval = 0; + unsigned long ul_uval = 0; + u16 uval16 = 0; + + hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd); + if (cmd != HFI1_IOCTL_ASSIGN_CTXT && + cmd != HFI1_IOCTL_GET_VERS && + !uctxt) + return -EINVAL; - /* only root can do these operations */ - if (must_be_root && !capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto bail; - } + switch (cmd) { + case HFI1_IOCTL_ASSIGN_CTXT: + if (copy_from_user(&uinfo, + (struct hfi1_user_info __user *)arg, + sizeof(uinfo))) + return -EFAULT; - switch (cmd.type) { - case HFI1_CMD_ASSIGN_CTXT: ret = assign_ctxt(fp, &uinfo); if (ret < 0) - goto bail; - ret = setup_ctxt(fp); + return ret; + setup_ctxt(fp); if (ret) - goto bail; + return ret; ret = user_init(fp); break; - case HFI1_CMD_CTXT_INFO: - ret = get_ctxt_info(fp, (void __user *)(unsigned long) - user_val, cmd.len); - break; - case HFI1_CMD_USER_INFO: - ret = get_base_info(fp, (void __user *)(unsigned long) - user_val, cmd.len); + case HFI1_IOCTL_CTXT_INFO: + ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg, + sizeof(struct hfi1_ctxt_info)); break; - case HFI1_CMD_SDMA_STATUS_UPD: + case HFI1_IOCTL_USER_INFO: + ret = get_base_info(fp, (void __user *)(unsigned long)arg, + sizeof(struct hfi1_base_info)); break; - case HFI1_CMD_CREDIT_UPD: + case HFI1_IOCTL_CREDIT_UPD: if (uctxt && uctxt->sc) sc_return_credits(uctxt->sc); break; - case HFI1_CMD_TID_UPDATE: + + case HFI1_IOCTL_TID_UPDATE: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + ret = hfi1_user_exp_rcv_setup(fp, &tinfo); if (!ret) { /* @@ -305,57 +243,82 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, * These fields are adjacent in the structure so * we can copy them at the same time. */ - addr = (unsigned long)cmd.addr + - offsetof(struct hfi1_tid_info, tidcnt); + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); if (copy_to_user((void __user *)addr, &tinfo.tidcnt, sizeof(tinfo.tidcnt) + sizeof(tinfo.length))) ret = -EFAULT; } break; - case HFI1_CMD_TID_INVAL_READ: - ret = hfi1_user_exp_rcv_invalid(fp, &tinfo); + + case HFI1_IOCTL_TID_FREE: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_clear(fp, &tinfo); if (ret) break; - addr = (unsigned long)cmd.addr + - offsetof(struct hfi1_tid_info, tidcnt); + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); if (copy_to_user((void __user *)addr, &tinfo.tidcnt, sizeof(tinfo.tidcnt))) ret = -EFAULT; break; - case HFI1_CMD_TID_FREE: - ret = hfi1_user_exp_rcv_clear(fp, &tinfo); + + case HFI1_IOCTL_TID_INVAL_READ: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_invalid(fp, &tinfo); if (ret) break; - addr = (unsigned long)cmd.addr + - offsetof(struct hfi1_tid_info, tidcnt); + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); if (copy_to_user((void __user *)addr, &tinfo.tidcnt, sizeof(tinfo.tidcnt))) ret = -EFAULT; break; - case HFI1_CMD_RECV_CTRL: - ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val); + + case HFI1_IOCTL_RECV_CTRL: + ret = get_user(uval, (int __user *)arg); + if (ret != 0) + return -EFAULT; + ret = manage_rcvq(uctxt, fd->subctxt, uval); break; - case HFI1_CMD_POLL_TYPE: - uctxt->poll_type = (typeof(uctxt->poll_type))user_val; + + case HFI1_IOCTL_POLL_TYPE: + ret = get_user(uval, (int __user *)arg); + if (ret != 0) + return -EFAULT; + uctxt->poll_type = (typeof(uctxt->poll_type))uval; break; - case HFI1_CMD_ACK_EVENT: - ret = user_event_ack(uctxt, fd->subctxt, user_val); + + case HFI1_IOCTL_ACK_EVENT: + ret = get_user(ul_uval, (unsigned long __user *)arg); + if (ret != 0) + return -EFAULT; + ret = user_event_ack(uctxt, fd->subctxt, ul_uval); break; - case HFI1_CMD_SET_PKEY: + + case HFI1_IOCTL_SET_PKEY: + ret = get_user(uval16, (u16 __user *)arg); + if (ret != 0) + return -EFAULT; if (HFI1_CAP_IS_USET(PKEY_CHECK)) - ret = set_ctxt_pkey(uctxt, fd->subctxt, user_val); + ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16); else - ret = -EPERM; + return -EPERM; break; - case HFI1_CMD_CTXT_RESET: { + + case HFI1_IOCTL_CTXT_RESET: { struct send_context *sc; struct hfi1_devdata *dd; - if (!uctxt || !uctxt->dd || !uctxt->sc) { - ret = -EINVAL; - break; - } + if (!uctxt || !uctxt->dd || !uctxt->sc) + return -EINVAL; + /* * There is no protection here. User level has to * guarantee that no one will be writing to the send @@ -373,10 +336,9 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, wait_event_interruptible_timeout( sc->halt_wait, (sc->flags & SCF_HALTED), msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); - if (!(sc->flags & SCF_HALTED)) { - ret = -ENOLCK; - break; - } + if (!(sc->flags & SCF_HALTED)) + return -ENOLCK; + /* * If the send context was halted due to a Freeze, * wait until the device has been "unfrozen" before @@ -387,18 +349,16 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, dd->event_queue, !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN), msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); - if (dd->flags & HFI1_FROZEN) { - ret = -ENOLCK; - break; - } - if (dd->flags & HFI1_FORCED_FREEZE) { + if (dd->flags & HFI1_FROZEN) + return -ENOLCK; + + if (dd->flags & HFI1_FORCED_FREEZE) /* * Don't allow context reset if we are into * forced freeze */ - ret = -ENODEV; - break; - } + return -ENODEV; + sc_disable(sc); ret = sc_enable(sc); hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, @@ -410,18 +370,17 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, sc_return_credits(sc); break; } - case HFI1_CMD_EP_INFO: - case HFI1_CMD_EP_ERASE_CHIP: - case HFI1_CMD_EP_ERASE_RANGE: - case HFI1_CMD_EP_READ_RANGE: - case HFI1_CMD_EP_WRITE_RANGE: - ret = handle_eprom_command(fp, &cmd); + + case HFI1_IOCTL_GET_VERS: + uval = HFI1_USER_SWVERSION; + if (put_user(uval, (int __user *)arg)) + return -EFAULT; break; + + default: + return -EINVAL; } - if (ret >= 0) - ret = consumed; -bail: return ret; } @@ -738,7 +697,9 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) { struct hfi1_filedata *fdata = fp->private_data; struct hfi1_ctxtdata *uctxt = fdata->uctxt; - struct hfi1_devdata *dd; + struct hfi1_devdata *dd = container_of(inode->i_cdev, + struct hfi1_devdata, + user_cdev); unsigned long flags, *ev; fp->private_data = NULL; @@ -747,7 +708,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) goto done; hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); - dd = uctxt->dd; mutex_lock(&hfi1_mutex); flush_wc(); @@ -813,6 +773,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) mutex_unlock(&hfi1_mutex); hfi1_free_ctxtdata(dd, uctxt); done: + kobject_put(&dd->kobj); kfree(fdata); return 0; } @@ -836,7 +797,7 @@ static u64 kvirt_to_phys(void *addr) static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) { int i_minor, ret = 0; - unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS; + unsigned int swmajor, swminor; swmajor = uinfo->userversion >> 16; if (swmajor != HFI1_USER_SWMAJOR) { @@ -846,9 +807,6 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) swminor = uinfo->userversion & 0xffff; - if (uinfo->hfi1_alg < HFI1_ALG_COUNT) - alg = uinfo->hfi1_alg; - mutex_lock(&hfi1_mutex); /* First, lets check if we need to setup a shared context? */ if (uinfo->subctxt_cnt) { @@ -868,7 +826,7 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) */ if (!ret) { i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE; - ret = get_user_context(fp, uinfo, i_minor - 1, alg); + ret = get_user_context(fp, uinfo, i_minor); } done_unlock: mutex_unlock(&hfi1_mutex); @@ -876,71 +834,26 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) return ret; } -/* return true if the device available for general use */ -static int usable_device(struct hfi1_devdata *dd) -{ - struct hfi1_pportdata *ppd = dd->pport; - - return driver_lstate(ppd) == IB_PORT_ACTIVE; -} - static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo, - int devno, unsigned alg) + int devno) { struct hfi1_devdata *dd = NULL; - int ret = 0, devmax, npresent, nup, dev; + int devmax, npresent, nup; devmax = hfi1_count_units(&npresent, &nup); - if (!npresent) { - ret = -ENXIO; - goto done; - } - if (!nup) { - ret = -ENETDOWN; - goto done; - } - if (devno >= 0) { - dd = hfi1_lookup(devno); - if (!dd) - ret = -ENODEV; - else if (!dd->freectxts) - ret = -EBUSY; - } else { - struct hfi1_devdata *pdd; - - if (alg == HFI1_ALG_ACROSS) { - unsigned free = 0U; - - for (dev = 0; dev < devmax; dev++) { - pdd = hfi1_lookup(dev); - if (!pdd) - continue; - if (!usable_device(pdd)) - continue; - if (pdd->freectxts && - pdd->freectxts > free) { - dd = pdd; - free = pdd->freectxts; - } - } - } else { - for (dev = 0; dev < devmax; dev++) { - pdd = hfi1_lookup(dev); - if (!pdd) - continue; - if (!usable_device(pdd)) - continue; - if (pdd->freectxts) { - dd = pdd; - break; - } - } - } - if (!dd) - ret = -EBUSY; - } -done: - return ret ? ret : allocate_ctxt(fp, dd, uinfo); + if (!npresent) + return -ENXIO; + + if (!nup) + return -ENETDOWN; + + dd = hfi1_lookup(devno); + if (!dd) + return -ENODEV; + else if (!dd->freectxts) + return -EBUSY; + + return allocate_ctxt(fp, dd, uinfo); } static int find_shared_ctxt(struct file *fp, @@ -1546,170 +1459,10 @@ static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt, return ret; } -static int ui_open(struct inode *inode, struct file *filp) -{ - struct hfi1_devdata *dd; - - dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev); - filp->private_data = dd; /* for other methods */ - return 0; -} - -static int ui_release(struct inode *inode, struct file *filp) -{ - /* nothing to do */ - return 0; -} - -static loff_t ui_lseek(struct file *filp, loff_t offset, int whence) -{ - struct hfi1_devdata *dd = filp->private_data; - - return fixed_size_llseek(filp, offset, whence, - (dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE); -} - -/* NOTE: assumes unsigned long is 8 bytes */ -static ssize_t ui_read(struct file *filp, char __user *buf, size_t count, - loff_t *f_pos) -{ - struct hfi1_devdata *dd = filp->private_data; - void __iomem *base = dd->kregbase; - unsigned long total, csr_off, - barlen = (dd->kregend - dd->kregbase); - u64 data; - - /* only read 8 byte quantities */ - if ((count % 8) != 0) - return -EINVAL; - /* offset must be 8-byte aligned */ - if ((*f_pos % 8) != 0) - return -EINVAL; - /* destination buffer must be 8-byte aligned */ - if ((unsigned long)buf % 8 != 0) - return -EINVAL; - /* must be in range */ - if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE)) - return -EINVAL; - /* only set the base if we are not starting past the BAR */ - if (*f_pos < barlen) - base += *f_pos; - csr_off = *f_pos; - for (total = 0; total < count; total += 8, csr_off += 8) { - /* accessing LCB CSRs requires more checks */ - if (is_lcb_offset(csr_off)) { - if (read_lcb_csr(dd, csr_off, (u64 *)&data)) - break; /* failed */ - } - /* - * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a - * false parity error. Avoid the whole issue by not reading - * them. These registers are defined as having a read value - * of 0. - */ - else if (csr_off == ASIC_GPIO_CLEAR || - csr_off == ASIC_GPIO_FORCE || - csr_off == ASIC_QSFP1_CLEAR || - csr_off == ASIC_QSFP1_FORCE || - csr_off == ASIC_QSFP2_CLEAR || - csr_off == ASIC_QSFP2_FORCE) - data = 0; - else if (csr_off >= barlen) { - /* - * read_8051_data can read more than just 8 bytes at - * a time. However, folding this into the loop and - * handling the reads in 8 byte increments allows us - * to smoothly transition from chip memory to 8051 - * memory. - */ - if (read_8051_data(dd, - (u32)(csr_off - barlen), - sizeof(data), &data)) - break; /* failed */ - } else - data = readq(base + total); - if (put_user(data, (unsigned long __user *)(buf + total))) - break; - } - *f_pos += total; - return total; -} - -/* NOTE: assumes unsigned long is 8 bytes */ -static ssize_t ui_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos) -{ - struct hfi1_devdata *dd = filp->private_data; - void __iomem *base; - unsigned long total, data, csr_off; - int in_lcb; - - /* only write 8 byte quantities */ - if ((count % 8) != 0) - return -EINVAL; - /* offset must be 8-byte aligned */ - if ((*f_pos % 8) != 0) - return -EINVAL; - /* source buffer must be 8-byte aligned */ - if ((unsigned long)buf % 8 != 0) - return -EINVAL; - /* must be in range */ - if (*f_pos + count > dd->kregend - dd->kregbase) - return -EINVAL; - - base = (void __iomem *)dd->kregbase + *f_pos; - csr_off = *f_pos; - in_lcb = 0; - for (total = 0; total < count; total += 8, csr_off += 8) { - if (get_user(data, (unsigned long __user *)(buf + total))) - break; - /* accessing LCB CSRs requires a special procedure */ - if (is_lcb_offset(csr_off)) { - if (!in_lcb) { - int ret = acquire_lcb_access(dd, 1); - - if (ret) - break; - in_lcb = 1; - } - } else { - if (in_lcb) { - release_lcb_access(dd, 1); - in_lcb = 0; - } - } - writeq(data, base + total); - } - if (in_lcb) - release_lcb_access(dd, 1); - *f_pos += total; - return total; -} - -static const struct file_operations ui_file_ops = { - .owner = THIS_MODULE, - .llseek = ui_lseek, - .read = ui_read, - .write = ui_write, - .open = ui_open, - .release = ui_release, -}; - -#define UI_OFFSET 192 /* device minor offset for UI devices */ -static int create_ui = 1; - -static struct cdev wildcard_cdev; -static struct device *wildcard_device; - -static atomic_t user_count = ATOMIC_INIT(0); - static void user_remove(struct hfi1_devdata *dd) { - if (atomic_dec_return(&user_count) == 0) - hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device); hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device); - hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device); } static int user_add(struct hfi1_devdata *dd) @@ -1717,34 +1470,13 @@ static int user_add(struct hfi1_devdata *dd) char name[10]; int ret; - if (atomic_inc_return(&user_count) == 1) { - ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops, - &wildcard_cdev, &wildcard_device, - true); - if (ret) - goto done; - } - snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit); - ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops, + ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops, &dd->user_cdev, &dd->user_device, - true); + true, &dd->kobj); if (ret) - goto done; + user_remove(dd); - if (create_ui) { - snprintf(name, sizeof(name), - "%s_ui%d", class_name(), dd->unit); - ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops, - &dd->ui_cdev, &dd->ui_device, - false); - if (ret) - goto done; - } - - return 0; -done: - user_remove(dd); return ret; } @@ -1753,13 +1485,7 @@ static int user_add(struct hfi1_devdata *dd) */ int hfi1_device_create(struct hfi1_devdata *dd) { - int r, ret; - - r = user_add(dd); - ret = hfi1_diag_add(dd); - if (r && !ret) - ret = r; - return ret; + return user_add(dd); } /* @@ -1769,5 +1495,4 @@ int hfi1_device_create(struct hfi1_devdata *dd) void hfi1_device_remove(struct hfi1_devdata *dd) { user_remove(dd); - hfi1_diag_remove(dd); } diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c similarity index 100% rename from drivers/staging/rdma/hfi1/firmware.c rename to drivers/infiniband/hw/hfi1/firmware.c diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h similarity index 99% rename from drivers/staging/rdma/hfi1/hfi.h rename to drivers/infiniband/hw/hfi1/hfi.h index 7b78d56de7f56b..4417a0fd3ef903 100644 --- a/drivers/staging/rdma/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -453,6 +453,7 @@ struct rvt_sge_state; #define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP) #define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE) +#define HLS_DOWN ~(HLS_UP) /* use this MTU size if none other is given */ #define HFI1_DEFAULT_ACTIVE_MTU 10240 @@ -1168,6 +1169,7 @@ struct hfi1_devdata { atomic_t aspm_disabled_cnt; struct hfi1_affinity *affinity; + struct kobject kobj; }; /* 8051 firmware version helper */ @@ -1882,9 +1884,8 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) get_unit_name((dd)->unit), ##__VA_ARGS__) #define hfi1_dev_porterr(dd, port, fmt, ...) \ - dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ - get_unit_name((dd)->unit), (dd)->unit, (port), \ - ##__VA_ARGS__) + dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ + get_unit_name((dd)->unit), (port), ##__VA_ARGS__) /* * this is used for formatting hw error messages... diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c similarity index 99% rename from drivers/staging/rdma/hfi1/init.c rename to drivers/infiniband/hw/hfi1/init.c index 502b7cf4647de3..5cc492e5776d8d 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -732,12 +732,12 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) lastfail = hfi1_create_rcvhdrq(dd, rcd); if (!lastfail) lastfail = hfi1_setup_eagerbufs(rcd); - if (lastfail) + if (lastfail) { dd_dev_err(dd, "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); + ret = lastfail; + } } - if (lastfail) - ret = lastfail; /* Allocate enough memory for user event notification. */ len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS * @@ -989,8 +989,10 @@ static void release_asic_data(struct hfi1_devdata *dd) dd->asic_data = NULL; } -void hfi1_free_devdata(struct hfi1_devdata *dd) +static void __hfi1_free_devdata(struct kobject *kobj) { + struct hfi1_devdata *dd = + container_of(kobj, struct hfi1_devdata, kobj); unsigned long flags; spin_lock_irqsave(&hfi1_devs_lock, flags); @@ -1007,6 +1009,15 @@ void hfi1_free_devdata(struct hfi1_devdata *dd) rvt_dealloc_device(&dd->verbs_dev.rdi); } +static struct kobj_type hfi1_devdata_type = { + .release = __hfi1_free_devdata, +}; + +void hfi1_free_devdata(struct hfi1_devdata *dd) +{ + kobject_put(&dd->kobj); +} + /* * Allocate our primary per-unit data structure. Must be done via verbs * allocator, because the verbs cleanup process both does cleanup and @@ -1102,6 +1113,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) &pdev->dev, "Could not alloc cpulist info, cpu affinity might be wrong\n"); } + kobject_init(&dd->kobj, &hfi1_devdata_type); return dd; bail: @@ -1300,7 +1312,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd) spin_lock(&ppd->cc_state_lock); cc_state = get_cc_state(ppd); - rcu_assign_pointer(ppd->cc_state, NULL); + RCU_INIT_POINTER(ppd->cc_state, NULL); spin_unlock(&ppd->cc_state_lock); if (cc_state) diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c similarity index 100% rename from drivers/staging/rdma/hfi1/intr.c rename to drivers/infiniband/hw/hfi1/intr.c diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h similarity index 100% rename from drivers/staging/rdma/hfi1/iowait.h rename to drivers/infiniband/hw/hfi1/iowait.h diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c similarity index 99% rename from drivers/staging/rdma/hfi1/mad.c rename to drivers/infiniband/hw/hfi1/mad.c index ed58cf21e790e2..219029576ba0b7 100644 --- a/drivers/staging/rdma/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1402,6 +1402,12 @@ static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys) if (key == okey) continue; + /* + * Don't update pkeys[2], if an HFI port without MgmtAllowed + * by neighbor is a switch. + */ + if (i == 2 && !ppd->mgmt_allowed && ppd->neighbor_type == 1) + continue; /* * The SM gives us the complete PKey table. We have * to ensure that we put the PKeys in the matching @@ -3363,6 +3369,50 @@ static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am, return reply((struct ib_mad_hdr *)smp); } +/* + * Apply congestion control information stored in the ppd to the + * active structure. + */ +static void apply_cc_state(struct hfi1_pportdata *ppd) +{ + struct cc_state *old_cc_state, *new_cc_state; + + new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL); + if (!new_cc_state) + return; + + /* + * Hold the lock for updating *and* to prevent ppd information + * from changing during the update. + */ + spin_lock(&ppd->cc_state_lock); + + old_cc_state = get_cc_state(ppd); + if (!old_cc_state) { + /* never active, or shutting down */ + spin_unlock(&ppd->cc_state_lock); + kfree(new_cc_state); + return; + } + + *new_cc_state = *old_cc_state; + + new_cc_state->cct.ccti_limit = ppd->total_cct_entry - 1; + memcpy(new_cc_state->cct.entries, ppd->ccti_entries, + ppd->total_cct_entry * sizeof(struct ib_cc_table_entry)); + + new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED; + new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map; + memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries, + OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry)); + + rcu_assign_pointer(ppd->cc_state, new_cc_state); + + spin_unlock(&ppd->cc_state_lock); + + call_rcu(&old_cc_state->rcu, cc_state_reclaim); +} + static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, u32 *resp_len) @@ -3374,6 +3424,11 @@ static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, struct opa_congestion_setting_entry_shadow *entries; int i; + /* + * Save details from packet into the ppd. Hold the cc_state_lock so + * our information is consistent with anyone trying to apply the state. + */ + spin_lock(&ppd->cc_state_lock); ppd->cc_sl_control_map = be32_to_cpu(p->control_map); entries = ppd->congestion_entries; @@ -3384,6 +3439,10 @@ static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, p->entries[i].trigger_threshold; entries[i].ccti_min = p->entries[i].ccti_min; } + spin_unlock(&ppd->cc_state_lock); + + /* now apply the information */ + apply_cc_state(ppd); return __subn_get_opa_cong_setting(smp, am, data, ibdev, port, resp_len); @@ -3526,7 +3585,6 @@ static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, int i, j; u32 sentry, eentry; u16 ccti_limit; - struct cc_state *old_cc_state, *new_cc_state; /* sanity check n_blocks, start_block */ if (n_blocks == 0 || @@ -3546,45 +3604,20 @@ static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, return reply((struct ib_mad_hdr *)smp); } - new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL); - if (!new_cc_state) - goto getit; - + /* + * Save details from packet into the ppd. Hold the cc_state_lock so + * our information is consistent with anyone trying to apply the state. + */ spin_lock(&ppd->cc_state_lock); - - old_cc_state = get_cc_state(ppd); - - if (!old_cc_state) { - spin_unlock(&ppd->cc_state_lock); - kfree(new_cc_state); - return reply((struct ib_mad_hdr *)smp); - } - - *new_cc_state = *old_cc_state; - - new_cc_state->cct.ccti_limit = ccti_limit; - - entries = ppd->ccti_entries; ppd->total_cct_entry = ccti_limit + 1; - + entries = ppd->ccti_entries; for (j = 0, i = sentry; i < eentry; j++, i++) entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry); - - memcpy(new_cc_state->cct.entries, entries, - eentry * sizeof(struct ib_cc_table_entry)); - - new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED; - new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map; - memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries, - OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry)); - - rcu_assign_pointer(ppd->cc_state, new_cc_state); - spin_unlock(&ppd->cc_state_lock); - call_rcu(&old_cc_state->rcu, cc_state_reclaim); + /* now apply the information */ + apply_cc_state(ppd); -getit: return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len); } diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h similarity index 100% rename from drivers/staging/rdma/hfi1/mad.h rename to drivers/infiniband/hw/hfi1/mad.h diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c similarity index 95% rename from drivers/staging/rdma/hfi1/mmu_rb.c rename to drivers/infiniband/hw/hfi1/mmu_rb.c index 2b0e91d3093dfe..b7a80aa1ae3060 100644 --- a/drivers/staging/rdma/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -45,6 +45,7 @@ * */ #include +#include #include #include @@ -97,7 +98,6 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node) int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) { struct mmu_rb_handler *handlr; - unsigned long flags; if (!ops->invalidate) return -EINVAL; @@ -111,9 +111,9 @@ int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) INIT_HLIST_NODE(&handlr->mn.hlist); spin_lock_init(&handlr->lock); handlr->mn.ops = &mn_opts; - spin_lock_irqsave(&mmu_rb_lock, flags); - list_add_tail(&handlr->list, &mmu_rb_handlers); - spin_unlock_irqrestore(&mmu_rb_lock, flags); + spin_lock(&mmu_rb_lock); + list_add_tail_rcu(&handlr->list, &mmu_rb_handlers); + spin_unlock(&mmu_rb_lock); return mmu_notifier_register(&handlr->mn, current->mm); } @@ -130,9 +130,10 @@ void hfi1_mmu_rb_unregister(struct rb_root *root) if (current->mm) mmu_notifier_unregister(&handler->mn, current->mm); - spin_lock_irqsave(&mmu_rb_lock, flags); - list_del(&handler->list); - spin_unlock_irqrestore(&mmu_rb_lock, flags); + spin_lock(&mmu_rb_lock); + list_del_rcu(&handler->list); + spin_unlock(&mmu_rb_lock); + synchronize_rcu(); spin_lock_irqsave(&handler->lock, flags); if (!RB_EMPTY_ROOT(root)) { @@ -271,16 +272,15 @@ void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node) static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root) { struct mmu_rb_handler *handler; - unsigned long flags; - spin_lock_irqsave(&mmu_rb_lock, flags); - list_for_each_entry(handler, &mmu_rb_handlers, list) { + rcu_read_lock(); + list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) { if (handler->root == root) goto unlock; } handler = NULL; unlock: - spin_unlock_irqrestore(&mmu_rb_lock, flags); + rcu_read_unlock(); return handler; } diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h similarity index 100% rename from drivers/staging/rdma/hfi1/mmu_rb.h rename to drivers/infiniband/hw/hfi1/mmu_rb.h diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h similarity index 100% rename from drivers/staging/rdma/hfi1/opa_compat.h rename to drivers/infiniband/hw/hfi1/opa_compat.h diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c similarity index 100% rename from drivers/staging/rdma/hfi1/pcie.c rename to drivers/infiniband/hw/hfi1/pcie.c diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c similarity index 99% rename from drivers/staging/rdma/hfi1/pio.c rename to drivers/infiniband/hw/hfi1/pio.c index c67b9ad3fcf4c1..d5edb1afbb8f8b 100644 --- a/drivers/staging/rdma/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1835,8 +1835,7 @@ int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts) struct pio_vl_map *oldmap, *newmap; if (!vl_scontexts) { - /* send context 0 reserved for VL15 */ - for (i = 1; i < dd->num_send_contexts; i++) + for (i = 0; i < dd->num_send_contexts; i++) if (dd->send_contexts[i].type == SC_KERNEL) num_kernel_send_contexts++; /* truncate divide */ diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h similarity index 98% rename from drivers/staging/rdma/hfi1/pio.h rename to drivers/infiniband/hw/hfi1/pio.h index 53a08edb7f642d..464cbd27b9752c 100644 --- a/drivers/staging/rdma/hfi1/pio.h +++ b/drivers/infiniband/hw/hfi1/pio.h @@ -49,10 +49,10 @@ /* send context types */ #define SC_KERNEL 0 -#define SC_ACK 1 -#define SC_USER 2 -#define SC_VL15 3 -#define SC_MAX 4 +#define SC_VL15 1 +#define SC_ACK 2 +#define SC_USER 3 /* must be the last one: it may take all left */ +#define SC_MAX 4 /* count of send context types */ /* invalid send context index */ #define INVALID_SCI 0xff diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c similarity index 100% rename from drivers/staging/rdma/hfi1/pio_copy.c rename to drivers/infiniband/hw/hfi1/pio_copy.c diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c similarity index 98% rename from drivers/staging/rdma/hfi1/platform.c rename to drivers/infiniband/hw/hfi1/platform.c index 8fe8a205b5bbb9..03df9322f86298 100644 --- a/drivers/staging/rdma/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -87,6 +87,17 @@ void free_platform_config(struct hfi1_devdata *dd) */ } +void get_port_type(struct hfi1_pportdata *ppd) +{ + int ret; + + ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_PORT_TYPE, &ppd->port_type, + 4); + if (ret) + ppd->port_type = PORT_TYPE_UNKNOWN; +} + int set_qsfp_tx(struct hfi1_pportdata *ppd, int on) { u8 tx_ctrl_byte = on ? 0x0 : 0xF; @@ -529,7 +540,8 @@ static void apply_tunings( /* Enable external device config if channel is limiting active */ read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, GENERAL_CONFIG, &config_data); - config_data |= limiting_active; + config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT); + config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT); ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, GENERAL_CONFIG, config_data); if (ret != HCMD_SUCCESS) @@ -542,7 +554,8 @@ static void apply_tunings( /* Pass tuning method to 8051 */ read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, &config_data); - config_data |= tuning_method; + config_data &= ~(0xff << TUNING_METHOD_SHIFT); + config_data |= ((u32)tuning_method << TUNING_METHOD_SHIFT); ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, config_data); if (ret != HCMD_SUCCESS) @@ -564,8 +577,8 @@ static void apply_tunings( ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, GENERAL_CONFIG, &config_data); /* Clear, then set the external device config field */ - config_data &= ~(0xFF << 24); - config_data |= (external_device_config << 24); + config_data &= ~(u32)0xFF; + config_data |= external_device_config; ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, GENERAL_CONFIG, config_data); if (ret != HCMD_SUCCESS) @@ -784,12 +797,6 @@ void tune_serdes(struct hfi1_pportdata *ppd) return; } - ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, - PORT_TABLE_PORT_TYPE, &ppd->port_type, - 4); - if (ret) - ppd->port_type = PORT_TYPE_UNKNOWN; - switch (ppd->port_type) { case PORT_TYPE_DISCONNECTED: ppd->offline_disabled_reason = diff --git a/drivers/staging/rdma/hfi1/platform.h b/drivers/infiniband/hw/hfi1/platform.h similarity index 99% rename from drivers/staging/rdma/hfi1/platform.h rename to drivers/infiniband/hw/hfi1/platform.h index 19620cf546d5fb..e2c21613c326fb 100644 --- a/drivers/staging/rdma/hfi1/platform.h +++ b/drivers/infiniband/hw/hfi1/platform.h @@ -298,6 +298,7 @@ enum link_tuning_encoding { /* platform.c */ void get_platform_config(struct hfi1_devdata *dd); void free_platform_config(struct hfi1_devdata *dd); +void get_port_type(struct hfi1_pportdata *ppd); int set_qsfp_tx(struct hfi1_pportdata *ppd, int on); void tune_serdes(struct hfi1_pportdata *ppd); diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c similarity index 99% rename from drivers/staging/rdma/hfi1/qp.c rename to drivers/infiniband/hw/hfi1/qp.c index 91eb42316df922..1a942ffba4cb08 100644 --- a/drivers/staging/rdma/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -161,9 +160,6 @@ static inline int opa_mtu_enum_to_int(int mtu) * This function is what we would push to the core layer if we wanted to be a * "first class citizen". Instead we hide this here and rely on Verbs ULPs * to blindly pass the MTU enum value from the PathRecord to us. - * - * The actual flag used to determine "8k MTU" will change and is currently - * unknown. */ static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) { @@ -516,6 +512,7 @@ static void iowait_wakeup(struct iowait *wait, int reason) static void iowait_sdma_drained(struct iowait *wait) { struct rvt_qp *qp = iowait_to_qp(wait); + unsigned long flags; /* * This happens when the send engine notes @@ -523,12 +520,12 @@ static void iowait_sdma_drained(struct iowait *wait) * do the flush work until that QP's * sdma work has finished. */ - spin_lock(&qp->s_lock); + spin_lock_irqsave(&qp->s_lock, flags); if (qp->s_flags & RVT_S_WAIT_DMA) { qp->s_flags &= ~RVT_S_WAIT_DMA; hfi1_schedule_send(qp); } - spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); } /** diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h similarity index 100% rename from drivers/staging/rdma/hfi1/qp.h rename to drivers/infiniband/hw/hfi1/qp.h diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c similarity index 100% rename from drivers/staging/rdma/hfi1/qsfp.c rename to drivers/infiniband/hw/hfi1/qsfp.c diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h similarity index 100% rename from drivers/staging/rdma/hfi1/qsfp.h rename to drivers/infiniband/hw/hfi1/qsfp.h diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c similarity index 100% rename from drivers/staging/rdma/hfi1/rc.c rename to drivers/infiniband/hw/hfi1/rc.c diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c similarity index 100% rename from drivers/staging/rdma/hfi1/ruc.c rename to drivers/infiniband/hw/hfi1/ruc.c diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c similarity index 99% rename from drivers/staging/rdma/hfi1/sdma.c rename to drivers/infiniband/hw/hfi1/sdma.c index abb8ebc1fcac90..f9befc05b3494c 100644 --- a/drivers/staging/rdma/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -134,6 +134,7 @@ static const char * const sdma_state_names[] = { [sdma_state_s99_running] = "s99_Running", }; +#ifdef CONFIG_SDMA_VERBOSITY static const char * const sdma_event_names[] = { [sdma_event_e00_go_hw_down] = "e00_GoHwDown", [sdma_event_e10_go_hw_start] = "e10_GoHwStart", @@ -150,6 +151,7 @@ static const char * const sdma_event_names[] = { [sdma_event_e85_link_down] = "e85_LinkDown", [sdma_event_e90_sw_halted] = "e90_SwHalted", }; +#endif static const struct sdma_set_state_action sdma_action_table[] = { [sdma_state_s00_hw_down] = { @@ -376,7 +378,7 @@ static inline void complete_tx(struct sdma_engine *sde, sdma_txclean(sde->dd, tx); if (complete) (*complete)(tx, res); - if (iowait_sdma_dec(wait) && wait) + if (wait && iowait_sdma_dec(wait)) iowait_drain_wakeup(wait); } diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h similarity index 100% rename from drivers/staging/rdma/hfi1/sdma.h rename to drivers/infiniband/hw/hfi1/sdma.h diff --git a/drivers/staging/rdma/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h similarity index 100% rename from drivers/staging/rdma/hfi1/sdma_txreq.h rename to drivers/infiniband/hw/hfi1/sdma_txreq.h diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c similarity index 99% rename from drivers/staging/rdma/hfi1/sysfs.c rename to drivers/infiniband/hw/hfi1/sysfs.c index 8cd6df8634ad2c..91fc2aed6aed93 100644 --- a/drivers/staging/rdma/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -721,8 +721,8 @@ int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, } dd_dev_info(dd, - "IB%u: Congestion Control Agent enabled for port %d\n", - dd->unit, port_num); + "Congestion Control Agent enabled for port %d\n", + port_num); return 0; diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c similarity index 97% rename from drivers/staging/rdma/hfi1/trace.c rename to drivers/infiniband/hw/hfi1/trace.c index 8b62fefcf90336..79b2952c0dfb31 100644 --- a/drivers/staging/rdma/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -66,6 +66,7 @@ u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr) #define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x" #define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x" #define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x" +#define IETH_PRN "ieth rkey 0x%.8x" #define ATOMICACKETH_PRN "origdata %lld" #define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld" @@ -166,6 +167,12 @@ const char *parse_everbs_hdrs( be32_to_cpu(eh->ud.deth[0]), be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK); break; + /* ieth */ + case OP(RC, SEND_LAST_WITH_INVALIDATE): + case OP(RC, SEND_ONLY_WITH_INVALIDATE): + trace_seq_printf(p, IETH_PRN, + be32_to_cpu(eh->ieth)); + break; } trace_seq_putc(p, 0); return ret; @@ -233,3 +240,4 @@ __hfi1_trace_fn(FIRMWARE); __hfi1_trace_fn(RCVCTRL); __hfi1_trace_fn(TID); __hfi1_trace_fn(MMU); +__hfi1_trace_fn(IOCTL); diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h similarity index 99% rename from drivers/staging/rdma/hfi1/trace.h rename to drivers/infiniband/hw/hfi1/trace.h index 963dc948c38a4f..28c1d083288632 100644 --- a/drivers/staging/rdma/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -74,8 +74,8 @@ __print_symbolic(etype, \ TRACE_EVENT(hfi1_rcvhdr, TP_PROTO(struct hfi1_devdata *dd, - u64 eflags, u32 ctxt, + u64 eflags, u32 etype, u32 hlen, u32 tlen, @@ -392,6 +392,8 @@ __print_symbolic(opcode, \ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ ib_opcode_name(RC_COMPARE_SWAP), \ ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE), \ + ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE), \ ib_opcode_name(UC_SEND_FIRST), \ ib_opcode_name(UC_SEND_MIDDLE), \ ib_opcode_name(UC_SEND_LAST), \ @@ -1341,6 +1343,7 @@ __hfi1_trace_def(FIRMWARE); __hfi1_trace_def(RCVCTRL); __hfi1_trace_def(TID); __hfi1_trace_def(MMU); +__hfi1_trace_def(IOCTL); #define hfi1_cdbg(which, fmt, ...) \ __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c similarity index 100% rename from drivers/staging/rdma/hfi1/twsi.c rename to drivers/infiniband/hw/hfi1/twsi.c diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h similarity index 100% rename from drivers/staging/rdma/hfi1/twsi.h rename to drivers/infiniband/hw/hfi1/twsi.h diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c similarity index 100% rename from drivers/staging/rdma/hfi1/uc.c rename to drivers/infiniband/hw/hfi1/uc.c diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c similarity index 100% rename from drivers/staging/rdma/hfi1/ud.c rename to drivers/infiniband/hw/hfi1/ud.c diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c similarity index 100% rename from drivers/staging/rdma/hfi1/user_exp_rcv.c rename to drivers/infiniband/hw/hfi1/user_exp_rcv.c diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h similarity index 100% rename from drivers/staging/rdma/hfi1/user_exp_rcv.h rename to drivers/infiniband/hw/hfi1/user_exp_rcv.h diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c similarity index 100% rename from drivers/staging/rdma/hfi1/user_pages.c rename to drivers/infiniband/hw/hfi1/user_pages.c diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c similarity index 99% rename from drivers/staging/rdma/hfi1/user_sdma.c rename to drivers/infiniband/hw/hfi1/user_sdma.c index 0014c9c0e967a5..29f4795f866cd1 100644 --- a/drivers/staging/rdma/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -166,6 +166,8 @@ static unsigned initial_pkt_count = 8; #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ +struct sdma_mmu_node; + struct user_sdma_iovec { struct list_head list; struct iovec iov; @@ -178,6 +180,7 @@ struct user_sdma_iovec { * which we last left off. */ u64 offset; + struct sdma_mmu_node *node; }; #define SDMA_CACHE_NODE_EVICT BIT(0) @@ -507,6 +510,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, struct sdma_req_info info; struct user_sdma_request *req; u8 opcode, sc, vl; + int req_queued = 0; if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { hfi1_cdbg( @@ -703,6 +707,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); atomic_inc(&pq->n_reqs); + req_queued = 1; /* Send the first N packets in the request to buy us some time */ ret = user_sdma_send_pkts(req, pcount); if (unlikely(ret < 0 && ret != -EBUSY)) { @@ -747,7 +752,8 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, return 0; free_req: user_sdma_free_request(req, true); - pq_update(pq); + if (req_queued) + pq_update(pq); set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); return ret; } @@ -1153,6 +1159,7 @@ static int pin_vector_pages(struct user_sdma_request *req, } iovec->pages = node->pages; iovec->npages = npages; + iovec->node = node; ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); if (ret) { @@ -1519,18 +1526,13 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) } if (req->data_iovs) { struct sdma_mmu_node *node; - struct mmu_rb_node *mnode; int i; for (i = 0; i < req->data_iovs; i++) { - mnode = hfi1_mmu_rb_search( - &req->pq->sdma_rb_root, - (unsigned long)req->iovs[i].iov.iov_base, - req->iovs[i].iov.iov_len); - if (!mnode || IS_ERR(mnode)) + node = req->iovs[i].node; + if (!node) continue; - node = container_of(mnode, struct sdma_mmu_node, rb); if (unpin) hfi1_mmu_rb_remove(&req->pq->sdma_rb_root, &node->rb); diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h similarity index 100% rename from drivers/staging/rdma/hfi1/user_sdma.h rename to drivers/infiniband/hw/hfi1/user_sdma.h diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c similarity index 99% rename from drivers/staging/rdma/hfi1/verbs.c rename to drivers/infiniband/hw/hfi1/verbs.c index 9cdc85fa366f19..849c4b9399d428 100644 --- a/drivers/staging/rdma/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include "hfi.h" @@ -336,6 +335,8 @@ const u8 hdr_len_by_opcode[256] = { [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4, [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28, [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, @@ -946,7 +947,6 @@ static int pio_wait(struct rvt_qp *qp, dev->n_piowait += !!(flag & RVT_S_WAIT_PIO); dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN); - dev->n_piowait++; qp->s_flags |= flag; was_empty = list_empty(&sc->piowait); list_add_tail(&priv->s_iowait.list, &sc->piowait); diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h similarity index 99% rename from drivers/staging/rdma/hfi1/verbs.h rename to drivers/infiniband/hw/hfi1/verbs.h index 3ee223983b20cf..48835677562780 100644 --- a/drivers/staging/rdma/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -152,6 +152,7 @@ union ib_ehdrs { } at; __be32 imm_data; __be32 aeth; + __be32 ieth; struct ib_atomic_eth atomic_eth; } __packed; diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c similarity index 100% rename from drivers/staging/rdma/hfi1/verbs_txreq.c rename to drivers/infiniband/hw/hfi1/verbs_txreq.c diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h similarity index 100% rename from drivers/staging/rdma/hfi1/verbs_txreq.h rename to drivers/infiniband/hw/hfi1/verbs_txreq.h diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 4a740f7a0519bc..02a735b64208c8 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2361,58 +2361,130 @@ static int i40iw_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static const char * const i40iw_hw_stat_names[] = { + // 32bit names + [I40IW_HW_STAT_INDEX_IP4RXDISCARD] = "ip4InDiscards", + [I40IW_HW_STAT_INDEX_IP4RXTRUNC] = "ip4InTruncatedPkts", + [I40IW_HW_STAT_INDEX_IP4TXNOROUTE] = "ip4OutNoRoutes", + [I40IW_HW_STAT_INDEX_IP6RXDISCARD] = "ip6InDiscards", + [I40IW_HW_STAT_INDEX_IP6RXTRUNC] = "ip6InTruncatedPkts", + [I40IW_HW_STAT_INDEX_IP6TXNOROUTE] = "ip6OutNoRoutes", + [I40IW_HW_STAT_INDEX_TCPRTXSEG] = "tcpRetransSegs", + [I40IW_HW_STAT_INDEX_TCPRXOPTERR] = "tcpInOptErrors", + [I40IW_HW_STAT_INDEX_TCPRXPROTOERR] = "tcpInProtoErrors", + // 64bit names + [I40IW_HW_STAT_INDEX_IP4RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4InOctets", + [I40IW_HW_STAT_INDEX_IP4RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4InPkts", + [I40IW_HW_STAT_INDEX_IP4RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4InReasmRqd", + [I40IW_HW_STAT_INDEX_IP4RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4InMcastPkts", + [I40IW_HW_STAT_INDEX_IP4TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4OutOctets", + [I40IW_HW_STAT_INDEX_IP4TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4OutPkts", + [I40IW_HW_STAT_INDEX_IP4TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4OutSegRqd", + [I40IW_HW_STAT_INDEX_IP4TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4OutMcastPkts", + [I40IW_HW_STAT_INDEX_IP6RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6InOctets", + [I40IW_HW_STAT_INDEX_IP6RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6InPkts", + [I40IW_HW_STAT_INDEX_IP6RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6InReasmRqd", + [I40IW_HW_STAT_INDEX_IP6RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6InMcastPkts", + [I40IW_HW_STAT_INDEX_IP6TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6OutOctets", + [I40IW_HW_STAT_INDEX_IP6TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6OutPkts", + [I40IW_HW_STAT_INDEX_IP6TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6OutSegRqd", + [I40IW_HW_STAT_INDEX_IP6TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6OutMcastPkts", + [I40IW_HW_STAT_INDEX_TCPRXSEGS + I40IW_HW_STAT_INDEX_MAX_32] = + "tcpInSegs", + [I40IW_HW_STAT_INDEX_TCPTXSEG + I40IW_HW_STAT_INDEX_MAX_32] = + "tcpOutSegs", + [I40IW_HW_STAT_INDEX_RDMARXRDS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwInRdmaReads", + [I40IW_HW_STAT_INDEX_RDMARXSNDS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwInRdmaSends", + [I40IW_HW_STAT_INDEX_RDMARXWRS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwInRdmaWrites", + [I40IW_HW_STAT_INDEX_RDMATXRDS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwOutRdmaReads", + [I40IW_HW_STAT_INDEX_RDMATXSNDS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwOutRdmaSends", + [I40IW_HW_STAT_INDEX_RDMATXWRS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwOutRdmaWrites", + [I40IW_HW_STAT_INDEX_RDMAVBND + I40IW_HW_STAT_INDEX_MAX_32] = + "iwRdmaBnd", + [I40IW_HW_STAT_INDEX_RDMAVINV + I40IW_HW_STAT_INDEX_MAX_32] = + "iwRdmaInv" +}; + /** - * i40iw_get_protocol_stats - Populates the rdma_stats structure - * @ibdev: ib dev struct - * @stats: iw protocol stats struct + * i40iw_alloc_hw_stats - Allocate a hw stats structure + * @ibdev: device pointer from stack + * @port_num: port number */ -static int i40iw_get_protocol_stats(struct ib_device *ibdev, - union rdma_protocol_stats *stats) +static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) +{ + struct i40iw_device *iwdev = to_iwdev(ibdev); + struct i40iw_sc_dev *dev = &iwdev->sc_dev; + int num_counters = I40IW_HW_STAT_INDEX_MAX_32 + + I40IW_HW_STAT_INDEX_MAX_64; + unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN; + + BUILD_BUG_ON(ARRAY_SIZE(i40iw_hw_stat_names) != + (I40IW_HW_STAT_INDEX_MAX_32 + + I40IW_HW_STAT_INDEX_MAX_64)); + + /* + * PFs get the default update lifespan, but VFs only update once + * per second + */ + if (!dev->is_pf) + lifespan = 1000; + return rdma_alloc_hw_stats_struct(i40iw_hw_stat_names, num_counters, + lifespan); +} + +/** + * i40iw_get_hw_stats - Populates the rdma_hw_stats structure + * @ibdev: device pointer from stack + * @stats: stats pointer from stack + * @port_num: port number + * @index: which hw counter the stack is requesting we update + */ +static int i40iw_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port_num, int index) { struct i40iw_device *iwdev = to_iwdev(ibdev); struct i40iw_sc_dev *dev = &iwdev->sc_dev; struct i40iw_dev_pestat *devstat = &dev->dev_pestat; struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats; - struct timespec curr_time; - static struct timespec last_rd_time = {0, 0}; unsigned long flags; - curr_time = current_kernel_time(); - memset(stats, 0, sizeof(*stats)); - if (dev->is_pf) { spin_lock_irqsave(&devstat->stats_lock, flags); devstat->ops.iw_hw_stat_read_all(devstat, &devstat->hw_stats); spin_unlock_irqrestore(&devstat->stats_lock, flags); } else { - if (((u64)curr_time.tv_sec - (u64)last_rd_time.tv_sec) > 1) - if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats)) - return -ENOSYS; + if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats)) + return -ENOSYS; } - stats->iw.ipInReceives = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXPKTS]; - stats->iw.ipInTruncatedPkts = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] + - hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC]; - stats->iw.ipInDiscards = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] + - hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD]; - stats->iw.ipOutNoRoutes = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] + - hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE]; - stats->iw.ipReasmReqds = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS]; - stats->iw.ipFragCreates = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS]; - stats->iw.ipInMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS]; - stats->iw.ipOutMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXMCPKTS]; - stats->iw.tcpOutSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPTXSEG]; - stats->iw.tcpInSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPRXSEGS]; - stats->iw.tcpRetransSegs = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_TCPRTXSEG]; - - last_rd_time = curr_time; - return 0; + memcpy(&stats->value[0], &hw_stats, sizeof(*hw_stats)); + + return stats->num_counters; } /** @@ -2551,7 +2623,8 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev iwibdev->ibdev.get_dma_mr = i40iw_get_dma_mr; iwibdev->ibdev.reg_user_mr = i40iw_reg_user_mr; iwibdev->ibdev.dereg_mr = i40iw_dereg_mr; - iwibdev->ibdev.get_protocol_stats = i40iw_get_protocol_stats; + iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats; + iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats; iwibdev->ibdev.query_device = i40iw_query_device; iwibdev->ibdev.create_ah = i40iw_create_ah; iwibdev->ibdev.destroy_ah = i40iw_destroy_ah; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 82d7c4bf5970c1..ce4034071f9c4d 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -1308,21 +1308,6 @@ static const struct qib_hwerror_msgs qib_7322p_error_msgs[] = { SYM_LSB(IntMask, fldname##17IntMask)), \ .msg = #fldname "_C", .sz = sizeof(#fldname "_C") } -static const struct qib_hwerror_msgs qib_7322_intr_msgs[] = { - INTR_AUTO_P(SDmaInt), - INTR_AUTO_P(SDmaProgressInt), - INTR_AUTO_P(SDmaIdleInt), - INTR_AUTO_P(SDmaCleanupDone), - INTR_AUTO_C(RcvUrg), - INTR_AUTO_P(ErrInt), - INTR_AUTO(ErrInt), /* non-port-specific errs */ - INTR_AUTO(AssertGPIOInt), - INTR_AUTO_P(SendDoneInt), - INTR_AUTO(SendBufAvailInt), - INTR_AUTO_C(RcvAvail), - { .mask = 0, .sz = 0 } -}; - #define TXSYMPTOM_AUTO_P(fldname) \ { .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \ .msg = #fldname, .sz = sizeof(#fldname) } diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 0bd18375d7df97..d2ac29861af5b6 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -1172,11 +1172,13 @@ static int pma_get_classportinfo(struct ib_pma_mad *pmp, * Set the most significant bit of CM2 to indicate support for * congestion statistics */ - p->reserved[0] = dd->psxmitwait_supported << 7; + ib_set_cpi_capmask2(p, + dd->psxmitwait_supported << + (31 - IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE)); /* * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. */ - p->resp_time_value = 18; + ib_set_cpi_resp_time(p, 18); return reply((struct ib_smp *) pmp); } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 6888f03c6d6100..4f878151f81ff4 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -159,6 +159,7 @@ struct qib_other_headers { } at; __be32 imm_data; __be32 aeth; + __be32 ieth; struct ib_atomic_eth atomic_eth; } u; } __packed; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index b1ffc8b4a6c0a5..6ca6fa80dd6e78 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -525,6 +525,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) return PTR_ERR(task); } + set_user_nice(task, MIN_NICE); cpu = cpumask_first(cpumask_of_node(rdi->dparms.node)); kthread_bind(task, cpu); wake_up_process(task); diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 0ff765bfd61907..0f4d4500f45e25 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -124,11 +124,13 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd, int count) { int m, i = 0; + struct rvt_dev_info *dev = ib_to_rvt(pd->device); mr->mapsz = 0; m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ; for (; i < m; i++) { - mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL); + mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL, + dev->dparms.node); if (!mr->map[i]) { rvt_deinit_mregion(mr); return -ENOMEM; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 0f12c211c3853f..5fa4d4d81ee0cf 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -397,6 +397,7 @@ static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn) static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) { unsigned n; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) rvt_put_ss(&qp->s_rdma_read_sge); @@ -431,7 +432,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) if (qp->ibqp.qp_type != IB_QPT_RC) return; - for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) { + for (n = 0; n < rvt_max_atomic(rdi); n++) { struct rvt_ack_entry *e = &qp->s_ack_queue[n]; if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && @@ -569,7 +570,12 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_ssn = 1; qp->s_lsn = 0; qp->s_mig_state = IB_MIG_MIGRATED; - memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + if (qp->s_ack_queue) + memset( + qp->s_ack_queue, + 0, + rvt_max_atomic(rdi) * + sizeof(*qp->s_ack_queue)); qp->r_head_ack_queue = 0; qp->s_tail_ack_queue = 0; qp->s_num_rd_atomic = 0; @@ -653,9 +659,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, if (gfp == GFP_NOIO) swq = __vmalloc( (init_attr->cap.max_send_wr + 1) * sz, - gfp, PAGE_KERNEL); + gfp | __GFP_ZERO, PAGE_KERNEL); else - swq = vmalloc_node( + swq = vzalloc_node( (init_attr->cap.max_send_wr + 1) * sz, rdi->dparms.node); if (!swq) @@ -677,6 +683,16 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, goto bail_swq; RCU_INIT_POINTER(qp->next, NULL); + if (init_attr->qp_type == IB_QPT_RC) { + qp->s_ack_queue = + kzalloc_node( + sizeof(*qp->s_ack_queue) * + rvt_max_atomic(rdi), + gfp, + rdi->dparms.node); + if (!qp->s_ack_queue) + goto bail_qp; + } /* * Driver needs to set up it's private QP structure and do any @@ -704,9 +720,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->r_rq.wq = __vmalloc( sizeof(struct rvt_rwq) + qp->r_rq.size * sz, - gfp, PAGE_KERNEL); + gfp | __GFP_ZERO, PAGE_KERNEL); else - qp->r_rq.wq = vmalloc_node( + qp->r_rq.wq = vzalloc_node( sizeof(struct rvt_rwq) + qp->r_rq.size * sz, rdi->dparms.node); @@ -857,6 +873,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, rdi->driver_f.qp_priv_free(rdi, qp); bail_qp: + kfree(qp->s_ack_queue); kfree(qp); bail_swq: @@ -1284,6 +1301,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp) vfree(qp->r_rq.wq); vfree(qp->s_wq); rdi->driver_f.qp_priv_free(rdi, qp); + kfree(qp->s_ack_queue); kfree(qp); return 0; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index caec8e9c46669b..bab7db6fa9abf8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -92,6 +92,8 @@ enum { IPOIB_FLAG_UMCAST = 10, IPOIB_STOP_NEIGH_GC = 11, IPOIB_NEIGH_TBL_FLUSH = 12, + IPOIB_FLAG_DEV_ADDR_SET = 13, + IPOIB_FLAG_DEV_ADDR_CTRL = 14, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -392,6 +394,7 @@ struct ipoib_dev_priv { struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; unsigned max_send_sge; + bool sm_fullmember_sendonly_support; }; struct ipoib_ah { @@ -476,6 +479,7 @@ void ipoib_reap_ah(struct work_struct *work); void ipoib_mark_paths_invalid(struct net_device *dev); void ipoib_flush_paths(struct net_device *dev); +int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 418e5a1c8744d5..45c40a17d6a6ff 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -997,6 +997,106 @@ static inline int update_child_pkey(struct ipoib_dev_priv *priv) return 0; } +/* + * returns true if the device address of the ipoib interface has changed and the + * new address is a valid one (i.e in the gid table), return false otherwise. + */ +static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) +{ + union ib_gid search_gid; + union ib_gid gid0; + union ib_gid *netdev_gid; + int err; + u16 index; + u8 port; + bool ret = false; + + netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4); + if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL)) + return false; + + netif_addr_lock(priv->dev); + + /* The subnet prefix may have changed, update it now so we won't have + * to do it later + */ + priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix; + netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix; + search_gid.global.subnet_prefix = gid0.global.subnet_prefix; + + search_gid.global.interface_id = priv->local_gid.global.interface_id; + + netif_addr_unlock(priv->dev); + + err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB, + priv->dev, &port, &index); + + netif_addr_lock(priv->dev); + + if (search_gid.global.interface_id != + priv->local_gid.global.interface_id) + /* There was a change while we were looking up the gid, bail + * here and let the next work sort this out + */ + goto out; + + /* The next section of code needs some background: + * Per IB spec the port GUID can't change if the HCA is powered on. + * port GUID is the basis for GID at index 0 which is the basis for + * the default device address of a ipoib interface. + * + * so it seems the flow should be: + * if user_changed_dev_addr && gid in gid tbl + * set bit dev_addr_set + * return true + * else + * return false + * + * The issue is that there are devices that don't follow the spec, + * they change the port GUID when the HCA is powered, so in order + * not to break userspace applications, We need to check if the + * user wanted to control the device address and we assume that + * if he sets the device address back to be based on GID index 0, + * he no longer wishs to control it. + * + * If the user doesn't control the the device address, + * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means + * the port GUID has changed and GID at index 0 has changed + * so we need to change priv->local_gid and priv->dev->dev_addr + * to reflect the new GID. + */ + if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + if (!err && port == priv->port) { + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + if (index == 0) + clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL, + &priv->flags); + else + set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags); + ret = true; + } else { + ret = false; + } + } else { + if (!err && port == priv->port) { + ret = true; + } else { + if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) { + memcpy(&priv->local_gid, &gid0, + sizeof(priv->local_gid)); + memcpy(priv->dev->dev_addr + 4, &gid0, + sizeof(priv->local_gid)); + ret = true; + } + } + } + +out: + netif_addr_unlock(priv->dev); + + return ret; +} + static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level, int nesting) @@ -1018,6 +1118,9 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && level != IPOIB_FLUSH_HEAVY) { + /* Make sure the dev_addr is set even if not flushing */ + if (level == IPOIB_FLUSH_LIGHT) + ipoib_dev_addr_changed_valid(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } @@ -1029,7 +1132,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, update_parent_pkey(priv); else update_child_pkey(priv); - } + } else if (level == IPOIB_FLUSH_LIGHT) + ipoib_dev_addr_changed_valid(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; } @@ -1081,7 +1185,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_up(dev); - ipoib_mcast_restart_task(&priv->restart_task); + if (ipoib_dev_addr_changed_valid(priv)) + ipoib_mcast_restart_task(&priv->restart_task); } } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index b940ef1c19c70e..2d7c1634664826 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -99,6 +99,7 @@ static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); +static int ipoib_set_mac(struct net_device *dev, void *addr); static struct ib_client ipoib_client = { .name = "ipoib", @@ -117,6 +118,8 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + priv->sm_fullmember_sendonly_support = false; + if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; @@ -629,6 +632,77 @@ void ipoib_mark_paths_invalid(struct net_device *dev) spin_unlock_irq(&priv->lock); } +struct classport_info_context { + struct ipoib_dev_priv *priv; + struct completion done; + struct ib_sa_query *sa_query; +}; + +static void classport_info_query_cb(int status, struct ib_class_port_info *rec, + void *context) +{ + struct classport_info_context *cb_ctx = context; + struct ipoib_dev_priv *priv; + + WARN_ON(!context); + + priv = cb_ctx->priv; + + if (status || !rec) { + pr_debug("device: %s failed query classport_info status: %d\n", + priv->dev->name, status); + /* keeps the default, will try next mcast_restart */ + priv->sm_fullmember_sendonly_support = false; + goto out; + } + + if (ib_get_cpi_capmask2(rec) & + IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) { + pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n", + priv->dev->name); + priv->sm_fullmember_sendonly_support = true; + } else { + pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n", + priv->dev->name); + priv->sm_fullmember_sendonly_support = false; + } + +out: + complete(&cb_ctx->done); +} + +int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv) +{ + struct classport_info_context *callback_context; + int ret; + + callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL); + if (!callback_context) + return -ENOMEM; + + callback_context->priv = priv; + init_completion(&callback_context->done); + + ret = ib_sa_classport_info_rec_query(&ipoib_sa_client, + priv->ca, priv->port, 3000, + GFP_KERNEL, + classport_info_query_cb, + callback_context, + &callback_context->sa_query); + if (ret < 0) { + pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n", + priv->dev->name, ret); + kfree(callback_context); + return ret; + } + + /* waiting for the callback to finish before returnning */ + wait_for_completion(&callback_context->done); + kfree(callback_context); + + return ret; +} + void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -1649,6 +1723,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_get_vf_config = ipoib_get_vf_config, .ndo_get_vf_stats = ipoib_get_vf_stats, .ndo_set_vf_guid = ipoib_set_vf_guid, + .ndo_set_mac_address = ipoib_set_mac, }; static const struct net_device_ops ipoib_netdev_ops_vf = { @@ -1771,6 +1846,70 @@ int ipoib_add_umcast_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_umcast); } +static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) +{ + struct ipoib_dev_priv *child_priv; + struct net_device *netdev = priv->dev; + + netif_addr_lock(netdev); + + memcpy(&priv->local_gid.global.interface_id, + &gid->global.interface_id, + sizeof(gid->global.interface_id)); + memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid)); + clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + + netif_addr_unlock(netdev); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + down_read(&priv->vlan_rwsem); + list_for_each_entry(child_priv, &priv->child_intfs, list) + set_base_guid(child_priv, gid); + up_read(&priv->vlan_rwsem); + } +} + +static int ipoib_check_lladdr(struct net_device *dev, + struct sockaddr_storage *ss) +{ + union ib_gid *gid = (union ib_gid *)(ss->__data + 4); + int ret = 0; + + netif_addr_lock(dev); + + /* Make sure the QPN, reserved and subnet prefix match the current + * lladdr, it also makes sure the lladdr is unicast. + */ + if (memcmp(dev->dev_addr, ss->__data, + 4 + sizeof(gid->global.subnet_prefix)) || + gid->global.interface_id == 0) + ret = -EINVAL; + + netif_addr_unlock(dev); + + return ret; +} + +static int ipoib_set_mac(struct net_device *dev, void *addr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sockaddr_storage *ss = addr; + int ret; + + if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) + return -EBUSY; + + ret = ipoib_check_lladdr(dev, ss); + if (ret) + return ret; + + set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); + + queue_work(ipoib_workqueue, &priv->flush_light); + + return 0; +} + static ssize_t create_child(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -1894,6 +2033,7 @@ static struct net_device *ipoib_add_port(const char *format, goto device_init_failed; } else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); result = ipoib_dev_init(priv->dev, hca, port); if (result < 0) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 25889311b1e9c8..82fbc9442608f6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -64,6 +64,9 @@ struct ipoib_mcast_iter { unsigned int send_only; }; +/* join state that allows creating mcg with sendonly member request */ +#define SENDONLY_FULLMEMBER_JOIN 8 + /* * This should be called with the priv->lock held */ @@ -326,12 +329,23 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; + int ret; if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } + /* + * Check if can send sendonly MCG's with sendonly-fullmember join state. + * It done here after the successfully join to the broadcast group, + * because the broadcast group must always be joined first and is always + * re-joined if the SM changes substantially. + */ + ret = ipoib_check_sm_sendonly_fullmember_support(priv); + if (ret < 0) + pr_debug("%s failed query sm support for sendonly-fullmember (ret: %d)\n", + priv->dev->name, ret); /* * Take rtnl_lock to avoid racing with ipoib_stop() and @@ -515,22 +529,20 @@ static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast) rec.hop_limit = priv->broadcast->mcmember.hop_limit; /* - * Send-only IB Multicast joins do not work at the core - * IB layer yet, so we can't use them here. However, - * we are emulating an Ethernet multicast send, which - * does not require a multicast subscription and will - * still send properly. The most appropriate thing to + * Send-only IB Multicast joins work at the core IB layer but + * require specific SM support. + * We can use such joins here only if the current SM supports that feature. + * However, if not, we emulate an Ethernet multicast send, + * which does not require a multicast subscription and will + * still send properly. The most appropriate thing to * do is to create the group if it doesn't exist as that * most closely emulates the behavior, from a user space - * application perspecitive, of Ethernet multicast - * operation. For now, we do a full join, maybe later - * when the core IB layers support send only joins we - * will use them. + * application perspective, of Ethernet multicast operation. */ -#if 0 - if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) - rec.join_state = 4; -#endif + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + priv->sm_fullmember_sendonly_support) + /* SM supports sendonly-fullmember, otherwise fallback to full-member */ + rec.join_state = SENDONLY_FULLMEMBER_JOIN; } spin_unlock_irq(&priv->lock); @@ -570,11 +582,13 @@ void ipoib_mcast_join_task(struct work_struct *work) return; } priv->local_lid = port_attr.lid; + netif_addr_lock(dev); - if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) - ipoib_warn(priv, "ib_query_gid() failed\n"); - else - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + netif_addr_unlock(dev); + return; + } + netif_addr_unlock(dev); spin_lock_irq(&priv->lock); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index b809c373e40e54..1e7cbbaa15bd0c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -307,5 +307,8 @@ void ipoib_event(struct ib_event_handler *handler, queue_work(ipoib_workqueue, &priv->flush_normal); } else if (record->event == IB_EVENT_PKEY_CHANGE) { queue_work(ipoib_workqueue, &priv->flush_heavy); + } else if (record->event == IB_EVENT_GID_CHANGE && + !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + queue_work(ipoib_workqueue, &priv->flush_light); } } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index fca1a882de27d1..64a35595eab837 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -68,6 +68,8 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, priv->pkey = pkey; memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); + memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); priv->dev->broadcast[8] = pkey >> 8; priv->dev->broadcast[9] = pkey & 0xff; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 2843f1ae75bdf5..887ebadd47745b 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -254,8 +254,8 @@ static void srpt_get_class_port_info(struct ib_dm_mad *mad) memset(cif, 0, sizeof(*cif)); cif->base_version = 1; cif->class_version = 1; - cif->resp_time_value = 20; + ib_set_cpi_resp_time(cif, 20); mad->mad_hdr.status = 0; } diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 5bac28a3944e2d..7c197d1a123169 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -66,8 +66,6 @@ source "drivers/staging/nvec/Kconfig" source "drivers/staging/media/Kconfig" -source "drivers/staging/rdma/Kconfig" - source "drivers/staging/android/Kconfig" source "drivers/staging/board/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index a954242b0f2c20..a470c72761422b 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_FB_XGI) += xgifb/ obj-$(CONFIG_USB_EMXX) += emxx_udc/ obj-$(CONFIG_SPEAKUP) += speakup/ obj-$(CONFIG_MFD_NVEC) += nvec/ -obj-$(CONFIG_STAGING_RDMA) += rdma/ obj-$(CONFIG_ANDROID) += android/ obj-$(CONFIG_STAGING_BOARD) += board/ obj-$(CONFIG_LTE_GDM724X) += gdm724x/ diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig deleted file mode 100644 index f1f3ecadf0fb0d..00000000000000 --- a/drivers/staging/rdma/Kconfig +++ /dev/null @@ -1,27 +0,0 @@ -menuconfig STAGING_RDMA - tristate "RDMA staging drivers" - depends on INFINIBAND - depends on PCI || BROKEN - depends on HAS_IOMEM - depends on NET - depends on INET - default n - ---help--- - This option allows you to select a number of RDMA drivers that - fall into one of two categories: deprecated drivers being held - here before finally being removed or new drivers that still need - some work before being moved to the normal RDMA driver area. - - If you wish to work on these drivers, to help improve them, or - to report problems you have with them, please use the - linux-rdma@vger.kernel.org mailing list. - - If in doubt, say N here. - - -# Please keep entries in alphabetic order -if STAGING_RDMA - -source "drivers/staging/rdma/hfi1/Kconfig" - -endif diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile deleted file mode 100644 index 8c7fc1de48a7bf..00000000000000 --- a/drivers/staging/rdma/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# Entries for RDMA_STAGING tree -obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO deleted file mode 100644 index 4c6f1d7d2eaf21..00000000000000 --- a/drivers/staging/rdma/hfi1/TODO +++ /dev/null @@ -1,6 +0,0 @@ -July, 2015 - -- Remove unneeded file entries in sysfs -- Remove software processing of IB protocol and place in library for use - by qib, ipath (if still present), hfi1, and eventually soft-roce -- Replace incorrect uAPI diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c deleted file mode 100644 index bb2409ad891a75..00000000000000 --- a/drivers/staging/rdma/hfi1/diag.c +++ /dev/null @@ -1,1925 +0,0 @@ -/* - * Copyright(c) 2015, 2016 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -/* - * This file contains support for diagnostic functions. It is accessed by - * opening the hfi1_diag device, normally minor number 129. Diagnostic use - * of the chip may render the chip or board unusable until the driver - * is unloaded, or in some cases, until the system is rebooted. - * - * Accesses to the chip through this interface are not similar to going - * through the /sys/bus/pci resource mmap interface. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "hfi.h" -#include "device.h" -#include "common.h" -#include "verbs_txreq.h" -#include "trace.h" - -#undef pr_fmt -#define pr_fmt(fmt) DRIVER_NAME ": " fmt -#define snoop_dbg(fmt, ...) \ - hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__) - -/* Snoop option mask */ -#define SNOOP_DROP_SEND BIT(0) -#define SNOOP_USE_METADATA BIT(1) -#define SNOOP_SET_VL0TOVL15 BIT(2) - -static u8 snoop_flags; - -/* - * Extract packet length from LRH header. - * This is in Dwords so multiply by 4 to get size in bytes - */ -#define HFI1_GET_PKT_LEN(x) (((be16_to_cpu((x)->lrh[2]) & 0xFFF)) << 2) - -enum hfi1_filter_status { - HFI1_FILTER_HIT, - HFI1_FILTER_ERR, - HFI1_FILTER_MISS -}; - -/* snoop processing functions */ -rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = { - [RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler, - [RHF_RCV_TYPE_EAGER] = snoop_recv_handler, - [RHF_RCV_TYPE_IB] = snoop_recv_handler, - [RHF_RCV_TYPE_ERROR] = snoop_recv_handler, - [RHF_RCV_TYPE_BYPASS] = snoop_recv_handler, - [RHF_RCV_TYPE_INVALID5] = process_receive_invalid, - [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, - [RHF_RCV_TYPE_INVALID7] = process_receive_invalid -}; - -/* Snoop packet structure */ -struct snoop_packet { - struct list_head list; - u32 total_len; - u8 data[]; -}; - -/* Do not make these an enum or it will blow up the capture_md */ -#define PKT_DIR_EGRESS 0x0 -#define PKT_DIR_INGRESS 0x1 - -/* Packet capture metadata returned to the user with the packet. */ -struct capture_md { - u8 port; - u8 dir; - u8 reserved[6]; - union { - u64 pbc; - u64 rhf; - } u; -}; - -static atomic_t diagpkt_count = ATOMIC_INIT(0); -static struct cdev diagpkt_cdev; -static struct device *diagpkt_device; - -static ssize_t diagpkt_write(struct file *fp, const char __user *data, - size_t count, loff_t *off); - -static const struct file_operations diagpkt_file_ops = { - .owner = THIS_MODULE, - .write = diagpkt_write, - .llseek = noop_llseek, -}; - -/* - * This is used for communication with user space for snoop extended IOCTLs - */ -struct hfi1_link_info { - __be64 node_guid; - u8 port_mode; - u8 port_state; - u16 link_speed_active; - u16 link_width_active; - u16 vl15_init; - u8 port_number; - /* - * Add padding to make this a full IB SMP payload. Note: changing the - * size of this structure will make the IOCTLs created with _IOWR - * change. - * Be sure to run tests on all IOCTLs when making changes to this - * structure. - */ - u8 res[47]; -}; - -/* - * This starts our ioctl sequence numbers *way* off from the ones - * defined in ib_core. - */ -#define SNOOP_CAPTURE_VERSION 0x1 - -#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl-number.txt */ -#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC -#define HFI1_SNOOP_IOC_BASE_SEQ 0x80 - -#define HFI1_SNOOP_IOCGETLINKSTATE \ - _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ) -#define HFI1_SNOOP_IOCSETLINKSTATE \ - _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 1) -#define HFI1_SNOOP_IOCCLEARQUEUE \ - _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 2) -#define HFI1_SNOOP_IOCCLEARFILTER \ - _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 3) -#define HFI1_SNOOP_IOCSETFILTER \ - _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 4) -#define HFI1_SNOOP_IOCGETVERSION \ - _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 5) -#define HFI1_SNOOP_IOCSET_OPTS \ - _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 6) - -/* - * These offsets +6/+7 could change, but these are already known and used - * IOCTL numbers so don't change them without a good reason. - */ -#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \ - _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 6, \ - struct hfi1_link_info) -#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \ - _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ + 7, \ - struct hfi1_link_info) - -static int hfi1_snoop_open(struct inode *in, struct file *fp); -static ssize_t hfi1_snoop_read(struct file *fp, char __user *data, - size_t pkt_len, loff_t *off); -static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data, - size_t count, loff_t *off); -static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg); -static unsigned int hfi1_snoop_poll(struct file *fp, - struct poll_table_struct *wait); -static int hfi1_snoop_release(struct inode *in, struct file *fp); - -struct hfi1_packet_filter_command { - int opcode; - int length; - void *value_ptr; -}; - -/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */ -#define HFI1_SNOOP_INGRESS 0x1 -#define HFI1_SNOOP_EGRESS 0x2 - -enum hfi1_packet_filter_opcodes { - FILTER_BY_LID, - FILTER_BY_DLID, - FILTER_BY_MAD_MGMT_CLASS, - FILTER_BY_QP_NUMBER, - FILTER_BY_PKT_TYPE, - FILTER_BY_SERVICE_LEVEL, - FILTER_BY_PKEY, - FILTER_BY_DIRECTION, -}; - -static const struct file_operations snoop_file_ops = { - .owner = THIS_MODULE, - .open = hfi1_snoop_open, - .read = hfi1_snoop_read, - .unlocked_ioctl = hfi1_ioctl, - .poll = hfi1_snoop_poll, - .write = hfi1_snoop_write, - .release = hfi1_snoop_release -}; - -struct hfi1_filter_array { - int (*filter)(void *, void *, void *); -}; - -static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value); -static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value); -static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data, - void *value); -static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value); -static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data, - void *value); -static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data, - void *value); -static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value); -static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value); - -static const struct hfi1_filter_array hfi1_filters[] = { - { hfi1_filter_lid }, - { hfi1_filter_dlid }, - { hfi1_filter_mad_mgmt_class }, - { hfi1_filter_qp_number }, - { hfi1_filter_ibpacket_type }, - { hfi1_filter_ib_service_level }, - { hfi1_filter_ib_pkey }, - { hfi1_filter_direction }, -}; - -#define HFI1_MAX_FILTERS ARRAY_SIZE(hfi1_filters) -#define HFI1_DIAG_MINOR_BASE 129 - -static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name); - -int hfi1_diag_add(struct hfi1_devdata *dd) -{ - char name[16]; - int ret = 0; - - snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(), - dd->unit); - /* - * Do this for each device as opposed to the normal diagpkt - * interface which is one per host - */ - ret = hfi1_snoop_add(dd, name); - if (ret) - dd_dev_err(dd, "Unable to init snoop/capture device"); - - snprintf(name, sizeof(name), "%s_diagpkt", class_name()); - if (atomic_inc_return(&diagpkt_count) == 1) { - ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name, - &diagpkt_file_ops, &diagpkt_cdev, - &diagpkt_device, false); - } - - return ret; -} - -/* this must be called w/ dd->snoop_in_lock held */ -static void drain_snoop_list(struct list_head *queue) -{ - struct list_head *pos, *q; - struct snoop_packet *packet; - - list_for_each_safe(pos, q, queue) { - packet = list_entry(pos, struct snoop_packet, list); - list_del(pos); - kfree(packet); - } -} - -static void hfi1_snoop_remove(struct hfi1_devdata *dd) -{ - unsigned long flags = 0; - - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - drain_snoop_list(&dd->hfi1_snoop.queue); - hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev); - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); -} - -void hfi1_diag_remove(struct hfi1_devdata *dd) -{ - hfi1_snoop_remove(dd); - if (atomic_dec_and_test(&diagpkt_count)) - hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device); - hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device); -} - -/* - * Allocated structure shared between the credit return mechanism and - * diagpkt_send(). - */ -struct diagpkt_wait { - struct completion credits_returned; - int code; - atomic_t count; -}; - -/* - * When each side is finished with the structure, they call this. - * The last user frees the structure. - */ -static void put_diagpkt_wait(struct diagpkt_wait *wait) -{ - if (atomic_dec_and_test(&wait->count)) - kfree(wait); -} - -/* - * Callback from the credit return code. Set the complete, which - * will let diapkt_send() continue. - */ -static void diagpkt_complete(void *arg, int code) -{ - struct diagpkt_wait *wait = (struct diagpkt_wait *)arg; - - wait->code = code; - complete(&wait->credits_returned); - put_diagpkt_wait(wait); /* finished with the structure */ -} - -/** - * diagpkt_send - send a packet - * @dp: diag packet descriptor - */ -static ssize_t diagpkt_send(struct diag_pkt *dp) -{ - struct hfi1_devdata *dd; - struct send_context *sc; - struct pio_buf *pbuf; - u32 *tmpbuf = NULL; - ssize_t ret = 0; - u32 pkt_len, total_len; - pio_release_cb credit_cb = NULL; - void *credit_arg = NULL; - struct diagpkt_wait *wait = NULL; - int trycount = 0; - - dd = hfi1_lookup(dp->unit); - if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) { - ret = -ENODEV; - goto bail; - } - if (!(dd->flags & HFI1_INITTED)) { - /* no hardware, freeze, etc. */ - ret = -ENODEV; - goto bail; - } - - if (dp->version != _DIAG_PKT_VERS) { - dd_dev_err(dd, "Invalid version %u for diagpkt_write\n", - dp->version); - ret = -EINVAL; - goto bail; - } - - /* send count must be an exact number of dwords */ - if (dp->len & 3) { - ret = -EINVAL; - goto bail; - } - - /* there is only port 1 */ - if (dp->port != 1) { - ret = -EINVAL; - goto bail; - } - - /* need a valid context */ - if (dp->sw_index >= dd->num_send_contexts) { - ret = -EINVAL; - goto bail; - } - /* can only use kernel contexts */ - if (dd->send_contexts[dp->sw_index].type != SC_KERNEL && - dd->send_contexts[dp->sw_index].type != SC_VL15) { - ret = -EINVAL; - goto bail; - } - /* must be allocated */ - sc = dd->send_contexts[dp->sw_index].sc; - if (!sc) { - ret = -EINVAL; - goto bail; - } - /* must be enabled */ - if (!(sc->flags & SCF_ENABLED)) { - ret = -EINVAL; - goto bail; - } - - /* allocate a buffer and copy the data in */ - tmpbuf = vmalloc(dp->len); - if (!tmpbuf) { - ret = -ENOMEM; - goto bail; - } - - if (copy_from_user(tmpbuf, - (const void __user *)(unsigned long)dp->data, - dp->len)) { - ret = -EFAULT; - goto bail; - } - - /* - * pkt_len is how much data we have to write, includes header and data. - * total_len is length of the packet in Dwords plus the PBC should not - * include the CRC. - */ - pkt_len = dp->len >> 2; - total_len = pkt_len + 2; /* PBC + packet */ - - /* if 0, fill in a default */ - if (dp->pbc == 0) { - struct hfi1_pportdata *ppd = dd->pport; - - hfi1_cdbg(PKT, "Generating PBC"); - dp->pbc = create_pbc(ppd, 0, 0, 0, total_len); - } else { - hfi1_cdbg(PKT, "Using passed in PBC"); - } - - hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc); - - /* - * The caller wants to wait until the packet is sent and to - * check for errors. The best we can do is wait until - * the buffer credits are returned and check if any packet - * error has occurred. If there are any late errors, this - * could miss it. If there are other senders who generate - * an error, this may find it. However, in general, it - * should catch most. - */ - if (dp->flags & F_DIAGPKT_WAIT) { - /* always force a credit return */ - dp->pbc |= PBC_CREDIT_RETURN; - /* turn on credit return interrupts */ - sc_add_credit_return_intr(sc); - wait = kmalloc(sizeof(*wait), GFP_KERNEL); - if (!wait) { - ret = -ENOMEM; - goto bail; - } - init_completion(&wait->credits_returned); - atomic_set(&wait->count, 2); - wait->code = PRC_OK; - - credit_cb = diagpkt_complete; - credit_arg = wait; - } - -retry: - pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg); - if (!pbuf) { - if (trycount == 0) { - /* force a credit return and try again */ - sc_return_credits(sc); - trycount = 1; - goto retry; - } - /* - * No send buffer means no credit callback. Undo - * the wait set-up that was done above. We free wait - * because the callback will never be called. - */ - if (dp->flags & F_DIAGPKT_WAIT) { - sc_del_credit_return_intr(sc); - kfree(wait); - wait = NULL; - } - ret = -ENOSPC; - goto bail; - } - - pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len); - /* no flush needed as the HW knows the packet size */ - - ret = sizeof(*dp); - - if (dp->flags & F_DIAGPKT_WAIT) { - /* wait for credit return */ - ret = wait_for_completion_interruptible( - &wait->credits_returned); - /* - * If the wait returns an error, the wait was interrupted, - * e.g. with a ^C in the user program. The callback is - * still pending. This is OK as the wait structure is - * kmalloc'ed and the structure will free itself when - * all users are done with it. - * - * A context disable occurs on a send context restart, so - * include that in the list of errors below to check for. - * NOTE: PRC_FILL_ERR is at best informational and cannot - * be depended on. - */ - if (!ret && (((wait->code & PRC_STATUS_ERR) || - (wait->code & PRC_FILL_ERR) || - (wait->code & PRC_SC_DISABLE)))) - ret = -EIO; - - put_diagpkt_wait(wait); /* finished with the structure */ - sc_del_credit_return_intr(sc); - } - -bail: - vfree(tmpbuf); - return ret; -} - -static ssize_t diagpkt_write(struct file *fp, const char __user *data, - size_t count, loff_t *off) -{ - struct hfi1_devdata *dd; - struct send_context *sc; - u8 vl; - - struct diag_pkt dp; - - if (count != sizeof(dp)) - return -EINVAL; - - if (copy_from_user(&dp, data, sizeof(dp))) - return -EFAULT; - - /* - * The Send Context is derived from the PbcVL value - * if PBC is populated - */ - if (dp.pbc) { - dd = hfi1_lookup(dp.unit); - if (!dd) - return -ENODEV; - vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK; - sc = dd->vld[vl].sc; - if (sc) { - dp.sw_index = sc->sw_index; - hfi1_cdbg( - PKT, - "Packet sent over VL %d via Send Context %u(%u)", - vl, sc->sw_index, sc->hw_context); - } - } - - return diagpkt_send(&dp); -} - -static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name) -{ - int ret = 0; - - dd->hfi1_snoop.mode_flag = 0; - spin_lock_init(&dd->hfi1_snoop.snoop_lock); - INIT_LIST_HEAD(&dd->hfi1_snoop.queue); - init_waitqueue_head(&dd->hfi1_snoop.waitq); - - ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name, - &snoop_file_ops, - &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev, - false); - - if (ret) { - dd_dev_err(dd, "Couldn't create %s device: %d", name, ret); - hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, - &dd->hfi1_snoop.class_dev); - } - - return ret; -} - -static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in) -{ - int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE; - struct hfi1_devdata *dd; - - dd = hfi1_lookup(unit); - return dd; -} - -/* clear or restore send context integrity checks */ -static void adjust_integrity_checks(struct hfi1_devdata *dd) -{ - struct send_context *sc; - unsigned long sc_flags; - int i; - - spin_lock_irqsave(&dd->sc_lock, sc_flags); - for (i = 0; i < dd->num_send_contexts; i++) { - int enable; - - sc = dd->send_contexts[i].sc; - - if (!sc) - continue; /* not allocated */ - - enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) && - dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE; - - set_pio_integrity(sc); - - if (enable) /* take HFI_CAP_* flags into account */ - hfi1_init_ctxt(sc); - } - spin_unlock_irqrestore(&dd->sc_lock, sc_flags); -} - -static int hfi1_snoop_open(struct inode *in, struct file *fp) -{ - int ret; - int mode_flag = 0; - unsigned long flags = 0; - struct hfi1_devdata *dd; - struct list_head *queue; - - mutex_lock(&hfi1_mutex); - - dd = hfi1_dd_from_sc_inode(in); - if (!dd) { - ret = -ENODEV; - goto bail; - } - - /* - * File mode determines snoop or capture. Some existing user - * applications expect the capture device to be able to be opened RDWR - * because they expect a dedicated capture device. For this reason we - * support a module param to force capture mode even if the file open - * mode matches snoop. - */ - if ((fp->f_flags & O_ACCMODE) == O_RDONLY) { - snoop_dbg("Capture Enabled"); - mode_flag = HFI1_PORT_CAPTURE_MODE; - } else if ((fp->f_flags & O_ACCMODE) == O_RDWR) { - snoop_dbg("Snoop Enabled"); - mode_flag = HFI1_PORT_SNOOP_MODE; - } else { - snoop_dbg("Invalid"); - ret = -EINVAL; - goto bail; - } - queue = &dd->hfi1_snoop.queue; - - /* - * We are not supporting snoop and capture at the same time. - */ - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - if (dd->hfi1_snoop.mode_flag) { - ret = -EBUSY; - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - goto bail; - } - - dd->hfi1_snoop.mode_flag = mode_flag; - drain_snoop_list(queue); - - dd->hfi1_snoop.filter_callback = NULL; - dd->hfi1_snoop.filter_value = NULL; - - /* - * Send side packet integrity checks are not helpful when snooping so - * disable and re-enable when we stop snooping. - */ - if (mode_flag == HFI1_PORT_SNOOP_MODE) { - /* clear after snoop mode is on */ - adjust_integrity_checks(dd); /* clear */ - - /* - * We also do not want to be doing the DLID LMC check for - * ingressed packets. - */ - dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1); - write_csr(dd, DCC_CFG_PORT_CONFIG1, - (dd->hfi1_snoop.dcc_cfg >> 32) << 32); - } - - /* - * As soon as we set these function pointers the recv and send handlers - * are active. This is a race condition so we must make sure to drain - * the queue and init filter values above. Technically we should add - * locking here but all that will happen is on recv a packet will get - * allocated and get stuck on the snoop_lock before getting added to the - * queue. Same goes for send. - */ - dd->rhf_rcv_function_map = snoop_rhf_rcv_functions; - dd->process_pio_send = snoop_send_pio_handler; - dd->process_dma_send = snoop_send_pio_handler; - dd->pio_inline_send = snoop_inline_pio_send; - - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - ret = 0; - -bail: - mutex_unlock(&hfi1_mutex); - - return ret; -} - -static int hfi1_snoop_release(struct inode *in, struct file *fp) -{ - unsigned long flags = 0; - struct hfi1_devdata *dd; - int mode_flag; - - dd = hfi1_dd_from_sc_inode(in); - if (!dd) - return -ENODEV; - - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - - /* clear the snoop mode before re-adjusting send context CSRs */ - mode_flag = dd->hfi1_snoop.mode_flag; - dd->hfi1_snoop.mode_flag = 0; - - /* - * Drain the queue and clear the filters we are done with it. Don't - * forget to restore the packet integrity checks - */ - drain_snoop_list(&dd->hfi1_snoop.queue); - if (mode_flag == HFI1_PORT_SNOOP_MODE) { - /* restore after snoop mode is clear */ - adjust_integrity_checks(dd); /* restore */ - - /* - * Also should probably reset the DCC_CONFIG1 register for DLID - * checking on incoming packets again. Use the value saved when - * opening the snoop device. - */ - write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg); - } - - dd->hfi1_snoop.filter_callback = NULL; - kfree(dd->hfi1_snoop.filter_value); - dd->hfi1_snoop.filter_value = NULL; - - /* - * User is done snooping and capturing, return control to the normal - * handler. Re-enable SDMA handling. - */ - dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions; - dd->process_pio_send = hfi1_verbs_send_pio; - dd->process_dma_send = hfi1_verbs_send_dma; - dd->pio_inline_send = pio_copy; - - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - - snoop_dbg("snoop/capture device released"); - - return 0; -} - -static unsigned int hfi1_snoop_poll(struct file *fp, - struct poll_table_struct *wait) -{ - int ret = 0; - unsigned long flags = 0; - - struct hfi1_devdata *dd; - - dd = hfi1_dd_from_sc_inode(fp->f_inode); - if (!dd) - return -ENODEV; - - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - - poll_wait(fp, &dd->hfi1_snoop.waitq, wait); - if (!list_empty(&dd->hfi1_snoop.queue)) - ret |= POLLIN | POLLRDNORM; - - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - return ret; -} - -static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data, - size_t count, loff_t *off) -{ - struct diag_pkt dpkt; - struct hfi1_devdata *dd; - size_t ret; - u8 byte_two, sl, sc5, sc4, vl, byte_one; - struct send_context *sc; - u32 len; - u64 pbc; - struct hfi1_ibport *ibp; - struct hfi1_pportdata *ppd; - - dd = hfi1_dd_from_sc_inode(fp->f_inode); - if (!dd) - return -ENODEV; - - ppd = dd->pport; - snoop_dbg("received %lu bytes from user", count); - - memset(&dpkt, 0, sizeof(struct diag_pkt)); - dpkt.version = _DIAG_PKT_VERS; - dpkt.unit = dd->unit; - dpkt.port = 1; - - if (likely(!(snoop_flags & SNOOP_USE_METADATA))) { - /* - * We need to generate the PBC and not let diagpkt_send do it, - * to do this we need the VL and the length in dwords. - * The VL can be determined by using the SL and looking up the - * SC. Then the SC can be converted into VL. The exception to - * this is those packets which are from an SMI queue pair. - * Since we can't detect anything about the QP here we have to - * rely on the SC. If its 0xF then we assume its SMI and - * do not look at the SL. - */ - if (copy_from_user(&byte_one, data, 1)) - return -EINVAL; - - if (copy_from_user(&byte_two, data + 1, 1)) - return -EINVAL; - - sc4 = (byte_one >> 4) & 0xf; - if (sc4 == 0xF) { - snoop_dbg("Detected VL15 packet ignoring SL in packet"); - vl = sc4; - } else { - sl = (byte_two >> 4) & 0xf; - ibp = to_iport(&dd->verbs_dev.rdi.ibdev, 1); - sc5 = ibp->sl_to_sc[sl]; - vl = sc_to_vlt(dd, sc5); - if (vl != sc4) { - snoop_dbg("VL %d does not match SC %d of packet", - vl, sc4); - return -EINVAL; - } - } - - sc = dd->vld[vl].sc; /* Look up the context based on VL */ - if (sc) { - dpkt.sw_index = sc->sw_index; - snoop_dbg("Sending on context %u(%u)", sc->sw_index, - sc->hw_context); - } else { - snoop_dbg("Could not find context for vl %d", vl); - return -EINVAL; - } - - len = (count >> 2) + 2; /* Add in PBC */ - pbc = create_pbc(ppd, 0, 0, vl, len); - } else { - if (copy_from_user(&pbc, data, sizeof(pbc))) - return -EINVAL; - vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK; - sc = dd->vld[vl].sc; /* Look up the context based on VL */ - if (sc) { - dpkt.sw_index = sc->sw_index; - } else { - snoop_dbg("Could not find context for vl %d", vl); - return -EINVAL; - } - data += sizeof(pbc); - count -= sizeof(pbc); - } - dpkt.len = count; - dpkt.data = (unsigned long)data; - - snoop_dbg("PBC: vl=0x%llx Length=0x%llx", - (pbc >> 12) & 0xf, - (pbc & 0xfff)); - - dpkt.pbc = pbc; - ret = diagpkt_send(&dpkt); - /* - * diagpkt_send only returns number of bytes in the diagpkt so patch - * that up here before returning. - */ - if (ret == sizeof(dpkt)) - return count; - - return ret; -} - -static ssize_t hfi1_snoop_read(struct file *fp, char __user *data, - size_t pkt_len, loff_t *off) -{ - ssize_t ret = 0; - unsigned long flags = 0; - struct snoop_packet *packet = NULL; - struct hfi1_devdata *dd; - - dd = hfi1_dd_from_sc_inode(fp->f_inode); - if (!dd) - return -ENODEV; - - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - - while (list_empty(&dd->hfi1_snoop.queue)) { - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - - if (fp->f_flags & O_NONBLOCK) - return -EAGAIN; - - if (wait_event_interruptible( - dd->hfi1_snoop.waitq, - !list_empty(&dd->hfi1_snoop.queue))) - return -EINTR; - - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - } - - if (!list_empty(&dd->hfi1_snoop.queue)) { - packet = list_entry(dd->hfi1_snoop.queue.next, - struct snoop_packet, list); - list_del(&packet->list); - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - if (pkt_len >= packet->total_len) { - if (copy_to_user(data, packet->data, - packet->total_len)) - ret = -EFAULT; - else - ret = packet->total_len; - } else { - ret = -EINVAL; - } - - kfree(packet); - } else { - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - } - - return ret; -} - -/** - * hfi1_assign_snoop_link_credits -- Set up credits for VL15 and others - * @ppd : ptr to hfi1 port data - * @value : options from user space - * - * Assumes the rest of the CM credit registers are zero from a - * previous global or credit reset. - * Leave shared count at zero for both global and all vls. - * In snoop mode ideally we don't use shared credits - * Reserve 8.5k for VL15 - * If total credits less than 8.5kbytes return error. - * Divide the rest of the credits across VL0 to VL7 and if - * each of these levels has less than 34 credits (at least 2048 + 128 bytes) - * return with an error. - * The credit registers will be reset to zero on link negotiation or link up - * so this function should be activated from user space only if the port has - * gone past link negotiation and link up. - * - * Return -- 0 if successful else error condition - * - */ -static long hfi1_assign_snoop_link_credits(struct hfi1_pportdata *ppd, - int value) -{ -#define OPA_MIN_PER_VL_CREDITS 34 /* 2048 + 128 bytes */ - struct buffer_control t; - int i; - struct hfi1_devdata *dd = ppd->dd; - u16 total_credits = (value >> 16) & 0xffff; - u16 vl15_credits = dd->vl15_init / 2; - u16 per_vl_credits; - __be16 be_per_vl_credits; - - if (!(ppd->host_link_state & HLS_UP)) - goto err_exit; - if (total_credits < vl15_credits) - goto err_exit; - - per_vl_credits = (total_credits - vl15_credits) / TXE_NUM_DATA_VL; - - if (per_vl_credits < OPA_MIN_PER_VL_CREDITS) - goto err_exit; - - memset(&t, 0, sizeof(t)); - be_per_vl_credits = cpu_to_be16(per_vl_credits); - - for (i = 0; i < TXE_NUM_DATA_VL; i++) - t.vl[i].dedicated = be_per_vl_credits; - - t.vl[15].dedicated = cpu_to_be16(vl15_credits); - return set_buffer_control(ppd, &t); - -err_exit: - snoop_dbg("port_state = 0x%x, total_credits = %d, vl15_credits = %d", - ppd->host_link_state, total_credits, vl15_credits); - - return -EINVAL; -} - -static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) -{ - struct hfi1_devdata *dd; - void *filter_value = NULL; - long ret = 0; - int value = 0; - u8 phys_state = 0; - u8 link_state = 0; - u16 dev_state = 0; - unsigned long flags = 0; - unsigned long *argp = NULL; - struct hfi1_packet_filter_command filter_cmd = {0}; - int mode_flag = 0; - struct hfi1_pportdata *ppd = NULL; - unsigned int index; - struct hfi1_link_info link_info; - int read_cmd, write_cmd, read_ok, write_ok; - - dd = hfi1_dd_from_sc_inode(fp->f_inode); - if (!dd) - return -ENODEV; - - mode_flag = dd->hfi1_snoop.mode_flag; - read_cmd = _IOC_DIR(cmd) & _IOC_READ; - write_cmd = _IOC_DIR(cmd) & _IOC_WRITE; - write_ok = access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd)); - read_ok = access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd)); - - if ((read_cmd && !write_ok) || (write_cmd && !read_ok)) - return -EFAULT; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if ((mode_flag & HFI1_PORT_CAPTURE_MODE) && - (cmd != HFI1_SNOOP_IOCCLEARQUEUE) && - (cmd != HFI1_SNOOP_IOCCLEARFILTER) && - (cmd != HFI1_SNOOP_IOCSETFILTER)) - /* Capture devices are allowed only 3 operations - * 1.Clear capture queue - * 2.Clear capture filter - * 3.Set capture filter - * Other are invalid. - */ - return -EINVAL; - - switch (cmd) { - case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA: - memset(&link_info, 0, sizeof(link_info)); - - if (copy_from_user(&link_info, - (struct hfi1_link_info __user *)arg, - sizeof(link_info))) - return -EFAULT; - - value = link_info.port_state; - index = link_info.port_number; - if (index > dd->num_pports - 1) - return -EINVAL; - - ppd = &dd->pport[index]; - if (!ppd) - return -EINVAL; - - /* What we want to transition to */ - phys_state = (value >> 4) & 0xF; - link_state = value & 0xF; - snoop_dbg("Setting link state 0x%x", value); - - switch (link_state) { - case IB_PORT_NOP: - if (phys_state == 0) - break; - /* fall through */ - case IB_PORT_DOWN: - switch (phys_state) { - case 0: - dev_state = HLS_DN_DOWNDEF; - break; - case 2: - dev_state = HLS_DN_POLL; - break; - case 3: - dev_state = HLS_DN_DISABLE; - break; - default: - return -EINVAL; - } - ret = set_link_state(ppd, dev_state); - break; - case IB_PORT_ARMED: - ret = set_link_state(ppd, HLS_UP_ARMED); - if (!ret) - send_idle_sma(dd, SMA_IDLE_ARM); - break; - case IB_PORT_ACTIVE: - ret = set_link_state(ppd, HLS_UP_ACTIVE); - if (!ret) - send_idle_sma(dd, SMA_IDLE_ACTIVE); - break; - default: - return -EINVAL; - } - - if (ret) - break; - /* fall through */ - case HFI1_SNOOP_IOCGETLINKSTATE: - case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA: - if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) { - memset(&link_info, 0, sizeof(link_info)); - if (copy_from_user(&link_info, - (struct hfi1_link_info __user *)arg, - sizeof(link_info))) - return -EFAULT; - index = link_info.port_number; - } else { - ret = __get_user(index, (int __user *)arg); - if (ret != 0) - break; - } - - if (index > dd->num_pports - 1) - return -EINVAL; - - ppd = &dd->pport[index]; - if (!ppd) - return -EINVAL; - - value = hfi1_ibphys_portstate(ppd); - value <<= 4; - value |= driver_lstate(ppd); - - snoop_dbg("Link port | Link State: %d", value); - - if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) || - (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) { - link_info.port_state = value; - link_info.node_guid = cpu_to_be64(ppd->guid); - link_info.link_speed_active = - ppd->link_speed_active; - link_info.link_width_active = - ppd->link_width_active; - if (copy_to_user((struct hfi1_link_info __user *)arg, - &link_info, sizeof(link_info))) - return -EFAULT; - } else { - ret = __put_user(value, (int __user *)arg); - } - break; - - case HFI1_SNOOP_IOCCLEARQUEUE: - snoop_dbg("Clearing snoop queue"); - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - drain_snoop_list(&dd->hfi1_snoop.queue); - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - break; - - case HFI1_SNOOP_IOCCLEARFILTER: - snoop_dbg("Clearing filter"); - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - if (dd->hfi1_snoop.filter_callback) { - /* Drain packets first */ - drain_snoop_list(&dd->hfi1_snoop.queue); - dd->hfi1_snoop.filter_callback = NULL; - } - kfree(dd->hfi1_snoop.filter_value); - dd->hfi1_snoop.filter_value = NULL; - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - break; - - case HFI1_SNOOP_IOCSETFILTER: - snoop_dbg("Setting filter"); - /* just copy command structure */ - argp = (unsigned long *)arg; - if (copy_from_user(&filter_cmd, (void __user *)argp, - sizeof(filter_cmd))) - return -EFAULT; - - if (filter_cmd.opcode >= HFI1_MAX_FILTERS) { - pr_alert("Invalid opcode in request\n"); - return -EINVAL; - } - - snoop_dbg("Opcode %d Len %d Ptr %p", - filter_cmd.opcode, filter_cmd.length, - filter_cmd.value_ptr); - - filter_value = kcalloc(filter_cmd.length, sizeof(u8), - GFP_KERNEL); - if (!filter_value) - return -ENOMEM; - - /* copy remaining data from userspace */ - if (copy_from_user((u8 *)filter_value, - (void __user *)filter_cmd.value_ptr, - filter_cmd.length)) { - kfree(filter_value); - return -EFAULT; - } - /* Drain packets first */ - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - drain_snoop_list(&dd->hfi1_snoop.queue); - dd->hfi1_snoop.filter_callback = - hfi1_filters[filter_cmd.opcode].filter; - /* just in case we see back to back sets */ - kfree(dd->hfi1_snoop.filter_value); - dd->hfi1_snoop.filter_value = filter_value; - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - break; - case HFI1_SNOOP_IOCGETVERSION: - value = SNOOP_CAPTURE_VERSION; - snoop_dbg("Getting version: %d", value); - ret = __put_user(value, (int __user *)arg); - break; - case HFI1_SNOOP_IOCSET_OPTS: - snoop_flags = 0; - ret = __get_user(value, (int __user *)arg); - if (ret != 0) - break; - - snoop_dbg("Setting snoop option %d", value); - if (value & SNOOP_DROP_SEND) - snoop_flags |= SNOOP_DROP_SEND; - if (value & SNOOP_USE_METADATA) - snoop_flags |= SNOOP_USE_METADATA; - if (value & (SNOOP_SET_VL0TOVL15)) { - ppd = &dd->pport[0]; /* first port will do */ - ret = hfi1_assign_snoop_link_credits(ppd, value); - } - break; - default: - return -ENOTTY; - } - - return ret; -} - -static void snoop_list_add_tail(struct snoop_packet *packet, - struct hfi1_devdata *dd) -{ - unsigned long flags = 0; - - spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); - if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) || - (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) { - list_add_tail(&packet->list, &dd->hfi1_snoop.queue); - snoop_dbg("Added packet to list"); - } - - /* - * Technically we can could have closed the snoop device while waiting - * on the above lock and it is gone now. The snoop mode_flag will - * prevent us from adding the packet to the queue though. - */ - - spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); - wake_up_interruptible(&dd->hfi1_snoop.waitq); -} - -static inline int hfi1_filter_check(void *val, const char *msg) -{ - if (!val) { - snoop_dbg("Error invalid %s value for filter", msg); - return HFI1_FILTER_ERR; - } - return 0; -} - -static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value) -{ - struct hfi1_ib_header *hdr; - int ret; - - ret = hfi1_filter_check(ibhdr, "header"); - if (ret) - return ret; - ret = hfi1_filter_check(value, "user"); - if (ret) - return ret; - hdr = (struct hfi1_ib_header *)ibhdr; - - if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */ - return HFI1_FILTER_HIT; /* matched */ - - return HFI1_FILTER_MISS; /* Not matched */ -} - -static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value) -{ - struct hfi1_ib_header *hdr; - int ret; - - ret = hfi1_filter_check(ibhdr, "header"); - if (ret) - return ret; - ret = hfi1_filter_check(value, "user"); - if (ret) - return ret; - - hdr = (struct hfi1_ib_header *)ibhdr; - - if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1])) - return HFI1_FILTER_HIT; - - return HFI1_FILTER_MISS; -} - -/* Not valid for outgoing packets, send handler passes null for data*/ -static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data, - void *value) -{ - struct hfi1_ib_header *hdr; - struct hfi1_other_headers *ohdr = NULL; - struct ib_smp *smp = NULL; - u32 qpn = 0; - int ret; - - ret = hfi1_filter_check(ibhdr, "header"); - if (ret) - return ret; - ret = hfi1_filter_check(packet_data, "packet_data"); - if (ret) - return ret; - ret = hfi1_filter_check(value, "user"); - if (ret) - return ret; - - hdr = (struct hfi1_ib_header *)ibhdr; - - /* Check for GRH */ - if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; /* LRH + BTH + DETH */ - else - ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */ - - qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF; - if (qpn <= 1) { - smp = (struct ib_smp *)packet_data; - if (*((u8 *)value) == smp->mgmt_class) - return HFI1_FILTER_HIT; - else - return HFI1_FILTER_MISS; - } - return HFI1_FILTER_ERR; -} - -static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value) -{ - struct hfi1_ib_header *hdr; - struct hfi1_other_headers *ohdr = NULL; - int ret; - - ret = hfi1_filter_check(ibhdr, "header"); - if (ret) - return ret; - ret = hfi1_filter_check(value, "user"); - if (ret) - return ret; - - hdr = (struct hfi1_ib_header *)ibhdr; - - /* Check for GRH */ - if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; /* LRH + BTH + DETH */ - else - ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */ - if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF)) - return HFI1_FILTER_HIT; - - return HFI1_FILTER_MISS; -} - -static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data, - void *value) -{ - u32 lnh = 0; - u8 opcode = 0; - struct hfi1_ib_header *hdr; - struct hfi1_other_headers *ohdr = NULL; - int ret; - - ret = hfi1_filter_check(ibhdr, "header"); - if (ret) - return ret; - ret = hfi1_filter_check(value, "user"); - if (ret) - return ret; - - hdr = (struct hfi1_ib_header *)ibhdr; - - lnh = (be16_to_cpu(hdr->lrh[0]) & 3); - - if (lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else if (lnh == HFI1_LRH_GRH) - ohdr = &hdr->u.l.oth; - else - return HFI1_FILTER_ERR; - - opcode = be32_to_cpu(ohdr->bth[0]) >> 24; - - if (*((u8 *)value) == ((opcode >> 5) & 0x7)) - return HFI1_FILTER_HIT; - - return HFI1_FILTER_MISS; -} - -static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data, - void *value) -{ - struct hfi1_ib_header *hdr; - int ret; - - ret = hfi1_filter_check(ibhdr, "header"); - if (ret) - return ret; - ret = hfi1_filter_check(value, "user"); - if (ret) - return ret; - - hdr = (struct hfi1_ib_header *)ibhdr; - - if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF)) - return HFI1_FILTER_HIT; - - return HFI1_FILTER_MISS; -} - -static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value) -{ - u32 lnh = 0; - struct hfi1_ib_header *hdr; - struct hfi1_other_headers *ohdr = NULL; - int ret; - - ret = hfi1_filter_check(ibhdr, "header"); - if (ret) - return ret; - ret = hfi1_filter_check(value, "user"); - if (ret) - return ret; - - hdr = (struct hfi1_ib_header *)ibhdr; - - lnh = (be16_to_cpu(hdr->lrh[0]) & 3); - if (lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else if (lnh == HFI1_LRH_GRH) - ohdr = &hdr->u.l.oth; - else - return HFI1_FILTER_ERR; - - /* P_key is 16-bit entity, however top most bit indicates - * type of membership. 0 for limited and 1 for Full. - * Limited members cannot accept information from other - * Limited members, but communication is allowed between - * every other combination of membership. - * Hence we'll omit comparing top-most bit while filtering - */ - - if ((*(u16 *)value & 0x7FFF) == - ((be32_to_cpu(ohdr->bth[0])) & 0x7FFF)) - return HFI1_FILTER_HIT; - - return HFI1_FILTER_MISS; -} - -/* - * If packet_data is NULL then this is coming from one of the send functions. - * Thus we know if its an ingressed or egressed packet. - */ -static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value) -{ - u8 user_dir = *(u8 *)value; - int ret; - - ret = hfi1_filter_check(value, "user"); - if (ret) - return ret; - - if (packet_data) { - /* Incoming packet */ - if (user_dir & HFI1_SNOOP_INGRESS) - return HFI1_FILTER_HIT; - } else { - /* Outgoing packet */ - if (user_dir & HFI1_SNOOP_EGRESS) - return HFI1_FILTER_HIT; - } - - return HFI1_FILTER_MISS; -} - -/* - * Allocate a snoop packet. The structure that is stored in the ring buffer, not - * to be confused with an hfi packet type. - */ -static struct snoop_packet *allocate_snoop_packet(u32 hdr_len, - u32 data_len, - u32 md_len) -{ - struct snoop_packet *packet; - - packet = kzalloc(sizeof(*packet) + hdr_len + data_len - + md_len, - GFP_ATOMIC | __GFP_NOWARN); - if (likely(packet)) - INIT_LIST_HEAD(&packet->list); - - return packet; -} - -/* - * Instead of having snoop and capture code intermixed with the recv functions, - * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call - * and land in here for snoop/capture but if not enabled the call will go - * through as before. This gives us a single point to constrain all of the snoop - * snoop recv logic. There is nothing special that needs to happen for bypass - * packets. This routine should not try to look into the packet. It just copied - * it. There is no guarantee for filters when it comes to bypass packets as - * there is no specific support. Bottom line is this routine does now even know - * what a bypass packet is. - */ -int snoop_recv_handler(struct hfi1_packet *packet) -{ - struct hfi1_pportdata *ppd = packet->rcd->ppd; - struct hfi1_ib_header *hdr = packet->hdr; - int header_size = packet->hlen; - void *data = packet->ebuf; - u32 tlen = packet->tlen; - struct snoop_packet *s_packet = NULL; - int ret; - int snoop_mode = 0; - u32 md_len = 0; - struct capture_md md; - - snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen, - data); - - trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size, - data); - - if (!ppd->dd->hfi1_snoop.filter_callback) { - snoop_dbg("filter not set"); - ret = HFI1_FILTER_HIT; - } else { - ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data, - ppd->dd->hfi1_snoop.filter_value); - } - - switch (ret) { - case HFI1_FILTER_ERR: - snoop_dbg("Error in filter call"); - break; - case HFI1_FILTER_MISS: - snoop_dbg("Filter Miss"); - break; - case HFI1_FILTER_HIT: - - if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) - snoop_mode = 1; - if ((snoop_mode == 0) || - unlikely(snoop_flags & SNOOP_USE_METADATA)) - md_len = sizeof(struct capture_md); - - s_packet = allocate_snoop_packet(header_size, - tlen - header_size, - md_len); - - if (unlikely(!s_packet)) { - dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n"); - break; - } - - if (md_len > 0) { - memset(&md, 0, sizeof(struct capture_md)); - md.port = 1; - md.dir = PKT_DIR_INGRESS; - md.u.rhf = packet->rhf; - memcpy(s_packet->data, &md, md_len); - } - - /* We should always have a header */ - if (hdr) { - memcpy(s_packet->data + md_len, hdr, header_size); - } else { - dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n"); - kfree(s_packet); - break; - } - - /* - * Packets with no data are possible. If there is no data needed - * to take care of the last 4 bytes which are normally included - * with data buffers and are included in tlen. Since we kzalloc - * the buffer we do not need to set any values but if we decide - * not to use kzalloc we should zero them. - */ - if (data) - memcpy(s_packet->data + header_size + md_len, data, - tlen - header_size); - - s_packet->total_len = tlen + md_len; - snoop_list_add_tail(s_packet, ppd->dd); - - /* - * If we are snooping the packet not capturing then throw away - * after adding to the list. - */ - snoop_dbg("Capturing packet"); - if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) { - snoop_dbg("Throwing packet away"); - /* - * If we are dropping the packet we still may need to - * handle the case where error flags are set, this is - * normally done by the type specific handler but that - * won't be called in this case. - */ - if (unlikely(rhf_err_flags(packet->rhf))) - handle_eflags(packet); - - /* throw the packet on the floor */ - return RHF_RCV_CONTINUE; - } - break; - default: - break; - } - - /* - * We do not care what type of packet came in here - just pass it off - * to the normal handler. - */ - return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)] - (packet); -} - -/* - * Handle snooping and capturing packets when sdma is being used. - */ -int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, - u64 pbc) -{ - pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n"); - snoop_dbg("Unsupported Operation"); - return hfi1_verbs_send_dma(qp, ps, 0); -} - -/* - * Handle snooping and capturing packets when pio is being used. Does not handle - * bypass packets. The only way to send a bypass packet currently is to use the - * diagpkt interface. When that interface is enable snoop/capture is not. - */ -int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, - u64 pbc) -{ - u32 hdrwords = qp->s_hdrwords; - struct rvt_sge_state *ss = qp->s_cur_sge; - u32 len = qp->s_cur_size; - u32 dwords = (len + 3) >> 2; - u32 plen = hdrwords + dwords + 2; /* includes pbc */ - struct hfi1_pportdata *ppd = ps->ppd; - struct snoop_packet *s_packet = NULL; - u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr; - u32 length = 0; - struct rvt_sge_state temp_ss; - void *data = NULL; - void *data_start = NULL; - int ret; - int snoop_mode = 0; - int md_len = 0; - struct capture_md md; - u32 vl; - u32 hdr_len = hdrwords << 2; - u32 tlen = HFI1_GET_PKT_LEN(&ps->s_txreq->phdr.hdr); - - md.u.pbc = 0; - - snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u", - hdrwords, len, plen, dwords, tlen); - if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) - snoop_mode = 1; - if ((snoop_mode == 0) || - unlikely(snoop_flags & SNOOP_USE_METADATA)) - md_len = sizeof(struct capture_md); - - /* not using ss->total_len as arg 2 b/c that does not count CRC */ - s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len); - - if (unlikely(!s_packet)) { - dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n"); - goto out; - } - - s_packet->total_len = tlen + md_len; - - if (md_len > 0) { - memset(&md, 0, sizeof(struct capture_md)); - md.port = 1; - md.dir = PKT_DIR_EGRESS; - if (likely(pbc == 0)) { - vl = be16_to_cpu(ps->s_txreq->phdr.hdr.lrh[0]) >> 12; - md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen); - } else { - md.u.pbc = 0; - } - memcpy(s_packet->data, &md, md_len); - } else { - md.u.pbc = pbc; - } - - /* Copy header */ - if (likely(hdr)) { - memcpy(s_packet->data + md_len, hdr, hdr_len); - } else { - dd_dev_err(ppd->dd, - "Unable to copy header to snoop/capture packet\n"); - kfree(s_packet); - goto out; - } - - if (ss) { - data = s_packet->data + hdr_len + md_len; - data_start = data; - - /* - * Copy SGE State - * The update_sge() function below will not modify the - * individual SGEs in the array. It will make a copy each time - * and operate on that. So we only need to copy this instance - * and it won't impact PIO. - */ - temp_ss = *ss; - length = len; - - snoop_dbg("Need to copy %d bytes", length); - while (length) { - void *addr = temp_ss.sge.vaddr; - u32 slen = temp_ss.sge.length; - - if (slen > length) { - slen = length; - snoop_dbg("slen %d > len %d", slen, length); - } - snoop_dbg("copy %d to %p", slen, addr); - memcpy(data, addr, slen); - update_sge(&temp_ss, slen); - length -= slen; - data += slen; - snoop_dbg("data is now %p bytes left %d", data, length); - } - snoop_dbg("Completed SGE copy"); - } - - /* - * Why do the filter check down here? Because the event tracing has its - * own filtering and we need to have the walked the SGE list. - */ - if (!ppd->dd->hfi1_snoop.filter_callback) { - snoop_dbg("filter not set\n"); - ret = HFI1_FILTER_HIT; - } else { - ret = ppd->dd->hfi1_snoop.filter_callback( - &ps->s_txreq->phdr.hdr, - NULL, - ppd->dd->hfi1_snoop.filter_value); - } - - switch (ret) { - case HFI1_FILTER_ERR: - snoop_dbg("Error in filter call"); - /* fall through */ - case HFI1_FILTER_MISS: - snoop_dbg("Filter Miss"); - kfree(s_packet); - break; - case HFI1_FILTER_HIT: - snoop_dbg("Capturing packet"); - snoop_list_add_tail(s_packet, ppd->dd); - - if (unlikely((snoop_flags & SNOOP_DROP_SEND) && - (ppd->dd->hfi1_snoop.mode_flag & - HFI1_PORT_SNOOP_MODE))) { - unsigned long flags; - - snoop_dbg("Dropping packet"); - if (qp->s_wqe) { - spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete( - qp, - qp->s_wqe, - IB_WC_SUCCESS); - spin_unlock_irqrestore(&qp->s_lock, flags); - } else if (qp->ibqp.qp_type == IB_QPT_RC) { - spin_lock_irqsave(&qp->s_lock, flags); - hfi1_rc_send_complete(qp, - &ps->s_txreq->phdr.hdr); - spin_unlock_irqrestore(&qp->s_lock, flags); - } - - /* - * If snoop is dropping the packet we need to put the - * txreq back because no one else will. - */ - hfi1_put_txreq(ps->s_txreq); - return 0; - } - break; - default: - kfree(s_packet); - break; - } -out: - return hfi1_verbs_send_pio(qp, ps, md.u.pbc); -} - -/* - * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently - * this can be used anywhere, but the intention is for inline ACKs for RC and - * CCA packets. We don't restrict this usage though. - */ -void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf, - u64 pbc, const void *from, size_t count) -{ - int snoop_mode = 0; - int md_len = 0; - struct capture_md md; - struct snoop_packet *s_packet = NULL; - - /* - * count is in dwords so we need to convert to bytes. - * We also need to account for CRC which would be tacked on by hardware. - */ - int packet_len = (count << 2) + 4; - int ret; - - snoop_dbg("ACK OUT: len %d", packet_len); - - if (!dd->hfi1_snoop.filter_callback) { - snoop_dbg("filter not set"); - ret = HFI1_FILTER_HIT; - } else { - ret = dd->hfi1_snoop.filter_callback( - (struct hfi1_ib_header *)from, - NULL, - dd->hfi1_snoop.filter_value); - } - - switch (ret) { - case HFI1_FILTER_ERR: - snoop_dbg("Error in filter call"); - /* fall through */ - case HFI1_FILTER_MISS: - snoop_dbg("Filter Miss"); - break; - case HFI1_FILTER_HIT: - snoop_dbg("Capturing packet"); - if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) - snoop_mode = 1; - if ((snoop_mode == 0) || - unlikely(snoop_flags & SNOOP_USE_METADATA)) - md_len = sizeof(struct capture_md); - - s_packet = allocate_snoop_packet(packet_len, 0, md_len); - - if (unlikely(!s_packet)) { - dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n"); - goto inline_pio_out; - } - - s_packet->total_len = packet_len + md_len; - - /* Fill in the metadata for the packet */ - if (md_len > 0) { - memset(&md, 0, sizeof(struct capture_md)); - md.port = 1; - md.dir = PKT_DIR_EGRESS; - md.u.pbc = pbc; - memcpy(s_packet->data, &md, md_len); - } - - /* Add the packet data which is a single buffer */ - memcpy(s_packet->data + md_len, from, packet_len); - - snoop_list_add_tail(s_packet, dd); - - if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) { - snoop_dbg("Dropping packet"); - return; - } - break; - default: - break; - } - -inline_pio_out: - pio_copy(dd, pbuf, pbc, from, count); -} diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c deleted file mode 100644 index bd8771570f81ac..00000000000000 --- a/drivers/staging/rdma/hfi1/eprom.c +++ /dev/null @@ -1,471 +0,0 @@ -/* - * Copyright(c) 2015, 2016 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ -#include -#include "hfi.h" -#include "common.h" -#include "eprom.h" - -/* - * The EPROM is logically divided into three partitions: - * partition 0: the first 128K, visible from PCI ROM BAR - * partition 1: 4K config file (sector size) - * partition 2: the rest - */ -#define P0_SIZE (128 * 1024) -#define P1_SIZE (4 * 1024) -#define P1_START P0_SIZE -#define P2_START (P0_SIZE + P1_SIZE) - -/* erase sizes supported by the controller */ -#define SIZE_4KB (4 * 1024) -#define MASK_4KB (SIZE_4KB - 1) - -#define SIZE_32KB (32 * 1024) -#define MASK_32KB (SIZE_32KB - 1) - -#define SIZE_64KB (64 * 1024) -#define MASK_64KB (SIZE_64KB - 1) - -/* controller page size, in bytes */ -#define EP_PAGE_SIZE 256 -#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1) - -/* controller commands */ -#define CMD_SHIFT 24 -#define CMD_NOP (0) -#define CMD_PAGE_PROGRAM(addr) ((0x02 << CMD_SHIFT) | addr) -#define CMD_READ_DATA(addr) ((0x03 << CMD_SHIFT) | addr) -#define CMD_READ_SR1 ((0x05 << CMD_SHIFT)) -#define CMD_WRITE_ENABLE ((0x06 << CMD_SHIFT)) -#define CMD_SECTOR_ERASE_4KB(addr) ((0x20 << CMD_SHIFT) | addr) -#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr) -#define CMD_CHIP_ERASE ((0x60 << CMD_SHIFT)) -#define CMD_READ_MANUF_DEV_ID ((0x90 << CMD_SHIFT)) -#define CMD_RELEASE_POWERDOWN_NOID ((0xab << CMD_SHIFT)) -#define CMD_SECTOR_ERASE_64KB(addr) ((0xd8 << CMD_SHIFT) | addr) - -/* controller interface speeds */ -#define EP_SPEED_FULL 0x2 /* full speed */ - -/* controller status register 1 bits */ -#define SR1_BUSY 0x1ull /* the BUSY bit in SR1 */ - -/* sleep length while waiting for controller */ -#define WAIT_SLEEP_US 100 /* must be larger than 5 (see usage) */ -#define COUNT_DELAY_SEC(n) ((n) * (1000000 / WAIT_SLEEP_US)) - -/* GPIO pins */ -#define EPROM_WP_N BIT_ULL(14) /* EPROM write line */ - -/* - * How long to wait for the EPROM to become available, in ms. - * The spec 32 Mb EPROM takes around 40s to erase then write. - * Double it for safety. - */ -#define EPROM_TIMEOUT 80000 /* ms */ - -/* - * Turn on external enable line that allows writing on the flash. - */ -static void write_enable(struct hfi1_devdata *dd) -{ - /* raise signal */ - write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N); - /* raise enable */ - write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N); -} - -/* - * Turn off external enable line that allows writing on the flash. - */ -static void write_disable(struct hfi1_devdata *dd) -{ - /* lower signal */ - write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N); - /* lower enable */ - write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N); -} - -/* - * Wait for the device to become not busy. Must be called after all - * write or erase operations. - */ -static int wait_for_not_busy(struct hfi1_devdata *dd) -{ - unsigned long count = 0; - u64 reg; - int ret = 0; - - /* starts page mode */ - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1); - while (1) { - udelay(WAIT_SLEEP_US); - usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5); - count++; - reg = read_csr(dd, ASIC_EEP_DATA); - if ((reg & SR1_BUSY) == 0) - break; - /* 200s is the largest time for a 128Mb device */ - if (count > COUNT_DELAY_SEC(200)) { - dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n"); - ret = -ETIMEDOUT; - break; /* break, not goto - must stop page mode */ - } - } - - /* stop page mode with a NOP */ - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); - - return ret; -} - -/* - * Read the device ID from the SPI controller. - */ -static u32 read_device_id(struct hfi1_devdata *dd) -{ - /* read the Manufacture Device ID */ - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID); - return (u32)read_csr(dd, ASIC_EEP_DATA); -} - -/* - * Erase the whole flash. - */ -static int erase_chip(struct hfi1_devdata *dd) -{ - int ret; - - write_enable(dd); - - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE); - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE); - ret = wait_for_not_busy(dd); - - write_disable(dd); - - return ret; -} - -/* - * Erase a range. - */ -static int erase_range(struct hfi1_devdata *dd, u32 start, u32 len) -{ - u32 end = start + len; - int ret = 0; - - if (end < start) - return -EINVAL; - - /* check the end points for the minimum erase */ - if ((start & MASK_4KB) || (end & MASK_4KB)) { - dd_dev_err(dd, - "%s: non-aligned range (0x%x,0x%x) for a 4KB erase\n", - __func__, start, end); - return -EINVAL; - } - - write_enable(dd); - - while (start < end) { - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE); - /* check in order of largest to smallest */ - if (((start & MASK_64KB) == 0) && (start + SIZE_64KB <= end)) { - write_csr(dd, ASIC_EEP_ADDR_CMD, - CMD_SECTOR_ERASE_64KB(start)); - start += SIZE_64KB; - } else if (((start & MASK_32KB) == 0) && - (start + SIZE_32KB <= end)) { - write_csr(dd, ASIC_EEP_ADDR_CMD, - CMD_SECTOR_ERASE_32KB(start)); - start += SIZE_32KB; - } else { /* 4KB will work */ - write_csr(dd, ASIC_EEP_ADDR_CMD, - CMD_SECTOR_ERASE_4KB(start)); - start += SIZE_4KB; - } - ret = wait_for_not_busy(dd); - if (ret) - goto done; - } - -done: - write_disable(dd); - - return ret; -} - -/* - * Read a 256 byte (64 dword) EPROM page. - * All callers have verified the offset is at a page boundary. - */ -static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result) -{ - int i; - - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset)); - for (i = 0; i < EP_PAGE_SIZE / sizeof(u32); i++) - result[i] = (u32)read_csr(dd, ASIC_EEP_DATA); - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */ -} - -/* - * Read length bytes starting at offset. Copy to user address addr. - */ -static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr) -{ - u32 offset; - u32 buffer[EP_PAGE_SIZE / sizeof(u32)]; - int ret = 0; - - /* reject anything not on an EPROM page boundary */ - if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK)) - return -EINVAL; - - for (offset = 0; offset < len; offset += EP_PAGE_SIZE) { - read_page(dd, start + offset, buffer); - if (copy_to_user((void __user *)(addr + offset), - buffer, EP_PAGE_SIZE)) { - ret = -EFAULT; - goto done; - } - } - -done: - return ret; -} - -/* - * Write a 256 byte (64 dword) EPROM page. - * All callers have verified the offset is at a page boundary. - */ -static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data) -{ - int i; - - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE); - write_csr(dd, ASIC_EEP_DATA, data[0]); - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset)); - for (i = 1; i < EP_PAGE_SIZE / sizeof(u32); i++) - write_csr(dd, ASIC_EEP_DATA, data[i]); - /* will close the open page */ - return wait_for_not_busy(dd); -} - -/* - * Write length bytes starting at offset. Read from user address addr. - */ -static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr) -{ - u32 offset; - u32 buffer[EP_PAGE_SIZE / sizeof(u32)]; - int ret = 0; - - /* reject anything not on an EPROM page boundary */ - if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK)) - return -EINVAL; - - write_enable(dd); - - for (offset = 0; offset < len; offset += EP_PAGE_SIZE) { - if (copy_from_user(buffer, (void __user *)(addr + offset), - EP_PAGE_SIZE)) { - ret = -EFAULT; - goto done; - } - ret = write_page(dd, start + offset, buffer); - if (ret) - goto done; - } - -done: - write_disable(dd); - return ret; -} - -/* convert an range composite to a length, in bytes */ -static inline u32 extract_rlen(u32 composite) -{ - return (composite & 0xffff) * EP_PAGE_SIZE; -} - -/* convert an range composite to a start, in bytes */ -static inline u32 extract_rstart(u32 composite) -{ - return (composite >> 16) * EP_PAGE_SIZE; -} - -/* - * Perform the given operation on the EPROM. Called from user space. The - * user credentials have already been checked. - * - * Return 0 on success, -ERRNO on error - */ -int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd) -{ - struct hfi1_devdata *dd; - u32 dev_id; - u32 rlen; /* range length */ - u32 rstart; /* range start */ - int i_minor; - int ret = 0; - - /* - * Map the device file to device data using the relative minor. - * The device file minor number is the unit number + 1. 0 is - * the generic device file - reject it. - */ - i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE; - if (i_minor <= 0) - return -EINVAL; - dd = hfi1_lookup(i_minor - 1); - if (!dd) { - pr_err("%s: cannot find unit %d!\n", __func__, i_minor); - return -EINVAL; - } - - /* some devices do not have an EPROM */ - if (!dd->eprom_available) - return -EOPNOTSUPP; - - ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT); - if (ret) { - dd_dev_err(dd, "%s: unable to acquire EPROM resource\n", - __func__); - goto done_asic; - } - - dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n", - __func__, cmd->type, cmd->len, cmd->addr); - - switch (cmd->type) { - case HFI1_CMD_EP_INFO: - if (cmd->len != sizeof(u32)) { - ret = -ERANGE; - break; - } - dev_id = read_device_id(dd); - /* addr points to a u32 user buffer */ - if (copy_to_user((void __user *)cmd->addr, &dev_id, - sizeof(u32))) - ret = -EFAULT; - break; - - case HFI1_CMD_EP_ERASE_CHIP: - ret = erase_chip(dd); - break; - - case HFI1_CMD_EP_ERASE_RANGE: - rlen = extract_rlen(cmd->len); - rstart = extract_rstart(cmd->len); - ret = erase_range(dd, rstart, rlen); - break; - - case HFI1_CMD_EP_READ_RANGE: - rlen = extract_rlen(cmd->len); - rstart = extract_rstart(cmd->len); - ret = read_length(dd, rstart, rlen, cmd->addr); - break; - - case HFI1_CMD_EP_WRITE_RANGE: - rlen = extract_rlen(cmd->len); - rstart = extract_rstart(cmd->len); - ret = write_length(dd, rstart, rlen, cmd->addr); - break; - - default: - dd_dev_err(dd, "%s: unexpected command %d\n", - __func__, cmd->type); - ret = -EINVAL; - break; - } - - release_chip_resource(dd, CR_EPROM); -done_asic: - return ret; -} - -/* - * Initialize the EPROM handler. - */ -int eprom_init(struct hfi1_devdata *dd) -{ - int ret = 0; - - /* only the discrete chip has an EPROM */ - if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0) - return 0; - - /* - * It is OK if both HFIs reset the EPROM as long as they don't - * do it at the same time. - */ - ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT); - if (ret) { - dd_dev_err(dd, - "%s: unable to acquire EPROM resource, no EPROM support\n", - __func__); - goto done_asic; - } - - /* reset EPROM to be sure it is in a good state */ - - /* set reset */ - write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK); - /* clear reset, set speed */ - write_csr(dd, ASIC_EEP_CTL_STAT, - EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT); - - /* wake the device with command "release powerdown NoID" */ - write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID); - - dd->eprom_available = true; - release_chip_resource(dd, CR_EPROM); -done_asic: - return ret; -} diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 37dd534cbeab89..c8a773ffe23b7f 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -239,12 +239,15 @@ struct ib_vendor_mad { #define IB_MGMT_CLASSPORTINFO_ATTR_ID cpu_to_be16(0x0001) +#define IB_CLASS_PORT_INFO_RESP_TIME_MASK 0x1F +#define IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE 5 + struct ib_class_port_info { u8 base_version; u8 class_version; __be16 capability_mask; - u8 reserved[3]; - u8 resp_time_value; + /* 27 bits for cap_mask2, 5 bits for resp_time */ + __be32 cap_mask2_resp_time; u8 redirect_gid[16]; __be32 redirect_tcslfl; __be16 redirect_lid; @@ -259,6 +262,59 @@ struct ib_class_port_info { __be32 trap_qkey; }; +/** + * ib_get_cpi_resp_time - Returns the resp_time value from + * cap_mask2_resp_time in ib_class_port_info. + * @cpi: A struct ib_class_port_info mad. + */ +static inline u8 ib_get_cpi_resp_time(struct ib_class_port_info *cpi) +{ + return (u8)(be32_to_cpu(cpi->cap_mask2_resp_time) & + IB_CLASS_PORT_INFO_RESP_TIME_MASK); +} + +/** + * ib_set_cpi_resptime - Sets the response time in an + * ib_class_port_info mad. + * @cpi: A struct ib_class_port_info. + * @rtime: The response time to set. + */ +static inline void ib_set_cpi_resp_time(struct ib_class_port_info *cpi, + u8 rtime) +{ + cpi->cap_mask2_resp_time = + (cpi->cap_mask2_resp_time & + cpu_to_be32(~IB_CLASS_PORT_INFO_RESP_TIME_MASK)) | + cpu_to_be32(rtime & IB_CLASS_PORT_INFO_RESP_TIME_MASK); +} + +/** + * ib_get_cpi_capmask2 - Returns the capmask2 value from + * cap_mask2_resp_time in ib_class_port_info. + * @cpi: A struct ib_class_port_info mad. + */ +static inline u32 ib_get_cpi_capmask2(struct ib_class_port_info *cpi) +{ + return (be32_to_cpu(cpi->cap_mask2_resp_time) >> + IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE); +} + +/** + * ib_set_cpi_capmask2 - Sets the capmask2 in an + * ib_class_port_info mad. + * @cpi: A struct ib_class_port_info. + * @capmask2: The capmask2 to set. + */ +static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi, + u32 capmask2) +{ + cpi->cap_mask2_resp_time = + (cpi->cap_mask2_resp_time & + cpu_to_be32(IB_CLASS_PORT_INFO_RESP_TIME_MASK)) | + cpu_to_be32(capmask2 << + IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE); +} + struct ib_mad_notice_attr { u8 generic_type; u8 prod_type_msb; diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index 0f3daae44bf9bf..b13419ce99ff5b 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -103,6 +103,9 @@ enum { IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, IB_OPCODE_COMPARE_SWAP = 0x13, IB_OPCODE_FETCH_ADD = 0x14, + /* opcode 0x15 is reserved */ + IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, + IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, /* real constants follow -- see comment about above IB_OPCODE() macro for more details */ @@ -129,6 +132,8 @@ enum { IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE), IB_OPCODE(RC, COMPARE_SWAP), IB_OPCODE(RC, FETCH_ADD), + IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), + IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), /* UC */ IB_OPCODE(UC, SEND_FIRST), diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index cdc1c81aa275bd..384041669489e1 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -94,6 +94,8 @@ enum ib_sa_selector { IB_SA_BEST = 3 }; +#define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12) + /* * Structures for SA records are named "struct ib_sa_xxx_rec." No * attempt is made to pack structures to match the physical layout of @@ -439,4 +441,14 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, void *context, struct ib_sa_query **sa_query); +/* Support get SA ClassPortInfo */ +int ib_sa_classport_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_class_port_info *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + #endif /* IB_SA_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index fc0320c004a306..432bed510369e7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -403,56 +403,55 @@ enum ib_port_speed { IB_SPEED_EDR = 32 }; -struct ib_protocol_stats { - /* TBD... */ -}; - -struct iw_protocol_stats { - u64 ipInReceives; - u64 ipInHdrErrors; - u64 ipInTooBigErrors; - u64 ipInNoRoutes; - u64 ipInAddrErrors; - u64 ipInUnknownProtos; - u64 ipInTruncatedPkts; - u64 ipInDiscards; - u64 ipInDelivers; - u64 ipOutForwDatagrams; - u64 ipOutRequests; - u64 ipOutDiscards; - u64 ipOutNoRoutes; - u64 ipReasmTimeout; - u64 ipReasmReqds; - u64 ipReasmOKs; - u64 ipReasmFails; - u64 ipFragOKs; - u64 ipFragFails; - u64 ipFragCreates; - u64 ipInMcastPkts; - u64 ipOutMcastPkts; - u64 ipInBcastPkts; - u64 ipOutBcastPkts; - - u64 tcpRtoAlgorithm; - u64 tcpRtoMin; - u64 tcpRtoMax; - u64 tcpMaxConn; - u64 tcpActiveOpens; - u64 tcpPassiveOpens; - u64 tcpAttemptFails; - u64 tcpEstabResets; - u64 tcpCurrEstab; - u64 tcpInSegs; - u64 tcpOutSegs; - u64 tcpRetransSegs; - u64 tcpInErrs; - u64 tcpOutRsts; -}; - -union rdma_protocol_stats { - struct ib_protocol_stats ib; - struct iw_protocol_stats iw; -}; +/** + * struct rdma_hw_stats + * @timestamp - Used by the core code to track when the last update was + * @lifespan - Used by the core code to determine how old the counters + * should be before being updated again. Stored in jiffies, defaults + * to 10 milliseconds, drivers can override the default be specifying + * their own value during their allocation routine. + * @name - Array of pointers to static names used for the counters in + * directory. + * @num_counters - How many hardware counters there are. If name is + * shorter than this number, a kernel oops will result. Driver authors + * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) + * in their code to prevent this. + * @value - Array of u64 counters that are accessed by the sysfs code and + * filled in by the drivers get_stats routine + */ +struct rdma_hw_stats { + unsigned long timestamp; + unsigned long lifespan; + const char * const *names; + int num_counters; + u64 value[]; +}; + +#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 +/** + * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct + * for drivers. + * @names - Array of static const char * + * @num_counters - How many elements in array + * @lifespan - How many milliseconds between updates + */ +static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct( + const char * const *names, int num_counters, + unsigned long lifespan) +{ + struct rdma_hw_stats *stats; + + stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64), + GFP_KERNEL); + if (!stats) + return NULL; + stats->names = names; + stats->num_counters = num_counters; + stats->lifespan = msecs_to_jiffies(lifespan); + + return stats; +} + /* Define bits for the various functionality this port needs to be supported by * the core. @@ -1707,8 +1706,29 @@ struct ib_device { struct iw_cm_verbs *iwcm; - int (*get_protocol_stats)(struct ib_device *device, - union rdma_protocol_stats *stats); + /** + * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the + * driver initialized data. The struct is kfree()'ed by the sysfs + * core when the device is removed. A lifespan of -1 in the return + * struct tells the core to set a default lifespan. + */ + struct rdma_hw_stats *(*alloc_hw_stats)(struct ib_device *device, + u8 port_num); + /** + * get_hw_stats - Fill in the counter value(s) in the stats struct. + * @index - The index in the value array we wish to have updated, or + * num_counters if we want all stats updated + * Return codes - + * < 0 - Error, no counters updated + * index - Updated the single counter pointed to by index + * num_counters - Updated all counters (will reset the timestamp + * and prevent further calls for lifespan milliseconds) + * Drivers are allowed to update all counters in leiu of just the + * one given in index at their option + */ + int (*get_hw_stats)(struct ib_device *device, + struct rdma_hw_stats *stats, + u8 port, int index); int (*query_device)(struct ib_device *device, struct ib_device_attr *device_attr, struct ib_udata *udata); @@ -1926,6 +1946,8 @@ struct ib_device { u8 node_type; u8 phys_port_cnt; struct ib_device_attr attrs; + struct attribute_group *hw_stats_ag; + struct rdma_hw_stats *hw_stats; /** * The following mandatory functions are used only at device diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index d57ceee90d26fe..16274e2133cdcc 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -149,15 +149,15 @@ struct rvt_driver_params { int qpn_res_end; int nports; int npkeys; - u8 qos_shift; char cq_name[RVT_CQN_MAX]; int node; - int max_rdma_atomic; int psn_mask; int psn_shift; int psn_modify_mask; u32 core_cap_flags; u32 max_mad_size; + u8 qos_shift; + u8 max_rdma_atomic; }; /* Protection domain */ @@ -425,6 +425,15 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) return rdi->dparms.npkeys; } +/* + * Return the max atomic suitable for determining + * the size of the ack ring buffer in a QP. + */ +static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) +{ + return rdi->dparms.max_rdma_atomic + 1; +} + /* * Return the indexed PKEY from the port PKEY table. */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 0e1ff2abfe9272..6d23b879416ac0 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -211,8 +211,6 @@ struct rvt_mmap_info { unsigned size; }; -#define RVT_MAX_RDMA_ATOMIC 16 - /* * This structure holds the information that the send tasklet needs * to send a RDMA read response or atomic operation. @@ -282,8 +280,7 @@ struct rvt_qp { atomic_t refcount ____cacheline_aligned_in_smp; wait_queue_head_t wait; - struct rvt_ack_entry s_ack_queue[RVT_MAX_RDMA_ATOMIC + 1] - ____cacheline_aligned_in_smp; + struct rvt_ack_entry *s_ack_queue; struct rvt_sge_state s_rdma_read_sge; spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index a533cecab14f60..98bebf8bef55e4 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -66,7 +66,7 @@ * The major version changes when data structures change in an incompatible * way. The driver must be the same for initialization to succeed. */ -#define HFI1_USER_SWMAJOR 5 +#define HFI1_USER_SWMAJOR 6 /* * Minor version differences are always compatible @@ -75,7 +75,12 @@ * may not be implemented; the user code must deal with this if it * cares, or it must abort after initialization reports the difference. */ -#define HFI1_USER_SWMINOR 0 +#define HFI1_USER_SWMINOR 1 + +/* + * We will encode the major/minor inside a single 32bit version number. + */ +#define HFI1_SWMAJOR_SHIFT 16 /* * Set of HW and driver capability/feature bits. @@ -107,19 +112,6 @@ #define HFI1_RCVHDR_ENTSIZE_16 (1UL << 1) #define HFI1_RCVDHR_ENTSIZE_32 (1UL << 2) -/* - * If the unit is specified via open, HFI choice is fixed. If port is - * specified, it's also fixed. Otherwise we try to spread contexts - * across ports and HFIs, using different algorithms. WITHIN is - * the old default, prior to this mechanism. - */ -#define HFI1_ALG_ACROSS 0 /* round robin contexts across HFIs, then - * ports; this is the default */ -#define HFI1_ALG_WITHIN 1 /* use all contexts on an HFI (round robin - * active ports within), then next HFI */ -#define HFI1_ALG_COUNT 2 /* number of algorithm choices */ - - /* User commands. */ #define HFI1_CMD_ASSIGN_CTXT 1 /* allocate HFI and context */ #define HFI1_CMD_CTXT_INFO 2 /* find out what resources we got */ @@ -127,7 +119,6 @@ #define HFI1_CMD_TID_UPDATE 4 /* update expected TID entries */ #define HFI1_CMD_TID_FREE 5 /* free expected TID entries */ #define HFI1_CMD_CREDIT_UPD 6 /* force an update of PIO credit */ -#define HFI1_CMD_SDMA_STATUS_UPD 7 /* force update of SDMA status ring */ #define HFI1_CMD_RECV_CTRL 8 /* control receipt of packets */ #define HFI1_CMD_POLL_TYPE 9 /* set the kind of polling we want */ @@ -135,13 +126,46 @@ #define HFI1_CMD_SET_PKEY 11 /* set context's pkey */ #define HFI1_CMD_CTXT_RESET 12 /* reset context's HW send context */ #define HFI1_CMD_TID_INVAL_READ 13 /* read TID cache invalidations */ -/* separate EPROM commands from normal PSM commands */ -#define HFI1_CMD_EP_INFO 64 /* read EPROM device ID */ -#define HFI1_CMD_EP_ERASE_CHIP 65 /* erase whole EPROM */ -/* range 66-74 no longer used */ -#define HFI1_CMD_EP_ERASE_RANGE 75 /* erase EPROM range */ -#define HFI1_CMD_EP_READ_RANGE 76 /* read EPROM range */ -#define HFI1_CMD_EP_WRITE_RANGE 77 /* write EPROM range */ +#define HFI1_CMD_GET_VERS 14 /* get the version of the user cdev */ + +/* + * User IOCTLs can not go above 128 if they do then see common.h and change the + * base for the snoop ioctl + */ +#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl/ioctl-number.txt */ + +/* + * Make the ioctls occupy the last 0xf0-0xff portion of the IB range + */ +#define __NUM(cmd) (HFI1_CMD_##cmd + 0xe0) + +struct hfi1_cmd; +#define HFI1_IOCTL_ASSIGN_CTXT \ + _IOWR(IB_IOCTL_MAGIC, __NUM(ASSIGN_CTXT), struct hfi1_user_info) +#define HFI1_IOCTL_CTXT_INFO \ + _IOW(IB_IOCTL_MAGIC, __NUM(CTXT_INFO), struct hfi1_ctxt_info) +#define HFI1_IOCTL_USER_INFO \ + _IOW(IB_IOCTL_MAGIC, __NUM(USER_INFO), struct hfi1_base_info) +#define HFI1_IOCTL_TID_UPDATE \ + _IOWR(IB_IOCTL_MAGIC, __NUM(TID_UPDATE), struct hfi1_tid_info) +#define HFI1_IOCTL_TID_FREE \ + _IOWR(IB_IOCTL_MAGIC, __NUM(TID_FREE), struct hfi1_tid_info) +#define HFI1_IOCTL_CREDIT_UPD \ + _IO(IB_IOCTL_MAGIC, __NUM(CREDIT_UPD)) +#define HFI1_IOCTL_RECV_CTRL \ + _IOW(IB_IOCTL_MAGIC, __NUM(RECV_CTRL), int) +#define HFI1_IOCTL_POLL_TYPE \ + _IOW(IB_IOCTL_MAGIC, __NUM(POLL_TYPE), int) +#define HFI1_IOCTL_ACK_EVENT \ + _IOW(IB_IOCTL_MAGIC, __NUM(ACK_EVENT), unsigned long) +#define HFI1_IOCTL_SET_PKEY \ + _IOW(IB_IOCTL_MAGIC, __NUM(SET_PKEY), __u16) +#define HFI1_IOCTL_CTXT_RESET \ + _IO(IB_IOCTL_MAGIC, __NUM(CTXT_RESET)) +#define HFI1_IOCTL_TID_INVAL_READ \ + _IOWR(IB_IOCTL_MAGIC, __NUM(TID_INVAL_READ), struct hfi1_tid_info) +#define HFI1_IOCTL_GET_VERS \ + _IOR(IB_IOCTL_MAGIC, __NUM(GET_VERS), int) #define _HFI1_EVENT_FROZEN_BIT 0 #define _HFI1_EVENT_LINKDOWN_BIT 1 @@ -199,9 +223,7 @@ struct hfi1_user_info { * Should be set to HFI1_USER_SWVERSION. */ __u32 userversion; - __u16 pad; - /* HFI selection algorithm, if unit has not selected */ - __u16 hfi1_alg; + __u32 pad; /* * If two or more processes wish to share a context, each process * must set the subcontext_cnt and subcontext_id to the same @@ -243,12 +265,6 @@ struct hfi1_tid_info { __u32 length; }; -struct hfi1_cmd { - __u32 type; /* command type */ - __u32 len; /* length of struct pointed to by add */ - __u64 addr; /* pointer to user structure */ -}; - enum hfi1_sdma_comp_state { FREE = 0, QUEUED, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 6e373d151cad75..02fe8390c18f6d 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -135,10 +135,12 @@ enum { * Local service operations: * RESOLVE - The client requests the local service to resolve a path. * SET_TIMEOUT - The local service requests the client to set the timeout. + * IP_RESOLVE - The client requests the local service to resolve an IP to GID. */ enum { RDMA_NL_LS_OP_RESOLVE = 0, RDMA_NL_LS_OP_SET_TIMEOUT, + RDMA_NL_LS_OP_IP_RESOLVE, RDMA_NL_LS_NUM_OPS }; @@ -176,6 +178,10 @@ struct rdma_ls_resolve_header { __u8 path_use; }; +struct rdma_ls_ip_resolve_header { + __u32 ifindex; +}; + /* Local service attribute type */ #define RDMA_NLA_F_MANDATORY (1 << 13) #define RDMA_NLA_TYPE_MASK (~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \ @@ -193,6 +199,8 @@ struct rdma_ls_resolve_header { * TCLASS u8 * PKEY u16 cpu * QOS_CLASS u16 cpu + * IPV4 u32 BE + * IPV6 u8[16] BE */ enum { LS_NLA_TYPE_UNSPEC = 0, @@ -204,6 +212,8 @@ enum { LS_NLA_TYPE_TCLASS, LS_NLA_TYPE_PKEY, LS_NLA_TYPE_QOS_CLASS, + LS_NLA_TYPE_IPV4, + LS_NLA_TYPE_IPV6, LS_NLA_TYPE_MAX };