diff options
Diffstat (limited to 'drivers/infiniband/ulp/ipoib')
-rw-r--r-- | drivers/infiniband/ulp/ipoib/Kconfig | 49 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/Makefile | 11 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib.h | 728 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1615 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 92 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_fs.c | 299 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_ib.c | 1086 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_main.c | 1424 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 965 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 294 | ||||
-rw-r--r-- | drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 191 |
11 files changed, 6754 insertions, 0 deletions
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig new file mode 100644 index 00000000..cda8eac5 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -0,0 +1,49 @@ +config INFINIBAND_IPOIB + tristate "IP-over-InfiniBand" + depends on NETDEVICES && INET && (IPV6 || IPV6=n) + ---help--- + Support for the IP-over-InfiniBand protocol (IPoIB). This + transports IP packets over InfiniBand so you can use your IB + device as a fancy NIC. + + See Documentation/infiniband/ipoib.txt for more information + +config INFINIBAND_IPOIB_CM + bool "IP-over-InfiniBand Connected Mode support" + depends on INFINIBAND_IPOIB + default n + ---help--- + This option enables support for IPoIB connected mode. After + enabling this option, you need to switch to connected mode + through /sys/class/net/ibXXX/mode to actually create + connections, and then increase the interface MTU with + e.g. ifconfig ib0 mtu 65520. + + WARNING: Enabling connected mode will trigger some packet + drops for multicast and UD mode traffic from this interface, + unless you limit mtu for these destinations to 2044. + +config INFINIBAND_IPOIB_DEBUG + bool "IP-over-InfiniBand debugging" if EXPERT + depends on INFINIBAND_IPOIB + default y + ---help--- + This option causes debugging code to be compiled into the + IPoIB driver. The output can be turned on via the + debug_level and mcast_debug_level module parameters (which + can also be set after the driver is loaded through sysfs). + + This option also creates a directory tree under ipoib/ in + debugfs, which contains files that expose debugging + information about IB multicast groups used by the IPoIB + driver. + +config INFINIBAND_IPOIB_DEBUG_DATA + bool "IP-over-InfiniBand data path debugging" + depends on INFINIBAND_IPOIB_DEBUG + ---help--- + This option compiles debugging code into the data path + of the IPoIB driver. The output can be turned on via the + data_debug_level module parameter; however, even with output + turned off, this debugging code will have some performance + impact. diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile new file mode 100644 index 00000000..3090100f --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_INFINIBAND_IPOIB) += ib_ipoib.o + +ib_ipoib-y := ipoib_main.o \ + ipoib_ib.o \ + ipoib_multicast.o \ + ipoib_verbs.o \ + ipoib_vlan.o \ + ipoib_ethtool.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o + diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h new file mode 100644 index 00000000..86df632e --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -0,0 +1,728 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPOIB_H +#define _IPOIB_H + +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/workqueue.h> +#include <linux/kref.h> +#include <linux/if_infiniband.h> +#include <linux/mutex.h> + +#include <net/neighbour.h> +#include <net/sch_generic.h> + +#include <linux/atomic.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_pack.h> +#include <rdma/ib_sa.h> +#include <linux/sched.h> + +/* constants */ + +enum ipoib_flush_level { + IPOIB_FLUSH_LIGHT, + IPOIB_FLUSH_NORMAL, + IPOIB_FLUSH_HEAVY +}; + +enum { + IPOIB_ENCAP_LEN = 4, + + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, + IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ + + IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */ + IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, + IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, + IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, + IPOIB_RX_RING_SIZE = 256, + IPOIB_TX_RING_SIZE = 128, + IPOIB_MAX_QUEUE_SIZE = 8192, + IPOIB_MIN_QUEUE_SIZE = 2, + IPOIB_CM_MAX_CONN_QP = 4096, + + IPOIB_NUM_WC = 4, + + IPOIB_MAX_PATH_REC_QUEUE = 3, + IPOIB_MAX_MCAST_QUEUE = 3, + + IPOIB_FLAG_OPER_UP = 0, + IPOIB_FLAG_INITIALIZED = 1, + IPOIB_FLAG_ADMIN_UP = 2, + IPOIB_PKEY_ASSIGNED = 3, + IPOIB_PKEY_STOP = 4, + IPOIB_FLAG_SUBINTERFACE = 5, + IPOIB_MCAST_RUN = 6, + IPOIB_STOP_REAPER = 7, + IPOIB_FLAG_ADMIN_CM = 9, + IPOIB_FLAG_UMCAST = 10, + + IPOIB_MAX_BACKOFF_SECONDS = 16, + + IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ + IPOIB_MCAST_FLAG_SENDONLY = 1, + IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ + IPOIB_MCAST_FLAG_ATTACHED = 3, + + MAX_SEND_CQE = 16, + IPOIB_CM_COPYBREAK = 256, +}; + +#define IPOIB_OP_RECV (1ul << 31) +#ifdef CONFIG_INFINIBAND_IPOIB_CM +#define IPOIB_OP_CM (1ul << 30) +#else +#define IPOIB_OP_CM (0) +#endif + +/* structs */ + +struct ipoib_header { + __be16 proto; + u16 reserved; +}; + +struct ipoib_cb { + struct qdisc_skb_cb qdisc_cb; + u8 hwaddr[INFINIBAND_ALEN]; +}; + +/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ +struct ipoib_mcast { + struct ib_sa_mcmember_rec mcmember; + struct ib_sa_multicast *mc; + struct ipoib_ah *ah; + + struct rb_node rb_node; + struct list_head list; + + unsigned long created; + unsigned long backoff; + + unsigned long flags; + unsigned char logcount; + + struct list_head neigh_list; + + struct sk_buff_head pkt_queue; + + struct net_device *dev; +}; + +struct ipoib_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; +}; + +struct ipoib_tx_buf { + struct sk_buff *skb; + u64 mapping[MAX_SKB_FRAGS + 1]; +}; + +struct ipoib_cm_tx_buf { + struct sk_buff *skb; + u64 mapping; +}; + +struct ib_cm_id; + +struct ipoib_cm_data { + __be32 qpn; /* High byte MUST be ignored on receive */ + __be32 mtu; +}; + +/* + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the second option and wait for a completion on the + * same CQ before destroying QPs attached to our SRQ. + */ + +enum ipoib_cm_state { + IPOIB_CM_RX_LIVE, + IPOIB_CM_RX_ERROR, /* Ignored by stale task */ + IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ +}; + +struct ipoib_cm_rx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct ipoib_cm_rx_buf *rx_ring; + struct list_head list; + struct net_device *dev; + unsigned long jiffies; + enum ipoib_cm_state state; + int recv_count; +}; + +struct ipoib_cm_tx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + struct ipoib_neigh *neigh; + struct ipoib_path *path; + struct ipoib_cm_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + unsigned long flags; + u32 mtu; +}; + +struct ipoib_cm_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_CM_RX_SG]; +}; + +struct ipoib_cm_dev_priv { + struct ib_srq *srq; + struct ipoib_cm_rx_buf *srq_ring; + struct ib_cm_id *id; + struct list_head passive_ids; /* state: LIVE */ + struct list_head rx_error_list; /* state: ERROR */ + struct list_head rx_flush_list; /* state: FLUSH, drain not started */ + struct list_head rx_drain_list; /* state: FLUSH, drain started */ + struct list_head rx_reap_list; /* state: FLUSH, drain done */ + struct work_struct start_task; + struct work_struct reap_task; + struct work_struct skb_task; + struct work_struct rx_reap_task; + struct delayed_work stale_task; + struct sk_buff_head skb_queue; + struct list_head start_list; + struct list_head reap_list; + struct ib_wc ibwc[IPOIB_NUM_WC]; + struct ib_sge rx_sge[IPOIB_CM_RX_SG]; + struct ib_recv_wr rx_wr; + int nonsrq_conn_qp; + int max_cm_mtu; + int num_frags; +}; + +struct ipoib_ethtool_st { + u16 coalesce_usecs; + u16 max_coalesced_frames; +}; + +/* + * Device private locking: network stack tx_lock protects members used + * in TX fast path, lock protects everything else. lock nests inside + * of tx_lock (ie tx_lock must be acquired first if needed). + */ +struct ipoib_dev_priv { + spinlock_t lock; + + struct net_device *dev; + + struct napi_struct napi; + + unsigned long flags; + + struct mutex vlan_mutex; + + struct rb_root path_tree; + struct list_head path_list; + + struct ipoib_mcast *broadcast; + struct list_head multicast_list; + struct rb_root multicast_tree; + + struct delayed_work pkey_poll_task; + struct delayed_work mcast_task; + struct work_struct carrier_on_task; + struct work_struct flush_light; + struct work_struct flush_normal; + struct work_struct flush_heavy; + struct work_struct restart_task; + struct delayed_work ah_reap_task; + + struct ib_device *ca; + u8 port; + u16 pkey; + u16 pkey_index; + struct ib_pd *pd; + struct ib_mr *mr; + struct ib_cq *recv_cq; + struct ib_cq *send_cq; + struct ib_qp *qp; + u32 qkey; + + union ib_gid local_gid; + u16 local_lid; + + unsigned int admin_mtu; + unsigned int mcast_mtu; + unsigned int max_ib_mtu; + + struct ipoib_rx_buf *rx_ring; + + struct ipoib_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; + struct ib_send_wr tx_wr; + unsigned tx_outstanding; + struct ib_wc send_wc[MAX_SEND_CQE]; + + struct ib_recv_wr rx_wr; + struct ib_sge rx_sge[IPOIB_UD_RX_SG]; + + struct ib_wc ibwc[IPOIB_NUM_WC]; + + struct list_head dead_ahs; + + struct ib_event_handler event_handler; + + struct net_device *parent; + struct list_head child_intfs; + struct list_head list; + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_dev_priv cm; +#endif + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + struct list_head fs_list; + struct dentry *mcg_dentry; + struct dentry *path_dentry; +#endif + int hca_caps; + struct ipoib_ethtool_st ethtool; + struct timer_list poll_timer; +}; + +struct ipoib_ah { + struct net_device *dev; + struct ib_ah *ah; + struct list_head list; + struct kref ref; + unsigned last_send; +}; + +struct ipoib_path { + struct net_device *dev; + struct ib_sa_path_rec pathrec; + struct ipoib_ah *ah; + struct sk_buff_head queue; + + struct list_head neigh_list; + + int query_id; + struct ib_sa_query *query; + struct completion done; + + struct rb_node rb_node; + struct list_head list; + int valid; +}; + +struct ipoib_neigh { + struct ipoib_ah *ah; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_tx *cm; +#endif + union ib_gid dgid; + struct sk_buff_head queue; + + struct neighbour *neighbour; + struct net_device *dev; + + struct list_head list; +}; + +#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) +#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +static inline int ipoib_ud_need_sg(unsigned int ib_mtu) +{ + return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; +} + +/* + * We stash a pointer to our private neighbour information after our + * hardware address in neigh->ha. The ALIGN() expression here makes + * sure that this pointer is stored aligned so that an unaligned + * load is not needed to dereference it. + */ +static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh) +{ + return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) + + INFINIBAND_ALEN, sizeof(void *)); +} + +struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh, + struct net_device *dev); +void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); + +extern struct workqueue_struct *ipoib_workqueue; + +/* functions */ + +int ipoib_poll(struct napi_struct *napi, int budget); +void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); +void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); + +struct ipoib_ah *ipoib_create_ah(struct net_device *dev, + struct ib_pd *pd, struct ib_ah_attr *attr); +void ipoib_free_ah(struct kref *kref); +static inline void ipoib_put_ah(struct ipoib_ah *ah) +{ + kref_put(&ah->ref, ipoib_free_ah); +} + +int ipoib_open(struct net_device *dev); +int ipoib_add_pkey_attr(struct net_device *dev); +int ipoib_add_umcast_attr(struct net_device *dev); + +void ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ipoib_ah *address, u32 qpn); +void ipoib_reap_ah(struct work_struct *work); + +void ipoib_mark_paths_invalid(struct net_device *dev); +void ipoib_flush_paths(struct net_device *dev); +struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); + +int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); +void ipoib_ib_dev_flush_light(struct work_struct *work); +void ipoib_ib_dev_flush_normal(struct work_struct *work); +void ipoib_ib_dev_flush_heavy(struct work_struct *work); +void ipoib_pkey_event(struct work_struct *work); +void ipoib_ib_dev_cleanup(struct net_device *dev); + +int ipoib_ib_dev_open(struct net_device *dev); +int ipoib_ib_dev_up(struct net_device *dev); +int ipoib_ib_dev_down(struct net_device *dev, int flush); +int ipoib_ib_dev_stop(struct net_device *dev, int flush); + +int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); +void ipoib_dev_cleanup(struct net_device *dev); + +void ipoib_mcast_join_task(struct work_struct *work); +void ipoib_mcast_carrier_on_task(struct work_struct *work); +void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); + +void ipoib_mcast_restart_task(struct work_struct *work); +int ipoib_mcast_start_thread(struct net_device *dev); +int ipoib_mcast_stop_thread(struct net_device *dev, int flush); + +void ipoib_mcast_dev_down(struct net_device *dev); +void ipoib_mcast_dev_flush(struct net_device *dev); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); +int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); +void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, + union ib_gid *gid, + unsigned long *created, + unsigned int *queuelen, + unsigned int *complete, + unsigned int *send_only); + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev); +int ipoib_path_iter_next(struct ipoib_path_iter *iter); +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path); +#endif + +int ipoib_mcast_attach(struct net_device *dev, u16 mlid, + union ib_gid *mgid, int set_qkey); + +int ipoib_init_qp(struct net_device *dev); +int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca); +void ipoib_transport_dev_cleanup(struct net_device *dev); + +void ipoib_event(struct ib_event_handler *handler, + struct ib_event *record); + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey); +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); + +void ipoib_pkey_poll(struct work_struct *work); +int ipoib_pkey_dev_delay_open(struct net_device *dev); +void ipoib_drain_cq(struct net_device *dev); + +void ipoib_set_ethtool_ops(struct net_device *dev); +int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + +#define IPOIB_FLAGS_RC 0x80 +#define IPOIB_FLAGS_UC 0x40 + +/* We don't support UC connections at the moment */ +#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) + +extern int ipoib_max_conn_qp; + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(dev->dev_addr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(n->ha) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags); +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return neigh->cm; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ + neigh->cm = tx; +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return !!priv->cm.srq; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return priv->cm.max_cm_mtu; +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx); +int ipoib_cm_dev_open(struct net_device *dev); +void ipoib_cm_dev_stop(struct net_device *dev); +int ipoib_cm_dev_init(struct net_device *dev); +int ipoib_cm_add_mode_attr(struct net_device *dev); +void ipoib_cm_dev_cleanup(struct net_device *dev); +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh); +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu); +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); +#else + +struct ipoib_cm_tx; + +#define ipoib_max_conn_qp 0 + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + return 0; +} +static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) + +{ + return 0; +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return 0; +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + return 0; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_dev_open(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_dev_stop(struct net_device *dev) +{ + return; +} + +static inline +int ipoib_cm_dev_init(struct net_device *dev) +{ + return -ENOSYS; +} + +static inline +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + return; +} + +static inline +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return 0; +} + +static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + dev_kfree_skb_any(skb); +} + +static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} + +static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} +#endif + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +void ipoib_create_debug_files(struct net_device *dev); +void ipoib_delete_debug_files(struct net_device *dev); +int ipoib_register_debugfs(void); +void ipoib_unregister_debugfs(void); +#else +static inline void ipoib_create_debug_files(struct net_device *dev) { } +static inline void ipoib_delete_debug_files(struct net_device *dev) { } +static inline int ipoib_register_debugfs(void) { return 0; } +static inline void ipoib_unregister_debugfs(void) { } +#endif + +#define ipoib_printk(level, priv, format, arg...) \ + printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg) +#define ipoib_warn(priv, format, arg...) \ + ipoib_printk(KERN_WARNING, priv, format , ## arg) + +extern int ipoib_sendq_size; +extern int ipoib_recvq_size; + +extern struct ib_sa_client ipoib_sa_client; + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +extern int ipoib_debug_level; + +#define ipoib_dbg(priv, format, arg...) \ + do { \ + if (ipoib_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#define ipoib_dbg_mcast(priv, format, arg...) \ + do { \ + if (mcast_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */ +#define ipoib_dbg(priv, format, arg...) \ + do { (void) (priv); } while (0) +#define ipoib_dbg_mcast(priv, format, arg...) \ + do { (void) (priv); } while (0) +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +#define ipoib_dbg_data(priv, format, arg...) \ + do { \ + if (data_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ +#define ipoib_dbg_data(priv, format, arg...) \ + do { (void) (priv); } while (0) +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ + +#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) + +#endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c new file mode 100644 index 00000000..014504d8 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -0,0 +1,1615 @@ +/* + * Copyright (c) 2006 Mellanox Technologies. All rights reserved + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_cm.h> +#include <net/dst.h> +#include <net/icmp.h> +#include <linux/icmpv6.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/moduleparam.h> + +#include "ipoib.h" + +int ipoib_max_conn_qp = 128; + +module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); +MODULE_PARM_DESC(max_nonsrq_conn_qp, + "Max number of connected-mode QPs per interface " + "(applied only if shared receive queue is not available)"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param_named(cm_data_debug_level, data_debug_level, int, 0644); +MODULE_PARM_DESC(cm_data_debug_level, + "Enable data path debug tracing for connected mode if > 0"); +#endif + +#define IPOIB_CM_IETF_ID 0x1000000000000000ULL + +#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) +#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) +#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) +#define IPOIB_CM_RX_UPDATE_MASK (0x3) + +static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR +}; + +#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff + +static struct ib_send_wr ipoib_cm_rx_drain_wr = { + .wr_id = IPOIB_CM_RX_DRAIN_WRID, + .opcode = IB_WR_SEND, +}; + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event); + +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, + u64 mapping[IPOIB_CM_RX_SG]) +{ + int i; + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (i = 0; i < frags; ++i) + ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); +} + +static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < priv->cm.num_frags; ++i) + priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; + + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1, + priv->cm.srq_ring[id].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[id].skb); + priv->cm.srq_ring[id].skb = NULL; + } + + return ret; +} + +static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, + struct ipoib_cm_rx *rx, + struct ib_recv_wr *wr, + struct ib_sge *sge, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + sge[i].addr = rx->rx_ring[id].mapping[i]; + + ret = ib_post_recv(rx->qp, wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx->rx_ring[id].mapping); + dev_kfree_skb_any(rx->rx_ring[id].skb); + rx->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring, + int id, int frags, + u64 mapping[IPOIB_CM_RX_SG]) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int i; + + skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); + if (unlikely(!skb)) + return NULL; + + /* + * IPoIB adds a 4 byte header. So we need 12 more bytes to align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 12); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { + dev_kfree_skb_any(skb); + return NULL; + } + + for (i = 0; i < frags; i++) { + struct page *page = alloc_page(GFP_ATOMIC); + + if (!page) + goto partial_error; + skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); + + mapping[i + 1] = ib_dma_map_page(priv->ca, page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) + goto partial_error; + } + + rx_ring[id].skb = skb; + return skb; + +partial_error: + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (; i > 0; --i) + ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); + + dev_kfree_skb_any(skb); + return NULL; +} + +static void ipoib_cm_free_rx_ring(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) + if (rx_ring[i].skb) { + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx_ring[i].mapping); + dev_kfree_skb_any(rx_ring[i].skb); + } + + vfree(rx_ring); +} + +static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) +{ + struct ib_send_wr *bad_wr; + struct ipoib_cm_rx *p; + + /* We only reserved 1 extra slot in CQ for drain WRs, so + * make sure we have at most 1 outstanding WR. */ + if (list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) + return; + + /* + * QPs on flush list are error state. This way, a "flush + * error" WC will be immediately generated for each WR we post. + */ + p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); + if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) + ipoib_warn(priv, "failed to post drain wr\n"); + + list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); +} + +static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) +{ + struct ipoib_cm_rx *p = ctx; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + unsigned long flags; + + if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) + return; + + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_flush_list); + p->state = IPOIB_CM_RX_FLUSH; + ipoib_cm_start_rx_drain(priv); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, + struct ipoib_cm_rx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .event_handler = ipoib_cm_rx_event_handler, + .send_cq = priv->recv_cq, /* For drain WR */ + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = 1, /* For drain WR */ + .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = p, + }; + + if (!ipoib_cm_has_srq(dev)) { + attr.cap.max_recv_wr = ipoib_recvq_size; + attr.cap.max_recv_sge = IPOIB_CM_RX_SG; + } + + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_modify_rx_qp(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); + return ret; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); + return ret; + } + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + qp_attr.rq_psn = psn; + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + /* + * Current Mellanox HCA firmware won't generate completions + * with error for drain WRs unless the QP has been moved to + * RTS first. This work-around leaves a window where a QP has + * moved to error asynchronously, but this will eventually get + * fixed in firmware, so let's not error out if modify QP + * fails. + */ + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return 0; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return 0; + } + + return 0; +} + +static void ipoib_cm_init_rx_wr(struct net_device *dev, + struct ib_recv_wr *wr, + struct ib_sge *sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < priv->cm.num_frags; ++i) + sge[i].lkey = priv->mr->lkey; + + sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < priv->cm.num_frags; ++i) + sge[i].length = PAGE_SIZE; + + wr->next = NULL; + wr->sg_list = sge; + wr->num_sge = priv->cm.num_frags; +} + +static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, + struct ipoib_cm_rx *rx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct { + struct ib_recv_wr wr; + struct ib_sge sge[IPOIB_CM_RX_SG]; + } *t; + int ret; + int i; + + rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring); + if (!rx->rx_ring) { + printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + return -ENOMEM; + } + + t = kmalloc(sizeof *t, GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto err_free; + } + + ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); + + spin_lock_irq(&priv->lock); + + if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) { + spin_unlock_irq(&priv->lock); + ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0); + ret = -EINVAL; + goto err_free; + } else + ++priv->cm.nonsrq_conn_qp; + + spin_unlock_irq(&priv->lock); + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, + rx->rx_ring[i].mapping)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + ret = -ENOMEM; + goto err_count; + } + ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i); + if (ret) { + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " + "failed for buf %d\n", i); + ret = -EIO; + goto err_count; + } + } + + rx->recv_count = ipoib_recvq_size; + + kfree(t); + + return 0; + +err_count: + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + +err_free: + kfree(t); + ipoib_cm_free_rx_ring(dev, rx->rx_ring); + + return ret; +} + +static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, + struct ib_qp *qp, struct ib_cm_req_event_param *req, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_rep_param rep = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + rep.private_data = &data; + rep.private_data_len = sizeof data; + rep.flow_control = 0; + rep.rnr_retry_count = req->rnr_retry_count; + rep.srq = ipoib_cm_has_srq(dev); + rep.qp_num = qp->qp_num; + rep.starting_psn = psn; + return ib_send_cm_rep(cm_id, &rep); +} + +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct net_device *dev = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned psn; + int ret; + + ipoib_dbg(priv, "REQ arrived\n"); + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + p->dev = dev; + p->id = cm_id; + cm_id->context = p; + p->state = IPOIB_CM_RX_LIVE; + p->jiffies = jiffies; + INIT_LIST_HEAD(&p->list); + + p->qp = ipoib_cm_create_rx_qp(dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + goto err_qp; + } + + psn = random32() & 0xffffff; + ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); + if (ret) + goto err_modify; + + if (!ipoib_cm_has_srq(dev)) { + ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p); + if (ret) + goto err_modify; + } + + spin_lock_irq(&priv->lock); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + /* Add this entry to passive ids list head, but do not re-add it + * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ + p->jiffies = jiffies; + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irq(&priv->lock); + + ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); + if (ret) { + ipoib_warn(priv, "failed to send REP: %d\n", ret); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + } + return 0; + +err_modify: + ib_destroy_qp(p->qp); +err_qp: + kfree(p); + return ret; +} + +static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_rx *p; + struct ipoib_dev_priv *priv; + + switch (event->event) { + case IB_CM_REQ_RECEIVED: + return ipoib_cm_req_handler(cm_id, event); + case IB_CM_DREQ_RECEIVED: + p = cm_id->context; + ib_send_cm_drep(cm_id, NULL, 0); + /* Fall through */ + case IB_CM_REJ_RECEIVED: + p = cm_id->context; + priv = netdev_priv(p->dev); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + /* Fall through */ + default: + return 0; + } +} +/* Adjust length of skb with fragments to match received data */ +static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, + unsigned int length, struct sk_buff *toskb) +{ + int i, num_frags; + unsigned int size; + + /* put header into skb */ + size = min(length, hdr_space); + skb->tail += size; + skb->len += size; + length -= size; + + num_frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < num_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (length == 0) { + /* don't need this page */ + skb_fill_page_desc(toskb, i, skb_frag_page(frag), + 0, PAGE_SIZE); + --skb_shinfo(skb)->nr_frags; + } else { + size = min(length, (unsigned) PAGE_SIZE); + + skb_frag_size_set(frag, size); + skb->data_len += size; + skb->truesize += size; + skb->len += size; + length -= size; + } + } +} + +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx_buf *rx_ring; + unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); + struct sk_buff *skb, *newskb; + struct ipoib_cm_rx *p; + unsigned long flags; + u64 mapping[IPOIB_CM_RX_SG]; + int frags; + int has_srq; + struct sk_buff *small_skb; + + ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { + spin_lock_irqsave(&priv->lock, flags); + list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); + ipoib_cm_start_rx_drain(priv); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + spin_unlock_irqrestore(&priv->lock, flags); + } else + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + p = wc->qp->qp_context; + + has_srq = ipoib_cm_has_srq(dev); + rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring; + + skb = rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + ipoib_dbg(priv, "cm recv error " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ++dev->stats.rx_dropped; + if (has_srq) + goto repost; + else { + if (!--p->recv_count) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_reap_list); + spin_unlock_irqrestore(&priv->lock, flags); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + } + return; + } + } + + if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { + if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { + spin_lock_irqsave(&priv->lock, flags); + p->jiffies = jiffies; + /* Move this entry to list head, but do not re-add it + * if it has been moved out of list. */ + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + } + } + + if (wc->byte_len < IPOIB_CM_COPYBREAK) { + int dlen = wc->byte_len; + + small_skb = dev_alloc_skb(dlen + 12); + if (small_skb) { + skb_reserve(small_skb, 12); + ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, small_skb->data, dlen); + ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_put(small_skb, dlen); + skb = small_skb; + goto copied; + } + } + + frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, + (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; + + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); + if (unlikely(!newskb)) { + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); + ++dev->stats.rx_dropped; + goto repost; + } + + ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping); + memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); + +copied: + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_ENCAP_LEN); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; + + skb->dev = dev; + /* XXX get correct PACKET_ type here */ + skb->pkt_type = PACKET_HOST; + netif_receive_skb(skb); + +repost: + if (has_srq) { + if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id))) + ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " + "for buf %d\n", wr_id); + } else { + if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, + &priv->cm.rx_wr, + priv->cm.rx_sge, + wr_id))) { + --p->recv_count; + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " + "for buf %d\n", wr_id); + } + } +} + +static inline int post_send(struct ipoib_dev_priv *priv, + struct ipoib_cm_tx *tx, + unsigned int wr_id, + u64 addr, int len) +{ + struct ib_send_wr *bad_wr; + + priv->tx_sge[0].addr = addr; + priv->tx_sge[0].length = len; + + priv->tx_wr.num_sge = 1; + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; + + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx_buf *tx_req; + u64 addr; + int rc; + + if (unlikely(skb->len > tx->mtu)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, tx->mtu); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); + return; + } + + ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", + tx->tx_head, skb->len, tx->qp->qp_num); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + tx_req->mapping = addr; + + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + addr, skb->len); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); + ++dev->stats.tx_errors; + ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); + dev_kfree_skb_any(skb); + } else { + dev->trans_start = jiffies; + ++tx->tx_head; + + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) + ipoib_warn(priv, "request notify on send CQ failed\n"); + netif_stop_queue(dev); + } + } +} + +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx = wc->qp->qp_context; + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; + struct ipoib_cm_tx_buf *tx_req; + unsigned long flags; + + ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &tx->tx_ring[wr_id]; + + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); + + /* FIXME: is this right? Shouldn't we only increment on success? */ + ++dev->stats.tx_packets; + dev->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + netif_tx_lock(dev); + + ++tx->tx_tail; + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_neigh *neigh; + + ipoib_dbg(priv, "failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); + + spin_unlock_irqrestore(&priv->lock, flags); + } + + netif_tx_unlock(dev); +} + +int ipoib_cm_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return 0; + + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); + if (IS_ERR(priv->cm.id)) { + printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + ret = PTR_ERR(priv->cm.id); + goto err_cm; + } + + ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), + 0, NULL); + if (ret) { + printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); + goto err_listen; + } + + return 0; + +err_listen: + ib_destroy_cm_id(priv->cm.id); +err_cm: + priv->cm.id = NULL; + return ret; +} + +static void ipoib_cm_free_rx_reap_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *rx, *n; + LIST_HEAD(list); + + spin_lock_irq(&priv->lock); + list_splice_init(&priv->cm.rx_reap_list, &list); + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(rx, n, &list, list) { + ib_destroy_cm_id(rx->id); + ib_destroy_qp(rx->qp); + if (!ipoib_cm_has_srq(dev)) { + ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring); + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + } + kfree(rx); + } +} + +void ipoib_cm_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned long begin; + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) + return; + + ib_destroy_cm_id(priv->cm.id); + priv->cm.id = NULL; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + /* Wait for all RX to be drained */ + begin = jiffies; + + while (!list_empty(&priv->cm.rx_error_list) || + !list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "RX drain timing out\n"); + + /* + * assume the HW is wedged and just free up everything. + */ + list_splice_init(&priv->cm.rx_flush_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_error_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_drain_list, + &priv->cm.rx_reap_list); + break; + } + spin_unlock_irq(&priv->lock); + msleep(1); + ipoib_drain_cq(dev); + spin_lock_irq(&priv->lock); + } + + spin_unlock_irq(&priv->lock); + + ipoib_cm_free_rx_reap_list(dev); + + cancel_delayed_work(&priv->cm.stale_task); +} + +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct ipoib_cm_tx *p = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_data *data = event->private_data; + struct sk_buff_head skqueue; + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + struct sk_buff *skb; + + p->mtu = be32_to_cpu(data->mtu); + + if (p->mtu <= IPOIB_ENCAP_LEN) { + ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", + p->mtu, IPOIB_ENCAP_LEN); + return -EINVAL; + } + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + + qp_attr.rq_psn = 0 /* FIXME */; + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return ret; + } + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return ret; + } + + skb_queue_head_init(&skqueue); + + spin_lock_irq(&priv->lock); + set_bit(IPOIB_FLAG_OPER_UP, &p->flags); + if (p->neigh) + while ((skb = __skb_dequeue(&p->neigh->queue))) + __skb_queue_tail(&skqueue, skb); + spin_unlock_irq(&priv->lock); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = p->dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed " + "to requeue packet\n"); + } + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + if (ret) { + ipoib_warn(priv, "failed to send RTU: %d\n", ret); + return ret; + } + return 0; +} + +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .send_cq = priv->recv_cq, + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = ipoib_sendq_size, + .cap.max_send_sge = 1, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = tx + }; + + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_send_req(struct net_device *dev, + struct ib_cm_id *id, struct ib_qp *qp, + u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_req_param req = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + req.primary_path = pathrec; + req.alternate_path = NULL; + req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); + req.qp_num = qp->qp_num; + req.qp_type = qp->qp_type; + req.private_data = &data; + req.private_data_len = sizeof data; + req.flow_control = 0; + + req.starting_psn = 0; /* FIXME */ + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req.responder_resources = 4; + req.remote_cm_response_timeout = 20; + req.local_cm_response_timeout = 20; + req.retry_count = 0; /* RFC draft warns against retries */ + req.rnr_retry_count = 0; /* RFC draft warns against retries */ + req.max_cm_retries = 15; + req.srq = ipoib_cm_has_srq(dev); + return ib_send_cm_req(id, &req); +} + +static int ipoib_cm_modify_tx_init(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); + if (ret) { + ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; + qp_attr.port_num = priv->port; + qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + int ret; + + p->tx_ring = vzalloc(ipoib_sendq_size * sizeof *p->tx_ring); + if (!p->tx_ring) { + ipoib_warn(priv, "failed to allocate tx ring\n"); + ret = -ENOMEM; + goto err_tx; + } + + p->qp = ipoib_cm_create_tx_qp(p->dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); + goto err_qp; + } + + p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); + if (IS_ERR(p->id)) { + ret = PTR_ERR(p->id); + ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); + goto err_id; + } + + ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); + if (ret) { + ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); + goto err_modify; + } + + ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); + if (ret) { + ipoib_warn(priv, "failed to send cm req: %d\n", ret); + goto err_send_cm; + } + + ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n", + p->qp->qp_num, pathrec->dgid.raw, qpn); + + return 0; + +err_send_cm: +err_modify: + ib_destroy_cm_id(p->id); +err_id: + p->id = NULL; + ib_destroy_qp(p->qp); +err_qp: + p->qp = NULL; + vfree(p->tx_ring); +err_tx: + return ret; +} + +static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_tx_buf *tx_req; + unsigned long begin; + + ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", + p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); + + if (p->id) + ib_destroy_cm_id(p->id); + + if (p->tx_ring) { + /* Wait for all sends to complete */ + begin = jiffies; + while ((int) p->tx_tail - (int) p->tx_head < 0) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends not completed\n", + p->tx_head - p->tx_tail); + goto timeout; + } + + msleep(1); + } + } + +timeout: + + while ((int) p->tx_tail - (int) p->tx_head < 0) { + tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, + DMA_TO_DEVICE); + dev_kfree_skb_any(tx_req->skb); + ++p->tx_tail; + netif_tx_lock_bh(p->dev); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(p->dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(p->dev); + netif_tx_unlock_bh(p->dev); + } + + if (p->qp) + ib_destroy_qp(p->qp); + + vfree(p->tx_ring); + kfree(p); +} + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_tx *tx = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_DREQ_RECEIVED: + ipoib_dbg(priv, "DREQ received.\n"); + ib_send_cm_drep(cm_id, NULL, 0); + break; + case IB_CM_REP_RECEIVED: + ipoib_dbg(priv, "REP received.\n"); + ret = ipoib_cm_rep_handler(cm_id, event); + if (ret) + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + break; + case IB_CM_REQ_ERROR: + case IB_CM_REJ_RECEIVED: + case IB_CM_TIMEWAIT_EXIT: + ipoib_dbg(priv, "CM error %d.\n", event->event); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + break; + default: + break; + } + + return 0; +} + +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx; + + tx = kzalloc(sizeof *tx, GFP_ATOMIC); + if (!tx) + return NULL; + + neigh->cm = tx; + tx->neigh = neigh; + tx->path = path; + tx->dev = dev; + list_add(&tx->list, &priv->cm.start_list); + set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); + queue_work(ipoib_workqueue, &priv->cm.start_task); + return tx; +} + +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + ipoib_dbg(priv, "Reap connection for gid %pI6\n", + tx->neigh->dgid.raw); + tx->neigh = NULL; + } +} + +static void ipoib_cm_tx_start(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.start_task); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + struct ipoib_cm_tx *p; + unsigned long flags; + int ret; + + struct ib_sa_path_rec pathrec; + u32 qpn; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.start_list)) { + p = list_entry(priv->cm.start_list.next, typeof(*p), list); + list_del_init(&p->list); + neigh = p->neigh; + qpn = IPOIB_QPN(neigh->neighbour->ha); + memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + ret = ipoib_cm_tx_init(p, qpn, &pathrec); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + if (ret) { + neigh = p->neigh; + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + } + list_del(&p->list); + kfree(p); + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_tx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.reap_task); + struct net_device *dev = priv->dev; + struct ipoib_cm_tx *p; + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.reap_list)) { + p = list_entry(priv->cm.reap_list.next, typeof(*p), list); + list_del(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + ipoib_cm_tx_destroy(p); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_skb_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.skb_task); + struct net_device *dev = priv->dev; + struct sk_buff *skb; + unsigned long flags; + unsigned mtu = priv->mcast_mtu; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while ((skb = skb_dequeue(&priv->cm.skb_queue))) { + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); +#endif + dev_kfree_skb_any(skb); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int e = skb_queue_empty(&priv->cm.skb_queue); + + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + skb_queue_tail(&priv->cm.skb_queue, skb); + if (e) + queue_work(ipoib_workqueue, &priv->cm.skb_task); +} + +static void ipoib_cm_rx_reap(struct work_struct *work) +{ + ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv, + cm.rx_reap_task)->dev); +} + +static void ipoib_cm_stale_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.stale_task.work); + struct ipoib_cm_rx *p; + int ret; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + /* List is sorted by LRU, start from tail, + * stop when we see a recently used entry */ + p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); + if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) + break; + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + if (!list_empty(&priv->cm.passive_ids)) + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + spin_unlock_irq(&priv->lock); +} + + +static ssize_t show_mode(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + return sprintf(buf, "connected\n"); + else + return sprintf(buf, "datagram\n"); +} + +static ssize_t set_mode(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (!rtnl_trylock()) + return restart_syscall(); + + /* flush paths if we switch modes so that connections are restarted */ + if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + netdev_update_features(dev); + rtnl_unlock(); + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + ipoib_flush_paths(dev); + return count; + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + netdev_update_features(dev); + dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + rtnl_unlock(); + ipoib_flush_paths(dev); + + return count; + } + rtnl_unlock(); + + return -EINVAL; +} + +static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); + +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_mode); +} + +static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_srq_init_attr srq_init_attr = { + .srq_type = IB_SRQT_BASIC, + .attr = { + .max_wr = ipoib_recvq_size, + .max_sge = max_sge + } + }; + + priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); + if (IS_ERR(priv->cm.srq)) { + if (PTR_ERR(priv->cm.srq) != -ENOSYS) + printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n", + priv->ca->name, PTR_ERR(priv->cm.srq)); + priv->cm.srq = NULL; + return; + } + + priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); + if (!priv->cm.srq_ring) { + printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + ib_destroy_srq(priv->cm.srq); + priv->cm.srq = NULL; + return; + } + +} + +int ipoib_cm_dev_init(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, ret; + struct ib_device_attr attr; + + INIT_LIST_HEAD(&priv->cm.passive_ids); + INIT_LIST_HEAD(&priv->cm.reap_list); + INIT_LIST_HEAD(&priv->cm.start_list); + INIT_LIST_HEAD(&priv->cm.rx_error_list); + INIT_LIST_HEAD(&priv->cm.rx_flush_list); + INIT_LIST_HEAD(&priv->cm.rx_drain_list); + INIT_LIST_HEAD(&priv->cm.rx_reap_list); + INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); + INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); + INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); + INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); + + skb_queue_head_init(&priv->cm.skb_queue); + + ret = ib_query_device(priv->ca, &attr); + if (ret) { + printk(KERN_WARNING "ib_query_device() failed with %d\n", ret); + return ret; + } + + ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge); + + attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge); + ipoib_cm_create_srq(dev, attr.max_srq_sge); + if (ipoib_cm_has_srq(dev)) { + priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10; + priv->cm.num_frags = attr.max_srq_sge; + ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", + priv->cm.max_cm_mtu, priv->cm.num_frags); + } else { + priv->cm.max_cm_mtu = IPOIB_CM_MTU; + priv->cm.num_frags = IPOIB_CM_RX_SG; + } + + ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge); + + if (ipoib_cm_has_srq(dev)) { + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, + priv->cm.num_frags - 1, + priv->cm.srq_ring[i].mapping)) { + ipoib_warn(priv, "failed to allocate " + "receive buffer %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + + if (ipoib_cm_post_receive_srq(dev, i)) { + ipoib_warn(priv, "ipoib_cm_post_receive_srq " + "failed for buf %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -EIO; + } + } + } + + priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; + return 0; +} + +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!priv->cm.srq) + return; + + ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); + + ret = ib_destroy_srq(priv->cm.srq); + if (ret) + ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); + + priv->cm.srq = NULL; + if (!priv->cm.srq_ring) + return; + + ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring); + priv->cm.srq_ring = NULL; +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c new file mode 100644 index 00000000..29bc7b57 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/ethtool.h> +#include <linux/netdevice.h> + +#include "ipoib.h" + +static void ipoib_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1); +} + +static int ipoib_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; + coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; + + return 0; +} + +static int ipoib_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + /* + * These values are saved in the private data and returned + * when ipoib_get_coalesce() is called + */ + if (coal->rx_coalesce_usecs > 0xffff || + coal->rx_max_coalesced_frames > 0xffff) + return -EINVAL; + + ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames, + coal->rx_coalesce_usecs); + if (ret && ret != -ENOSYS) { + ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); + return ret; + } + + priv->ethtool.coalesce_usecs = coal->rx_coalesce_usecs; + priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames; + + return 0; +} + +static const struct ethtool_ops ipoib_ethtool_ops = { + .get_drvinfo = ipoib_get_drvinfo, + .get_coalesce = ipoib_get_coalesce, + .set_coalesce = ipoib_set_coalesce, +}; + +void ipoib_set_ethtool_ops(struct net_device *dev) +{ + SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops); +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c new file mode 100644 index 00000000..50061854 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/err.h> +#include <linux/seq_file.h> +#include <linux/slab.h> + +struct file_operations; + +#include <linux/debugfs.h> +#include <linux/export.h> + +#include "ipoib.h" + +static struct dentry *ipoib_root; + +static void format_gid(union ib_gid *gid, char *buf) +{ + int i, n; + + for (n = 0, i = 0; i < 8; ++i) { + n += sprintf(buf + n, "%x", + be16_to_cpu(((__be16 *) gid->raw)[i])); + if (i < 7) + buf[n++] = ':'; + } +} + +static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_mcast_iter *iter; + loff_t n = *pos; + + iter = ipoib_mcast_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_mcast_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_mcast_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + union ib_gid mgid; + unsigned long created; + unsigned int queuelen, complete, send_only; + + if (!iter) + return 0; + + ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen, + &complete, &send_only); + + format_gid(&mgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " created: %10ld\n" + " queuelen: %9d\n" + " complete: %9s\n" + " send_only: %8s\n" + "\n", + gid_buf, created, queuelen, + complete ? "yes" : "no", + send_only ? "yes" : "no"); + + return 0; +} + +static const struct seq_operations ipoib_mcg_seq_ops = { + .start = ipoib_mcg_seq_start, + .next = ipoib_mcg_seq_next, + .stop = ipoib_mcg_seq_stop, + .show = ipoib_mcg_seq_show, +}; + +static int ipoib_mcg_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &ipoib_mcg_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations ipoib_mcg_fops = { + .owner = THIS_MODULE, + .open = ipoib_mcg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_path_iter *iter; + loff_t n = *pos; + + iter = ipoib_path_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_path_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_path_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + struct ipoib_path path; + int rate; + + if (!iter) + return 0; + + ipoib_path_iter_read(iter, &path); + + format_gid(&path.pathrec.dgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " complete: %6s\n", + gid_buf, path.pathrec.dlid ? "yes" : "no"); + + if (path.pathrec.dlid) { + rate = ib_rate_to_mbps(path.pathrec.rate); + + seq_printf(file, + " DLID: 0x%04x\n" + " SL: %12d\n" + " rate: %8d.%d Gb/sec\n", + be16_to_cpu(path.pathrec.dlid), + path.pathrec.sl, + rate / 1000, rate % 1000); + } + + seq_putc(file, '\n'); + + return 0; +} + +static const struct seq_operations ipoib_path_seq_ops = { + .start = ipoib_path_seq_start, + .next = ipoib_path_seq_next, + .stop = ipoib_path_seq_stop, + .show = ipoib_path_seq_show, +}; + +static int ipoib_path_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &ipoib_path_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations ipoib_path_fops = { + .owner = THIS_MODULE, + .open = ipoib_path_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +void ipoib_create_debug_files(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + char name[IFNAMSIZ + sizeof "_path"]; + + snprintf(name, sizeof name, "%s_mcg", dev->name); + priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_mcg_fops); + if (!priv->mcg_dentry) + ipoib_warn(priv, "failed to create mcg debug file\n"); + + snprintf(name, sizeof name, "%s_path", dev->name); + priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_path_fops); + if (!priv->path_dentry) + ipoib_warn(priv, "failed to create path debug file\n"); +} + +void ipoib_delete_debug_files(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (priv->mcg_dentry) + debugfs_remove(priv->mcg_dentry); + if (priv->path_dentry) + debugfs_remove(priv->path_dentry); +} + +int ipoib_register_debugfs(void) +{ + ipoib_root = debugfs_create_dir("ipoib", NULL); + return ipoib_root ? 0 : -ENOMEM; +} + +void ipoib_unregister_debugfs(void) +{ + debugfs_remove(ipoib_root); +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c new file mode 100644 index 00000000..5c1bc995 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -0,0 +1,1086 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/delay.h> +#include <linux/moduleparam.h> +#include <linux/dma-mapping.h> +#include <linux/slab.h> + +#include <linux/ip.h> +#include <linux/tcp.h> + +#include "ipoib.h" + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param(data_debug_level, int, 0644); +MODULE_PARM_DESC(data_debug_level, + "Enable data path debug tracing if > 0"); +#endif + +static DEFINE_MUTEX(pkey_mutex); + +struct ipoib_ah *ipoib_create_ah(struct net_device *dev, + struct ib_pd *pd, struct ib_ah_attr *attr) +{ + struct ipoib_ah *ah; + struct ib_ah *vah; + + ah = kmalloc(sizeof *ah, GFP_KERNEL); + if (!ah) + return ERR_PTR(-ENOMEM); + + ah->dev = dev; + ah->last_send = 0; + kref_init(&ah->ref); + + vah = ib_create_ah(pd, attr); + if (IS_ERR(vah)) { + kfree(ah); + ah = (struct ipoib_ah *)vah; + } else { + ah->ah = vah; + ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); + } + + return ah; +} + +void ipoib_free_ah(struct kref *kref) +{ + struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); + struct ipoib_dev_priv *priv = netdev_priv(ah->dev); + + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + list_add_tail(&ah->list, &priv->dead_ahs); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, + u64 mapping[IPOIB_UD_RX_SG]) +{ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, + DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, + DMA_FROM_DEVICE); + } else + ib_dma_unmap_single(priv->ca, mapping[0], + IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), + DMA_FROM_DEVICE); +} + +static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, + struct sk_buff *skb, + unsigned int length) +{ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; + unsigned int size; + /* + * There is only two buffers needed for max_payload = 4K, + * first buf size is IPOIB_UD_HEAD_SIZE + */ + skb->tail += IPOIB_UD_HEAD_SIZE; + skb->len += length; + + size = length - IPOIB_UD_HEAD_SIZE; + + skb_frag_size_set(frag, size); + skb->data_len += size; + skb->truesize += size; + } else + skb_put(skb, length); + +} + +static int ipoib_ib_post_receive(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int ret; + + priv->rx_wr.wr_id = id | IPOIB_OP_RECV; + priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0]; + priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; + + + ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping); + dev_kfree_skb_any(priv->rx_ring[id].skb); + priv->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int buf_size; + u64 *mapping; + + if (ipoib_ud_need_sg(priv->max_ib_mtu)) + buf_size = IPOIB_UD_HEAD_SIZE; + else + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + + skb = dev_alloc_skb(buf_size + 4); + if (unlikely(!skb)) + return NULL; + + /* + * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte + * header. So we need 4 more bytes to get to 48 and align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 4); + + mapping = priv->rx_ring[id].mapping; + mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) + goto error; + + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + struct page *page = alloc_page(GFP_ATOMIC); + if (!page) + goto partial_error; + skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); + mapping[1] = + ib_dma_map_page(priv->ca, page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) + goto partial_error; + } + + priv->rx_ring[id].skb = skb; + return skb; + +partial_error: + ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE); +error: + dev_kfree_skb_any(skb); + return NULL; +} + +static int ipoib_ib_post_receives(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_alloc_rx_skb(dev, i)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + return -ENOMEM; + } + if (ipoib_ib_post_receive(dev, i)) { + ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); + return -EIO; + } + } + + return 0; +} + +static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; + union ib_gid *dgid; + + ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + skb = priv->rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, "failed recv event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping); + dev_kfree_skb_any(skb); + priv->rx_ring[wr_id].skb = NULL; + return; + } + + /* + * Drop packets that this interface sent, ie multicast packets + * that the HCA has replicated. + */ + if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) + goto repost; + + memcpy(mapping, priv->rx_ring[wr_id].mapping, + IPOIB_UD_RX_SG * sizeof *mapping); + + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) { + ++dev->stats.rx_dropped; + goto repost; + } + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + ipoib_ud_dma_unmap_rx(priv, mapping); + ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + + /* First byte of dgid signals multicast when 0xff */ + dgid = &((struct ib_grh *)skb->data)->dgid; + + if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff) + skb->pkt_type = PACKET_HOST; + else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + + skb_pull(skb, IB_GRH_BYTES); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_ENCAP_LEN); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; + + skb->dev = dev; + if ((dev->features & NETIF_F_RXCSUM) && + likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + napi_gro_receive(&priv->napi, skb); + +repost: + if (unlikely(ipoib_ib_post_receive(dev, wr_id))) + ipoib_warn(priv, "ipoib_ib_post_receive failed " + "for buf %d\n", wr_id); +} + +static int ipoib_dma_map_tx(struct ib_device *ca, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) + return -EIO; + + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, + skb_frag_page(frag), + frag->page_offset, skb_frag_size(frag), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) + goto partial_error; + } + return 0; + +partial_error: + for (; i > 0; --i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; + + ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE); + } + + if (off) + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + + return -EIO; +} + +static void ipoib_dma_unmap_tx(struct ib_device *ca, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag), + DMA_TO_DEVICE); + } +} + +static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id; + struct ipoib_tx_buf *tx_req; + + ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &priv->tx_ring[wr_id]; + + ipoib_dma_unmap_tx(priv->ca, tx_req); + + ++dev->stats.tx_packets; + dev->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + ++priv->tx_tail; + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, "failed send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); +} + +static int poll_tx(struct ipoib_dev_priv *priv) +{ + int n, i; + + n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); + for (i = 0; i < n; ++i) + ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i); + + return n == MAX_SEND_CQE; +} + +int ipoib_poll(struct napi_struct *napi, int budget) +{ + struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi); + struct net_device *dev = priv->dev; + int done; + int t; + int n, i; + + done = 0; + +poll_more: + while (done < budget) { + int max = (budget - done); + + t = min(IPOIB_NUM_WC, max); + n = ib_poll_cq(priv->recv_cq, t, priv->ibwc); + + for (i = 0; i < n; i++) { + struct ib_wc *wc = priv->ibwc + i; + + if (wc->wr_id & IPOIB_OP_RECV) { + ++done; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else + ipoib_cm_handle_tx_wc(priv->dev, wc); + } + + if (n != t) + break; + } + + if (done < budget) { + napi_complete(napi); + if (unlikely(ib_req_notify_cq(priv->recv_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS)) && + napi_reschedule(napi)) + goto poll_more; + } + + return done; +} + +void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) +{ + struct net_device *dev = dev_ptr; + struct ipoib_dev_priv *priv = netdev_priv(dev); + + napi_schedule(&priv->napi); +} + +static void drain_tx_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + netif_tx_lock(dev); + while (poll_tx(priv)) + ; /* nothing */ + + if (netif_queue_stopped(dev)) + mod_timer(&priv->poll_timer, jiffies + 1); + + netif_tx_unlock(dev); +} + +void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev_ptr); + + mod_timer(&priv->poll_timer, jiffies); +} + +static inline int post_send(struct ipoib_dev_priv *priv, + unsigned int wr_id, + struct ib_ah *address, u32 qpn, + struct ipoib_tx_buf *tx_req, + void *head, int hlen) +{ + struct ib_send_wr *bad_wr; + int i, off; + struct sk_buff *skb = tx_req->skb; + skb_frag_t *frags = skb_shinfo(skb)->frags; + int nr_frags = skb_shinfo(skb)->nr_frags; + u64 *mapping = tx_req->mapping; + + if (skb_headlen(skb)) { + priv->tx_sge[0].addr = mapping[0]; + priv->tx_sge[0].length = skb_headlen(skb); + off = 1; + } else + off = 0; + + for (i = 0; i < nr_frags; ++i) { + priv->tx_sge[i + off].addr = mapping[i + off]; + priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); + } + priv->tx_wr.num_sge = nr_frags + off; + priv->tx_wr.wr_id = wr_id; + priv->tx_wr.wr.ud.remote_qpn = qpn; + priv->tx_wr.wr.ud.ah = address; + + if (head) { + priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size; + priv->tx_wr.wr.ud.header = head; + priv->tx_wr.wr.ud.hlen = hlen; + priv->tx_wr.opcode = IB_WR_LSO; + } else + priv->tx_wr.opcode = IB_WR_SEND; + + return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ipoib_ah *address, u32 qpn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_tx_buf *tx_req; + int hlen, rc; + void *phead; + + if (skb_is_gso(skb)) { + hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); + phead = skb->data; + if (unlikely(!skb_pull(skb, hlen))) { + ipoib_warn(priv, "linear data too small\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } else { + if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); + return; + } + phead = NULL; + hlen = 0; + } + + ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", + skb->len, address, qpn); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) + priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; + else + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) + ipoib_warn(priv, "request notify on send CQ failed\n"); + netif_stop_queue(dev); + } + + rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), + address->ah, qpn, tx_req, phead, hlen); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); + ++dev->stats.tx_errors; + --priv->tx_outstanding; + ipoib_dma_unmap_tx(priv->ca, tx_req); + dev_kfree_skb_any(skb); + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + } else { + dev->trans_start = jiffies; + + address->last_send = priv->tx_head; + ++priv->tx_head; + skb_orphan(skb); + + } + + if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) + while (poll_tx(priv)) + ; /* nothing */ +} + +static void __ipoib_reap_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah, *tah; + LIST_HEAD(remove_list); + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) + if ((int) priv->tx_tail - (int) ah->last_send >= 0) { + list_del(&ah->list); + ib_destroy_ah(ah->ah); + kfree(ah); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +void ipoib_reap_ah(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, ah_reap_task.work); + struct net_device *dev = priv->dev; + + __ipoib_reap_ah(dev); + + if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); +} + +static void ipoib_ib_tx_timer_func(unsigned long ctx) +{ + drain_tx_cq((struct net_device *)ctx); +} + +int ipoib_ib_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { + ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + return -1; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + + ret = ipoib_init_qp(dev); + if (ret) { + ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); + return -1; + } + + ret = ipoib_ib_post_receives(dev); + if (ret) { + ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); + ipoib_ib_dev_stop(dev, 1); + return -1; + } + + ret = ipoib_cm_dev_open(dev); + if (ret) { + ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); + ipoib_ib_dev_stop(dev, 1); + return -1; + } + + clear_bit(IPOIB_STOP_REAPER, &priv->flags); + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); + + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + + return 0; +} + +static void ipoib_pkey_dev_check_presence(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + u16 pkey_index = 0; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + else + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); +} + +int ipoib_ib_dev_up(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_dbg(priv, "PKEY is not assigned.\n"); + return 0; + } + + set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + + return ipoib_mcast_start_thread(dev); +} + +int ipoib_ib_dev_down(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "downing ib_dev\n"); + + clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + netif_carrier_off(dev); + + /* Shutdown the P_Key thread if still active */ + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + mutex_lock(&pkey_mutex); + set_bit(IPOIB_PKEY_STOP, &priv->flags); + cancel_delayed_work(&priv->pkey_poll_task); + mutex_unlock(&pkey_mutex); + if (flush) + flush_workqueue(ipoib_workqueue); + } + + ipoib_mcast_stop_thread(dev, flush); + ipoib_mcast_dev_flush(dev); + + ipoib_flush_paths(dev); + + return 0; +} + +static int recvs_pending(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int pending = 0; + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) + if (priv->rx_ring[i].skb) + ++pending; + + return pending; +} + +void ipoib_drain_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, n; + + /* + * We call completion handling routines that expect to be + * called from the BH-disabled NAPI poll context, so disable + * BHs here too. + */ + local_bh_disable(); + + do { + n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); + for (i = 0; i < n; ++i) { + /* + * Convert any successful completions to flush + * errors to avoid passing packets up the + * stack after bringing the device down. + */ + if (priv->ibwc[i].status == IB_WC_SUCCESS) + priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; + + if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { + if (priv->ibwc[i].wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); + else + ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); + } else + ipoib_cm_handle_tx_wc(dev, priv->ibwc + i); + } + } while (n == IPOIB_NUM_WC); + + while (poll_tx(priv)) + ; /* nothing */ + + local_bh_enable(); +} + +int ipoib_ib_dev_stop(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + unsigned long begin; + struct ipoib_tx_buf *tx_req; + int i; + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_disable(&priv->napi); + + ipoib_cm_dev_stop(dev); + + /* + * Move our QP to the error state and then reinitialize in + * when all work requests have completed or have been flushed. + */ + qp_attr.qp_state = IB_QPS_ERR; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); + + /* Wait for all sends and receives to complete */ + begin = jiffies; + + while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", + priv->tx_head - priv->tx_tail, recvs_pending(dev)); + + /* + * assume the HW is wedged and just free up + * all our pending work requests. + */ + while ((int) priv->tx_tail - (int) priv->tx_head < 0) { + tx_req = &priv->tx_ring[priv->tx_tail & + (ipoib_sendq_size - 1)]; + ipoib_dma_unmap_tx(priv->ca, tx_req); + dev_kfree_skb_any(tx_req->skb); + ++priv->tx_tail; + --priv->tx_outstanding; + } + + for (i = 0; i < ipoib_recvq_size; ++i) { + struct ipoib_rx_buf *rx_req; + + rx_req = &priv->rx_ring[i]; + if (!rx_req->skb) + continue; + ipoib_ud_dma_unmap_rx(priv, + priv->rx_ring[i].mapping); + dev_kfree_skb_any(rx_req->skb); + rx_req->skb = NULL; + } + + goto timeout; + } + + ipoib_drain_cq(dev); + + msleep(1); + } + + ipoib_dbg(priv, "All sends and receives done.\n"); + +timeout: + del_timer_sync(&priv->poll_timer); + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to RESET state\n"); + + /* Wait for all AHs to be reaped */ + set_bit(IPOIB_STOP_REAPER, &priv->flags); + cancel_delayed_work(&priv->ah_reap_task); + if (flush) + flush_workqueue(ipoib_workqueue); + + begin = jiffies; + + while (!list_empty(&priv->dead_ahs)) { + __ipoib_reap_ah(dev); + + if (time_after(jiffies, begin + HZ)) { + ipoib_warn(priv, "timing out; will leak address handles\n"); + break; + } + + msleep(1); + } + + ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); + + return 0; +} + +int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + priv->ca = ca; + priv->port = port; + priv->qp = NULL; + + if (ipoib_transport_dev_init(dev, ca)) { + printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); + return -ENODEV; + } + + setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, + (unsigned long) dev); + + if (dev->flags & IFF_UP) { + if (ipoib_ib_dev_open(dev)) { + ipoib_transport_dev_cleanup(dev); + return -ENODEV; + } + } + + return 0; +} + +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, + enum ipoib_flush_level level) +{ + struct ipoib_dev_priv *cpriv; + struct net_device *dev = priv->dev; + u16 new_index; + + mutex_lock(&priv->vlan_mutex); + + /* + * Flush any child interfaces too -- they might be up even if + * the parent is down. + */ + list_for_each_entry(cpriv, &priv->child_intfs, list) + __ipoib_ib_dev_flush(cpriv, level); + + mutex_unlock(&priv->vlan_mutex); + + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); + return; + } + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); + return; + } + + if (level == IPOIB_FLUSH_HEAVY) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_stop(dev, 0); + if (ipoib_pkey_dev_delay_open(dev)) + return; + } + + /* restart QP only if P_Key index is changed */ + if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + new_index == priv->pkey_index) { + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); + return; + } + priv->pkey_index = new_index; + } + + if (level == IPOIB_FLUSH_LIGHT) { + ipoib_mark_paths_invalid(dev); + ipoib_mcast_dev_flush(dev); + } + + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_down(dev, 0); + + if (level == IPOIB_FLUSH_HEAVY) { + ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_open(dev); + } + + /* + * The device could have been brought down between the start and when + * we get here, don't bring it back up if it's not configured up + */ + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_up(dev); + ipoib_mcast_restart_task(&priv->restart_task); + } +} + +void ipoib_ib_dev_flush_light(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_light); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); +} + +void ipoib_ib_dev_flush_normal(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_normal); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); +} + +void ipoib_ib_dev_flush_heavy(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_heavy); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); +} + +void ipoib_ib_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "cleaning up ib_dev\n"); + + ipoib_mcast_stop_thread(dev, 1); + ipoib_mcast_dev_flush(dev); + + ipoib_transport_dev_cleanup(dev); +} + +/* + * Delayed P_Key Assigment Interim Support + * + * The following is initial implementation of delayed P_Key assigment + * mechanism. It is using the same approach implemented for the multicast + * group join. The single goal of this implementation is to quickly address + * Bug #2507. This implementation will probably be removed when the P_Key + * change async notification is available. + */ + +void ipoib_pkey_poll(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); + struct net_device *dev = priv->dev; + + ipoib_pkey_dev_check_presence(dev); + + if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + ipoib_open(dev); + else { + mutex_lock(&pkey_mutex); + if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->pkey_poll_task, + HZ); + mutex_unlock(&pkey_mutex); + } +} + +int ipoib_pkey_dev_delay_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* Look for the interface pkey value in the IB Port P_Key table and */ + /* set the interface pkey assigment flag */ + ipoib_pkey_dev_check_presence(dev); + + /* P_Key value not assigned yet - start polling */ + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + mutex_lock(&pkey_mutex); + clear_bit(IPOIB_PKEY_STOP, &priv->flags); + queue_delayed_work(ipoib_workqueue, + &priv->pkey_poll_task, + HZ); + mutex_unlock(&pkey_mutex); + return 1; + } + + return 0; +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c new file mode 100644 index 00000000..3974c290 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -0,0 +1,1424 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipoib.h" + +#include <linux/module.h> + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/vmalloc.h> + +#include <linux/if_arp.h> /* For ARPHRD_xxx */ + +#include <linux/ip.h> +#include <linux/in.h> + +#include <net/dst.h> + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; +int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; + +module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); +module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +int ipoib_debug_level; + +module_param_named(debug_level, ipoib_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); +#endif + +struct ipoib_path_iter { + struct net_device *dev; + struct ipoib_path path; +}; + +static const u8 ipv4_bcast_addr[] = { + 0x00, 0xff, 0xff, 0xff, + 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff +}; + +struct workqueue_struct *ipoib_workqueue; + +struct ib_sa_client ipoib_sa_client; + +static void ipoib_add_one(struct ib_device *device); +static void ipoib_remove_one(struct ib_device *device); + +static struct ib_client ipoib_client = { + .name = "ipoib", + .add = ipoib_add_one, + .remove = ipoib_remove_one +}; + +int ipoib_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "bringing up interface\n"); + + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + if (ipoib_pkey_dev_delay_open(dev)) + return 0; + + if (ipoib_ib_dev_open(dev)) + goto err_disable; + + if (ipoib_ib_dev_up(dev)) + goto err_stop; + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + /* Bring up any child interfaces too */ + mutex_lock(&priv->vlan_mutex); + list_for_each_entry(cpriv, &priv->child_intfs, list) { + int flags; + + flags = cpriv->dev->flags; + if (flags & IFF_UP) + continue; + + dev_change_flags(cpriv->dev, flags | IFF_UP); + } + mutex_unlock(&priv->vlan_mutex); + } + + netif_start_queue(dev); + + return 0; + +err_stop: + ipoib_ib_dev_stop(dev, 1); + +err_disable: + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + return -EINVAL; +} + +static int ipoib_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "stopping interface\n"); + + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + netif_stop_queue(dev); + + ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_stop(dev, 0); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + /* Bring down any child interfaces too */ + mutex_lock(&priv->vlan_mutex); + list_for_each_entry(cpriv, &priv->child_intfs, list) { + int flags; + + flags = cpriv->dev->flags; + if (!(flags & IFF_UP)) + continue; + + dev_change_flags(cpriv->dev, flags & ~IFF_UP); + } + mutex_unlock(&priv->vlan_mutex); + } + + return 0; +} + +static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); + + return features; +} + +static int ipoib_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* dev->mtu > 2K ==> connected mode */ + if (ipoib_cm_admin_enabled(dev)) { + if (new_mtu > ipoib_cm_max_mtu(dev)) + return -EINVAL; + + if (new_mtu > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + + dev->mtu = new_mtu; + return 0; + } + + if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) + return -EINVAL; + + priv->admin_mtu = new_mtu; + + dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + + return 0; +} + +static struct ipoib_path *__path_find(struct net_device *dev, void *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node *n = priv->path_tree.rb_node; + struct ipoib_path *path; + int ret; + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + ret = memcmp(gid, path->pathrec.dgid.raw, + sizeof (union ib_gid)); + + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return path; + } + + return NULL; +} + +static int __path_add(struct net_device *dev, struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node **n = &priv->path_tree.rb_node; + struct rb_node *pn = NULL; + struct ipoib_path *tpath; + int ret; + + while (*n) { + pn = *n; + tpath = rb_entry(pn, struct ipoib_path, rb_node); + + ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = &pn->rb_left; + else if (ret > 0) + n = &pn->rb_right; + else + return -EEXIST; + } + + rb_link_node(&path->rb_node, pn, n); + rb_insert_color(&path->rb_node, &priv->path_tree); + + list_add_tail(&path->list, &priv->path_list); + + return 0; +} + +static void path_free(struct net_device *dev, struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh *neigh, *tn; + struct sk_buff *skb; + unsigned long flags; + + while ((skb = __skb_dequeue(&path->queue))) + dev_kfree_skb_irq(skb); + + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { + /* + * It's safe to call ipoib_put_ah() inside priv->lock + * here, because we know that path->ah will always + * hold one more reference, so ipoib_put_ah() will + * never do more than decrement the ref count. + */ + if (neigh->ah) + ipoib_put_ah(neigh->ah); + + ipoib_neigh_free(dev, neigh); + } + + spin_unlock_irqrestore(&priv->lock, flags); + + if (path->ah) + ipoib_put_ah(path->ah); + + kfree(path); +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) +{ + struct ipoib_path_iter *iter; + + iter = kmalloc(sizeof *iter, GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->path.pathrec.dgid.raw, 0, 16); + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_path_iter_next(struct ipoib_path_iter *iter) +{ + struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct rb_node *n; + struct ipoib_path *path; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->path_tree); + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, + sizeof (union ib_gid)) < 0) { + iter->path = *path; + ret = 0; + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path) +{ + *path = iter->path; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +void ipoib_mark_paths_invalid(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path, *tp; + + spin_lock_irq(&priv->lock); + + list_for_each_entry_safe(path, tp, &priv->path_list, list) { + ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n", + be16_to_cpu(path->pathrec.dlid), + path->pathrec.dgid.raw); + path->valid = 0; + } + + spin_unlock_irq(&priv->lock); +} + +void ipoib_flush_paths(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path, *tp; + LIST_HEAD(remove_list); + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + list_splice_init(&priv->path_list, &remove_list); + + list_for_each_entry(path, &remove_list, list) + rb_erase(&path->rb_node, &priv->path_tree); + + list_for_each_entry_safe(path, tp, &remove_list, list) { + if (path->query) + ib_sa_cancel_query(path->query_id, path->query); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + wait_for_completion(&path->done); + path_free(dev, path); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void path_rec_completion(int status, + struct ib_sa_path_rec *pathrec, + void *path_ptr) +{ + struct ipoib_path *path = path_ptr; + struct net_device *dev = path->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah = NULL; + struct ipoib_ah *old_ah = NULL; + struct ipoib_neigh *neigh, *tn; + struct sk_buff_head skqueue; + struct sk_buff *skb; + unsigned long flags; + + if (!status) + ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", + be16_to_cpu(pathrec->dlid), pathrec->dgid.raw); + else + ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", + status, path->pathrec.dgid.raw); + + skb_queue_head_init(&skqueue); + + if (!status) { + struct ib_ah_attr av; + + if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) + ah = ipoib_create_ah(dev, priv->pd, &av); + } + + spin_lock_irqsave(&priv->lock, flags); + + if (!IS_ERR_OR_NULL(ah)) { + path->pathrec = *pathrec; + + old_ah = path->ah; + path->ah = ah; + + ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", + ah, be16_to_cpu(pathrec->dlid), pathrec->sl); + + while ((skb = __skb_dequeue(&path->queue))) + __skb_queue_tail(&skqueue, skb); + + list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { + if (neigh->ah) { + WARN_ON(neigh->ah != old_ah); + /* + * Dropping the ah reference inside + * priv->lock is safe here, because we + * will hold one more reference from + * the original value of path->ah (ie + * old_ah). + */ + ipoib_put_ah(neigh->ah); + } + kref_get(&path->ah->ref); + neigh->ah = path->ah; + memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, + sizeof(union ib_gid)); + + if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, + path, + neigh)); + if (!ipoib_cm_get(neigh)) { + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + continue; + } + } + + while ((skb = __skb_dequeue(&neigh->queue))) + __skb_queue_tail(&skqueue, skb); + } + path->valid = 1; + } + + path->query = NULL; + complete(&path->done); + + spin_unlock_irqrestore(&priv->lock, flags); + + if (old_ah) + ipoib_put_ah(old_ah); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed " + "to requeue packet\n"); + } +} + +static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + + if (!priv->broadcast) + return NULL; + + path = kzalloc(sizeof *path, GFP_ATOMIC); + if (!path) + return NULL; + + path->dev = dev; + + skb_queue_head_init(&path->queue); + + INIT_LIST_HEAD(&path->neigh_list); + + memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); + path->pathrec.sgid = priv->local_gid; + path->pathrec.pkey = cpu_to_be16(priv->pkey); + path->pathrec.numb_path = 1; + path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; + + return path; +} + +static int path_rec_start(struct net_device *dev, + struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "Start path record lookup for %pI6\n", + path->pathrec.dgid.raw); + + init_completion(&path->done); + + path->query_id = + ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, + &path->pathrec, + IB_SA_PATH_REC_DGID | + IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_TRAFFIC_CLASS | + IB_SA_PATH_REC_PKEY, + 1000, GFP_ATOMIC, + path_rec_completion, + path, &path->query); + if (path->query_id < 0) { + ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); + path->query = NULL; + complete(&path->done); + return path->query_id; + } + + return 0; +} + +/* called with rcu_read_lock */ +static void neigh_add_path(struct sk_buff *skb, struct neighbour *n, struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + struct ipoib_neigh *neigh; + unsigned long flags; + + neigh = ipoib_neigh_alloc(n, skb->dev); + if (!neigh) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return; + } + + spin_lock_irqsave(&priv->lock, flags); + + path = __path_find(dev, n->ha + 4); + if (!path) { + path = path_rec_create(dev, n->ha + 4); + if (!path) + goto err_path; + + __path_add(dev, path); + } + + list_add_tail(&neigh->list, &path->neigh_list); + + if (path->ah) { + kref_get(&path->ah->ref); + neigh->ah = path->ah; + memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, + sizeof(union ib_gid)); + + if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); + if (!ipoib_cm_get(neigh)) { + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + goto err_drop; + } + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; + } + } else { + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(n->ha)); + return; + } + } else { + neigh->ah = NULL; + + if (!path->query && path_rec_start(dev, path)) + goto err_list; + + __skb_queue_tail(&neigh->queue, skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); + return; + +err_list: + list_del(&neigh->list); + +err_path: + ipoib_neigh_free(dev, neigh); +err_drop: + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + + spin_unlock_irqrestore(&priv->lock, flags); +} + +/* called with rcu_read_lock */ +static void ipoib_path_lookup(struct sk_buff *skb, struct neighbour *n, struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(skb->dev); + + /* Look up path record for unicasts */ + if (n->ha[4] != 0xff) { + neigh_add_path(skb, n, dev); + return; + } + + /* Add in the P_Key for multicasts */ + n->ha[8] = (priv->pkey >> 8) & 0xff; + n->ha[9] = priv->pkey & 0xff; + ipoib_mcast_send(dev, n->ha + 4, skb); +} + +static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, + struct ipoib_cb *cb) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + + path = __path_find(dev, cb->hwaddr + 4); + if (!path || !path->valid) { + int new_path = 0; + + if (!path) { + path = path_rec_create(dev, cb->hwaddr + 4); + new_path = 1; + } + if (path) { + __skb_queue_tail(&path->queue, skb); + + if (!path->query && path_rec_start(dev, path)) { + spin_unlock_irqrestore(&priv->lock, flags); + if (new_path) + path_free(dev, path); + return; + } else + __path_add(dev, path); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); + return; + } + + if (path->ah) { + ipoib_dbg(priv, "Send unicast ARP to %04x\n", + be16_to_cpu(path->pathrec.dlid)); + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); + return; + } else if ((path->query || !path_rec_start(dev, path)) && + skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); +} + +static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh *neigh; + struct neighbour *n = NULL; + unsigned long flags; + + rcu_read_lock(); + if (likely(skb_dst(skb))) { + n = dst_get_neighbour_noref(skb_dst(skb)); + if (!n) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } + } + if (likely(n)) { + if (unlikely(!*to_ipoib_neigh(n))) { + ipoib_path_lookup(skb, n, dev); + goto unlock; + } + + neigh = *to_ipoib_neigh(n); + + if (unlikely((memcmp(&neigh->dgid.raw, + n->ha + 4, + sizeof(union ib_gid))) || + (neigh->dev != dev))) { + spin_lock_irqsave(&priv->lock, flags); + /* + * It's safe to call ipoib_put_ah() inside + * priv->lock here, because we know that + * path->ah will always hold one more reference, + * so ipoib_put_ah() will never do more than + * decrement the ref count. + */ + if (neigh->ah) + ipoib_put_ah(neigh->ah); + list_del(&neigh->list); + ipoib_neigh_free(dev, neigh); + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_path_lookup(skb, n, dev); + goto unlock; + } + + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto unlock; + } + } else if (neigh->ah) { + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(n->ha)); + goto unlock; + } + + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + spin_lock_irqsave(&priv->lock, flags); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + } else { + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; + + if (cb->hwaddr[4] == 0xff) { + /* Add in the P_Key for multicast*/ + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; + cb->hwaddr[9] = priv->pkey & 0xff; + + ipoib_mcast_send(dev, cb->hwaddr + 4, skb); + } else { + /* unicast GID -- should be ARP or RARP reply */ + + if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && + (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { + ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n", + skb_dst(skb) ? "neigh" : "dst", + be16_to_cpup((__be16 *) skb->data), + IPOIB_QPN(cb->hwaddr), + cb->hwaddr + 4); + dev_kfree_skb_any(skb); + ++dev->stats.tx_dropped; + goto unlock; + } + + unicast_arp_send(skb, dev, cb); + } + } +unlock: + rcu_read_unlock(); + return NETDEV_TX_OK; +} + +static void ipoib_timeout(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_warn(priv, "transmit timeout: latency %d msecs\n", + jiffies_to_msecs(jiffies - dev->trans_start)); + ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", + netif_queue_stopped(dev), + priv->tx_head, priv->tx_tail); + /* XXX reset QP, etc. */ +} + +static int ipoib_hard_header(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) +{ + struct ipoib_header *header; + + header = (struct ipoib_header *) skb_push(skb, sizeof *header); + + header->proto = htons(type); + header->reserved = 0; + + /* + * If we don't have a dst_entry structure, stuff the + * destination address into skb->cb so we can figure out where + * to send the packet later. + */ + if (!skb_dst(skb)) { + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); + } + + return 0; +} + +static void ipoib_set_mcast_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); + return; + } + + queue_work(ipoib_workqueue, &priv->restart_task); +} + +static void ipoib_neigh_cleanup(struct neighbour *n) +{ + struct ipoib_neigh *neigh; + struct ipoib_dev_priv *priv = netdev_priv(n->dev); + unsigned long flags; + struct ipoib_ah *ah = NULL; + + neigh = *to_ipoib_neigh(n); + if (neigh) + priv = netdev_priv(neigh->dev); + else + return; + ipoib_dbg(priv, + "neigh_cleanup for %06x %pI6\n", + IPOIB_QPN(n->ha), + n->ha + 4); + + spin_lock_irqsave(&priv->lock, flags); + + if (neigh->ah) + ah = neigh->ah; + list_del(&neigh->list); + ipoib_neigh_free(n->dev, neigh); + + spin_unlock_irqrestore(&priv->lock, flags); + + if (ah) + ipoib_put_ah(ah); +} + +struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, + struct net_device *dev) +{ + struct ipoib_neigh *neigh; + + neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); + if (!neigh) + return NULL; + + neigh->neighbour = neighbour; + neigh->dev = dev; + memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); + *to_ipoib_neigh(neighbour) = neigh; + skb_queue_head_init(&neigh->queue); + ipoib_cm_set(neigh, NULL); + + return neigh; +} + +void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) +{ + struct sk_buff *skb; + *to_ipoib_neigh(neigh->neighbour) = NULL; + while ((skb = __skb_dequeue(&neigh->queue))) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + if (ipoib_cm_get(neigh)) + ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); + kfree(neigh); +} + +static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) +{ + parms->neigh_cleanup = ipoib_neigh_cleanup; + + return 0; +} + +int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* Allocate RX/TX "rings" to hold queued skbs */ + priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, + GFP_KERNEL); + if (!priv->rx_ring) { + printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", + ca->name, ipoib_recvq_size); + goto out; + } + + priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); + if (!priv->tx_ring) { + printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", + ca->name, ipoib_sendq_size); + goto out_rx_ring_cleanup; + } + + /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ + + if (ipoib_ib_dev_init(dev, ca, port)) + goto out_tx_ring_cleanup; + + return 0; + +out_tx_ring_cleanup: + vfree(priv->tx_ring); + +out_rx_ring_cleanup: + kfree(priv->rx_ring); + +out: + return -ENOMEM; +} + +void ipoib_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; + + ipoib_delete_debug_files(dev); + + /* Delete any child interfaces first */ + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + unregister_netdev(cpriv->dev); + ipoib_dev_cleanup(cpriv->dev); + free_netdev(cpriv->dev); + } + + ipoib_ib_dev_cleanup(dev); + + kfree(priv->rx_ring); + vfree(priv->tx_ring); + + priv->rx_ring = NULL; + priv->tx_ring = NULL; +} + +static const struct header_ops ipoib_header_ops = { + .create = ipoib_hard_header, +}; + +static const struct net_device_ops ipoib_netdev_ops = { + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, + .ndo_change_mtu = ipoib_change_mtu, + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, + .ndo_neigh_setup = ipoib_neigh_setup_dev, +}; + +static void ipoib_setup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + dev->netdev_ops = &ipoib_netdev_ops; + dev->header_ops = &ipoib_header_ops; + + ipoib_set_ethtool_ops(dev); + + netif_napi_add(dev, &priv->napi, ipoib_poll, 100); + + dev->watchdog_timeo = HZ; + + dev->flags |= IFF_BROADCAST | IFF_MULTICAST; + + dev->hard_header_len = IPOIB_ENCAP_LEN; + dev->addr_len = INFINIBAND_ALEN; + dev->type = ARPHRD_INFINIBAND; + dev->tx_queue_len = ipoib_sendq_size * 2; + dev->features = (NETIF_F_VLAN_CHALLENGED | + NETIF_F_HIGHDMA); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + + memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); + + netif_carrier_off(dev); + + priv->dev = dev; + + spin_lock_init(&priv->lock); + + mutex_init(&priv->vlan_mutex); + + INIT_LIST_HEAD(&priv->path_list); + INIT_LIST_HEAD(&priv->child_intfs); + INIT_LIST_HEAD(&priv->dead_ahs); + INIT_LIST_HEAD(&priv->multicast_list); + + INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); + INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); + INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); + INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); + INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); + INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); + INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); + INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); +} + +struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) +{ + struct net_device *dev; + + dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, + ipoib_setup); + if (!dev) + return NULL; + + return netdev_priv(dev); +} + +static ssize_t show_pkey(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + return sprintf(buf, "0x%04x\n", priv->pkey); +} +static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); + +static ssize_t show_umcast(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); +} + +static ssize_t set_umcast(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + + if (umcast_val > 0) { + set_bit(IPOIB_FLAG_UMCAST, &priv->flags); + ipoib_warn(priv, "ignoring multicast groups joined directly " + "by userspace\n"); + } else + clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); + + return count; +} +static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast); + +int ipoib_add_umcast_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_umcast); +} + +static ssize_t create_child(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int pkey; + int ret; + + if (sscanf(buf, "%i", &pkey) != 1) + return -EINVAL; + + if (pkey < 0 || pkey > 0xffff) + return -EINVAL; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + pkey |= 0x8000; + + ret = ipoib_vlan_add(to_net_dev(dev), pkey); + + return ret ? ret : count; +} +static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child); + +static ssize_t delete_child(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int pkey; + int ret; + + if (sscanf(buf, "%i", &pkey) != 1) + return -EINVAL; + + if (pkey < 0 || pkey > 0xffff) + return -EINVAL; + + ret = ipoib_vlan_delete(to_net_dev(dev), pkey); + + return ret ? ret : count; + +} +static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child); + +int ipoib_add_pkey_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_pkey); +} + +int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) +{ + struct ib_device_attr *device_attr; + int result = -ENOMEM; + + device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); + if (!device_attr) { + printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", + hca->name, sizeof *device_attr); + return result; + } + + result = ib_query_device(hca, device_attr); + if (result) { + printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", + hca->name, result); + kfree(device_attr); + return result; + } + priv->hca_caps = device_attr->device_cap_flags; + + kfree(device_attr); + + if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { + priv->dev->hw_features = NETIF_F_SG | + NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->hw_features |= NETIF_F_TSO; + + priv->dev->features |= priv->dev->hw_features; + } + + return 0; +} + +static struct net_device *ipoib_add_port(const char *format, + struct ib_device *hca, u8 port) +{ + struct ipoib_dev_priv *priv; + struct ib_port_attr attr; + int result = -ENOMEM; + + priv = ipoib_intf_alloc(format); + if (!priv) + goto alloc_mem_failed; + + SET_NETDEV_DEV(priv->dev, hca->dma_device); + priv->dev->dev_id = port - 1; + + if (!ib_query_port(hca, port, &attr)) + priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + else { + printk(KERN_WARNING "%s: ib_query_port %d failed\n", + hca->name, port); + goto device_init_failed; + } + + /* MTU will be reset when mcast join happens */ + priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + + priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); + + result = ib_query_pkey(hca, port, 0, &priv->pkey); + if (result) { + printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } + + if (ipoib_set_dev_features(priv, hca)) + goto device_init_failed; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + priv->pkey |= 0x8000; + + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + + result = ib_query_gid(hca, port, 0, &priv->local_gid); + if (result) { + printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } else + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + + result = ipoib_dev_init(priv->dev, hca, port); + if (result < 0) { + printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } + + INIT_IB_EVENT_HANDLER(&priv->event_handler, + priv->ca, ipoib_event); + result = ib_register_event_handler(&priv->event_handler); + if (result < 0) { + printk(KERN_WARNING "%s: ib_register_event_handler failed for " + "port %d (ret = %d)\n", + hca->name, port, result); + goto event_failed; + } + + result = register_netdev(priv->dev); + if (result) { + printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", + hca->name, port, result); + goto register_failed; + } + + ipoib_create_debug_files(priv->dev); + + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) + goto sysfs_failed; + if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) + goto sysfs_failed; + + return priv->dev; + +sysfs_failed: + ipoib_delete_debug_files(priv->dev); + unregister_netdev(priv->dev); + +register_failed: + ib_unregister_event_handler(&priv->event_handler); + flush_workqueue(ipoib_workqueue); + +event_failed: + ipoib_dev_cleanup(priv->dev); + +device_init_failed: + free_netdev(priv->dev); + +alloc_mem_failed: + return ERR_PTR(result); +} + +static void ipoib_add_one(struct ib_device *device) +{ + struct list_head *dev_list; + struct net_device *dev; + struct ipoib_dev_priv *priv; + int s, e, p; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); + if (!dev_list) + return; + + INIT_LIST_HEAD(dev_list); + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + s = 0; + e = 0; + } else { + s = 1; + e = device->phys_port_cnt; + } + + for (p = s; p <= e; ++p) { + if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + continue; + dev = ipoib_add_port("ib%d", device, p); + if (!IS_ERR(dev)) { + priv = netdev_priv(dev); + list_add_tail(&priv->list, dev_list); + } + } + + ib_set_client_data(device, &ipoib_client, dev_list); +} + +static void ipoib_remove_one(struct ib_device *device) +{ + struct ipoib_dev_priv *priv, *tmp; + struct list_head *dev_list; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + dev_list = ib_get_client_data(device, &ipoib_client); + + list_for_each_entry_safe(priv, tmp, dev_list, list) { + ib_unregister_event_handler(&priv->event_handler); + + rtnl_lock(); + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); + rtnl_unlock(); + + flush_workqueue(ipoib_workqueue); + + unregister_netdev(priv->dev); + ipoib_dev_cleanup(priv->dev); + free_netdev(priv->dev); + } + + kfree(dev_list); +} + +static int __init ipoib_init_module(void) +{ + int ret; + + ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); + ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); + + ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); + ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); +#ifdef CONFIG_INFINIBAND_IPOIB_CM + ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); +#endif + + /* + * When copying small received packets, we only copy from the + * linear data part of the SKB, so we rely on this condition. + */ + BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); + + ret = ipoib_register_debugfs(); + if (ret) + return ret; + + /* + * We create our own workqueue mainly because we want to be + * able to flush it when devices are being removed. We can't + * use schedule_work()/flush_scheduled_work() because both + * unregister_netdev() and linkwatch_event take the rtnl lock, + * so flush_scheduled_work() can deadlock during device + * removal. + */ + ipoib_workqueue = create_singlethread_workqueue("ipoib"); + if (!ipoib_workqueue) { + ret = -ENOMEM; + goto err_fs; + } + + ib_sa_register_client(&ipoib_sa_client); + + ret = ib_register_client(&ipoib_client); + if (ret) + goto err_sa; + + return 0; + +err_sa: + ib_sa_unregister_client(&ipoib_sa_client); + destroy_workqueue(ipoib_workqueue); + +err_fs: + ipoib_unregister_debugfs(); + + return ret; +} + +static void __exit ipoib_cleanup_module(void) +{ + ib_unregister_client(&ipoib_client); + ib_sa_unregister_client(&ipoib_sa_client); + ipoib_unregister_debugfs(); + destroy_workqueue(ipoib_workqueue); +} + +module_init(ipoib_init_module); +module_exit(ipoib_cleanup_module); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c new file mode 100644 index 00000000..20ebc6fd --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -0,0 +1,965 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/moduleparam.h> +#include <linux/ip.h> +#include <linux/in.h> +#include <linux/igmp.h> +#include <linux/inetdevice.h> +#include <linux/delay.h> +#include <linux/completion.h> +#include <linux/slab.h> + +#include <net/dst.h> + +#include "ipoib.h" + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +static int mcast_debug_level; + +module_param(mcast_debug_level, int, 0644); +MODULE_PARM_DESC(mcast_debug_level, + "Enable multicast debug tracing if > 0"); +#endif + +static DEFINE_MUTEX(mcast_mutex); + +struct ipoib_mcast_iter { + struct net_device *dev; + union ib_gid mgid; + unsigned long created; + unsigned int queuelen; + unsigned int complete; + unsigned int send_only; +}; + +static void ipoib_mcast_free(struct ipoib_mcast *mcast) +{ + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh *neigh, *tmp; + int tx_dropped = 0; + + ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + spin_lock_irq(&priv->lock); + + list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) { + /* + * It's safe to call ipoib_put_ah() inside priv->lock + * here, because we know that mcast->ah will always + * hold one more reference, so ipoib_put_ah() will + * never do more than decrement the ref count. + */ + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + } + + spin_unlock_irq(&priv->lock); + + if (mcast->ah) + ipoib_put_ah(mcast->ah); + + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + + netif_tx_lock_bh(dev); + dev->stats.tx_dropped += tx_dropped; + netif_tx_unlock_bh(dev); + + kfree(mcast); +} + +static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, + int can_sleep) +{ + struct ipoib_mcast *mcast; + + mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC); + if (!mcast) + return NULL; + + mcast->dev = dev; + mcast->created = jiffies; + mcast->backoff = 1; + + INIT_LIST_HEAD(&mcast->list); + INIT_LIST_HEAD(&mcast->neigh_list); + skb_queue_head_init(&mcast->pkt_queue); + + return mcast; +} + +static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node *n = priv->multicast_tree.rb_node; + + while (n) { + struct ipoib_mcast *mcast; + int ret; + + mcast = rb_entry(n, struct ipoib_mcast, rb_node); + + ret = memcmp(mgid, mcast->mcmember.mgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return mcast; + } + + return NULL; +} + +static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; + + while (*n) { + struct ipoib_mcast *tmcast; + int ret; + + pn = *n; + tmcast = rb_entry(pn, struct ipoib_mcast, rb_node); + + ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = &pn->rb_left; + else if (ret > 0) + n = &pn->rb_right; + else + return -EEXIST; + } + + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &priv->multicast_tree); + + return 0; +} + +static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, + struct ib_sa_mcmember_rec *mcmember) +{ + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah; + int ret; + int set_qkey = 0; + + mcast->mcmember = *mcmember; + + /* Set the cached Q_Key before we attach if it's the broadcast group */ + if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid))) { + spin_lock_irq(&priv->lock); + if (!priv->broadcast) { + spin_unlock_irq(&priv->lock); + return -EAGAIN; + } + priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); + spin_unlock_irq(&priv->lock); + priv->tx_wr.wr.ud.remote_qkey = priv->qkey; + set_qkey = 1; + } + + if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_warn(priv, "multicast group %pI6 already attached\n", + mcast->mcmember.mgid.raw); + + return 0; + } + + ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid), + &mcast->mcmember.mgid, set_qkey); + if (ret < 0) { + ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); + return ret; + } + } + + { + struct ib_ah_attr av = { + .dlid = be16_to_cpu(mcast->mcmember.mlid), + .port_num = priv->port, + .sl = mcast->mcmember.sl, + .ah_flags = IB_AH_GRH, + .static_rate = mcast->mcmember.rate, + .grh = { + .flow_label = be32_to_cpu(mcast->mcmember.flow_label), + .hop_limit = mcast->mcmember.hop_limit, + .sgid_index = 0, + .traffic_class = mcast->mcmember.traffic_class + } + }; + av.grh.dgid = mcast->mcmember.mgid; + + ah = ipoib_create_ah(dev, priv->pd, &av); + if (IS_ERR(ah)) { + ipoib_warn(priv, "ib_address_create failed %ld\n", + -PTR_ERR(ah)); + /* use original error */ + return PTR_ERR(ah); + } else { + spin_lock_irq(&priv->lock); + mcast->ah = ah; + spin_unlock_irq(&priv->lock); + + ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n", + mcast->mcmember.mgid.raw, + mcast->ah->ah, + be16_to_cpu(mcast->mcmember.mlid), + mcast->mcmember.sl); + } + } + + /* actually send any queued packets */ + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); + + netif_tx_unlock_bh(dev); + + skb->dev = dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); + + netif_tx_lock_bh(dev); + } + netif_tx_unlock_bh(dev); + + return 0; +} + +static int +ipoib_mcast_sendonly_join_complete(int status, + struct ib_sa_multicast *multicast) +{ + struct ipoib_mcast *mcast = multicast->context; + struct net_device *dev = mcast->dev; + + /* We trap for port events ourselves. */ + if (status == -ENETRESET) + return 0; + + if (!status) + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (status) { + if (mcast->logcount++ < 20) + ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); + + /* Flush out any queued packets */ + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + netif_tx_unlock_bh(dev); + + /* Clear the busy flag so we try again */ + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, + &mcast->flags); + } + return status; +} + +static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) +{ + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_mcmember_rec rec = { +#if 0 /* Some SMs don't support send-only yet */ + .join_state = 4 +#else + .join_state = 1 +#endif + }; + int ret = 0; + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); + return -ENODEV; + } + + if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { + ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); + return -EBUSY; + } + + rec.mgid = mcast->mcmember.mgid; + rec.port_gid = priv->local_gid; + rec.pkey = cpu_to_be16(priv->pkey); + + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, + priv->port, &rec, + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE, + GFP_ATOMIC, + ipoib_mcast_sendonly_join_complete, + mcast); + if (IS_ERR(mcast->mc)) { + ret = PTR_ERR(mcast->mc); + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", + ret); + } else { + ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", + mcast->mcmember.mgid.raw); + } + + return ret; +} + +void ipoib_mcast_carrier_on_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + carrier_on_task); + struct ib_port_attr attr; + + /* + * Take rtnl_lock to avoid racing with ipoib_stop() and + * turning the carrier back on while a device is being + * removed. + */ + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); + return; + } + + rtnl_lock(); + netif_carrier_on(priv->dev); + rtnl_unlock(); +} + +static int ipoib_mcast_join_complete(int status, + struct ib_sa_multicast *multicast) +{ + struct ipoib_mcast *mcast = multicast->context; + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", + mcast->mcmember.mgid.raw, status); + + /* We trap for port events ourselves. */ + if (status == -ENETRESET) + return 0; + + if (!status) + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (!status) { + mcast->backoff = 1; + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_task, 0); + mutex_unlock(&mcast_mutex); + + /* + * Defer carrier on work to ipoib_workqueue to avoid a + * deadlock on rtnl_lock here. + */ + if (mcast == priv->broadcast) + queue_work(ipoib_workqueue, &priv->carrier_on_task); + + return 0; + } + + if (mcast->logcount++ < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN) { + ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); + } else { + ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); + } + } + + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + + /* Clear the busy flag so we try again */ + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + + mutex_lock(&mcast_mutex); + spin_lock_irq(&priv->lock); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_task, + mcast->backoff * HZ); + spin_unlock_irq(&priv->lock); + mutex_unlock(&mcast_mutex); + + return status; +} + +static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, + int create) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_mcmember_rec rec = { + .join_state = 1 + }; + ib_sa_comp_mask comp_mask; + int ret = 0; + + ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw); + + rec.mgid = mcast->mcmember.mgid; + rec.port_gid = priv->local_gid; + rec.pkey = cpu_to_be16(priv->pkey); + + comp_mask = + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE; + + if (create) { + comp_mask |= + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT; + + rec.qkey = priv->broadcast->mcmember.qkey; + rec.mtu_selector = IB_SA_EQ; + rec.mtu = priv->broadcast->mcmember.mtu; + rec.traffic_class = priv->broadcast->mcmember.traffic_class; + rec.rate_selector = IB_SA_EQ; + rec.rate = priv->broadcast->mcmember.rate; + rec.sl = priv->broadcast->mcmember.sl; + rec.flow_label = priv->broadcast->mcmember.flow_label; + rec.hop_limit = priv->broadcast->mcmember.hop_limit; + } + + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, + &rec, comp_mask, GFP_KERNEL, + ipoib_mcast_join_complete, mcast); + if (IS_ERR(mcast->mc)) { + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + ret = PTR_ERR(mcast->mc); + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); + + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_task, + mcast->backoff * HZ); + mutex_unlock(&mcast_mutex); + } +} + +void ipoib_mcast_join_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, mcast_task.work); + struct net_device *dev = priv->dev; + + if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) + return; + + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) + ipoib_warn(priv, "ib_query_gid() failed\n"); + else + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + + { + struct ib_port_attr attr; + + if (!ib_query_port(priv->ca, priv->port, &attr)) + priv->local_lid = attr.lid; + else + ipoib_warn(priv, "ib_query_port failed\n"); + } + + if (!priv->broadcast) { + struct ipoib_mcast *broadcast; + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + return; + + broadcast = ipoib_mcast_alloc(dev, 1); + if (!broadcast) { + ipoib_warn(priv, "failed to allocate broadcast group\n"); + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_task, HZ); + mutex_unlock(&mcast_mutex); + return; + } + + spin_lock_irq(&priv->lock); + memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid)); + priv->broadcast = broadcast; + + __ipoib_mcast_add(dev, priv->broadcast); + spin_unlock_irq(&priv->lock); + } + + if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) + ipoib_mcast_join(dev, priv->broadcast, 0); + return; + } + + while (1) { + struct ipoib_mcast *mcast = NULL; + + spin_lock_irq(&priv->lock); + list_for_each_entry(mcast, &priv->multicast_list, list) { + if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) + && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) + && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + /* Found the next unjoined group */ + break; + } + } + spin_unlock_irq(&priv->lock); + + if (&mcast->list == &priv->multicast_list) { + /* All done */ + break; + } + + ipoib_mcast_join(dev, mcast, 1); + return; + } + + priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + + if (!ipoib_cm_admin_enabled(dev)) { + rtnl_lock(); + dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); + rtnl_unlock(); + } + + ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); + + clear_bit(IPOIB_MCAST_RUN, &priv->flags); +} + +int ipoib_mcast_start_thread(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "starting multicast thread\n"); + + mutex_lock(&mcast_mutex); + if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); + mutex_unlock(&mcast_mutex); + + return 0; +} + +int ipoib_mcast_stop_thread(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "stopping multicast thread\n"); + + mutex_lock(&mcast_mutex); + clear_bit(IPOIB_MCAST_RUN, &priv->flags); + cancel_delayed_work(&priv->mcast_task); + mutex_unlock(&mcast_mutex); + + if (flush) + flush_workqueue(ipoib_workqueue); + + return 0; +} + +static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret = 0; + + if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ib_sa_free_multicast(mcast->mc); + + if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_dbg_mcast(priv, "leaving MGID %pI6\n", + mcast->mcmember.mgid.raw); + + /* Remove ourselves from the multicast group */ + ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid)); + if (ret) + ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); + } + + return 0; +} + +void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_mcast *mcast; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || + !priv->broadcast || + !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } + + mcast = __ipoib_mcast_find(dev, mgid); + if (!mcast) { + /* Let's create a new send only group now */ + ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", + mgid); + + mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast) { + ipoib_warn(priv, "unable to allocate memory for " + "multicast structure\n"); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto out; + } + + set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); + memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } + + if (!mcast->ah) { + if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) + skb_queue_tail(&mcast->pkt_queue, skb); + else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ipoib_dbg_mcast(priv, "no address vector, " + "but multicast join already started\n"); + else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + ipoib_mcast_sendonly_join(mcast); + + /* + * If lookup completes between here and out:, don't + * want to send packet twice. + */ + mcast = NULL; + } + +out: + if (mcast && mcast->ah) { + struct dst_entry *dst = skb_dst(skb); + struct neighbour *n = NULL; + + rcu_read_lock(); + if (dst) + n = dst_get_neighbour_noref(dst); + if (n && !*to_ipoib_neigh(n)) { + struct ipoib_neigh *neigh = ipoib_neigh_alloc(n, + skb->dev); + + if (neigh) { + kref_get(&mcast->ah->ref); + neigh->ah = mcast->ah; + list_add_tail(&neigh->list, &mcast->neigh_list); + } + } + rcu_read_unlock(); + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + return; + } + +unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +void ipoib_mcast_dev_flush(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + LIST_HEAD(remove_list); + struct ipoib_mcast *mcast, *tmcast; + unsigned long flags; + + ipoib_dbg_mcast(priv, "flushing multicast list\n"); + + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + list_del(&mcast->list); + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_add_tail(&mcast->list, &remove_list); + } + + if (priv->broadcast) { + rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); + list_add_tail(&priv->broadcast->list, &remove_list); + priv->broadcast = NULL; + } + + spin_unlock_irqrestore(&priv->lock, flags); + + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(dev, mcast); + ipoib_mcast_free(mcast); + } +} + +static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast) +{ + /* reserved QPN, prefix, scope */ + if (memcmp(addr, broadcast, 6)) + return 0; + /* signature lower, pkey */ + if (memcmp(addr + 7, broadcast + 7, 3)) + return 0; + return 1; +} + +void ipoib_mcast_restart_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, restart_task); + struct net_device *dev = priv->dev; + struct netdev_hw_addr *ha; + struct ipoib_mcast *mcast, *tmcast; + LIST_HEAD(remove_list); + unsigned long flags; + struct ib_sa_mcmember_rec rec; + + ipoib_dbg_mcast(priv, "restarting multicast task\n"); + + ipoib_mcast_stop_thread(dev, 0); + + local_irq_save(flags); + netif_addr_lock(dev); + spin_lock(&priv->lock); + + /* + * Unfortunately, the networking core only gives us a list of all of + * the multicast hardware addresses. We need to figure out which ones + * are new and which ones have been removed + */ + + /* Clear out the found flag */ + list_for_each_entry(mcast, &priv->multicast_list, list) + clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + + /* Mark all of the entries that are found or don't exist */ + netdev_for_each_mc_addr(ha, dev) { + union ib_gid mgid; + + if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) + continue; + + memcpy(mgid.raw, ha->addr + 4, sizeof mgid); + + mcast = __ipoib_mcast_find(dev, &mgid); + if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + struct ipoib_mcast *nmcast; + + /* ignore group which is directly joined by userspace */ + if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && + !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { + ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n", + mgid.raw); + continue; + } + + /* Not found or send-only group, let's add a new entry */ + ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n", + mgid.raw); + + nmcast = ipoib_mcast_alloc(dev, 0); + if (!nmcast) { + ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); + continue; + } + + set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags); + + nmcast->mcmember.mgid = mgid; + + if (mcast) { + /* Destroy the send only entry */ + list_move_tail(&mcast->list, &remove_list); + + rb_replace_node(&mcast->rb_node, + &nmcast->rb_node, + &priv->multicast_tree); + } else + __ipoib_mcast_add(dev, nmcast); + + list_add_tail(&nmcast->list, &priv->multicast_list); + } + + if (mcast) + set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + } + + /* Remove all of the entries don't exist anymore */ + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && + !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + rb_erase(&mcast->rb_node, &priv->multicast_tree); + + /* Move to the remove list */ + list_move_tail(&mcast->list, &remove_list); + } + } + + spin_unlock(&priv->lock); + netif_addr_unlock(dev); + local_irq_restore(flags); + + /* We have to cancel outside of the spinlock */ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(mcast->dev, mcast); + ipoib_mcast_free(mcast); + } + + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + ipoib_mcast_start_thread(dev); +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) +{ + struct ipoib_mcast_iter *iter; + + iter = kmalloc(sizeof *iter, GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->mgid.raw, 0, 16); + + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) +{ + struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct rb_node *n; + struct ipoib_mcast *mcast; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->multicast_tree); + + while (n) { + mcast = rb_entry(n, struct ipoib_mcast, rb_node); + + if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw, + sizeof (union ib_gid)) < 0) { + iter->mgid = mcast->mcmember.mgid; + iter->created = mcast->created; + iter->queuelen = skb_queue_len(&mcast->pkt_queue); + iter->complete = !!mcast->ah; + iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY)); + + ret = 0; + + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, + union ib_gid *mgid, + unsigned long *created, + unsigned int *queuelen, + unsigned int *complete, + unsigned int *send_only) +{ + *mgid = iter->mgid; + *created = iter->created; + *queuelen = iter->queuelen; + *complete = iter->complete; + *send_only = iter->send_only; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c new file mode 100644 index 00000000..049a997c --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/slab.h> + +#include "ipoib.h" + +int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr *qp_attr = NULL; + int ret; + u16 pkey_index; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ret = -ENXIO; + goto out; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + + if (set_qkey) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) + goto out; + + /* set correct QKey for QP */ + qp_attr->qkey = priv->qkey; + ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); + if (ret) { + ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); + goto out; + } + } + + /* attach QP to multicast group */ + ret = ib_attach_mcast(priv->qp, mgid, mlid); + if (ret) + ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret); + +out: + kfree(qp_attr); + return ret; +} + +int ipoib_init_qp(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + struct ib_qp_attr qp_attr; + int attr_mask; + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return -1; + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = 0; + qp_attr.port_num = priv->port; + qp_attr.pkey_index = priv->pkey_index; + attr_mask = + IB_QP_QKEY | + IB_QP_PORT | + IB_QP_PKEY_INDEX | + IB_QP_STATE; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret); + goto out_fail; + } + + qp_attr.qp_state = IB_QPS_RTR; + /* Can't set this in a INIT->RTR transition */ + attr_mask &= ~IB_QP_PORT; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret); + goto out_fail; + } + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + attr_mask |= IB_QP_SQ_PSN; + attr_mask &= ~IB_QP_PKEY_INDEX; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret); + goto out_fail; + } + + return 0; + +out_fail: + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to RESET state\n"); + + return ret; +} + +int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr init_attr = { + .cap = { + .max_send_wr = ipoib_sendq_size, + .max_recv_wr = ipoib_recvq_size, + .max_send_sge = 1, + .max_recv_sge = IPOIB_UD_RX_SG + }, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_UD + }; + + int ret, size; + int i; + + priv->pd = ib_alloc_pd(priv->ca); + if (IS_ERR(priv->pd)) { + printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); + return -ENODEV; + } + + priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(priv->mr)) { + printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); + goto out_free_pd; + } + + size = ipoib_recvq_size + 1; + ret = ipoib_cm_dev_init(dev); + if (!ret) { + size += ipoib_sendq_size; + if (ipoib_cm_has_srq(dev)) + size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ + else + size += ipoib_recvq_size * ipoib_max_conn_qp; + } + + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); + if (IS_ERR(priv->recv_cq)) { + printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); + goto out_free_mr; + } + + priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, + dev, ipoib_sendq_size, 0); + if (IS_ERR(priv->send_cq)) { + printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); + goto out_free_recv_cq; + } + + if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) + goto out_free_send_cq; + + init_attr.send_cq = priv->send_cq; + init_attr.recv_cq = priv->recv_cq; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) + init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (dev->features & NETIF_F_SG) + init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + + priv->qp = ib_create_qp(priv->pd, &init_attr); + if (IS_ERR(priv->qp)) { + printk(KERN_WARNING "%s: failed to create QP\n", ca->name); + goto out_free_send_cq; + } + + priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; + priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; + priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; + + for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) + priv->tx_sge[i].lkey = priv->mr->lkey; + + priv->tx_wr.opcode = IB_WR_SEND; + priv->tx_wr.sg_list = priv->tx_sge; + priv->tx_wr.send_flags = IB_SEND_SIGNALED; + + priv->rx_sge[0].lkey = priv->mr->lkey; + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE; + priv->rx_sge[1].length = PAGE_SIZE; + priv->rx_sge[1].lkey = priv->mr->lkey; + priv->rx_wr.num_sge = IPOIB_UD_RX_SG; + } else { + priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + priv->rx_wr.num_sge = 1; + } + priv->rx_wr.next = NULL; + priv->rx_wr.sg_list = priv->rx_sge; + + return 0; + +out_free_send_cq: + ib_destroy_cq(priv->send_cq); + +out_free_recv_cq: + ib_destroy_cq(priv->recv_cq); + +out_free_mr: + ib_dereg_mr(priv->mr); + ipoib_cm_dev_cleanup(dev); + +out_free_pd: + ib_dealloc_pd(priv->pd); + return -ENODEV; +} + +void ipoib_transport_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (priv->qp) { + if (ib_destroy_qp(priv->qp)) + ipoib_warn(priv, "ib_qp_destroy failed\n"); + + priv->qp = NULL; + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + } + + if (ib_destroy_cq(priv->send_cq)) + ipoib_warn(priv, "ib_cq_destroy (send) failed\n"); + + if (ib_destroy_cq(priv->recv_cq)) + ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); + + ipoib_cm_dev_cleanup(dev); + + if (ib_dereg_mr(priv->mr)) + ipoib_warn(priv, "ib_dereg_mr failed\n"); + + if (ib_dealloc_pd(priv->pd)) + ipoib_warn(priv, "ib_dealloc_pd failed\n"); +} + +void ipoib_event(struct ib_event_handler *handler, + struct ib_event *record) +{ + struct ipoib_dev_priv *priv = + container_of(handler, struct ipoib_dev_priv, event_handler); + + if (record->element.port_num != priv->port) + return; + + ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, + record->device->name, record->element.port_num); + + if (record->event == IB_EVENT_SM_CHANGE || + record->event == IB_EVENT_CLIENT_REREGISTER) { + queue_work(ipoib_workqueue, &priv->flush_light); + } else if (record->event == IB_EVENT_PORT_ERR || + record->event == IB_EVENT_PORT_ACTIVE || + record->event == IB_EVENT_LID_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_normal); + } else if (record->event == IB_EVENT_PKEY_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_heavy); + } +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c new file mode 100644 index 00000000..d7e9740c --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> + +#include <linux/init.h> +#include <linux/seq_file.h> + +#include <asm/uaccess.h> + +#include "ipoib.h" + +static ssize_t show_parent(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = netdev_priv(dev); + + return sprintf(buf, "%s\n", priv->parent->name); +} +static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv; + char intf_name[IFNAMSIZ]; + int result; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + + if (!rtnl_trylock()) + return restart_syscall(); + mutex_lock(&ppriv->vlan_mutex); + + /* + * First ensure this isn't a duplicate. We check the parent device and + * then all of the child interfaces to make sure the Pkey doesn't match. + */ + if (ppriv->pkey == pkey) { + result = -ENOTUNIQ; + priv = NULL; + goto err; + } + + list_for_each_entry(priv, &ppriv->child_intfs, list) { + if (priv->pkey == pkey) { + result = -ENOTUNIQ; + priv = NULL; + goto err; + } + } + + snprintf(intf_name, sizeof intf_name, "%s.%04x", + ppriv->dev->name, pkey); + priv = ipoib_intf_alloc(intf_name); + if (!priv) { + result = -ENOMEM; + goto err; + } + + priv->max_ib_mtu = ppriv->max_ib_mtu; + /* MTU will be reset when mcast join happens */ + priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + + result = ipoib_set_dev_features(priv, ppriv->ca); + if (result) + goto err; + + priv->pkey = pkey; + + memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); + priv->dev->broadcast[8] = pkey >> 8; + priv->dev->broadcast[9] = pkey & 0xff; + + result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port); + if (result < 0) { + ipoib_warn(ppriv, "failed to initialize subinterface: " + "device %s, port %d", + ppriv->ca->name, ppriv->port); + goto err; + } + + result = register_netdevice(priv->dev); + if (result) { + ipoib_warn(priv, "failed to initialize; error %i", result); + goto register_failed; + } + + priv->parent = ppriv->dev; + + ipoib_create_debug_files(priv->dev); + + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + + if (device_create_file(&priv->dev->dev, &dev_attr_parent)) + goto sysfs_failed; + + list_add_tail(&priv->list, &ppriv->child_intfs); + + mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); + + return 0; + +sysfs_failed: + ipoib_delete_debug_files(priv->dev); + unregister_netdevice(priv->dev); + +register_failed: + ipoib_dev_cleanup(priv->dev); + +err: + mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); + if (priv) + free_netdev(priv->dev); + + return result; +} + +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv, *tpriv; + struct net_device *dev = NULL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + + if (!rtnl_trylock()) + return restart_syscall(); + mutex_lock(&ppriv->vlan_mutex); + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { + if (priv->pkey == pkey) { + unregister_netdevice(priv->dev); + ipoib_dev_cleanup(priv->dev); + list_del(&priv->list); + dev = priv->dev; + break; + } + } + mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); + + if (dev) { + free_netdev(dev); + return 0; + } + + return -ENODEV; +} |