Sean Hefty wrote:
The sysadmin creates "for iwarp use only" alias interfaces of the form"devname:iw*" where devname is the native interface name (eg eth0) for theiwarp netdev device. The alias label can be anything starting with "iw".The "iw" immediately after the ':' is the key used by the iw_cxgb3 driver.I'm still not sure about this, but haven't come up with anything better myself. And if there's a good chance of other rnic's needing the same support, I'd rather see the common code separated out, even if just encapsulated within this module for easy re-use.As for the code, I have a couple of questions about whether deadlock and a race condition are possible, plus a few minor comments.
Thanks for reviewing! Responses are in-line below.
+static void insert_ifa(struct iwch_dev *rnicp, struct in_ifaddr *ifa) +{ + struct iwch_addrlist *addr; + + addr = kmalloc(sizeof *addr, GFP_KERNEL); + if (!addr) { + printk(KERN_ERR MOD "%s - failed to alloc memory!\n", + __FUNCTION__); + return; + } + addr->ifa = ifa; + mutex_lock(&rnicp->mutex); + list_add_tail(&addr->entry, &rnicp->addrlist); + mutex_unlock(&rnicp->mutex); +}Should this return success/failure?
I think so. See below...
+static int nb_callback(struct notifier_block *self, unsigned long event, + void *ctx) +{ + struct in_ifaddr *ifa = ctx; + struct iwch_dev *rnicp = container_of(self, struct iwch_dev, nb); + + PDBG("%s rnicp %p event %lx\n", __FUNCTION__, rnicp, event); + + switch (event) { + case NETDEV_UP: + if (netdev_is_ours(rnicp, ifa->ifa_dev->dev) && + is_iwarp_label(ifa->ifa_label)) { + PDBG("%s label %s addr 0x%x added\n", + __FUNCTION__, ifa->ifa_label, ifa->ifa_address); + insert_ifa(rnicp, ifa); + iwch_listeners_add_addr(rnicp, ifa->ifa_address);If insert_ifa() fails, what will iwch_listeners_add_addr() do? (I'm not easily seeing the relationship between the address list and the listen list at this point.)
I guess insert_ifa() needs to return success/failure. Then if we failed to add the ifa to the list we won't update the listeners.
The relationship is this:- when a listen is done on addr 0.0.0.0, the code walks the list of addresses to do specific listens on each address.
- when an address is added or deleted, then the list of current listeners is walked and updated accordingly.
+ } + break; + case NETDEV_DOWN: + if (netdev_is_ours(rnicp, ifa->ifa_dev->dev) && + is_iwarp_label(ifa->ifa_label)) { + PDBG("%s label %s addr 0x%x deleted\n", + __FUNCTION__, ifa->ifa_label, ifa->ifa_address); + iwch_listeners_del_addr(rnicp, ifa->ifa_address); + remove_ifa(rnicp, ifa); + } + break; + default: + break; + } + return 0; +} + +static void delete_addrlist(struct iwch_dev *rnicp) +{ + struct iwch_addrlist *addr, *tmp; + + mutex_lock(&rnicp->mutex); + list_for_each_entry_safe(addr, tmp, &rnicp->addrlist, entry) { + list_del(&addr->entry); + kfree(addr); + } + mutex_unlock(&rnicp->mutex); +} + +static void populate_addrlist(struct iwch_dev *rnicp) +{ + int i; + struct in_device *indev; + + for (i = 0; i < rnicp->rdev.port_info.nports; i++) { + indev = in_dev_get(rnicp->rdev.port_info.lldevs[i]); + if (!indev) + continue; + for_ifa(indev) + if (is_iwarp_label(ifa->ifa_label)) { + PDBG("%s label %s addr 0x%x added\n", + __FUNCTION__, ifa->ifa_label, + ifa->ifa_address); + insert_ifa(rnicp, ifa); + } + endfor_ifa(indev); + } +} + static void rnic_init(struct iwch_dev *rnicp) { PDBG("%s iwch_dev %p\n", __FUNCTION__, rnicp); @@ -70,6 +187,12 @@ static void rnic_init(struct iwch_dev *r idr_init(&rnicp->qpidr); idr_init(&rnicp->mmidr); spin_lock_init(&rnicp->lock); + INIT_LIST_HEAD(&rnicp->addrlist); + INIT_LIST_HEAD(&rnicp->listen_eps); + mutex_init(&rnicp->mutex); + rnicp->nb.notifier_call = nb_callback; + populate_addrlist(rnicp); + register_inetaddr_notifier(&rnicp->nb);rnicp->attr.vendor_id = 0x168;rnicp->attr.vendor_part_id = 7; @@ -148,6 +271,8 @@ static void close_rnic_dev(struct t3cdev mutex_lock(&dev_mutex); list_for_each_entry_safe(dev, tmp, &dev_list, entry) { if (dev->rdev.t3cdev_p == tdev) { + unregister_inetaddr_notifier(&dev->nb); + delete_addrlist(dev); list_del(&dev->entry); iwch_unregister_device(dev); cxio_rdev_close(&dev->rdev);diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.hindex caf4e60..7fa0a47 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.h +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -36,6 +36,8 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/idr.h> +#include <linux/notifier.h> +#include <linux/inetdevice.h>#include <rdma/ib_verbs.h> @@ -101,6 +103,11 @@ struct iwch_rnic_attributes {u32 cq_overflow_detection; };+struct iwch_addrlist {+ struct list_head entry; + struct in_ifaddr *ifa; +}; + struct iwch_dev { struct ib_device ibdev; struct cxio_rdev rdev; @@ -111,6 +118,10 @@ struct iwch_dev { struct idr mmidr; spinlock_t lock; struct list_head entry; + struct notifier_block nb; + struct list_head addrlist; + struct list_head listen_eps;The behavior of the listen lists is similar to what's done in the rdma_cm: Wildcard listens are stored in a listen_any_list. When new devices are added, associated listens are added to each device. This is similar, except we're dealing with devices and addresses. I'm wondering if we shouldn't mimic the same behavior and track listens in iwch_addrlist directly. (I don't see anything wrong with this approach though.)What happens if an address changes between iwarp only and non-iwarp?
That results in a NETDEV_DOWN event indicating the iwarp only address is getting deleted. All the affected listening endpoints are updated to stop listening on that address. A NETDEV_UP event would happen when the ipaddress is switched over to the TCP interface, but our callback function ignores this since the interface name is not ethX:iw.
How are listens on specific addresses handled from an rdma_cm level? Does the rdma_cm map the address to the device, call the iw_cm to listen, which in turn calls the device listen function?
Yes.
The device then checks that the address has been marked as iwarp only? (I'm being too lazy to trace this through the code, but if you don't know off the top of your head, I will do that.)
Actually, I don't enforce this. If the app explicitly binds/listens to a non-iwarp address, then the code happily allows it. I could fail this case though. That would be best I guess.
+ struct mutex mutex; };static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.cindex 1cdfcd4..afc8a48 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -1127,23 +1127,149 @@ static int act_open_rpl(struct t3cdev *t return CPL_RET_BUF_DONE; }-static int listen_start(struct iwch_listen_ep *ep)+static int wait_for_reply(struct iwch_ep_common *epc) +{ + PDBG("%s ep %p waiting\n", __FUNCTION__, epc); + wait_event(epc->waitq, epc->rpl_done);+ PDBG("%s ep %p done waiting err %d\n", __FUNCTION__, epc, epc->rpl_err);+ return epc->rpl_err; +}What thread is being blocked here, and what sets the event?
The thread calling rdma_listen() gets blocked here until the rnic posts a response to the listen request. Ditto for rdma_destroy_id() on a listening endpoint. The event is set and the wakeup don in pass_open_rpl() and close_listsrv_rpl().
++static struct iwch_listen_entry *alloc_listener(struct iwch_listen_ep *ep,+ __be32 addr) +{ + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device); + struct iwch_listen_entry *le; + + le = kmalloc(sizeof *le, GFP_KERNEL); + if (!le) { + printk(KERN_ERR MOD "%s - failed to alloc memory!\n", + __FUNCTION__); + return NULL; + } + le->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, + &t3c_client, ep); + if (le->stid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc stid.\n", + __FUNCTION__); + kfree(le); + return NULL; + } + le->addr = addr; + PDBG("%s stid %u addr %x port %x\n", __FUNCTION__, le->stid, + ntohl(le->addr), ntohs(ep->com.local_addr.sin_port)); + return le; +} + +static void dealloc_listener(struct iwch_listen_ep *ep, + struct iwch_listen_entry *le) +{ + PDBG("%s stid %u addr %x port %x\n", __FUNCTION__, le->stid, + ntohl(le->addr), ntohs(ep->com.local_addr.sin_port)); + cxgb3_free_stid(ep->com.tdev, le->stid); + kfree(le); +} + +static void dealloc_listener_list(struct iwch_listen_ep *ep) +{ + struct iwch_listen_entry *le, *tmp; + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device); + + mutex_lock(&h->mutex); + list_for_each_entry_safe(le, tmp, &ep->listeners, entry) { + list_del(&le->entry); + dealloc_listener(ep, le); + } + mutex_unlock(&h->mutex); +} + +static int alloc_listener_list(struct iwch_listen_ep *ep) +{ + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device);nit: in place of 'h' in several places, how about 'rnicp', which is also used?+ struct iwch_addrlist *addr; + struct iwch_listen_entry *le; + int err = 0; + int added=0; + mutex_lock(&h->mutex); + list_for_each_entry(addr, &h->addrlist, entry) { + if (ep->com.local_addr.sin_addr.s_addr == 0 || + ep->com.local_addr.sin_addr.s_addr == + addr->ifa->ifa_address) { + le = alloc_listener(ep, addr->ifa->ifa_address); + if (!le) + break; + list_add_tail(&le->entry, &ep->listeners); + added++; + } + } + mutex_unlock(&h->mutex); + if (ep->com.local_addr.sin_addr.s_addr != 0 && !added) + err = -EADDRNOTAVAIL; + if (!err && !added) + printk(KERN_WARNING MOD + "No RDMA interface found for device %s\n", + pci_name(h->rdev.rnic_info.pdev)); + return err; +}Adding some white space would improve readability.++static int listen_stop_one(struct iwch_listen_ep *ep, unsigned int stid){ struct sk_buff *skb; - struct cpl_pass_open_req *req; + struct cpl_close_listserv_req *req; + + PDBG("%s stid %u\n", __FUNCTION__, stid); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__); + return -ENOMEM; + } + req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->cpu_idx = 0; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid)); + skb->priority = 1; + ep->com.rpl_err = 0; + ep->com.rpl_done = 0; + cxgb3_ofld_send(ep->com.tdev, skb); + return wait_for_reply(&ep->com); +} + +static int listen_stop(struct iwch_listen_ep *ep) +{ + struct iwch_listen_entry *le; + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device); + int err = 0;PDBG("%s ep %p\n", __FUNCTION__, ep);+ mutex_lock(&h->mutex); + list_for_each_entry(le, &ep->listeners, entry) { + err = listen_stop_one(ep, le->stid);This ends up blocking while holding a mutex, which looks like deadlock potential.
I don't think there are any deadlocks. I don't know how to avoid blocking while holding the mutex. But its ok, I think.
+ if (err) + break; + } + mutex_unlock(&h->mutex); + return err; +} ++static int listen_start_one(struct iwch_listen_ep *ep, unsigned int stid,+ __be32 addr, __be16 port) +{ + struct sk_buff *skb; + struct cpl_pass_open_req *req; ++ PDBG("%s stid %u addr %x port %x\n", __FUNCTION__, stid, ntohl(addr),+ ntohs(port)); skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); if (!skb) { - printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n"); + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__); return -ENOMEM; }req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req));req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid)); - req->local_port = ep->com.local_addr.sin_port; - req->local_ip = ep->com.local_addr.sin_addr.s_addr; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid)); + req->local_port = port; + req->local_ip = addr; req->peer_port = 0; req->peer_ip = 0; req->peer_netmask = 0; @@ -1152,8 +1278,32 @@ static int listen_start(struct iwch_list req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));skb->priority = 1;+ ep->com.rpl_err = 0; + ep->com.rpl_done = 0; cxgb3_ofld_send(ep->com.tdev, skb); - return 0; + return wait_for_reply(&ep->com); +} + +static int listen_start(struct iwch_listen_ep *ep) +{ + struct iwch_listen_entry *le; + struct iwch_dev *h = to_iwch_dev(ep->com.cm_id->device); + int err = 0; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + mutex_lock(&h->mutex); + list_for_each_entry(le, &ep->listeners, entry) { + err = listen_start_one(ep, le->stid, le->addr, + ep->com.local_addr.sin_port);Similar to above - blocking while holding a mutex. There are a couple of other places where this also occurs.
It is ok to block while holding a mutex, yes?
+ if (err) + goto fail; + } + mutex_unlock(&h->mutex); + return err; +fail: + mutex_unlock(&h->mutex); + listen_stop(ep); + return err; }static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)@@ -1170,39 +1320,59 @@ static int pass_open_rpl(struct t3cdev * return CPL_RET_BUF_DONE; }-static int listen_stop(struct iwch_listen_ep *ep)-{ - struct sk_buff *skb; - struct cpl_close_listserv_req *req; - - PDBG("%s ep %p\n", __FUNCTION__, ep); - skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); - if (!skb) { - printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__); - return -ENOMEM; - } - req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req)); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->cpu_idx = 0;- OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid));- skb->priority = 1; - cxgb3_ofld_send(ep->com.tdev, skb); - return 0; -} - static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) { struct iwch_listen_ep *ep = ctx; struct cpl_close_listserv_rpl *rpl = cplhdr(skb);- PDBG("%s ep %p\n", __FUNCTION__, ep);+ PDBG("%s ep %p stid %u\n", __FUNCTION__, ep, GET_TID(rpl)); + ep->com.rpl_err = status2errno(rpl->status); ep->com.rpl_done = 1; wake_up(&ep->com.waitq); return CPL_RET_BUF_DONE; }+void iwch_listeners_add_addr(struct iwch_dev *rnicp, __be32 addr)+{ + struct iwch_listen_ep *listen_ep; + struct iwch_listen_entry *le; + + mutex_lock(&rnicp->mutex); + list_for_each_entry(listen_ep, &rnicp->listen_eps, entry) { + if (listen_ep->com.local_addr.sin_addr.s_addr) + continue; + le = alloc_listener(listen_ep, addr); + if (le) { + list_add_tail(&le->entry, &listen_ep->listeners); + listen_start_one(listen_ep, le->stid, addr, + listen_ep->com.local_addr.sin_port); + } + } + mutex_unlock(&rnicp->mutex); +} + +void iwch_listeners_del_addr(struct iwch_dev *rnicp, __be32 addr) +{ + struct iwch_listen_ep *listen_ep; + struct iwch_listen_entry *le, *tmp; + + mutex_lock(&rnicp->mutex); + list_for_each_entry(listen_ep, &rnicp->listen_eps, entry) { + if (listen_ep->com.local_addr.sin_addr.s_addr) + continue; + list_for_each_entry_safe(le, tmp, &listen_ep->listeners, + entry) + if (le->addr == addr) { + listen_stop_one(listen_ep, le->stid); + list_del(&le->entry); + dealloc_listener(listen_ep, le); + } + } + mutex_unlock(&rnicp->mutex); +} +static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb){ struct cpl_pass_accept_rpl *rpl; @@ -1767,8 +1937,7 @@ int iwch_accept_cr(struct iw_cm_id *cm_i goto err;/* wait for wr_ack */- wait_event(ep->com.waitq, ep->com.rpl_done); - err = ep->com.rpl_err; + err = wait_for_reply(&ep->com); if (err) goto err;@@ -1887,31 +2056,23 @@ int iwch_create_listen(struct iw_cm_id *ep->com.cm_id = cm_id; ep->backlog = backlog; ep->com.local_addr = cm_id->local_addr; + INIT_LIST_HEAD(&ep->listeners);- /*- * Allocate a server TID. - */ - ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep); - if (ep->stid == -1) { - printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __FUNCTION__); - err = -ENOMEM; + err = alloc_listener_list(ep); + if (err) goto fail2; - }state_set(&ep->com, LISTEN);err = listen_start(ep); - if (err) - goto fail3;- /* wait for pass_open_rpl */- wait_event(ep->com.waitq, ep->com.rpl_done); - err = ep->com.rpl_err; if (!err) { cm_id->provider_data = ep; + mutex_lock(&h->mutex); + list_add_tail(&ep->entry, &h->listen_eps); + mutex_unlock(&h->mutex);Is there a race between listen_start() being called and inserting the ep into the list? Could anything try to find the ep on the list after listen_start returns?
I guess if the iwarp address was removed between after the listen_start() and before we add it to the list, then we would not stop the listen for this address. Perhaps I need to hold the mutex around the listen_start() -and- the insert...
goto out; } -fail3: - cxgb3_free_stid(ep->com.tdev, ep->stid); + dealloc_listener_list(ep); fail2: cm_id->rem_ref(cm_id); put_ep(&ep->com); @@ -1923,18 +2084,20 @@ out: int iwch_destroy_listen(struct iw_cm_id *cm_id) { int err; + struct iwch_dev *h = to_iwch_dev(cm_id->device); struct iwch_listen_ep *ep = to_listen_ep(cm_id);PDBG("%s ep %p\n", __FUNCTION__, ep); might_sleep();+ mutex_lock(&h->mutex); + list_del(&ep->entry); + mutex_unlock(&h->mutex); state_set(&ep->com, DEAD); ep->com.rpl_done = 0; ep->com.rpl_err = 0; err = listen_stop(ep); - wait_event(ep->com.waitq, ep->com.rpl_done); - cxgb3_free_stid(ep->com.tdev, ep->stid); - err = ep->com.rpl_err; + dealloc_listener_list(ep); cm_id->rem_ref(cm_id); put_ep(&ep->com); return err;diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.hindex 6107e7c..23e5a22 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.h +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -162,10 +162,19 @@ struct iwch_ep_common { int rpl_err; };-struct iwch_listen_ep {- struct iwch_ep_common com; +struct iwch_listen_entry { + struct list_head entry; unsigned int stid; + __be32 addr; +}; + +struct iwch_listen_ep { + struct iwch_ep_common com; /* Must be first entry! */ + struct list_head entry; + struct list_head listeners; int backlog; + int listen_count;I didn't notice where this was used.+ int listen_rpls;or this.
Yea, I think this is dead code. I'll remove these.
};struct iwch_ep {@@ -222,6 +231,8 @@ int iwch_resume_tid(struct iwch_ep *ep); void __free_ep(struct kref *kref); void iwch_rearp(struct iwch_ep *ep);int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t);+void iwch_listeners_add_addr(struct iwch_dev *rnicp, __be32 addr); +void iwch_listeners_del_addr(struct iwch_dev *rnicp, __be32 addr);int __init iwch_cm_init(void);void __exit iwch_cm_term(void); _______________________________________________ general mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/generalTo unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [email protected] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
- Follow-Ups:
- References:
- Prev by Date: Re: 2.6.23-rc8-mm2: BUG near reiserfs_xattr_set
- Next by Date: Re: [PATCHSET 3/4] sysfs: divorce sysfs from kobject and driver model
- Previous by thread: Re: [ofa-general] [PATCH v3] iw_cxgb3: Support"iwarp-only"interfacesto avoid 4-tuple conflicts.
- Next by thread: RE: [ofa-general] [PATCH v3] iw_cxgb3: Support "iwarp-only" interfaces to avoid 4-tuple conflicts.
- Index(es):