Merge remote branch 'alacrity/linux-next'

Conflicts: include/linux/Kbuild lib/Kconfig
author: Stephen Rothwell <sfr@canb.auug.org.au> 2010-05-18 15:20:24 +1000
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2010-05-18 15:20:30 +1000
commit: b1030e48176d8aea860779963d4ece43ee5bc18b (patch)
tree: c7a7bdc9c2bc9e17319428cfd4e29c505ac01f81 /drivers
parent: b8010dfa3fe2759c13fe7b8b050b641deeaad0b6 (diff)
parent: e1077ef3b2751766c4437e2f974e3d7372742d0d (diff)
8 files changed, 2871 insertions, 0 deletions
diff --git a/drivers/Makefile b/drivers/Makefile
index f42a03029b7c..08e542fb2daf 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -113,3 +113,4 @@ obj-$(CONFIG_VLYNQ)		+= vlynq/
 obj-$(CONFIG_STAGING)		+= staging/
 obj-y				+= platform/
 obj-y				+= ieee802154/
+obj-y				+= vbus/
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b9e7618a1473..ab75ccb2fb37 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -3302,4 +3302,18 @@ config VMXNET3
          To compile this driver as a module, choose M here: the
          module will be called vmxnet3.
 
+config VBUS_ENET
+	tristate "VBUS Ethernet Driver"
+	default n
+	depends on VBUS_PROXY
+	help
+	   A virtualized 802.x network device based on the VBUS
+	   "virtual-ethernet" interface.  It can be used with any
+	   hypervisor/kernel that supports the vbus+venet protocol.
+
+config VBUS_ENET_DEBUG
+        bool "Enable Debugging"
+	depends on VBUS_ENET
+	default n
+
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 0a0512ae77da..8beb1e27191f 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -289,6 +289,7 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
 obj-$(CONFIG_NETXEN_NIC) += netxen/
 obj-$(CONFIG_NIU) += niu.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VBUS_ENET) += vbus-enet.o
 obj-$(CONFIG_SFC) += sfc/
 
 obj-$(CONFIG_WIMAX) += wimax/
diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
new file mode 100644
index 000000000000..94b86d482cee
--- /dev/null
+++ b/drivers/net/vbus-enet.c
@@ -0,0 +1,1560 @@
+/*
+ * vbus_enet - A virtualized 802.x network device based on the VBUS interface
+ *
+ * Copyright (C) 2009 Novell, Gregory Haskins <ghaskins@novell.com>
+ *
+ * Derived from the SNULL example from the book "Linux Device Drivers" by
+ * Alessandro Rubini, Jonathan Corbet, and Greg Kroah-Hartman, published
+ * by O'Reilly & Associates.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <linux/ioq.h>
+#include <linux/vbus_driver.h>
+
+#include <linux/in6.h>
+#include <asm/checksum.h>
+
+#include <linux/venet.h>
+
+MODULE_AUTHOR("Gregory Haskins");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("virtual-ethernet");
+MODULE_VERSION("1");
+
+static int rx_ringlen = 256;
+module_param(rx_ringlen, int, 0444);
+static int tx_ringlen = 256;
+module_param(tx_ringlen, int, 0444);
+static int sg_enabled = 1;
+module_param(sg_enabled, int, 0444);
+
+#define PDEBUG(_dev, fmt, args...) dev_dbg(&(_dev)->dev, fmt, ## args)
+
+#define SG_DESC_SIZE VSG_DESC_SIZE(MAX_SKB_FRAGS)
+
+struct vbus_enet_queue {
+	struct ioq              *queue;
+	struct ioq_notifier      notifier;
+	unsigned long            count;
+};
+
+struct vbus_enet_priv {
+	spinlock_t                 lock;
+	struct net_device         *dev;
+	struct vbus_device_proxy  *vdev;
+	struct napi_struct         napi;
+	struct vbus_enet_queue     rxq;
+	struct {
+		struct vbus_enet_queue veq;
+		struct tasklet_struct  task;
+		struct sk_buff_head    outstanding;
+	} tx;
+	bool                       sg;
+	struct {
+		bool               enabled;
+		char              *pool;
+	} pmtd; /* pre-mapped transmit descriptors */
+	struct {
+		bool                   enabled;
+		bool                   linkstate;
+		bool                   txc;
+		unsigned long          evsize;
+		struct vbus_enet_queue veq;
+		struct tasklet_struct  task;
+		char                  *pool;
+	} evq;
+	struct {
+		bool                   available;
+		char                  *pool;
+		struct vbus_enet_queue pageq;
+	} l4ro;
+
+	struct sk_buff *(*import)(struct vbus_enet_priv *priv,
+				  struct ioq_ring_desc *desc);
+};
+
+static void vbus_enet_tx_reap(struct vbus_enet_priv *priv);
+
+static struct vbus_enet_priv *
+napi_to_priv(struct napi_struct *napi)
+{
+	return container_of(napi, struct vbus_enet_priv, napi);
+}
+
+static int
+queue_init(struct vbus_enet_priv *priv,
+	   struct vbus_enet_queue *q,
+	   const char *name,
+	   int qid,
+	   size_t ringsize,
+	   void (*func)(struct ioq_notifier *))
+{
+	struct vbus_device_proxy *dev = priv->vdev;
+	int ret;
+	char _name[64];
+
+	if (name)
+		snprintf(_name, sizeof(_name), "%s-%s", priv->dev->name, name);
+
+	ret = vbus_driver_ioq_alloc(dev, name ? _name : NULL, qid, 0,
+				    ringsize, &q->queue);
+	if (ret < 0)
+		panic("ioq_alloc failed: %d\n", ret);
+
+	if (func) {
+		q->notifier.signal = func;
+		q->queue->notifier = &q->notifier;
+	}
+
+	q->count = ringsize;
+
+	return 0;
+}
+
+static int
+devcall(struct vbus_enet_priv *priv, u32 func, void *data, size_t len)
+{
+	struct vbus_device_proxy *dev = priv->vdev;
+
+	return dev->ops->call(dev, func, data, len, 0);
+}
+
+/*
+ * ---------------
+ * rx descriptors
+ * ---------------
+ */
+
+static void
+rxdesc_alloc(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc, size_t len)
+{
+	struct net_device *dev = priv->dev;
+	struct sk_buff *skb;
+
+	len += ETH_HLEN;
+
+	skb = netdev_alloc_skb(dev, len + NET_IP_ALIGN);
+	BUG_ON(!skb);
+
+	skb_reserve(skb, NET_IP_ALIGN); /* align IP on 16B boundary */
+
+	if (priv->l4ro.available) {
+		/*
+		 * We will populate an SG descriptor initially with one
+		 * IOV filled with an MTU SKB.  If the packet needs to be
+		 * larger than MTU, the host will grab pages out of the
+		 * page-queue and populate additional IOVs
+		 */
+		struct venet_sg *vsg = (struct venet_sg *)(unsigned long)desc->cookie;
+		struct venet_iov *iov = &vsg->iov[0];
+
+		memset(vsg, 0, SG_DESC_SIZE);
+
+		vsg->cookie  = (u64)(unsigned long)skb;
+		vsg->count   = 1;
+
+		iov->ptr     = (u64)__pa(skb->data);
+		iov->len     = len;
+	} else {
+		desc->cookie = (u64)(unsigned long)skb;
+		desc->ptr    = cpu_to_le64(__pa(skb->data));
+		desc->len    = cpu_to_le64(len); /* total length  */
+	}
+
+	desc->valid  = 1;
+}
+
+static void
+rx_pageq_refill(struct vbus_enet_priv *priv, gfp_t gfp_mask)
+{
+	struct ioq *ioq = priv->l4ro.pageq.queue;
+	struct ioq_iterator iter;
+	int ret, added = 0;
+
+	if (ioq_full(ioq, ioq_idxtype_inuse))
+		/* nothing to do if the pageq is already fully populated */
+		return;
+
+	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_inuse, 0);
+	BUG_ON(ret < 0); /* will never fail unless seriously broken */
+
+	ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * Now populate each descriptor with an empty page
+	 */
+	while (!iter.desc->sown) {
+		struct page *page = NULL;
+
+		page = alloc_page(gfp_mask);
+
+		if (!page)
+			break;
+
+		added = 1;
+		iter.desc->cookie = (u64)(unsigned long)page;
+		iter.desc->ptr    = cpu_to_le64(__pa(page_address(page)));
+		iter.desc->len    = cpu_to_le64(PAGE_SIZE);
+
+		ret = ioq_iter_push(&iter, 0);
+		BUG_ON(ret < 0);
+	}
+
+	if (added)
+		ioq_signal(ioq, 0);
+}
+
+static void
+rx_setup(struct vbus_enet_priv *priv)
+{
+	struct ioq *ioq = priv->rxq.queue;
+	struct ioq_iterator iter;
+	int ret;
+	int i = 0;
+
+	/*
+	 * We want to iterate on the "valid" index.  By default the iterator
+	 * will not "autoupdate" which means it will not hypercall the host
+	 * with our changes.  This is good, because we are really just
+	 * initializing stuff here anyway.  Note that you can always manually
+	 * signal the host with ioq_signal() if the autoupdate feature is not
+	 * used.
+	 */
+	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
+	BUG_ON(ret < 0); /* will never fail unless seriously broken */
+
+	/*
+	 * Seek to the tail of the valid index (which should be our first
+	 * item, since the queue is brand-new)
+	 */
+	ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * Now populate each descriptor with an empty buffer and mark it valid
+	 */
+	while (!iter.desc->valid) {
+		if (priv->l4ro.available) {
+			size_t offset = (i * SG_DESC_SIZE);
+			void *addr = &priv->l4ro.pool[offset];
+
+			iter.desc->ptr    = cpu_to_le64(offset);
+			iter.desc->cookie = (u64)(unsigned long)addr;
+			iter.desc->len    = cpu_to_le64(SG_DESC_SIZE);
+		}
+
+		rxdesc_alloc(priv, iter.desc, priv->dev->mtu);
+
+		/*
+		 * This push operation will simultaneously advance the
+		 * valid-head index and increment our position in the queue
+		 * by one.
+		 */
+		ret = ioq_iter_push(&iter, 0);
+		BUG_ON(ret < 0);
+
+		i++;
+	}
+
+	if (priv->l4ro.available)
+		rx_pageq_refill(priv, GFP_KERNEL);
+}
+
+static void
+rx_rxq_teardown(struct vbus_enet_priv *priv)
+{
+	struct ioq *ioq = priv->rxq.queue;
+	struct ioq_iterator iter;
+	int ret;
+
+	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * free each valid descriptor
+	 */
+	while (iter.desc->valid) {
+		struct sk_buff *skb;
+
+		if (priv->l4ro.available) {
+			struct venet_sg *vsg;
+			int i;
+
+			vsg = (struct venet_sg *)(unsigned long)iter.desc->cookie;
+
+			/* skip i=0, since that is the skb->data IOV */
+			for (i = 1; i < vsg->count; i++) {
+				struct venet_iov *iov = &vsg->iov[i];
+				struct page *page = (struct page *)(unsigned long)iov->ptr;
+
+				put_page(page);
+			}
+
+			skb = (struct sk_buff *)(unsigned long)vsg->cookie;
+		} else
+			skb = (struct sk_buff *)(unsigned long)iter.desc->cookie;
+
+		iter.desc->valid = 0;
+		wmb();
+
+		iter.desc->ptr = 0;
+		iter.desc->cookie = 0;
+
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+
+		dev_kfree_skb(skb);
+	}
+}
+
+static void
+rx_l4ro_teardown(struct vbus_enet_priv *priv)
+{
+	struct ioq *ioq = priv->l4ro.pageq.queue;
+	struct ioq_iterator iter;
+	int ret;
+
+	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_inuse, 0);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * free each valid descriptor
+	 */
+	while (iter.desc->sown) {
+		struct page *page = (struct page *)(unsigned long)iter.desc->cookie;
+
+		iter.desc->valid = 0;
+		wmb();
+
+		iter.desc->ptr = 0;
+		iter.desc->cookie = 0;
+
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+
+		put_page(page);
+	}
+
+	ioq_put(ioq);
+	kfree(priv->l4ro.pool);
+}
+
+static void
+rx_teardown(struct vbus_enet_priv *priv)
+{
+	rx_rxq_teardown(priv);
+
+	if (priv->l4ro.available)
+		rx_l4ro_teardown(priv);
+}
+
+static int
+tx_setup(struct vbus_enet_priv *priv)
+{
+	struct ioq *ioq    = priv->tx.veq.queue;
+	struct ioq_iterator iter;
+	int i;
+	int ret;
+
+	if (!priv->sg)
+		/*
+		 * There is nothing to do for a ring that is not using
+		 * scatter-gather
+		 */
+		return 0;
+
+	/* pre-allocate our descriptor pool if pmtd is enabled */
+	if (priv->pmtd.enabled) {
+		struct vbus_device_proxy *dev = priv->vdev;
+		size_t poollen = SG_DESC_SIZE * priv->tx.veq.count;
+		char *pool;
+		int shmid;
+
+		/* pmtdquery will return the shm-id to use for the pool */
+		ret = devcall(priv, VENET_FUNC_PMTDQUERY, NULL, 0);
+		BUG_ON(ret < 0);
+
+		shmid = ret;
+
+		pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+		if (!pool)
+			return -ENOMEM;
+
+		priv->pmtd.pool = pool;
+
+		ret = dev->ops->shm(dev, NULL, shmid, 0, pool, poollen,
+				    NULL, NULL, 0);
+		BUG_ON(ret < 0);
+	}
+
+	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_set, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * Now populate each descriptor with an empty SG descriptor
+	 */
+	for (i = 0; i < priv->tx.veq.count; i++) {
+		struct venet_sg *vsg;
+
+		if (priv->pmtd.enabled) {
+			size_t offset = (i * SG_DESC_SIZE);
+
+			vsg = (struct venet_sg *)&priv->pmtd.pool[offset];
+			iter.desc->ptr = cpu_to_le64(offset);
+		} else {
+			vsg = kzalloc(SG_DESC_SIZE, GFP_KERNEL);
+			if (!vsg)
+				return -ENOMEM;
+
+			iter.desc->ptr = cpu_to_le64(__pa(vsg));
+		}
+
+		iter.desc->cookie = (u64)(unsigned long)vsg;
+		iter.desc->len    = cpu_to_le64(SG_DESC_SIZE);
+
+		ret = ioq_iter_seek(&iter, ioq_seek_next, 0, 0);
+		BUG_ON(ret < 0);
+	}
+
+	return 0;
+}
+
+static void
+tx_teardown(struct vbus_enet_priv *priv)
+{
+	struct ioq *ioq = priv->tx.veq.queue;
+	struct ioq_iterator iter;
+	struct sk_buff *skb;
+	int ret;
+
+	/* forcefully free all outstanding transmissions */
+	while ((skb = __skb_dequeue(&priv->tx.outstanding)))
+		dev_kfree_skb(skb);
+
+	if (!priv->sg)
+		/*
+		 * There is nothing else to do for a ring that is not using
+		 * scatter-gather
+		 */
+		return;
+
+	if (priv->pmtd.enabled) {
+		/*
+		 * PMTD mode means we only need to free the pool
+		 */
+		kfree(priv->pmtd.pool);
+		return;
+	}
+
+	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
+	BUG_ON(ret < 0);
+
+	/* seek to position 0 */
+	ret = ioq_iter_seek(&iter, ioq_seek_set, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * free each valid descriptor
+	 */
+	while (iter.desc->cookie) {
+		struct venet_sg *vsg = (struct venet_sg *)(unsigned long)iter.desc->cookie;
+
+		iter.desc->valid = 0;
+		wmb();
+
+		iter.desc->ptr = 0;
+		iter.desc->cookie = 0;
+
+		ret = ioq_iter_seek(&iter, ioq_seek_next, 0, 0);
+		BUG_ON(ret < 0);
+
+		kfree(vsg);
+	}
+}
+
+static void
+evq_teardown(struct vbus_enet_priv *priv)
+{
+	if (!priv->evq.enabled)
+		return;
+
+	ioq_put(priv->evq.veq.queue);
+	kfree(priv->evq.pool);
+}
+
+/*
+ * Open and close
+ */
+
+static int
+vbus_enet_open(struct net_device *dev)
+{
+	struct vbus_enet_priv *priv = netdev_priv(dev);
+	int ret;
+
+	ret = devcall(priv, VENET_FUNC_LINKUP, NULL, 0);
+	BUG_ON(ret < 0);
+
+	napi_enable(&priv->napi);
+
+	return 0;
+}
+
+static int
+vbus_enet_stop(struct net_device *dev)
+{
+	struct vbus_enet_priv *priv = netdev_priv(dev);
+	int ret;
+
+	napi_disable(&priv->napi);
+
+	ret = devcall(priv, VENET_FUNC_LINKDOWN, NULL, 0);
+	BUG_ON(ret < 0);
+
+	return 0;
+}
+
+/*
+ * Configuration changes (passed on by ifconfig)
+ */
+static int
+vbus_enet_config(struct net_device *dev, struct ifmap *map)
+{
+	if (dev->flags & IFF_UP) /* can't act on a running interface */
+		return -EBUSY;
+
+	/* Don't allow changing the I/O address */
+	if (map->base_addr != dev->base_addr) {
+		dev_warn(&dev->dev, "Can't change I/O address\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* ignore other fields */
+	return 0;
+}
+
+static void
+vbus_enet_schedule_rx(struct vbus_enet_priv *priv)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (napi_schedule_prep(&priv->napi)) {
+		/* Disable further interrupts */
+		ioq_notify_disable(priv->rxq.queue, 0);
+		__napi_schedule(&priv->napi);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static int
+vbus_enet_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct vbus_enet_priv *priv = netdev_priv(dev);
+	int ret;
+
+	dev->mtu = new_mtu;
+
+	/*
+	 * FLUSHRX will cause the device to flush any outstanding
+	 * RX buffers.  They will appear to come in as 0 length
+	 * packets which we can simply discard and replace with new_mtu
+	 * buffers for the future.
+	 */
+	ret = devcall(priv, VENET_FUNC_FLUSHRX, NULL, 0);
+	BUG_ON(ret < 0);
+
+	vbus_enet_schedule_rx(priv);
+
+	return 0;
+}
+
+static struct sk_buff *
+vbus_enet_l4ro_import(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc)
+{
+	struct venet_sg *vsg = (struct venet_sg *)(unsigned long)desc->cookie;
+	struct sk_buff *skb = (struct sk_buff *)(unsigned long)vsg->cookie;
+	struct skb_shared_info *sinfo = skb_shinfo(skb);
+	int i;
+
+	rx_pageq_refill(priv, GFP_ATOMIC);
+
+	if (!vsg->len)
+		/*
+		 * the device may send a zero-length packet when its
+		 * flushing references on the ring.  We can just drop
+		 * these on the floor
+		 */
+		goto fail;
+
+	/* advance only by the linear portion in IOV[0] */
+	skb_put(skb, vsg->iov[0].len);
+
+	/* skip i=0, since that is the skb->data IOV */
+	for (i = 1; i < vsg->count; i++) {
+		struct venet_iov *iov = &vsg->iov[i];
+		struct page *page = (struct page *)(unsigned long)iov->ptr;
+		skb_frag_t *f = &sinfo->frags[i-1];
+
+		f->page        = page;
+		f->page_offset = 0;
+		f->size        = iov->len;
+
+		PDEBUG(priv->dev, "SG: Importing %d byte page[%i]\n",
+		       f->size, i);
+
+		skb->data_len += f->size;
+		skb->len      += f->size;
+		skb->truesize += f->size;
+		sinfo->nr_frags++;
+	}
+
+	if (vsg->flags & VENET_SG_FLAG_NEEDS_CSUM
+	    && !skb_partial_csum_set(skb, vsg->csum.start,
+				     vsg->csum.offset)) {
+		priv->dev->stats.rx_frame_errors++;
+		goto fail;
+	}
+
+	if (vsg->flags & VENET_SG_FLAG_GSO) {
+		PDEBUG(priv->dev, "L4RO packet detected\n");
+
+		switch (vsg->gso.type) {
+		case VENET_GSO_TYPE_TCPV4:
+			sinfo->gso_type = SKB_GSO_TCPV4;
+			break;
+		case VENET_GSO_TYPE_TCPV6:
+			sinfo->gso_type = SKB_GSO_TCPV6;
+			break;
+		case VENET_GSO_TYPE_UDP:
+			sinfo->gso_type = SKB_GSO_UDP;
+			break;
+		default:
+			PDEBUG(priv->dev, "Illegal L4RO type: %d\n",
+			       vsg->gso.type);
+			priv->dev->stats.rx_frame_errors++;
+			goto fail;
+		}
+
+		if (vsg->flags & VENET_SG_FLAG_ECN)
+			sinfo->gso_type |= SKB_GSO_TCP_ECN;
+
+		sinfo->gso_size = vsg->gso.size;
+		if (sinfo->gso_size == 0) {
+			PDEBUG(priv->dev, "Illegal L4RO size: %d\n",
+			       vsg->gso.size);
+			priv->dev->stats.rx_frame_errors++;
+			goto fail;
+		}
+
+		/*
+		 * Header must be checked, and gso_segs
+		 * computed.
+		 */
+		sinfo->gso_type |= SKB_GSO_DODGY;
+		sinfo->gso_segs = 0;
+	}
+
+	return skb;
+
+fail:
+	dev_kfree_skb(skb);
+
+	return NULL;
+}
+
+static struct sk_buff *
+vbus_enet_flat_import(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc)
+{
+	struct sk_buff *skb = (struct sk_buff *)(unsigned long)desc->cookie;
+
+	if (!desc->len) {
+		/*
+		 * the device may send a zero-length packet when its
+		 * flushing references on the ring.  We can just drop
+		 * these on the floor
+		 */
+		dev_kfree_skb(skb);
+		return NULL;
+	}
+
+	skb_put(skb, le64_to_cpu(desc->len));
+
+	return skb;
+}
+
+/*
+ * The poll implementation.
+ */
+static int
+vbus_enet_poll(struct napi_struct *napi, int budget)
+{
+	struct vbus_enet_priv *priv = napi_to_priv(napi);
+	int npackets = 0;
+	struct ioq_iterator iter;
+	int ret;
+
+	PDEBUG(priv->dev, "polling...\n");
+
+	/* We want to iterate on the head of the in-use index */
+	ret = ioq_iter_init(priv->rxq.queue, &iter, ioq_idxtype_inuse,
+			    IOQ_ITER_AUTOUPDATE);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * We stop if we have met the quota or there are no more packets.
+	 * The EOM is indicated by finding a packet that is still owned by
+	 * the south side
+	 */
+	while ((npackets < budget) && (!iter.desc->sown)) {
+		struct sk_buff *skb;
+
+		skb = priv->import(priv, iter.desc);
+		if (skb) {
+			/* Maintain stats */
+			npackets++;
+			priv->dev->stats.rx_packets++;
+			priv->dev->stats.rx_bytes += skb->len;
+
+			/* Pass the buffer up to the stack */
+			skb->dev      = priv->dev;
+			skb->protocol = eth_type_trans(skb, priv->dev);
+			netif_receive_skb(skb);
+
+			mb();
+		}
+
+		/* Grab a new buffer to put in the ring */
+		rxdesc_alloc(priv, iter.desc, priv->dev->mtu);
+
+		/* Advance the in-use tail */
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+	}
+
+	PDEBUG(priv->dev, "%d packets received\n", npackets);
+
+	/*
+	 * If we processed all packets, we're done; tell the kernel and
+	 * reenable ints
+	 */
+	if (ioq_empty(priv->rxq.queue, ioq_idxtype_inuse)) {
+		napi_complete(napi);
+		ioq_notify_enable(priv->rxq.queue, 0);
+		ret = 0;
+	} else
+		/* We couldn't process everything. */
+		ret = 1;
+
+	return ret;
+}
+
+/*
+ * Transmit a packet (called by the kernel)
+ */
+static int
+vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vbus_enet_priv *priv = netdev_priv(dev);
+	struct ioq_iterator    iter;
+	int ret;
+	unsigned long flags;
+
+	PDEBUG(priv->dev, "sending %d bytes\n", skb->len);
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
+		/*
+		 * We must flow-control the kernel by disabling the
+		 * queue
+		 */
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_stop_queue(dev);
+		dev_err(&priv->dev->dev, "tx on full queue bug\n");
+		return 1;
+	}
+
+	/*
+	 * We want to iterate on the tail of both the "inuse" and "valid" index
+	 * so we specify the "both" index
+	 */
+	ret = ioq_iter_init(priv->tx.veq.queue, &iter, ioq_idxtype_both,
+			    IOQ_ITER_AUTOUPDATE);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
+	BUG_ON(ret < 0);
+	BUG_ON(iter.desc->sown);
+
+	if (priv->sg) {
+		struct venet_sg *vsg = (struct venet_sg *)(unsigned long)iter.desc->cookie;
+		struct scatterlist sgl[MAX_SKB_FRAGS+1];
+		struct scatterlist *sg;
+		int count, maxcount = ARRAY_SIZE(sgl);
+
+		sg_init_table(sgl, maxcount);
+
+		memset(vsg, 0, sizeof(*vsg));
+
+		vsg->cookie = (u64)(unsigned long)skb;
+		vsg->len    = skb->len;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			vsg->flags      |= VENET_SG_FLAG_NEEDS_CSUM;
+			vsg->csum.start  = skb->csum_start - skb_headroom(skb);
+			vsg->csum.offset = skb->csum_offset;
+		}
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			vsg->flags |= VENET_SG_FLAG_GSO;
+
+			vsg->gso.hdrlen = skb_headlen(skb);
+			vsg->gso.size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				vsg->gso.type = VENET_GSO_TYPE_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				vsg->gso.type = VENET_GSO_TYPE_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				vsg->gso.type = VENET_GSO_TYPE_UDP;
+			else
+				panic("Virtual-Ethernet: unknown GSO type " \
+				      "0x%x\n", sinfo->gso_type);
+
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				vsg->flags |= VENET_SG_FLAG_ECN;
+		}
+
+		count = skb_to_sgvec(skb, sgl, 0, skb->len);
+
+		BUG_ON(count > maxcount);
+
+		for (sg = &sgl[0]; sg; sg = sg_next(sg)) {
+			struct venet_iov *iov = &vsg->iov[vsg->count++];
+
+			iov->len = sg->length;
+			iov->ptr = (u64)sg_phys(sg);
+		}
+
+		iter.desc->len = cpu_to_le64(VSG_DESC_SIZE(vsg->count));
+
+	} else {
+		/*
+		 * non scatter-gather mode: simply put the skb right onto the
+		 * ring.
+		 */
+		iter.desc->cookie = (u64)(unsigned long)skb;
+		iter.desc->len = cpu_to_le64(skb->len);
+		iter.desc->ptr = cpu_to_le64(__pa(skb->data));
+	}
+
+	iter.desc->valid  = 1;
+
+	priv->dev->stats.tx_packets++;
+	priv->dev->stats.tx_bytes += skb->len;
+
+	skb_queue_tail(&priv->tx.outstanding, skb);
+
+	/*
+	 * This advances both indexes together implicitly, and then
+	 * signals the south side to consume the packet
+	 */
+	ret = ioq_iter_push(&iter, 0);
+	BUG_ON(ret < 0);
+
+	dev->trans_start = jiffies; /* save the timestamp */
+
+	if (ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
+		/*
+		 * If the queue is congested, we must flow-control the kernel
+		 */
+		PDEBUG(priv->dev, "backpressure tx queue\n");
+		netif_stop_queue(dev);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return 0;
+}
+
+/* assumes priv->lock held */
+static void
+vbus_enet_skb_complete(struct vbus_enet_priv *priv, struct sk_buff *skb)
+{
+	PDEBUG(priv->dev, "completed sending %d bytes\n",
+	       skb->len);
+
+	skb_unlink(skb, &priv->tx.outstanding);
+	dev_kfree_skb(skb);
+}
+
+/*
+ * reclaim any outstanding completed tx packets
+ *
+ * assumes priv->lock held
+ */
+static struct sk_buff *
+vbus_enet_tx_reap_one(struct vbus_enet_priv *priv)
+{
+	struct sk_buff *skb = NULL;
+	struct ioq_iterator iter;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	/*
+	 * We want to iterate on the head of the valid index, but we
+	 * do not want the iter_pop (below) to flip the ownership, so
+	 * we set the NOFLIPOWNER option
+	 */
+	ret = ioq_iter_init(priv->tx.veq.queue, &iter, ioq_idxtype_valid,
+			    IOQ_ITER_NOFLIPOWNER);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	if (iter.desc->valid && !iter.desc->sown) {
+
+		if (priv->sg) {
+			struct venet_sg *vsg;
+
+			vsg = (struct venet_sg *)(unsigned long)iter.desc->cookie;
+			skb = (struct sk_buff *)(unsigned long)vsg->cookie;
+		} else
+			skb = (struct sk_buff *)(unsigned long)iter.desc->cookie;
+
+		/* Reset the descriptor */
+		iter.desc->valid  = 0;
+
+		/* Advance the valid-index head */
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+	}
+
+	/*
+	 * If we were previously stopped due to flow control, restart the
+	 * processing
+	 */
+	if (netif_queue_stopped(priv->dev)
+	    && !ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
+		PDEBUG(priv->dev, "re-enabling tx queue\n");
+		netif_wake_queue(priv->dev);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return skb;
+}
+
+static void
+vbus_enet_tx_reap(struct vbus_enet_priv *priv)
+{
+	struct sk_buff *skb;
+
+	while ((skb = vbus_enet_tx_reap_one(priv))) {
+		if (!priv->evq.txc)
+			/*
+			 * We are responsible for freeing the packet upon
+			 * reap if TXC is not enabled
+			 */
+			vbus_enet_skb_complete(priv, skb);
+	}
+}
+
+static void
+vbus_enet_timeout(struct net_device *dev)
+{
+	struct vbus_enet_priv *priv = netdev_priv(dev);
+
+	dev_dbg(&dev->dev, "Transmit timeout\n");
+
+	vbus_enet_tx_reap(priv);
+}
+
+static void
+rx_isr(struct ioq_notifier *notifier)
+{
+	struct vbus_enet_priv *priv;
+	struct net_device  *dev;
+
+	priv = container_of(notifier, struct vbus_enet_priv, rxq.notifier);
+	dev = priv->dev;
+
+	if (!ioq_empty(priv->rxq.queue, ioq_idxtype_inuse))
+		vbus_enet_schedule_rx(priv);
+}
+
+static void
+deferred_tx_isr(unsigned long data)
+{
+	struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
+
+	PDEBUG(priv->dev, "deferred_tx_isr\n");
+
+	vbus_enet_tx_reap(priv);
+
+	ioq_notify_enable(priv->tx.veq.queue, 0);
+}
+
+static void
+tx_isr(struct ioq_notifier *notifier)
+{
+       struct vbus_enet_priv *priv;
+
+       priv = container_of(notifier, struct vbus_enet_priv, tx.veq.notifier);
+
+       PDEBUG(priv->dev, "tx_isr\n");
+
+       ioq_notify_disable(priv->tx.veq.queue, 0);
+       tasklet_schedule(&priv->tx.task);
+}
+
+static void
+evq_linkstate_event(struct vbus_enet_priv *priv,
+		    struct venet_event_header *header)
+{
+	struct venet_event_linkstate *event =
+		(struct venet_event_linkstate *)header;
+
+	switch (event->state) {
+	case 0:
+		netif_carrier_off(priv->dev);
+		break;
+	case 1:
+		netif_carrier_on(priv->dev);
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+evq_txc_event(struct vbus_enet_priv *priv,
+	      struct venet_event_header *header)
+{
+	struct venet_event_txc *event =
+		(struct venet_event_txc *)header;
+
+	vbus_enet_tx_reap(priv);
+
+	vbus_enet_skb_complete(priv, (struct sk_buff *)(unsigned long)event->cookie);
+}
+
+static void
+deferred_evq_isr(unsigned long data)
+{
+	struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
+	int nevents = 0;
+	struct ioq_iterator iter;
+	int ret;
+
+	PDEBUG(priv->dev, "evq: polling...\n");
+
+	/* We want to iterate on the head of the in-use index */
+	ret = ioq_iter_init(priv->evq.veq.queue, &iter, ioq_idxtype_inuse,
+			    IOQ_ITER_AUTOUPDATE);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * The EOM is indicated by finding a packet that is still owned by
+	 * the south side
+	 */
+	while (!iter.desc->sown) {
+		struct venet_event_header *header;
+
+		header = (struct venet_event_header *)(unsigned long)iter.desc->cookie;
+
+		switch (header->id) {
+		case VENET_EVENT_LINKSTATE:
+			evq_linkstate_event(priv, header);
+			break;
+		case VENET_EVENT_TXC:
+			evq_txc_event(priv, header);
+			break;
+		default:
+			panic("venet: unexpected event id:%d of size %d\n",
+			      header->id, header->size);
+			break;
+		}
+
+		memset((void *)(unsigned long)iter.desc->cookie, 0, priv->evq.evsize);
+
+		/* Advance the in-use tail */
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+
+		nevents++;
+	}
+
+	PDEBUG(priv->dev, "%d events received\n", nevents);
+
+	ioq_notify_enable(priv->evq.veq.queue, 0);
+}
+
+static void
+evq_isr(struct ioq_notifier *notifier)
+{
+       struct vbus_enet_priv *priv;
+
+       priv = container_of(notifier, struct vbus_enet_priv, evq.veq.notifier);
+
+       PDEBUG(priv->dev, "evq_isr\n");
+
+       ioq_notify_disable(priv->evq.veq.queue, 0);
+       tasklet_schedule(&priv->evq.task);
+}
+
+static int
+vbus_enet_sg_negcap(struct vbus_enet_priv *priv)
+{
+	struct net_device *dev = priv->dev;
+	struct venet_capabilities caps;
+	int ret;
+
+	memset(&caps, 0, sizeof(caps));
+
+	if (sg_enabled) {
+		caps.gid = VENET_CAP_GROUP_SG;
+		caps.bits |= (VENET_CAP_SG|VENET_CAP_TSO4|VENET_CAP_TSO6
+			      |VENET_CAP_ECN|VENET_CAP_PMTD);
+		/* note: exclude UFO for now due to stack bug */
+	}
+
+	ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
+	if (ret < 0)
+		return ret;
+
+	if (caps.bits & VENET_CAP_SG) {
+		priv->sg = true;
+
+		dev->features |= NETIF_F_SG|NETIF_F_HW_CSUM|NETIF_F_FRAGLIST;
+
+		if (caps.bits & VENET_CAP_TSO4)
+			dev->features |= NETIF_F_TSO;
+		if (caps.bits & VENET_CAP_UFO)
+			dev->features |= NETIF_F_UFO;
+		if (caps.bits & VENET_CAP_TSO6)
+			dev->features |= NETIF_F_TSO6;
+		if (caps.bits & VENET_CAP_ECN)
+			dev->features |= NETIF_F_TSO_ECN;
+
+		if (caps.bits & VENET_CAP_PMTD)
+			priv->pmtd.enabled = true;
+	}
+
+	return 0;
+}
+
+static int
+vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)
+{
+	struct venet_capabilities caps;
+	int ret;
+
+	memset(&caps, 0, sizeof(caps));
+
+	caps.gid = VENET_CAP_GROUP_EVENTQ;
+	caps.bits |= VENET_CAP_EVQ_LINKSTATE;
+	caps.bits |= VENET_CAP_EVQ_TXC;
+
+	ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
+	if (ret < 0)
+		return ret;
+
+	if (caps.bits) {
+		struct vbus_device_proxy *dev = priv->vdev;
+		struct venet_eventq_query query;
+		size_t                    poollen;
+		struct ioq_iterator       iter;
+		char                     *pool;
+		int                       i;
+
+		priv->evq.enabled = true;
+
+		if (caps.bits & VENET_CAP_EVQ_LINKSTATE) {
+			/*
+			 * We will assume there is no carrier until we get
+			 * an event telling us otherwise
+			 */
+			netif_carrier_off(priv->dev);
+			priv->evq.linkstate = true;
+		}
+
+		if (caps.bits & VENET_CAP_EVQ_TXC)
+			priv->evq.txc = true;
+
+		memset(&query, 0, sizeof(query));
+
+		ret = devcall(priv, VENET_FUNC_EVQQUERY, &query, sizeof(query));
+		if (ret < 0)
+			return ret;
+
+		priv->evq.evsize = query.evsize;
+		poollen = query.evsize * count;
+
+		pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+		if (!pool)
+			return -ENOMEM;
+
+		priv->evq.pool = pool;
+
+		ret = dev->ops->shm(dev, NULL, query.dpid, 0,
+				    pool, poollen, NULL, NULL, 0);
+		if (ret < 0)
+			return ret;
+
+		queue_init(priv, &priv->evq.veq, "evq",
+			   query.qid, count, evq_isr);
+
+		ret = ioq_iter_init(priv->evq.veq.queue,
+				    &iter, ioq_idxtype_valid, 0);
+		BUG_ON(ret < 0);
+
+		ret = ioq_iter_seek(&iter, ioq_seek_set, 0, 0);
+		BUG_ON(ret < 0);
+
+		/* Now populate each descriptor with an empty event */
+		for (i = 0; i < count; i++) {
+			size_t offset = (i * query.evsize);
+			void *addr = &priv->evq.pool[offset];
+
+			iter.desc->ptr    = cpu_to_le64(offset);
+			iter.desc->cookie = (u64)(unsigned long)addr;
+			iter.desc->len    = cpu_to_le64(query.evsize);
+
+			ret = ioq_iter_push(&iter, 0);
+			BUG_ON(ret < 0);
+		}
+
+		/* Finally, enable interrupts */
+		tasklet_init(&priv->evq.task, deferred_evq_isr,
+			     (unsigned long)priv);
+		ioq_notify_enable(priv->evq.veq.queue, 0);
+	}
+
+	return 0;
+}
+
+static int
+vbus_enet_l4ro_negcap(struct vbus_enet_priv *priv, unsigned long count)
+{
+	struct venet_capabilities caps;
+	int ret;
+
+	memset(&caps, 0, sizeof(caps));
+
+	caps.gid = VENET_CAP_GROUP_L4RO;
+	caps.bits |= (VENET_CAP_SG|VENET_CAP_TSO4|VENET_CAP_TSO6
+		      |VENET_CAP_ECN);
+
+	ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
+	if (ret < 0) {
+		printk(KERN_ERR "Error negotiating L4RO: %d\n", ret);
+		return ret;
+	}
+
+	if (caps.bits & VENET_CAP_SG) {
+		struct vbus_device_proxy *dev = priv->vdev;
+		size_t                    poollen = SG_DESC_SIZE * count;
+		struct venet_l4ro_query    query;
+		char                     *pool;
+
+		memset(&query, 0, sizeof(query));
+
+		ret = devcall(priv, VENET_FUNC_L4ROQUERY, &query, sizeof(query));
+		if (ret < 0) {
+			printk(KERN_ERR "Error querying L4RO: %d\n", ret);
+			return ret;
+		}
+
+		pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+		if (!pool)
+			return -ENOMEM;
+
+		/*
+		 * pre-mapped descriptor pool
+		 */
+		ret = dev->ops->shm(dev, NULL, query.dpid, 0,
+				    pool, poollen, NULL, NULL, 0);
+		if (ret < 0) {
+			printk(KERN_ERR "Error registering L4RO pool: %d\n",
+			       ret);
+			kfree(pool);
+			return ret;
+		}
+
+		/*
+		 * page-queue: contains a ring of arbitrary pages for
+		 * consumption by the host for when the SG::IOV count exceeds
+		 * one MTU frame.  All we need to do is keep it populated
+		 * with free pages.
+		 */
+		queue_init(priv, &priv->l4ro.pageq, "pageq", query.pqid,
+			   count, NULL);
+
+		priv->l4ro.pool      = pool;
+		priv->l4ro.available = true;
+	}
+
+	return 0;
+}
+
+static int
+vbus_enet_negcap(struct vbus_enet_priv *priv)
+{
+	int ret;
+
+	ret = vbus_enet_sg_negcap(priv);
+	if (ret < 0)
+		return ret;
+
+	ret = vbus_enet_evq_negcap(priv, tx_ringlen);
+	if (ret < 0)
+		return ret;
+
+	ret = vbus_enet_l4ro_negcap(priv, rx_ringlen);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int vbus_enet_set_tx_csum(struct net_device *dev, u32 data)
+{
+	struct vbus_enet_priv *priv = netdev_priv(dev);
+
+	if (data && !priv->sg)
+		return -ENOSYS;
+
+	return ethtool_op_set_tx_hw_csum(dev, data);
+}
+
+static struct ethtool_ops vbus_enet_ethtool_ops = {
+	.set_tx_csum = vbus_enet_set_tx_csum,
+	.set_sg      = ethtool_op_set_sg,
+	.set_tso     = ethtool_op_set_tso,
+	.get_link    = ethtool_op_get_link,
+};
+
+static const struct net_device_ops vbus_enet_netdev_ops = {
+	.ndo_open            = vbus_enet_open,
+	.ndo_stop            = vbus_enet_stop,
+	.ndo_set_config      = vbus_enet_config,
+	.ndo_start_xmit      = vbus_enet_tx_start,
+	.ndo_change_mtu	     = vbus_enet_change_mtu,
+	.ndo_tx_timeout      = vbus_enet_timeout,
+	.ndo_set_mac_address = eth_mac_addr,
+	.ndo_validate_addr   = eth_validate_addr,
+};
+
+/*
+ * This is called whenever a new vbus_device_proxy is added to the vbus
+ * with the matching VENET_ID
+ */
+static int
+vbus_enet_probe(struct vbus_device_proxy *vdev)
+{
+	struct net_device  *dev;
+	struct vbus_enet_priv *priv;
+	int ret;
+
+	printk(KERN_INFO "VENET: Found new device at %lld\n", vdev->id);
+
+	ret = vdev->ops->open(vdev, VENET_VERSION, 0);
+	if (ret < 0)
+		return ret;
+
+	dev = alloc_etherdev(sizeof(struct vbus_enet_priv));
+	if (!dev)
+		return -ENOMEM;
+
+	/*
+	 * establish our device-name early so we can incorporate it into
+	 * the signal-path names, etc
+	 */
+	rtnl_lock();
+
+	ret = dev_alloc_name(dev, dev->name);
+	if (ret < 0)
+		goto out_free;
+
+	priv = netdev_priv(dev);
+
+	spin_lock_init(&priv->lock);
+	priv->dev  = dev;
+	priv->vdev = vdev;
+
+	ret = vbus_enet_negcap(priv);
+	if (ret < 0) {
+		printk(KERN_INFO "VENET: Error negotiating capabilities for " \
+		       "%lld\n",
+		       priv->vdev->id);
+		goto out_free;
+	}
+
+	if (priv->l4ro.available)
+		priv->import = &vbus_enet_l4ro_import;
+	else
+		priv->import = &vbus_enet_flat_import;
+
+	skb_queue_head_init(&priv->tx.outstanding);
+
+	queue_init(priv, &priv->rxq, "rx", VENET_QUEUE_RX, rx_ringlen,
+		   rx_isr);
+	queue_init(priv, &priv->tx.veq, "tx", VENET_QUEUE_TX, tx_ringlen,
+		   tx_isr);
+
+	rx_setup(priv);
+	tx_setup(priv);
+
+	ioq_notify_enable(priv->rxq.queue, 0);  /* enable rx interrupts */
+
+	if (!priv->evq.txc) {
+		/*
+		 * If the TXC feature is present, we will recieve our
+		 * tx-complete notification via the event-channel.  Therefore,
+		 * we only enable txq interrupts if the TXC feature is not
+		 * present.
+		 */
+		tasklet_init(&priv->tx.task, deferred_tx_isr,
+			     (unsigned long)priv);
+		ioq_notify_enable(priv->tx.veq.queue, 0);
+	}
+
+	dev->netdev_ops     = &vbus_enet_netdev_ops;
+	dev->watchdog_timeo = 5 * HZ;
+	SET_ETHTOOL_OPS(dev, &vbus_enet_ethtool_ops);
+	SET_NETDEV_DEV(dev, &vdev->dev);
+
+	netif_napi_add(dev, &priv->napi, vbus_enet_poll, 128);
+
+	ret = devcall(priv, VENET_FUNC_MACQUERY, priv->dev->dev_addr, ETH_ALEN);
+	if (ret < 0) {
+		printk(KERN_INFO "VENET: Error obtaining MAC address for " \
+		       "%lld\n",
+		       priv->vdev->id);
+		goto out_free;
+	}
+
+	dev->features |= NETIF_F_HIGHDMA;
+
+	ret = register_netdevice(dev);
+	if (ret < 0) {
+		printk(KERN_INFO "VENET: error %i registering device \"%s\"\n",
+		       ret, dev->name);
+		goto out_free;
+	}
+
+	rtnl_unlock();
+
+	vdev->priv = priv;
+
+	return 0;
+
+ out_free:
+	rtnl_unlock();
+
+	free_netdev(dev);
+
+	return ret;
+}
+
+static int
+vbus_enet_remove(struct vbus_device_proxy *vdev)
+{
+	struct vbus_enet_priv *priv = (struct vbus_enet_priv *)vdev->priv;
+	struct vbus_device_proxy *dev = priv->vdev;
+
+	unregister_netdev(priv->dev);
+	napi_disable(&priv->napi);
+
+	rx_teardown(priv);
+	ioq_put(priv->rxq.queue);
+
+	tx_teardown(priv);
+	ioq_put(priv->tx.veq.queue);
+
+	if (priv->evq.enabled)
+		evq_teardown(priv);
+
+	dev->ops->close(dev, 0);
+
+	free_netdev(priv->dev);
+
+	return 0;
+}
+
+/*
+ * Finally, the module stuff
+ */
+
+static struct vbus_driver_ops vbus_enet_driver_ops = {
+	.probe  = vbus_enet_probe,
+	.remove = vbus_enet_remove,
+};
+
+static struct vbus_driver vbus_enet_driver = {
+	.type   = VENET_TYPE,
+	.owner  = THIS_MODULE,
+	.ops    = &vbus_enet_driver_ops,
+};
+
+static __init int
+vbus_enet_init_module(void)
+{
+	printk(KERN_INFO "Virtual Ethernet: Copyright (C) 2009 Novell, Gregory Haskins\n");
+	printk(KERN_DEBUG "VENET: Using %d/%d queue depth\n",
+	       rx_ringlen, tx_ringlen);
+	return vbus_driver_register(&vbus_enet_driver);
+}
+
+static __exit void
+vbus_enet_cleanup(void)
+{
+	vbus_driver_unregister(&vbus_enet_driver);
+}
+
+module_init(vbus_enet_init_module);
+module_exit(vbus_enet_cleanup);
+
+VBUS_DRIVER_AUTOPROBE(VENET_TYPE);
diff --git a/drivers/vbus/Kconfig b/drivers/vbus/Kconfig
new file mode 100644
index 000000000000..f51cba10913e
--- /dev/null
+++ b/drivers/vbus/Kconfig
@@ -0,0 +1,25 @@
+#
+# Virtual-Bus (VBus) driver configuration
+#
+
+config VBUS_PROXY
+       bool "Virtual-Bus support"
+       select SHM_SIGNAL
+       select IOQ
+       default n
+       help
+       Adds support for a virtual-bus model drivers in a guest to connect
+	to host side virtual-bus resources.  If you are using this kernel
+	in a virtualization solution which implements virtual-bus devices
+	on the backend, say Y.  If unsure, say N.
+
+config VBUS_PCIBRIDGE
+       bool "PCI to Virtual-Bus bridge"
+       depends on PCI
+       depends on VBUS_PROXY
+       select IOQ
+       default n
+       help
+        Provides a way to bridge host side vbus devices via a PCI-BRIDGE
+        object.  If you are running virtualization with vbus devices on the
+	host, and the vbus is exposed via PCI, say Y.  Otherwise, say N.
diff --git a/drivers/vbus/Makefile b/drivers/vbus/Makefile
new file mode 100644
index 000000000000..944b7f1fec90
--- /dev/null
+++ b/drivers/vbus/Makefile
@@ -0,0 +1,6 @@
+
+vbus-proxy-objs += bus-proxy.o
+obj-$(CONFIG_VBUS_PROXY) += vbus-proxy.o
+
+vbus-pcibridge-objs += pci-bridge.o
+obj-$(CONFIG_VBUS_PCIBRIDGE) += vbus-pcibridge.o
diff --git a/drivers/vbus/bus-proxy.c b/drivers/vbus/bus-proxy.c
new file mode 100644
index 000000000000..ae11f679d34e
--- /dev/null
+++ b/drivers/vbus/bus-proxy.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *      Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vbus_driver.h>
+
+MODULE_AUTHOR("Gregory Haskins");
+MODULE_LICENSE("GPL");
+
+#define VBUS_PROXY_NAME "vbus-proxy"
+
+static struct vbus_device_proxy *to_dev(struct device *_dev)
+{
+	return _dev ? container_of(_dev, struct vbus_device_proxy, dev) : NULL;
+}
+
+static struct vbus_driver *to_drv(struct device_driver *_drv)
+{
+	return container_of(_drv, struct vbus_driver, drv);
+}
+
+/*
+ * This function is invoked whenever a new driver and/or device is added
+ * to check if there is a match
+ */
+static int vbus_dev_proxy_match(struct device *_dev, struct device_driver *_drv)
+{
+	struct vbus_device_proxy *dev = to_dev(_dev);
+	struct vbus_driver *drv = to_drv(_drv);
+
+	return !strcmp(dev->type, drv->type);
+}
+
+static int vbus_dev_proxy_uevent(struct device *_dev, struct kobj_uevent_env *env)
+{
+	struct vbus_device_proxy *dev = to_dev(_dev);
+
+	if (add_uevent_var(env, "MODALIAS=vbus-proxy:%s", dev->type))
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * This function is invoked after the bus infrastructure has already made a
+ * match.  The device will contain a reference to the paired driver which
+ * we will extract.
+ */
+static int vbus_dev_proxy_probe(struct device *_dev)
+{
+	int ret = 0;
+	struct vbus_device_proxy *dev = to_dev(_dev);
+	struct vbus_driver *drv = to_drv(_dev->driver);
+
+	if (drv->ops->probe)
+		ret = drv->ops->probe(dev);
+
+	return ret;
+}
+
+static struct bus_type vbus_proxy = {
+	.name   = VBUS_PROXY_NAME,
+	.match  = vbus_dev_proxy_match,
+	.uevent = vbus_dev_proxy_uevent,
+};
+
+static struct device vbus_proxy_rootdev = {
+	.parent    = NULL,
+	.init_name = VBUS_PROXY_NAME,
+};
+
+static int __init vbus_init(void)
+{
+	int ret;
+
+	ret = bus_register(&vbus_proxy);
+	BUG_ON(ret < 0);
+
+	ret = device_register(&vbus_proxy_rootdev);
+	BUG_ON(ret < 0);
+
+	return 0;
+}
+
+postcore_initcall(vbus_init);
+
+static void device_release(struct device *dev)
+{
+	struct vbus_device_proxy *_dev;
+
+	_dev = container_of(dev, struct vbus_device_proxy, dev);
+
+	_dev->ops->release(_dev);
+}
+
+static ssize_t _show_modalias(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "vbus-proxy:%s\n", to_dev(dev)->type);
+}
+static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, _show_modalias, NULL);
+
+int vbus_device_proxy_register(struct vbus_device_proxy *new)
+{
+	int ret;
+
+	new->dev.parent  = &vbus_proxy_rootdev;
+	new->dev.bus     = &vbus_proxy;
+	new->dev.release = &device_release;
+
+	ret = device_register(&new->dev);
+	if (ret < 0)
+		return ret;
+
+	ret = device_create_file(&new->dev, &dev_attr_modalias);
+	if (ret < 0) {
+		device_unregister(&new->dev);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vbus_device_proxy_register);
+
+void vbus_device_proxy_unregister(struct vbus_device_proxy *dev)
+{
+	device_remove_file(&dev->dev, &dev_attr_modalias);
+	device_unregister(&dev->dev);
+}
+EXPORT_SYMBOL_GPL(vbus_device_proxy_unregister);
+
+static int match_device_id(struct device *_dev, void *data)
+{
+	struct vbus_device_proxy *dev = to_dev(_dev);
+	u64 id = *(u64 *)data;
+
+	return dev->id == id;
+}
+
+struct vbus_device_proxy *vbus_device_proxy_find(u64 id)
+{
+	struct device *dev;
+
+	dev = bus_find_device(&vbus_proxy, NULL, &id, &match_device_id);
+
+	return to_dev(dev);
+}
+EXPORT_SYMBOL_GPL(vbus_device_proxy_find);
+
+int vbus_driver_register(struct vbus_driver *new)
+{
+	new->drv.bus   = &vbus_proxy;
+	new->drv.name  = new->type;
+	new->drv.owner = new->owner;
+	new->drv.probe = vbus_dev_proxy_probe;
+
+	return driver_register(&new->drv);
+}
+EXPORT_SYMBOL_GPL(vbus_driver_register);
+
+void vbus_driver_unregister(struct vbus_driver *drv)
+{
+	driver_unregister(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(vbus_driver_unregister);
+
+/*
+ *---------------------------------
+ * driver-side IOQ helper
+ *---------------------------------
+ */
+static void
+vbus_driver_ioq_release(struct ioq *ioq)
+{
+	kfree(ioq->head_desc);
+	kfree(ioq);
+}
+
+static struct ioq_ops vbus_driver_ioq_ops = {
+	.release = vbus_driver_ioq_release,
+};
+
+
+int vbus_driver_ioq_alloc(struct vbus_device_proxy *dev, const char *name,
+			  int id, int prio, size_t count, struct ioq **ioq)
+{
+	struct ioq           *_ioq;
+	struct ioq_ring_head *head = NULL;
+	struct shm_signal    *signal = NULL;
+	size_t                len = IOQ_HEAD_DESC_SIZE(count);
+	int                   ret = -ENOMEM;
+
+	_ioq = kzalloc(sizeof(*_ioq), GFP_KERNEL);
+	if (!_ioq)
+		goto error;
+
+	head = kzalloc(len, GFP_KERNEL | GFP_DMA);
+	if (!head)
+		goto error;
+
+	head->magic     = IOQ_RING_MAGIC;
+	head->ver	= IOQ_RING_VER;
+	head->count     = cpu_to_le32(count);
+
+	ret = dev->ops->shm(dev, name, id, prio, head, len,
+			    &head->signal, &signal, 0);
+	if (ret < 0)
+		goto error;
+
+	ioq_init(_ioq,
+		 &vbus_driver_ioq_ops,
+		 ioq_locality_north,
+		 head,
+		 signal,
+		 count);
+
+	*ioq = _ioq;
+
+	return 0;
+
+ error:
+	kfree(_ioq);
+	kfree(head);
+
+	if (signal)
+		shm_signal_put(signal);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vbus_driver_ioq_alloc);
diff --git a/drivers/vbus/pci-bridge.c b/drivers/vbus/pci-bridge.c
new file mode 100644
index 000000000000..36de7c48891c
--- /dev/null
+++ b/drivers/vbus/pci-bridge.c
@@ -0,0 +1,1016 @@
+/*
+ * Copyright (C) 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *	Gregory Haskins <ghaskins@novell.com>
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/mm.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/ioq.h>
+#include <linux/interrupt.h>
+#include <linux/vbus_driver.h>
+#include <linux/vbus_pci.h>
+
+MODULE_AUTHOR("Gregory Haskins");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1");
+
+#define VBUS_PCI_NAME "pci-to-vbus-bridge"
+
+struct vbus_pci {
+	spinlock_t                lock;
+	struct pci_dev           *dev;
+	struct ioq                eventq;
+	struct vbus_pci_event    *ring;
+	struct vbus_pci_regs     *regs;
+	struct vbus_pci_signals  *signals;
+	int                       irq;
+	bool                      enabled;
+	struct {
+		struct dentry    *fs;
+		int               events;
+		int               qnotify;
+		int               qinject;
+		int               notify;
+		int               inject;
+		int               bridgecalls;
+		int               buscalls;
+	} stats;
+};
+
+static struct vbus_pci vbus_pci;
+
+struct vbus_pci_device {
+	char                     type[VBUS_MAX_DEVTYPE_LEN];
+	u64                      handle;
+	struct list_head         shms;
+	struct vbus_device_proxy vdev;
+	struct work_struct       drop;
+};
+
+static DEFINE_PER_CPU(struct vbus_pci_fastcall_desc, vbus_pci_percpu_fastcall)
+____cacheline_aligned;
+
+/*
+ * -------------------
+ * common routines
+ * -------------------
+ */
+
+static int
+vbus_pci_bridgecall(unsigned long nr, void *data, unsigned long len)
+{
+	struct vbus_pci_call_desc params = {
+		.vector = nr,
+		.len    = len,
+		.datap  = __pa(data),
+	};
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&vbus_pci.lock, flags);
+
+	memcpy_toio(&vbus_pci.regs->bridgecall, &params, sizeof(params));
+	ret = ioread32(&vbus_pci.regs->bridgecall);
+
+	spin_unlock_irqrestore(&vbus_pci.lock, flags);
+
+	vbus_pci.stats.bridgecalls++;
+
+	return ret;
+}
+
+static int
+vbus_pci_buscall(unsigned long nr, void *data, unsigned long len)
+{
+	struct vbus_pci_fastcall_desc *params;
+	int ret;
+
+	preempt_disable();
+
+	params = &get_cpu_var(vbus_pci_percpu_fastcall);
+
+	params->call.vector = nr;
+	params->call.len    = len;
+	params->call.datap  = __pa(data);
+
+	iowrite32(smp_processor_id(), &vbus_pci.signals->fastcall);
+
+	ret = params->result;
+
+	preempt_enable();
+
+	vbus_pci.stats.buscalls++;
+
+	return ret;
+}
+
+static struct vbus_pci_device *
+to_dev(struct vbus_device_proxy *vdev)
+{
+	return container_of(vdev, struct vbus_pci_device, vdev);
+}
+
+static void
+_signal_init(struct shm_signal *signal, struct shm_signal_desc *desc,
+	     struct shm_signal_ops *ops)
+{
+	desc->magic = SHM_SIGNAL_MAGIC;
+	desc->ver   = SHM_SIGNAL_VER;
+
+	shm_signal_init(signal, shm_locality_north, ops, desc);
+}
+
+/*
+ * -------------------
+ * _signal
+ * -------------------
+ */
+
+struct _signal {
+	char               name[64];
+	struct vbus_pci   *pcivbus;
+	struct shm_signal  signal;
+	u32                handle;
+	struct rb_node     node;
+	struct list_head   list;
+	int                irq;
+	struct irq_desc   *desc;
+};
+
+static struct _signal *
+to_signal(struct shm_signal *signal)
+{
+       return container_of(signal, struct _signal, signal);
+}
+
+static int
+_signal_inject(struct shm_signal *signal)
+{
+	struct _signal *_signal = to_signal(signal);
+
+	vbus_pci.stats.inject++;
+	iowrite32(_signal->handle, &vbus_pci.signals->shmsignal);
+
+	return 0;
+}
+
+static void
+_signal_release(struct shm_signal *signal)
+{
+	struct _signal *_signal = to_signal(signal);
+
+	kfree(_signal);
+}
+
+static struct shm_signal_ops _signal_ops = {
+	.inject  = _signal_inject,
+	.release = _signal_release,
+};
+
+static void shmsignal_disconnect(struct _signal *_signal);
+
+/*
+ * -------------------
+ * vbus_device_proxy routines
+ * -------------------
+ */
+
+static int
+vbus_pci_device_open(struct vbus_device_proxy *vdev, int version, int flags)
+{
+	struct vbus_pci_device *dev = to_dev(vdev);
+	struct vbus_pci_deviceopen params;
+	int ret;
+
+	if (dev->handle)
+		return -EINVAL;
+
+	params.devid   = vdev->id;
+	params.version = version;
+
+	ret = vbus_pci_buscall(VBUS_PCI_HC_DEVOPEN,
+				 &params, sizeof(params));
+	if (ret < 0)
+		return ret;
+
+	dev->handle = params.handle;
+
+	return 0;
+}
+
+static int
+vbus_pci_device_close(struct vbus_device_proxy *vdev, int flags)
+{
+	struct vbus_pci_device *dev = to_dev(vdev);
+	unsigned long iflags;
+	int ret;
+
+	if (!dev->handle)
+		return -EINVAL;
+
+	spin_lock_irqsave(&vbus_pci.lock, iflags);
+
+	while (!list_empty(&dev->shms)) {
+		struct _signal *_signal;
+
+		_signal = list_first_entry(&dev->shms, struct _signal, list);
+
+		list_del(&_signal->list);
+		shmsignal_disconnect(_signal);
+
+		spin_unlock_irqrestore(&vbus_pci.lock, iflags);
+		shm_signal_put(&_signal->signal);
+		spin_lock_irqsave(&vbus_pci.lock, iflags);
+	}
+
+	spin_unlock_irqrestore(&vbus_pci.lock, iflags);
+
+	/*
+	 * The DEVICECLOSE will implicitly close all of the shm on the
+	 * host-side, so there is no need to do an explicit per-shm
+	 * hypercall
+	 */
+	ret = vbus_pci_buscall(VBUS_PCI_HC_DEVCLOSE,
+				 &dev->handle, sizeof(dev->handle));
+
+	if (ret < 0)
+		printk(KERN_ERR "VBUS-PCI: Error closing device %s/%lld: %d\n",
+		       vdev->type, vdev->id, ret);
+
+	dev->handle = 0;
+
+	return 0;
+}
+
+/*
+ * -------------------
+ * shmsignal interrupt routines
+ * -------------------
+ */
+
+/* We abstract these routines so that we can drop in irqchip later */
+
+static void
+shmsignal_wakeup(struct _signal *_signal)
+{
+	_shm_signal_wakeup(&_signal->signal);
+}
+
+static int
+shmsignal_connect(struct _signal *_signal)
+{
+	return 0;
+}
+
+static void
+shmsignal_disconnect(struct _signal *_signal)
+{
+
+}
+
+static int
+vbus_pci_device_shm(struct vbus_device_proxy *vdev, const char *name,
+		    int id, int prio,
+		    void *ptr, size_t len,
+		    struct shm_signal_desc *sdesc, struct shm_signal **signal,
+		    int flags)
+{
+	struct vbus_pci_device *dev = to_dev(vdev);
+	struct _signal *_signal = NULL;
+	struct vbus_pci_deviceshm params;
+	unsigned long iflags;
+	int ret;
+
+	if (!dev->handle)
+		return -EINVAL;
+
+	params.devh   = dev->handle;
+	params.id     = id;
+	params.flags  = flags;
+	params.datap  = (u64)__pa(ptr);
+	params.len    = len;
+
+	if (signal) {
+		/*
+		 * The signal descriptor must be embedded within the
+		 * provided ptr
+		 */
+		if (!sdesc
+		    || (len < sizeof(*sdesc))
+		    || ((void *)sdesc < ptr)
+		    || ((void *)sdesc > (ptr + len - sizeof(*sdesc))))
+			return -EINVAL;
+
+		_signal = kzalloc(sizeof(*_signal), GFP_KERNEL);
+		if (!_signal)
+			return -ENOMEM;
+
+		_signal_init(&_signal->signal, sdesc, &_signal_ops);
+
+		/*
+		 * take another reference for the host.  This is dropped
+		 * by a SHMCLOSE event
+		 */
+		shm_signal_get(&_signal->signal);
+
+		params.signal.offset = (u64)(unsigned long)sdesc -
+					(u64)(unsigned long)ptr;
+		params.signal.prio   = prio;
+		params.signal.cookie = (u64)(unsigned long)_signal;
+
+	} else
+		params.signal.offset = -1; /* yes, this is a u32, but its ok */
+
+	ret = vbus_pci_buscall(VBUS_PCI_HC_DEVSHM,
+				 &params, sizeof(params));
+	if (ret < 0)
+		goto fail;
+
+	if (signal) {
+
+		BUG_ON(ret < 0);
+
+		_signal->handle = ret;
+
+		if (!name)
+			snprintf(_signal->name, sizeof(_signal->name),
+				 "dev%lld-id%d", vdev->id, id);
+		else
+			snprintf(_signal->name, sizeof(_signal->name),
+				 "%s", name);
+
+		shmsignal_connect(_signal);
+
+		spin_lock_irqsave(&vbus_pci.lock, iflags);
+		list_add_tail(&_signal->list, &dev->shms);
+		spin_unlock_irqrestore(&vbus_pci.lock, iflags);
+
+		shm_signal_get(&_signal->signal);
+		*signal = &_signal->signal;
+	}
+
+	return 0;
+
+fail:
+	if (_signal) {
+		/*
+		 * We held two references above, so we need to drop
+		 * both of them
+		 */
+		shm_signal_put(&_signal->signal);
+		shm_signal_put(&_signal->signal);
+	}
+
+	return ret;
+}
+
+static int
+vbus_pci_device_call(struct vbus_device_proxy *vdev, u32 func, void *data,
+		     size_t len, int flags)
+{
+	struct vbus_pci_device *dev = to_dev(vdev);
+	struct vbus_pci_devicecall params = {
+		.devh  = dev->handle,
+		.func  = func,
+		.datap = (u64)__pa(data),
+		.len   = len,
+		.flags = flags,
+	};
+
+	if (!dev->handle)
+		return -EINVAL;
+
+	return vbus_pci_buscall(VBUS_PCI_HC_DEVCALL, &params, sizeof(params));
+}
+
+static void
+vbus_pci_device_release(struct vbus_device_proxy *vdev)
+{
+	struct vbus_pci_device *_dev = to_dev(vdev);
+
+	vbus_pci_device_close(vdev, 0);
+
+	kfree(_dev);
+}
+
+static struct vbus_device_proxy_ops vbus_pci_device_ops = {
+	.open    = vbus_pci_device_open,
+	.close   = vbus_pci_device_close,
+	.shm     = vbus_pci_device_shm,
+	.call    = vbus_pci_device_call,
+	.release = vbus_pci_device_release,
+};
+
+/*
+ * -------------------
+ * vbus events
+ * -------------------
+ */
+
+struct deferred_devadd_event {
+	struct work_struct        work;
+	struct vbus_pci_add_event event;
+};
+
+static void deferred_devdrop(struct work_struct *work);
+
+static void
+deferred_devadd(struct work_struct *work)
+{
+	struct deferred_devadd_event *_event;
+	struct vbus_pci_device *new;
+	int ret;
+
+	_event = container_of(work, struct deferred_devadd_event, work);
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "VBUS_PCI: Out of memory on add_event\n");
+		return;
+	}
+
+	INIT_LIST_HEAD(&new->shms);
+
+	memcpy(new->type, _event->event.type, VBUS_MAX_DEVTYPE_LEN);
+	new->vdev.type        = new->type;
+	new->vdev.id          = _event->event.id;
+	new->vdev.ops         = &vbus_pci_device_ops;
+
+	dev_set_name(&new->vdev.dev, "%lld", _event->event.id);
+
+	INIT_WORK(&new->drop, deferred_devdrop);
+
+	ret = vbus_device_proxy_register(&new->vdev);
+	if (ret < 0)
+		panic("failed to register device %lld(%s): %d\n",
+		      new->vdev.id, new->type, ret);
+
+	kfree(_event);
+}
+
+static void
+deferred_devdrop(struct work_struct *work)
+{
+	struct vbus_pci_device *dev;
+
+	dev = container_of(work, struct vbus_pci_device, drop);
+	vbus_device_proxy_unregister(&dev->vdev);
+}
+
+static void
+event_devadd(struct vbus_pci_add_event *event)
+{
+	struct deferred_devadd_event *_event;
+
+	_event = kzalloc(sizeof(*_event), GFP_ATOMIC);
+	if (!_event) {
+		printk(KERN_ERR \
+		       "VBUS_PCI: Out of ATOMIC memory on add_event\n");
+		return;
+	}
+
+	INIT_WORK(&_event->work, deferred_devadd);
+	memcpy(&_event->event, event, sizeof(*event));
+
+	schedule_work(&_event->work);
+}
+
+static void
+event_devdrop(struct vbus_pci_handle_event *event)
+{
+	struct vbus_device_proxy *dev = vbus_device_proxy_find(event->handle);
+
+	if (!dev) {
+		printk(KERN_WARNING "VBUS-PCI: devdrop failed: %lld\n",
+		       event->handle);
+		return;
+	}
+
+	schedule_work(&to_dev(dev)->drop);
+}
+
+static void
+event_shmsignal(struct vbus_pci_handle_event *event)
+{
+	struct _signal *_signal = (struct _signal *)(unsigned long)event->handle;
+
+	vbus_pci.stats.notify++;
+
+	shmsignal_wakeup(_signal);
+}
+
+static void
+event_shmclose(struct vbus_pci_handle_event *event)
+{
+	struct _signal *_signal = (struct _signal *)(unsigned long)event->handle;
+
+	/*
+	 * This reference was taken during the DEVICESHM call
+	 */
+	shm_signal_put(&_signal->signal);
+}
+
+/*
+ * -------------------
+ * eventq routines
+ * -------------------
+ */
+
+static struct ioq_notifier eventq_notifier;
+
+static int __devinit
+eventq_init(int qlen)
+{
+	struct ioq_iterator iter;
+	int ret;
+	int i;
+
+	vbus_pci.ring = kzalloc(sizeof(struct vbus_pci_event) * qlen,
+				GFP_KERNEL);
+	if (!vbus_pci.ring)
+		return -ENOMEM;
+
+	/*
+	 * We want to iterate on the "valid" index.  By default the iterator
+	 * will not "autoupdate" which means it will not hypercall the host
+	 * with our changes.  This is good, because we are really just
+	 * initializing stuff here anyway.  Note that you can always manually
+	 * signal the host with ioq_signal() if the autoupdate feature is not
+	 * used.
+	 */
+	ret = ioq_iter_init(&vbus_pci.eventq, &iter, ioq_idxtype_valid, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * Seek to the tail of the valid index (which should be our first
+	 * item since the queue is brand-new)
+	 */
+	ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * Now populate each descriptor with an empty vbus_event and mark it
+	 * valid
+	 */
+	for (i = 0; i < qlen; i++) {
+		struct vbus_pci_event *event = &vbus_pci.ring[i];
+		size_t                 len   = sizeof(*event);
+		struct ioq_ring_desc  *desc  = iter.desc;
+
+		BUG_ON(iter.desc->valid);
+
+		desc->cookie = (u64)(unsigned long)event;
+		desc->ptr    = cpu_to_le64(__pa(event));
+		desc->len    = cpu_to_le64(len); /* total length */
+		desc->valid  = 1;
+
+		/*
+		 * This push operation will simultaneously advance the
+		 * valid-tail index and increment our position in the queue
+		 * by one.
+		 */
+		ret = ioq_iter_push(&iter, 0);
+		BUG_ON(ret < 0);
+	}
+
+	vbus_pci.eventq.notifier = &eventq_notifier;
+
+	/*
+	 * And finally, ensure that we can receive notification
+	 */
+	ioq_notify_enable(&vbus_pci.eventq, 0);
+
+	return 0;
+}
+
+/* Invoked whenever the hypervisor ioq_signal()s our eventq */
+static void
+eventq_wakeup(struct ioq_notifier *notifier)
+{
+	struct ioq_iterator iter;
+	int ret;
+
+	/* We want to iterate on the head of the in-use index */
+	ret = ioq_iter_init(&vbus_pci.eventq, &iter, ioq_idxtype_inuse, 0);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * The EOM is indicated by finding a packet that is still owned by
+	 * the south side.
+	 *
+	 * FIXME: This in theory could run indefinitely if the host keeps
+	 * feeding us events since there is nothing like a NAPI budget.  We
+	 * might need to address that
+	 */
+	while (!iter.desc->sown) {
+		struct ioq_ring_desc *desc  = iter.desc;
+		struct vbus_pci_event *event;
+
+		event = (struct vbus_pci_event *)(unsigned long)desc->cookie;
+
+		switch (event->eventid) {
+		case VBUS_PCI_EVENT_DEVADD:
+			event_devadd(&event->data.add);
+			break;
+		case VBUS_PCI_EVENT_DEVDROP:
+			event_devdrop(&event->data.handle);
+			break;
+		case VBUS_PCI_EVENT_SHMSIGNAL:
+			event_shmsignal(&event->data.handle);
+			break;
+		case VBUS_PCI_EVENT_SHMCLOSE:
+			event_shmclose(&event->data.handle);
+			break;
+		default:
+			printk(KERN_WARNING "VBUS_PCI: Unexpected event %d\n",
+			       event->eventid);
+			break;
+		};
+
+		memset(event, 0, sizeof(*event));
+
+		/* Advance the in-use head */
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+
+		vbus_pci.stats.events++;
+	}
+
+	/* And let the south side know that we changed the queue */
+	ioq_signal(&vbus_pci.eventq, 0);
+}
+
+static struct ioq_notifier eventq_notifier = {
+	.signal = &eventq_wakeup,
+};
+
+/* Injected whenever the host issues an ioq_signal() on the eventq */
+static irqreturn_t
+eventq_intr(int irq, void *dev)
+{
+	vbus_pci.stats.qnotify++;
+	_shm_signal_wakeup(vbus_pci.eventq.signal);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * -------------------
+ */
+
+static int
+eventq_signal_inject(struct shm_signal *signal)
+{
+	vbus_pci.stats.qinject++;
+
+	/* The eventq uses the special-case handle=0 */
+	iowrite32(0, &vbus_pci.signals->eventq);
+
+	return 0;
+}
+
+static void
+eventq_signal_release(struct shm_signal *signal)
+{
+	kfree(signal);
+}
+
+static struct shm_signal_ops eventq_signal_ops = {
+	.inject  = eventq_signal_inject,
+	.release = eventq_signal_release,
+};
+
+/*
+ * -------------------
+ */
+
+static void
+eventq_ioq_release(struct ioq *ioq)
+{
+	/* released as part of the vbus_pci object */
+}
+
+static struct ioq_ops eventq_ioq_ops = {
+	.release = eventq_ioq_release,
+};
+
+/*
+ * -------------------
+ */
+
+static void
+vbus_pci_release(void)
+{
+#ifdef CONFIG_DEBUG_FS
+	if (vbus_pci.stats.fs)
+		debugfs_remove(vbus_pci.stats.fs);
+#endif
+
+	if (vbus_pci.irq > 0)
+		free_irq(vbus_pci.irq, NULL);
+
+	if (vbus_pci.signals)
+		pci_iounmap(vbus_pci.dev, (void *)vbus_pci.signals);
+
+	if (vbus_pci.regs)
+		pci_iounmap(vbus_pci.dev, (void *)vbus_pci.regs);
+
+	pci_release_regions(vbus_pci.dev);
+	pci_disable_device(vbus_pci.dev);
+
+	kfree(vbus_pci.eventq.head_desc);
+	kfree(vbus_pci.ring);
+
+	vbus_pci.enabled = false;
+}
+
+static int __devinit
+vbus_pci_open(void)
+{
+	struct vbus_pci_bridge_negotiate params = {
+		.magic        = VBUS_PCI_ABI_MAGIC,
+		.version      = VBUS_PCI_HC_VERSION,
+		.capabilities = 0,
+	};
+
+	return vbus_pci_bridgecall(VBUS_PCI_BRIDGE_NEGOTIATE,
+				  &params, sizeof(params));
+}
+
+#define QLEN 1024
+
+static int __devinit
+vbus_pci_eventq_register(void)
+{
+	struct vbus_pci_busreg params = {
+		.count = 1,
+		.eventq = {
+			{
+				.count = QLEN,
+				.ring  = (u64)__pa(vbus_pci.eventq.head_desc),
+				.data  = (u64)__pa(vbus_pci.ring),
+			},
+		},
+	};
+
+	return vbus_pci_bridgecall(VBUS_PCI_BRIDGE_QREG,
+				   &params, sizeof(params));
+}
+
+static int __devinit
+_ioq_init(size_t ringsize, struct ioq *ioq, struct ioq_ops *ops)
+{
+	struct shm_signal    *signal = NULL;
+	struct ioq_ring_head *head = NULL;
+	size_t                len  = IOQ_HEAD_DESC_SIZE(ringsize);
+
+	head = kzalloc(len, GFP_KERNEL | GFP_DMA);
+	if (!head)
+		return -ENOMEM;
+
+	signal = kzalloc(sizeof(*signal), GFP_KERNEL);
+	if (!signal) {
+		kfree(head);
+		return -ENOMEM;
+	}
+
+	head->magic     = IOQ_RING_MAGIC;
+	head->ver	= IOQ_RING_VER;
+	head->count     = cpu_to_le32(ringsize);
+
+	_signal_init(signal, &head->signal, &eventq_signal_ops);
+
+	ioq_init(ioq, ops, ioq_locality_north, head, signal, ringsize);
+
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int _debugfs_seq_show(struct seq_file *m, void *p)
+{
+#define P(F) \
+	seq_printf(m, "  .%-30s: %d\n", #F, (int)vbus_pci.stats.F)
+
+	P(events);
+	P(qnotify);
+	P(qinject);
+	P(notify);
+	P(inject);
+	P(bridgecalls);
+	P(buscalls);
+
+#undef P
+
+	return 0;
+}
+
+static int _debugfs_fops_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, _debugfs_seq_show, inode->i_private);
+}
+
+static const struct file_operations stat_fops = {
+	.open		= _debugfs_fops_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.owner		= THIS_MODULE,
+};
+#endif
+
+static int __devinit
+vbus_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret;
+	int cpu;
+
+	if (vbus_pci.enabled)
+		return -EEXIST; /* we only support one bridge per kernel */
+
+	if (pdev->revision != VBUS_PCI_ABI_VERSION) {
+		printk(KERN_DEBUG "VBUS_PCI: expected ABI version %d, got %d\n",
+		       VBUS_PCI_ABI_VERSION,
+		       pdev->revision);
+		return -ENODEV;
+	}
+
+	vbus_pci.dev = pdev;
+
+	ret = pci_enable_device(pdev);
+	if (ret < 0)
+		return ret;
+
+	pci_set_master(pdev);
+
+	ret = pci_request_regions(pdev, VBUS_PCI_NAME);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Could not init BARs: %d\n", ret);
+		goto out_fail;
+	}
+
+	vbus_pci.regs = pci_iomap(pdev, 0, sizeof(struct vbus_pci_regs));
+	if (!vbus_pci.regs) {
+		printk(KERN_ERR "VBUS_PCI: Could not map BARs\n");
+		goto out_fail;
+	}
+
+	vbus_pci.signals = pci_iomap(pdev, 1, sizeof(struct vbus_pci_signals));
+	if (!vbus_pci.signals) {
+		printk(KERN_ERR "VBUS_PCI: Could not map BARs\n");
+		goto out_fail;
+	}
+
+	ret = vbus_pci_open();
+	if (ret < 0) {
+		printk(KERN_DEBUG "VBUS_PCI: Could not register with host: %d\n",
+		       ret);
+		goto out_fail;
+	}
+
+	/*
+	 * Allocate an IOQ to use for host-2-guest event notification
+	 */
+	ret = _ioq_init(QLEN, &vbus_pci.eventq, &eventq_ioq_ops);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Cound not init eventq: %d\n", ret);
+		goto out_fail;
+	}
+
+	ret = eventq_init(QLEN);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Cound not setup ring: %d\n", ret);
+		goto out_fail;
+	}
+
+	ret = pci_enable_msi(pdev);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Cound not enable MSI: %d\n", ret);
+		goto out_fail;
+	}
+
+	vbus_pci.irq = pdev->irq;
+
+	ret = request_irq(pdev->irq, eventq_intr, 0, "vbus", NULL);
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Failed to register IRQ %d\n: %d",
+		       pdev->irq, ret);
+		goto out_fail;
+	}
+
+	/*
+	 * Add one fastcall vector per cpu so that we can do lockless
+	 * hypercalls
+	 */
+	for_each_possible_cpu(cpu) {
+		struct vbus_pci_fastcall_desc *desc =
+			&per_cpu(vbus_pci_percpu_fastcall, cpu);
+		struct vbus_pci_call_desc params = {
+			.vector = cpu,
+			.len    = sizeof(*desc),
+			.datap  = __pa(desc),
+		};
+
+		ret = vbus_pci_bridgecall(VBUS_PCI_BRIDGE_FASTCALL_ADD,
+					  &params, sizeof(params));
+		if (ret < 0) {
+			printk(KERN_ERR \
+			       "VBUS_PCI: Failed to register cpu:%d\n: %d",
+			       cpu, ret);
+			goto out_fail;
+		}
+	}
+
+	/*
+	 * Finally register our queue on the host to start receiving events
+	 */
+	ret = vbus_pci_eventq_register();
+	if (ret < 0) {
+		printk(KERN_ERR "VBUS_PCI: Could not register with host: %d\n",
+		       ret);
+		goto out_fail;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	vbus_pci.stats.fs = debugfs_create_file(VBUS_PCI_NAME, S_IRUGO,
+						NULL, NULL, &stat_fops);
+	if (IS_ERR(vbus_pci.stats.fs)) {
+		ret = PTR_ERR(vbus_pci.stats.fs);
+		printk(KERN_ERR "VBUS_PCI: error creating stats-fs: %d\n", ret);
+		goto out_fail;
+	}
+#endif
+
+	vbus_pci.enabled = true;
+
+	printk(KERN_INFO "Virtual-Bus: Copyright (c) 2009, " \
+	       "Gregory Haskins <ghaskins@novell.com>\n");
+
+	return 0;
+
+ out_fail:
+	vbus_pci_release();
+
+	return ret;
+}
+
+static void __devexit
+vbus_pci_remove(struct pci_dev *pdev)
+{
+	vbus_pci_release();
+}
+
+static DEFINE_PCI_DEVICE_TABLE(vbus_pci_tbl) = {
+	{ PCI_DEVICE(0x11da, 0x2000) },
+	{ 0 },
+};
+
+MODULE_DEVICE_TABLE(pci, vbus_pci_tbl);
+
+static struct pci_driver vbus_pci_driver = {
+	.name     = VBUS_PCI_NAME,
+	.id_table = vbus_pci_tbl,
+	.probe    = vbus_pci_probe,
+	.remove   = vbus_pci_remove,
+};
+
+static int __init
+vbus_pci_init(void)
+{
+	memset(&vbus_pci, 0, sizeof(vbus_pci));
+	spin_lock_init(&vbus_pci.lock);
+
+	return pci_register_driver(&vbus_pci_driver);
+}
+
+static void __exit
+vbus_pci_exit(void)
+{
+	pci_unregister_driver(&vbus_pci_driver);
+}
+
+module_init(vbus_pci_init);
+module_exit(vbus_pci_exit);
+
author	Stephen Rothwell <sfr@canb.auug.org.au>	2010-05-18 15:20:24 +1000
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2010-05-18 15:20:30 +1000
commit	b1030e48176d8aea860779963d4ece43ee5bc18b (patch)
tree	c7a7bdc9c2bc9e17319428cfd4e29c505ac01f81 /drivers
parent	b8010dfa3fe2759c13fe7b8b050b641deeaad0b6 (diff)
parent	e1077ef3b2751766c4437e2f974e3d7372742d0d (diff)