Merge branch 'quilt/rr'

Conflicts: drivers/char/hvc_console.h kernel/stop_machine.c
author: Stephen Rothwell <sfr@canb.auug.org.au> 2008-06-25 16:27:25 +1000
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2008-06-25 16:27:25 +1000
commit: b78ffa0535e3749e4a9d621f3627a97b603de77a (patch)
tree: 5546c941b7b7e7d5fc09f4fadeaef11f7eb7f3eb
parent: c9ce4244158a7a7959e7b734d7968b840db5497c (diff)
parent: febc36efc07ed8c3288523655ab7594f552e496b (diff)
62 files changed, 1349 insertions, 633 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index 2510763295d0..084f6ad7b7a0 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -219,10 +219,10 @@
    </para>
 
    <sect1 id="lock-intro">
-   <title>Three Main Types of Kernel Locks: Spinlocks, Mutexes and Semaphores</title>
+   <title>Two Main Types of Kernel Locks: Spinlocks and Mutexes</title>
 
    <para>
-     There are three main types of kernel locks.  The fundamental type
+     There are two main types of kernel locks.  The fundamental type
      is the spinlock 
      (<filename class="headerfile">include/asm/spinlock.h</filename>),
      which is a very simple single-holder lock: if you can't get the 
@@ -240,14 +240,6 @@
      use a spinlock instead.
    </para>
    <para>
-     The third type is a semaphore
-     (<filename class="headerfile">include/linux/semaphore.h</filename>): it
-     can have more than one holder at any time (the number decided at
-     initialization time), although it is most commonly used as a
-     single-holder lock (a mutex).  If you can't get a semaphore, your
-     task will be suspended and later on woken up - just like for mutexes.
-   </para>
-   <para>
      Neither type of lock is recursive: see
      <xref linkend="deadlock"/>.
    </para>
@@ -278,7 +270,7 @@
     </para>
 
     <para>
-      Semaphores still exist, because they are required for
+      Mutexes still exist, because they are required for
       synchronization between <firstterm linkend="gloss-usercontext">user 
       contexts</firstterm>, as we will see below.
     </para>
@@ -289,18 +281,17 @@
 
      <para>
        If you have a data structure which is only ever accessed from
-       user context, then you can use a simple semaphore
-       (<filename>linux/linux/semaphore.h</filename>) to protect it.  This
-       is the most trivial case: you initialize the semaphore to the number 
-       of resources available (usually 1), and call
-       <function>down_interruptible()</function> to grab the semaphore, and 
-       <function>up()</function> to release it.  There is also a 
-       <function>down()</function>, which should be avoided, because it 
+       user context, then you can use a simple mutex
+       (<filename>include/linux/mutex.h</filename>) to protect it.  This
+       is the most trivial case: you initialize the mutex.  Then you can
+       call <function>mutex_lock_interruptible()</function> to grab the mutex,
+       and <function>mutex_unlock()</function> to release it.  There is also a 
+       <function>mutex_lock()</function>, which should be avoided, because it 
        will not return if a signal is received.
      </para>
 
      <para>
-       Example: <filename>linux/net/core/netfilter.c</filename> allows 
+       Example: <filename>net/netfilter/nf_sockopt.c</filename> allows 
        registration of new <function>setsockopt()</function> and 
        <function>getsockopt()</function> calls, with
        <function>nf_register_sockopt()</function>.  Registration and 
@@ -515,7 +506,7 @@
       <listitem>
 	<para>
           If you are in a process context (any syscall) and want to
-	lock other process out, use a semaphore.  You can take a semaphore
+	lock other process out, use a mutex.  You can take a mutex
 	and sleep (<function>copy_from_user*(</function> or
 	<function>kmalloc(x,GFP_KERNEL)</function>).
       </para>
@@ -662,7 +653,7 @@
 <entry>SLBH</entry>
 <entry>SLBH</entry>
 <entry>SLBH</entry>
-<entry>DI</entry>
+<entry>MLI</entry>
 <entry>None</entry>
 </row>
 
@@ -692,8 +683,8 @@
 <entry>spin_lock_bh</entry>
 </row>
 <row>
-<entry>DI</entry>
-<entry>down_interruptible</entry>
+<entry>MLI</entry>
+<entry>mutex_lock_interruptible</entry>
 </row>
 
 </tbody>
@@ -1310,7 +1301,7 @@ as Alan Cox says, <quote>Lock data, not code</quote>.
     <para>
       There is a coding bug where a piece of code tries to grab a
       spinlock twice: it will spin forever, waiting for the lock to
-      be released (spinlocks, rwlocks and semaphores are not
+      be released (spinlocks, rwlocks and mutexes are not
       recursive in Linux).  This is trivial to diagnose: not a
       stay-up-five-nights-talk-to-fluffy-code-bunnies kind of
       problem.
@@ -1335,7 +1326,7 @@ as Alan Cox says, <quote>Lock data, not code</quote>.
 
     <para>
       This complete lockup is easy to diagnose: on SMP boxes the
-      watchdog timer or compiling with <symbol>DEBUG_SPINLOCKS</symbol> set
+      watchdog timer or compiling with <symbol>DEBUG_SPINLOCK</symbol> set
       (<filename>include/linux/spinlock.h</filename>) will show this up 
       immediately when it happens.
     </para>
@@ -1558,7 +1549,7 @@ the amount of locking which needs to be done.
    <title>Read/Write Lock Variants</title>
 
    <para>
-      Both spinlocks and semaphores have read/write variants:
+      Both spinlocks and mutexes have read/write variants:
       <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>.
       These divide users into two classes: the readers and the writers.  If
       you are only reading the data, you can get a read lock, but to write to
@@ -1681,7 +1672,7 @@ the amount of locking which needs to be done.
  #include &lt;linux/slab.h&gt;
  #include &lt;linux/string.h&gt;
 +#include &lt;linux/rcupdate.h&gt;
- #include &lt;linux/semaphore.h&gt;
+ #include &lt;linux/mutex.h&gt;
  #include &lt;asm/errno.h&gt;
 
  struct object
@@ -1913,7 +1904,7 @@ machines due to caching.
        </listitem>
        <listitem>
         <para>
-          <function> put_user()</function>
+          <function>put_user()</function>
         </para>
        </listitem>
       </itemizedlist>
@@ -1927,13 +1918,13 @@ machines due to caching.
 
      <listitem>
       <para>
-      <function>down_interruptible()</function> and
-      <function>down()</function>
+      <function>mutex_lock_interruptible()</function> and
+      <function>mutex_lock()</function>
       </para>
       <para>
-       There is a <function>down_trylock()</function> which can be
+       There is a <function>mutex_trylock()</function> which can be
        used inside interrupt context, as it will not sleep.
-       <function>up()</function> will also never sleep.
+       <function>mutex_unlock()</function> will also never sleep.
       </para>
      </listitem>
     </itemizedlist>
@@ -2023,7 +2014,7 @@ machines due to caching.
       <para>
         Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is
         unset, processes in user context inside the kernel would not
-        preempt each other (ie. you had that CPU until you have it up,
+        preempt each other (ie. you had that CPU until you gave it up,
         except for interrupts).  With the addition of
         <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when
         in user context, higher priority tasks can "cut in": spinlocks
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 82fafe0429fe..35c8c69ceb72 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -36,11 +36,13 @@
 #include <sched.h>
 #include <limits.h>
 #include <stddef.h>
+#include <signal.h>
 #include "linux/lguest_launcher.h"
 #include "linux/virtio_config.h"
 #include "linux/virtio_net.h"
 #include "linux/virtio_blk.h"
 #include "linux/virtio_console.h"
+#include "linux/virtio_rng.h"
 #include "linux/virtio_ring.h"
 #include "asm-x86/bootparam.h"
 /*L:110 We can ignore the 39 include files we need for this program, but I do
@@ -64,8 +66,8 @@ typedef uint8_t u8;
 #endif
 /* We can have up to 256 pages for devices. */
 #define DEVICE_PAGES 256
-/* This will occupy 2 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 128
+/* This will occupy 4 pages: it must be a power of 2. */
+#define VIRTQUEUE_NUM 512
 
 /*L:120 verbose is both a global flag and a macro.  The C preprocessor allows
  * this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -80,6 +82,8 @@ static int waker_fd;
 static void *guest_base;
 /* The maximum guest physical address allowed, and maximum possible. */
 static unsigned long guest_limit, guest_max;
+/* The pipe for signal hander to write to. */
+static int timeoutpipe[2];
 
 /* a per-cpu variable indicating whose vcpu is currently running */
 static unsigned int __thread cpu_id;
@@ -152,16 +156,18 @@ struct virtqueue
 	/* The actual ring of buffers. */
 	struct vring vring;
 
-	/* Last available index we saw. */
-	u16 last_avail_idx;
-
 	/* The routine to call when the Guest pings us. */
 	void (*handle_output)(int fd, struct virtqueue *me);
 
 	/* Outstanding buffers */
 	unsigned int inflight;
+
+	/* Is this blocked awaiting a timer? */
+	bool blocked;
 };
 
+static unsigned int net_xmit_notify, net_recv_notify, net_timeout;
+
 /* Remember the arguments to the program so we can "reboot" */
 static char **main_args;
 
@@ -199,6 +205,33 @@ static void *_convert(struct iovec *iov, size_t size, size_t align,
 #define le32_to_cpu(v32) (v32)
 #define le64_to_cpu(v64) (v64)
 
+/* Is this iovec empty? */
+static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_iov; i++)
+		if (iov[i].iov_len)
+			return false;
+	return true;
+}
+
+/* Take len bytes from the front of this iovec. */
+static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_iov; i++) {
+		unsigned int used;
+
+		used = iov[i].iov_len < len ? iov[i].iov_len : len;
+		iov[i].iov_base += used;
+		iov[i].iov_len -= used;
+		len -= used;
+	}
+	assert(len == 0);
+}
+
 /* The device virtqueue descriptors are followed by feature bitmasks. */
 static u8 *get_feature_bits(struct device *dev)
 {
@@ -254,6 +287,7 @@ static void *map_zeroed_pages(unsigned int num)
 		    PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
 	if (addr == MAP_FAILED)
 		err(1, "Mmaping %u pages of /dev/zero", num);
+	close(fd);
 
 	return addr;
 }
@@ -661,19 +695,22 @@ static unsigned get_vq_desc(struct virtqueue *vq,
 			    unsigned int *out_num, unsigned int *in_num)
 {
 	unsigned int i, head;
+	u16 last_avail;
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
-	if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
+	last_avail = vring_last_avail(&vq->vring);
+	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
 		errx(1, "Guest moved used index from %u to %u",
-		     vq->last_avail_idx, vq->vring.avail->idx);
+		     last_avail, vq->vring.avail->idx);
 
 	/* If there's nothing new since last we looked, return invalid. */
-	if (vq->vring.avail->idx == vq->last_avail_idx)
+	if (vq->vring.avail->idx == last_avail)
 		return vq->vring.num;
 
 	/* Grab the next descriptor number they're advertising, and increment
 	 * the index we've seen. */
-	head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
+	head = vq->vring.avail->ring[last_avail % vq->vring.num];
+	vring_last_avail(&vq->vring)++;
 
 	/* If their number is silly, that's a fatal mistake. */
 	if (head >= vq->vring.num)
@@ -825,6 +862,9 @@ static bool handle_console_input(int fd, struct device *dev)
 				/* Just in case waker is blocked in BREAK, send
 				 * unbreak now. */
 				write(fd, args, sizeof(args));
+				printf("network xmit %u recv %u timeout %u\n",
+				       net_xmit_notify, net_recv_notify,
+				       net_timeout);
 				exit(2);
 			}
 			abort->count = 0;
@@ -854,6 +894,21 @@ static void handle_console_output(int fd, struct virtqueue *vq)
 	}
 }
 
+static void block_vq(struct virtqueue *vq)
+{
+	struct itimerval itm;
+
+	vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+	vq->blocked = true;
+
+	itm.it_interval.tv_sec = 0;
+	itm.it_interval.tv_usec = 0;
+	itm.it_value.tv_sec = 0;
+	itm.it_value.tv_usec = 250;
+
+	setitimer(ITIMER_REAL, &itm, NULL);
+}
+
 /*
  * The Network
  *
@@ -863,21 +918,26 @@ static void handle_console_output(int fd, struct virtqueue *vq)
  */
 static void handle_net_output(int fd, struct virtqueue *vq)
 {
-	unsigned int head, out, in;
+	unsigned int head, out, in, num = 0;
 	int len;
 	struct iovec iov[vq->vring.num];
 
+	net_xmit_notify++;
+
 	/* Keep getting output buffers from the Guest until we run out. */
 	while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
 		if (in)
 			errx(1, "Input buffers in output queue?");
-		/* Check header, but otherwise ignore it (we told the Guest we
-		 * supported no features, so it shouldn't have anything
-		 * interesting). */
-		(void)convert(&iov[0], struct virtio_net_hdr);
-		len = writev(vq->dev->fd, iov+1, out-1);
+		len = writev(vq->dev->fd, iov, out);
+		if (len < 0)
+			err(1, "Writing network packet to tun");
 		add_used_and_trigger(fd, vq, head, len);
+		num++;
 	}
+
+	/* Block further kicks, and set up a timer if we saw anything. */
+	if (num)
+		block_vq(vq);
 }
 
 /* This is where we handle a packet coming in from the tun device to our
@@ -887,7 +947,6 @@ static bool handle_tun_input(int fd, struct device *dev)
 	unsigned int head, in_num, out_num;
 	int len;
 	struct iovec iov[dev->vq->vring.num];
-	struct virtio_net_hdr *hdr;
 
 	/* First we need a network buffer from the Guests's recv virtqueue. */
 	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
@@ -896,25 +955,23 @@ static bool handle_tun_input(int fd, struct device *dev)
 		 * early, the Guest won't be ready yet.  Wait until the device
 		 * status says it's ready. */
 		/* FIXME: Actually want DRIVER_ACTIVE here. */
-		if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
-			warn("network: no dma buffer!");
+
+		/* Now tell it we want to know if new things appear. */
+		dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+		wmb();
+
 		/* We'll turn this back on if input buffers are registered. */
 		return false;
 	} else if (out_num)
 		errx(1, "Output buffers in network recv queue?");
 
-	/* First element is the header: we set it to 0 (no features). */
-	hdr = convert(&iov[0], struct virtio_net_hdr);
-	hdr->flags = 0;
-	hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
-
 	/* Read the packet from the device directly into the Guest's buffer. */
-	len = readv(dev->fd, iov+1, in_num-1);
+	len = readv(dev->fd, iov, in_num);
 	if (len <= 0)
 		err(1, "reading network");
 
 	/* Tell the Guest about the new packet. */
-	add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
+	add_used_and_trigger(fd, dev->vq, head, len);
 
 	verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
 		((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
@@ -934,6 +991,16 @@ static void enable_fd(int fd, struct virtqueue *vq)
 	write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
 }
 
+static void net_enable_fd(int fd, struct virtqueue *vq)
+{
+	/* We don't need to know again when Guest refills receive buffer. */
+	vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+	net_recv_notify++;
+	add_device_fd(vq->dev->fd);
+	/* Tell waker to listen to it again */
+	write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
+}
+
 /* When the Guest tells us they updated the status field, we handle it. */
 static void update_device_status(struct device *dev)
 {
@@ -951,7 +1018,7 @@ static void update_device_status(struct device *dev)
 		for (vq = dev->vq; vq; vq = vq->next) {
 			memset(vq->vring.desc, 0,
 			       vring_size(vq->config.num, getpagesize()));
-			vq->last_avail_idx = 0;
+			vring_last_avail(&vq->vring) = 0;
 		}
 	} else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
 		warnx("Device %s configuration FAILED", dev->name);
@@ -1014,6 +1081,29 @@ static void handle_output(int fd, unsigned long addr)
 	      strnlen(from_guest_phys(addr), guest_limit - addr));
 }
 
+static void handle_timeout(int fd)
+{
+	char buf[32];
+	struct device *i;
+	struct virtqueue *vq;
+
+	/* Clear the pipe */
+	read(timeoutpipe[0], buf, sizeof(buf));
+
+	/* Check each device and virtqueue: flush blocked ones. */
+	for (i = devices.dev; i; i = i->next) {
+		for (vq = i->vq; vq; vq = vq->next) {
+			if (!vq->blocked)
+				continue;
+
+			vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+			vq->blocked = false;
+			if (vq->handle_output)
+				vq->handle_output(fd, vq);
+		}
+	}
+}
+
 /* This is called when the Waker wakes us up: check for incoming file
  * descriptors. */
 static void handle_input(int fd)
@@ -1024,9 +1114,14 @@ static void handle_input(int fd)
 	for (;;) {
 		struct device *i;
 		fd_set fds = devices.infds;
+		int num;
 
+		num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
+		/* Could get interrupted */
+		if (num < 0)
+			continue;
 		/* If nothing is ready, we're done. */
-		if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
+		if (num == 0)
 			break;
 
 		/* Otherwise, call the device(s) which have readable file
@@ -1050,6 +1145,10 @@ static void handle_input(int fd)
 				write(waker_fd, &dev_fd, sizeof(dev_fd));
 			}
 		}
+
+		/* Is this the timeout fd? */
+		if (FD_ISSET(timeoutpipe[0], &fds))
+			handle_timeout(fd);
 	}
 }
 
@@ -1111,9 +1210,9 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 
 	/* Initialize the virtqueue */
 	vq->next = NULL;
-	vq->last_avail_idx = 0;
 	vq->dev = dev;
 	vq->inflight = 0;
+	vq->blocked = false;
 
 	/* Initialize the configuration. */
 	vq->config.num = num_descs;
@@ -1168,6 +1267,10 @@ static void add_feature(struct device *dev, unsigned bit)
  * how we use it. */
 static void set_config(struct device *dev, unsigned len, const void *conf)
 {
+	/* We always set the VIRTIO_RING_F_PUBLISH_INDICES feature
+	 * bit, so now is a good time to do that. */
+	add_feature(dev, VIRTIO_RING_F_PUBLISH_INDICES);
+
 	/* Check we haven't overflowed our single page. */
 	if (device_config(dev) + len > devices.descpage + getpagesize())
 		errx(1, "Too many devices");
@@ -1242,10 +1345,31 @@ static void setup_console(void)
 	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
 	add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
 
+	/* Every device should set this bit. */
+	add_feature(dev, VIRTIO_RING_F_PUBLISH_INDICES);
 	verbose("device %u: console\n", devices.device_num++);
 }
 /*:*/
 
+static void timeout_alarm(int sig)
+{
+	net_timeout++;
+	write(timeoutpipe[1], "", 1);
+}
+
+static void setup_timeout(void)
+{
+	if (pipe(timeoutpipe) != 0)
+		err(1, "Creating timeout pipe");
+
+	if (fcntl(timeoutpipe[1], F_SETFL,
+		  fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
+		err(1, "Making timeout pipe nonblocking");
+
+	add_device_fd(timeoutpipe[0]);
+	signal(SIGALRM, timeout_alarm);
+}
+
 /*M:010 Inter-guest networking is an interesting area.  Simplest is to have a
  * --sharenet=<name> option which opens or creates a named pipe.  This can be
  * used to send packets to another guest in a 1:1 manner.
@@ -1264,10 +1388,25 @@ static void setup_console(void)
 
 static u32 str2ip(const char *ipaddr)
 {
-	unsigned int byte[4];
+	unsigned int b[4];
+
+	if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
+		errx(1, "Failed to parse IP address '%s'", ipaddr);
+	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
+}
 
-	sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
-	return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
+static void str2mac(const char *macaddr, unsigned char mac[6])
+{
+	unsigned int m[6];
+	if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
+		   &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
+		errx(1, "Failed to parse mac address '%s'", mac);
+	mac[0] = m[0];
+	mac[1] = m[1];
+	mac[2] = m[2];
+	mac[3] = m[3];
+	mac[4] = m[4];
+	mac[5] = m[5];
 }
 
 /* This code is "adapted" from libbridge: it attaches the Host end of the
@@ -1288,6 +1427,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
 		errx(1, "interface %s does not exist!", if_name);
 
 	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
+	ifr.ifr_name[IFNAMSIZ-1] = '\0';
 	ifr.ifr_ifindex = ifidx;
 	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
 		err(1, "can't add %s to bridge %s", if_name, br_name);
@@ -1296,64 +1436,90 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
 /* This sets up the Host end of the network device with an IP address, brings
  * it up so packets will flow, the copies the MAC address into the hwaddr
  * pointer. */
-static void configure_device(int fd, const char *devname, u32 ipaddr,
-			     unsigned char hwaddr[6])
+static void configure_device(int fd, const char *tapif, u32 ipaddr)
 {
 	struct ifreq ifr;
 	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
 
-	/* Don't read these incantations.  Just cut & paste them like I did! */
 	memset(&ifr, 0, sizeof(ifr));
-	strcpy(ifr.ifr_name, devname);
+	strcpy(ifr.ifr_name, tapif);
+
+	/* Don't read these incantations.  Just cut & paste them like I did! */
 	sin->sin_family = AF_INET;
 	sin->sin_addr.s_addr = htonl(ipaddr);
 	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-		err(1, "Setting %s interface address", devname);
+		err(1, "Setting %s interface address", tapif);
 	ifr.ifr_flags = IFF_UP;
 	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-		err(1, "Bringing interface %s up", devname);
+		err(1, "Bringing interface %s up", tapif);
+}
+
+static void get_mac(int fd, const char *tapif, unsigned char hwaddr[6])
+{
+	struct ifreq ifr;
+
+	memset(&ifr, 0, sizeof(ifr));
+	strcpy(ifr.ifr_name, tapif);
 
 	/* SIOC stands for Socket I/O Control.  G means Get (vs S for Set
 	 * above).  IF means Interface, and HWADDR is hardware address.
 	 * Simple! */
 	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
-		err(1, "getting hw address for %s", devname);
+		err(1, "getting hw address for %s", tapif);
 	memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
 }
 
-/*L:195 Our network is a Host<->Guest network.  This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card.  We
- * just shunt packets between the Guest and the tun device. */
-static void setup_tun_net(const char *arg)
+static int get_tun_device(char tapif[IFNAMSIZ])
 {
-	struct device *dev;
 	struct ifreq ifr;
-	int netfd, ipfd;
-	u32 ip;
-	const char *br_name = NULL;
-	struct virtio_net_config conf;
+	int netfd;
+
+	/* Start with this zeroed.  Messy but sure. */
+	memset(&ifr, 0, sizeof(ifr));
 
 	/* We open the /dev/net/tun device and tell it we want a tap device.  A
 	 * tap device is like a tun device, only somehow different.  To tell
 	 * the truth, I completely blundered my way through this code, but it
 	 * works now! */
 	netfd = open_or_die("/dev/net/tun", O_RDWR);
-	memset(&ifr, 0, sizeof(ifr));
-	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
 	strcpy(ifr.ifr_name, "tap%d");
 	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
 		err(1, "configuring /dev/net/tun");
+
+	if (ioctl(netfd, TUNSETFEATURES,
+		  TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
+		err(1, "Could not set features for tun device");
+
 	/* We don't need checksums calculated for packets coming in this
 	 * device: trust us! */
 	ioctl(netfd, TUNSETNOCSUM, 1);
 
+	memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
+	return netfd;
+}
+
+/*L:195 Our network is a Host<->Guest network.  This can either use bridging or
+ * routing, but the principle is the same: it uses the "tun" device to inject
+ * packets into the Host as if they came in from a normal network card.  We
+ * just shunt packets between the Guest and the tun device. */
+static void setup_tun_net(char *arg)
+{
+	struct device *dev;
+	int netfd, ipfd;
+	u32 ip = INADDR_ANY;
+	bool bridging = false;
+	char tapif[IFNAMSIZ], *p;
+	struct virtio_net_config conf;
+
+	netfd = get_tun_device(tapif);
+
 	/* First we create a new network device. */
 	dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
 
 	/* Network devices need a receive and a send queue, just like
 	 * console. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+	add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
 	add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
 
 	/* We need a socket to perform the magic network ioctls to bring up the
@@ -1364,28 +1530,56 @@ static void setup_tun_net(const char *arg)
 
 	/* If the command line was --tunnet=bridge:<name> do bridging. */
 	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-		ip = INADDR_ANY;
-		br_name = arg + strlen(BRIDGE_PFX);
-		add_to_bridge(ipfd, ifr.ifr_name, br_name);
-	} else /* It is an IP address to set up the device with */
+		arg += strlen(BRIDGE_PFX);
+		bridging = true;
+	}
+
+	/* A mac address may follow the bridge name or IP address */
+	p = strchr(arg, ':');
+	if (p) {
+		str2mac(p+1, conf.mac);
+		*p = '\0';
+	} else {
+		p = arg + strlen(arg);
+		/* None supplied; query the randomly assigned mac. */
+		get_mac(ipfd, tapif, conf.mac);
+	}
+
+	/* arg is now either an IP address or a bridge name */
+	if (bridging)
+		add_to_bridge(ipfd, tapif, arg);
+	else
 		ip = str2ip(arg);
 
-	/* Set up the tun device, and get the mac address for the interface. */
-	configure_device(ipfd, ifr.ifr_name, ip, conf.mac);
+	/* Set up the tun device. */
+	configure_device(ipfd, tapif, ip);
 
 	/* Tell Guest what MAC address to use. */
 	add_feature(dev, VIRTIO_NET_F_MAC);
 	add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
+	/* Expect Guest to handle everything except UFO */
+	add_feature(dev, VIRTIO_NET_F_CSUM);
+	add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
+	add_feature(dev, VIRTIO_NET_F_MAC);
+	add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
+	add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
+	add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
+	add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
+	add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
+	add_feature(dev, VIRTIO_NET_F_HOST_ECN);
 	set_config(dev, sizeof(conf), &conf);
 
 	/* We don't need the socket any more; setup is done. */
 	close(ipfd);
 
-	verbose("device %u: tun net %u.%u.%u.%u\n",
-		devices.device_num++,
-		(u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
-	if (br_name)
-		verbose("attached to bridge: %s\n", br_name);
+	devices.device_num++;
+
+	if (bridging)
+		verbose("device %u: tun %s attached to bridge: %s\n",
+			devices.device_num, tapif, arg);
+	else
+		verbose("device %u: tun %s: %s\n",
+			devices.device_num, tapif, arg);
 }
 
 /* Our block (disk) device should be really simple: the Guest asks for a block
@@ -1621,6 +1815,64 @@ static void setup_block_file(const char *filename)
 	verbose("device %u: virtblock %llu sectors\n",
 		devices.device_num, le64_to_cpu(conf.capacity));
 }
+
+/* Our random number generator device reads from /dev/random into the Guest's
+ * input buffers.  The usual case is that the Guest doesn't want random numbers
+ * and so has no buffers although /dev/random is still readable, whereas
+ * console is the reverse.
+ *
+ * The same logic applies, however. */
+static bool handle_rng_input(int fd, struct device *dev)
+{
+	int len;
+	unsigned int head, in_num, out_num, totlen = 0;
+	struct iovec iov[dev->vq->vring.num];
+
+	/* First we need a buffer from the Guests's virtqueue. */
+	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+
+	/* If they're not ready for input, stop listening to this file
+	 * descriptor.  We'll start again once they add an input buffer. */
+	if (head == dev->vq->vring.num)
+		return false;
+
+	if (out_num)
+		errx(1, "Output buffers in rng?");
+
+	/* This is why we convert to iovecs: the readv() call uses them, and so
+	 * it reads straight into the Guest's buffer.  We loop to make sure we
+	 * fill it. */
+	while (!iov_empty(iov, in_num)) {
+		len = readv(dev->fd, iov, in_num);
+		if (len <= 0)
+			err(1, "Read from /dev/random gave %i", len);
+		iov_consume(iov, in_num, len);
+		totlen += len;
+	}
+
+	/* Tell the Guest about the new input. */
+	add_used_and_trigger(fd, dev->vq, head, totlen);
+
+	/* Everything went OK! */
+	return true;
+}
+
+/* And this creates a "hardware" random number device for the Guest. */
+static void setup_rng(void)
+{
+	struct device *dev;
+	int fd;
+
+	fd = open_or_die("/dev/random", O_RDONLY);
+
+	/* The device responds to return from I/O thread. */
+	dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input);
+
+	/* The device has one virtqueue, where the Guest places inbufs. */
+	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+
+	verbose("device %u: rng\n", devices.device_num++);
+}
 /* That's the end of device setup. */
 
 /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
@@ -1663,7 +1915,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
 		/* ERESTART means that we need to reboot the guest */
 		} else if (errno == ERESTART) {
 			restart_guest();
-		/* EAGAIN means the Waker wanted us to look at some input.
+		/* EAGAIN means a signal (timeout).
 		 * Anything else means a bug or incompatible change. */
 		} else if (errno != EAGAIN)
 			err(1, "Running guest failed");
@@ -1691,13 +1943,14 @@ static struct option opts[] = {
 	{ "verbose", 0, NULL, 'v' },
 	{ "tunnet", 1, NULL, 't' },
 	{ "block", 1, NULL, 'b' },
+	{ "rng", 0, NULL, 'r' },
 	{ "initrd", 1, NULL, 'i' },
 	{ NULL },
 };
 static void usage(void)
 {
 	errx(1, "Usage: lguest [--verbose] "
-	     "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
+	     "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
 	     "|--block=<filename>|--initrd=<filename>]...\n"
 	     "<mem-in-mb> vmlinux [args...]");
 }
@@ -1765,6 +2018,9 @@ int main(int argc, char *argv[])
 		case 'b':
 			setup_block_file(optarg);
 			break;
+		case 'r':
+			setup_rng();
+			break;
 		case 'i':
 			initrd_name = optarg;
 			break;
@@ -1783,6 +2039,9 @@ int main(int argc, char *argv[])
 	/* We always have a console device */
 	setup_console();
 
+	/* We can timeout waiting for Guest network transmit. */
+	setup_timeout();
+
 	/* Now we load the kernel */
 	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
 
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index ecb9eb78d687..57c10efa161c 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -192,7 +192,7 @@ struct salinfo_platform_oemdata_parms {
 static void
 salinfo_work_to_do(struct salinfo_data *data)
 {
-	down_trylock(&data->mutex);
+	down_try(&data->mutex);
 	up(&data->mutex);
 }
 
@@ -309,7 +309,7 @@ salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t
 	int i, n, cpu = -1;
 
 retry:
-	if (cpus_empty(data->cpu_event) && down_trylock(&data->mutex)) {
+	if (cpus_empty(data->cpu_event) && !down_try(&data->mutex)) {
 		if (file->f_flags & O_NONBLOCK)
 			return -EAGAIN;
 		if (down_interruptible(&data->mutex))
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 6d0d31651f05..3caeaf1905eb 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -561,6 +561,7 @@ bool "s390 guest support (EXPERIMENTAL)"
 	depends on 64BIT && EXPERIMENTAL
 	select VIRTIO
 	select VIRTIO_RING
+	select VIRTIO_CONSOLE
 	help
 	  Select this option if you want to run the kernel under s390 linux
 endmenu
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 2bc70b6e876a..8acd44293cca 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -54,6 +54,7 @@
 #include <asm/sections.h>
 #include <asm/ebcdic.h>
 #include <asm/compat.h>
+#include <asm/kvm_virtio.h>
 
 long psw_kernel_bits	= (PSW_BASE_BITS | PSW_MASK_DAT | PSW_ASC_PRIMARY |
 			   PSW_MASK_MCHECK | PSW_DEFAULT_KEY);
@@ -799,7 +800,8 @@ setup_arch(char **cmdline_p)
 		printk("We are running under VM (64 bit mode)\n");
 	else if (MACHINE_IS_KVM) {
 		printk("We are running under KVM (64 bit mode)\n");
-		add_preferred_console("ttyS", 1, NULL);
+		add_preferred_console("hvc", 0, NULL);
+		s390_virtio_console_init();
 	} else
 		printk("We are running native (64 bit mode)\n");
 #endif /* CONFIG_64BIT */
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dd7ea203f940..42251095134f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -196,6 +196,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 	int err;
 	u64 cap;
 	u32 v;
+	u32 blk_size;
 
 	if (index_to_minor(index) >= 1 << MINORBITS)
 		return -ENOSPC;
@@ -290,6 +291,13 @@ static int virtblk_probe(struct virtio_device *vdev)
 	if (!err)
 		blk_queue_max_hw_segments(vblk->disk->queue, v);
 
+	/* Host can optionally specify the block size of the device */
+	err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE,
+				offsetof(struct virtio_blk_config, blk_size),
+				&blk_size);
+	if (!err)
+		blk_queue_hardsect_size(vblk->disk->queue, blk_size);
+
 	add_disk(vblk->disk);
 	return 0;
 
@@ -330,7 +338,7 @@ static struct virtio_device_id id_table[] = {
 
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
-	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO,
+	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 2d854bb9373e..d64fd3855a30 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -588,11 +588,14 @@ config HVC_DRIVER
 	  It will automatically be selected if one of the back-end console drivers
 	  is selected.
 
+config HVC_IRQ
+	bool
 
 config HVC_CONSOLE
 	bool "pSeries Hypervisor Virtual Console support"
 	depends on PPC_PSERIES
 	select HVC_DRIVER
+	select HVC_IRQ
 	help
 	  pSeries machines when partitioned support a hypervisor virtual
 	  console. This driver allows each pSeries partition to have a console
@@ -603,6 +606,7 @@ config HVC_ISERIES
 	depends on PPC_ISERIES
 	default y
 	select HVC_DRIVER
+	select HVC_IRQ
 	help
 	  iSeries machines support a hypervisor virtual console.
 
@@ -624,13 +628,18 @@ config HVC_XEN
 	bool "Xen Hypervisor Console support"
 	depends on XEN
 	select HVC_DRIVER
+	select HVC_IRQ
 	default y
 	help
 	  Xen virtual console device driver
 
 config VIRTIO_CONSOLE
-	bool
+	bool "Virtio console"
+	depends on VIRTIO
 	select HVC_DRIVER
+	help
+	  Virtio console for use with lguest and other hypervisors.
+
 
 config HVCS
 	tristate "IBM Hypervisor Virtual Console Server support"
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 4c1c584e9eb6..2bb55d105f57 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES)	+= hvc_iseries.o
 obj-$(CONFIG_HVC_RTAS)		+= hvc_rtas.o
 obj-$(CONFIG_HVC_BEAT)		+= hvc_beat.o
 obj-$(CONFIG_HVC_DRIVER)	+= hvc_console.o
+obj-$(CONFIG_HVC_IRQ)		+= hvc_irq.o
 obj-$(CONFIG_HVC_XEN)		+= hvc_xen.o
 obj-$(CONFIG_VIRTIO_CONSOLE)	+= virtio_console.o
 obj-$(CONFIG_RAW_DRIVER)	+= raw.o
diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index 2f9759d625cc..bf82a44ffbb7 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -75,23 +75,6 @@ static int hvc_init(void);
 static int sysrq_pressed;
 #endif
 
-struct hvc_struct {
-	spinlock_t lock;
-	int index;
-	struct tty_struct *tty;
-	unsigned int count;
-	int do_wakeup;
-	char *outbuf;
-	int outbuf_size;
-	int n_outbuf;
-	uint32_t vtermno;
-	struct hv_ops *ops;
-	int irq_requested;
-	int irq;
-	struct list_head next;
-	struct kref kref; /* ref count & hvc_struct lifetime */
-};
-
 /* dynamic list of hvc_struct instances */
 static LIST_HEAD(hvc_structs);
 
@@ -300,26 +283,12 @@ int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops *ops)
 }
 
 /* Wake the sleeping khvcd */
-static void hvc_kick(void)
+void hvc_kick(void)
 {
 	hvc_kicked = 1;
 	wake_up_process(hvc_task);
 }
 
-static int hvc_poll(struct hvc_struct *hp);
-
-/*
- * NOTE: This API isn't used if the console adapter doesn't support interrupts.
- * In this case the console is poll driven.
- */
-static irqreturn_t hvc_handle_interrupt(int irq, void *dev_instance)
-{
-	/* if hvc_poll request a repoll, then kick the hvcd thread */
-	if (hvc_poll(dev_instance))
-		hvc_kick();
-	return IRQ_HANDLED;
-}
-
 static void hvc_unthrottle(struct tty_struct *tty)
 {
 	hvc_kick();
@@ -333,7 +302,6 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
 {
 	struct hvc_struct *hp;
 	unsigned long flags;
-	int irq = 0;
 	int rc = 0;
 
 	/* Auto increments kref reference if found. */
@@ -352,18 +320,15 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
 	tty->low_latency = 1; /* Makes flushes to ldisc synchronous. */
 
 	hp->tty = tty;
-	/* Save for request_irq outside of spin_lock. */
-	irq = hp->irq;
-	if (irq)
-		hp->irq_requested = 1;
+
+	if (hp->ops->notifier_add)
+		rc = hp->ops->notifier_add(hp, hp->data);
 
 	spin_unlock_irqrestore(&hp->lock, flags);
-	/* check error, fallback to non-irq */
-	if (irq)
-		rc = request_irq(irq, hvc_handle_interrupt, IRQF_DISABLED, "hvc_console", hp);
+
 
 	/*
-	 * If the request_irq() fails and we return an error.  The tty layer
+	 * If the notifier fails we return an error.  The tty layer
 	 * will call hvc_close() after a failed open but we don't want to clean
 	 * up there so we'll clean up here and clear out the previously set
 	 * tty fields and return the kref reference.
@@ -371,7 +336,6 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
 	if (rc) {
 		spin_lock_irqsave(&hp->lock, flags);
 		hp->tty = NULL;
-		hp->irq_requested = 0;
 		spin_unlock_irqrestore(&hp->lock, flags);
 		tty->driver_data = NULL;
 		kref_put(&hp->kref, destroy_hvc_struct);
@@ -386,7 +350,6 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
 static void hvc_close(struct tty_struct *tty, struct file * filp)
 {
 	struct hvc_struct *hp;
-	int irq = 0;
 	unsigned long flags;
 
 	if (tty_hung_up_p(filp))
@@ -404,9 +367,8 @@ static void hvc_close(struct tty_struct *tty, struct file * filp)
 	spin_lock_irqsave(&hp->lock, flags);
 
 	if (--hp->count == 0) {
-		if (hp->irq_requested)
-			irq = hp->irq;
-		hp->irq_requested = 0;
+		if (hp->ops->notifier_del)
+			hp->ops->notifier_del(hp, hp->data);
 
 		/* We are done with the tty pointer now. */
 		hp->tty = NULL;
@@ -418,10 +380,6 @@ static void hvc_close(struct tty_struct *tty, struct file * filp)
 		 * waking periodically to check chars_in_buffer().
 		 */
 		tty_wait_until_sent(tty, HVC_CLOSE_WAIT);
-
-		if (irq)
-			free_irq(irq, hp);
-
 	} else {
 		if (hp->count < 0)
 			printk(KERN_ERR "hvc_close %X: oops, count is %d\n",
@@ -436,7 +394,6 @@ static void hvc_hangup(struct tty_struct *tty)
 {
 	struct hvc_struct *hp = tty->driver_data;
 	unsigned long flags;
-	int irq = 0;
 	int temp_open_count;
 
 	if (!hp)
@@ -458,13 +415,12 @@ static void hvc_hangup(struct tty_struct *tty)
 	hp->count = 0;
 	hp->n_outbuf = 0;
 	hp->tty = NULL;
-	if (hp->irq_requested)
-		/* Saved for use outside of spin_lock. */
-		irq = hp->irq;
-	hp->irq_requested = 0;
+
+	if (hp->ops->notifier_del)
+			hp->ops->notifier_del(hp, hp->data);
+
 	spin_unlock_irqrestore(&hp->lock, flags);
-	if (irq)
-		free_irq(irq, hp);
+
 	while(temp_open_count) {
 		--temp_open_count;
 		kref_put(&hp->kref, destroy_hvc_struct);
@@ -575,7 +531,7 @@ static u32 timeout = MIN_TIMEOUT;
 #define HVC_POLL_READ	0x00000001
 #define HVC_POLL_WRITE	0x00000002
 
-static int hvc_poll(struct hvc_struct *hp)
+int hvc_poll(struct hvc_struct *hp)
 {
 	struct tty_struct *tty;
 	int i, n, poll_mask = 0;
@@ -602,10 +558,10 @@ static int hvc_poll(struct hvc_struct *hp)
 	if (test_bit(TTY_THROTTLED, &tty->flags))
 		goto throttled;
 
-	/* If we aren't interrupt driven and aren't throttled, we always
+	/* If we aren't notifier driven and aren't throttled, we always
 	 * request a reschedule
 	 */
-	if (hp->irq == 0)
+	if (!hp->irq_requested)
 		poll_mask |= HVC_POLL_READ;
 
 	/* Read data if any */
@@ -733,7 +689,7 @@ static const struct tty_operations hvc_ops = {
 	.chars_in_buffer = hvc_chars_in_buffer,
 };
 
-struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int irq,
+struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int data,
 					struct hv_ops *ops, int outbuf_size)
 {
 	struct hvc_struct *hp;
@@ -754,7 +710,7 @@ struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int irq,
 	memset(hp, 0x00, sizeof(*hp));
 
 	hp->vtermno = vtermno;
-	hp->irq = irq;
+	hp->data = data;
 	hp->ops = ops;
 	hp->outbuf_size = outbuf_size;
 	hp->outbuf = &((char *)hp)[ALIGN(sizeof(*hp), sizeof(long))];
diff --git a/drivers/char/hvc_console.h b/drivers/char/hvc_console.h
index 42ffb17e15df..3c14cea3f468 100644
--- a/drivers/char/hvc_console.h
+++ b/drivers/char/hvc_console.h
@@ -42,22 +42,40 @@
  */
 #define HVC_ALLOC_TTY_ADAPTERS	8
 
+struct hvc_struct {
+	spinlock_t lock;
+	int index;
+	struct tty_struct *tty;
+	unsigned int count;
+	int do_wakeup;
+	char *outbuf;
+	int outbuf_size;
+	int n_outbuf;
+	uint32_t vtermno;
+	struct hv_ops *ops;
+	int irq_requested;
+	int data;
+	struct list_head next;
+	struct kref kref; /* ref count & hvc_struct lifetime */
+};
 
 /* implemented by a low level driver */
 struct hv_ops {
 	int (*get_chars)(uint32_t vtermno, char *buf, int count);
 	int (*put_chars)(uint32_t vtermno, const char *buf, int count);
-};
 
-struct hvc_struct;
+	/* Callbacks for notification. Called in open and close */
+	int (*notifier_add)(struct hvc_struct *hp, int irq);
+	void (*notifier_del)(struct hvc_struct *hp, int irq);
+};
 
 /* Register a vterm and a slot index for use as a console (console_init) */
 extern int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops *ops);
 
 /* register a vterm for hvc tty operation (module_init or hotplug add) */
-extern struct hvc_struct * __devinit hvc_alloc(uint32_t vtermno, int irq,
+extern struct hvc_struct * __devinit hvc_alloc(uint32_t vtermno, int data,
 				struct hv_ops *ops, int outbuf_size);
-/* remove a vterm from hvc tty operation (modele_exit or hotplug remove) */
+/* remove a vterm from hvc tty operation (module_exit or hotplug remove) */
 extern int __devexit hvc_remove(struct hvc_struct *hp);
 
 
@@ -70,4 +88,12 @@ static inline int cpus_are_in_xmon(void)
 }
 #endif
 
+/* data available */
+int hvc_poll(struct hvc_struct *hp);
+void hvc_kick(void);
+
+/* default notifier for irq based notification */
+extern int notifier_add_irq(struct hvc_struct *hp, int data);
+extern void notifier_del_irq(struct hvc_struct *hp, int data);
+
 #endif // HVC_CONSOLE_H
diff --git a/drivers/char/hvc_irq.c b/drivers/char/hvc_irq.c
new file mode 100644
index 000000000000..92ab1e6e6e0e
--- /dev/null
+++ b/drivers/char/hvc_irq.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright IBM Corp. 2001,2008
+ *
+ * This file contains the IRQ specific code for hvc_console
+ *
+ */
+
+#include <linux/interrupt.h>
+
+#include "hvc_console.h"
+
+static irqreturn_t hvc_handle_interrupt(int irq, void *dev_instance)
+{
+	/* if hvc_poll request a repoll, then kick the hvcd thread */
+	if (hvc_poll(dev_instance))
+		hvc_kick();
+	return IRQ_HANDLED;
+}
+
+/*
+ * For IRQ based systems these callbacks can be used
+ */
+int notifier_add_irq(struct hvc_struct *hp, int irq)
+{
+	int rc;
+
+	if (!irq) {
+		hp->irq_requested = 0;
+		return 0;
+	}
+	rc = request_irq(irq, hvc_handle_interrupt, IRQF_DISABLED,
+			   "hvc_console", hp);
+	if (!rc)
+		hp->irq_requested = 1;
+	return rc;
+}
+
+void notifier_del_irq(struct hvc_struct *hp, int irq)
+{
+	if (!irq)
+		return;
+	free_irq(irq, hp);
+	hp->irq_requested = 0;
+}
+
diff --git a/drivers/char/hvc_iseries.c b/drivers/char/hvc_iseries.c
index a08f8f981c11..b71c610fe5ae 100644
--- a/drivers/char/hvc_iseries.c
+++ b/drivers/char/hvc_iseries.c
@@ -200,6 +200,8 @@ done:
 static struct hv_ops hvc_get_put_ops = {
 	.get_chars = get_chars,
 	.put_chars = put_chars,
+	.notifier_add = notifier_add_irq,
+	.notifier_del = notifier_del_irq,
 };
 
 static int __devinit hvc_vio_probe(struct vio_dev *vdev,
diff --git a/drivers/char/hvc_vio.c b/drivers/char/hvc_vio.c
index 79711aa4b41d..93f3840c1682 100644
--- a/drivers/char/hvc_vio.c
+++ b/drivers/char/hvc_vio.c
@@ -80,6 +80,8 @@ static int filtered_get_chars(uint32_t vtermno, char *buf, int count)
 static struct hv_ops hvc_get_put_ops = {
 	.get_chars = filtered_get_chars,
 	.put_chars = hvc_put_chars,
+	.notifier_add = notifier_add_irq,
+	.notifier_del = notifier_del_irq,
 };
 
 static int __devinit hvc_vio_probe(struct vio_dev *vdev,
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
index db2ae4216279..6b70aa66a587 100644
--- a/drivers/char/hvc_xen.c
+++ b/drivers/char/hvc_xen.c
@@ -100,6 +100,8 @@ static int read_console(uint32_t vtermno, char *buf, int len)
 static struct hv_ops hvc_ops = {
 	.get_chars = read_console,
 	.put_chars = write_console,
+	.notifier_add = notifier_add_irq,
+	.notifier_del = notifier_del_irq,
 };
 
 static int __init xen_init(void)
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
index 864080c257ab..41ccfcb1c898 100644
--- a/drivers/char/snsc.c
+++ b/drivers/char/snsc.c
@@ -161,7 +161,7 @@ scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos)
 	struct subch_data_s *sd = (struct subch_data_s *) file->private_data;
 
 	/* try to get control of the read buffer */
-	if (down_trylock(&sd->sd_rbs)) {
+	if (!down_try(&sd->sd_rbs)) {
 		/* somebody else has it now;
 		 * if we're non-blocking, then exit...
 		 */
@@ -253,7 +253,7 @@ scdrv_write(struct file *file, const char __user *buf,
 	struct subch_data_s *sd = (struct subch_data_s *) file->private_data;
 
 	/* try to get control of the write buffer */
-	if (down_trylock(&sd->sd_wbs)) {
+	if (!down_try(&sd->sd_wbs)) {
 		/* somebody else has it now;
 		 * if we're non-blocking, then exit...
 		 */
diff --git a/drivers/char/viotape.c b/drivers/char/viotape.c
index e508ad99d11e..49a346022e93 100644
--- a/drivers/char/viotape.c
+++ b/drivers/char/viotape.c
@@ -361,7 +361,7 @@ static ssize_t viotap_write(struct file *file, const char *buf,
 	 * semaphore
 	 */
 	if (noblock) {
-		if (down_trylock(&reqSem)) {
+		if (!down_try(&reqSem)) {
 			ret = -EWOULDBLOCK;
 			goto free_op;
 		}
@@ -451,7 +451,7 @@ static ssize_t viotap_read(struct file *file, char *buf, size_t count,
 	 * semaphore
 	 */
 	if (noblock) {
-		if (down_trylock(&reqSem)) {
+		if (!down_try(&reqSem)) {
 			ret = -EWOULDBLOCK;
 			goto free_op;
 		}
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index dc17fe3a88bc..d0f4eb6fdb7f 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -46,6 +46,9 @@ static char *in, *inbuf;
 /* The operations for our console. */
 static struct hv_ops virtio_cons;
 
+/* The hvc device */
+static struct hvc_struct *hvc;
+
 /*D:310 The put_chars() callback is pretty straightforward.
  *
  * We turn the characters into a scatter-gather list, add it to the output
@@ -134,6 +137,27 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
 	return hvc_instantiate(0, 0, &virtio_cons);
 }
 
+/*
+ * we support only one console, the hvc struct is a global var
+ * There is no need to do anything
+ */
+static int notifier_add_vio(struct hvc_struct *hp, int data)
+{
+	hp->irq_requested = 1;
+	return 0;
+}
+
+static void notifier_del_vio(struct hvc_struct *hp, int data)
+{
+	hp->irq_requested = 0;
+}
+
+static void hvc_handle_input(struct virtqueue *vq)
+{
+	if (hvc_poll(hvc))
+		hvc_kick();
+}
+
 /*D:370 Once we're further in boot, we get probed like any other virtio device.
  * At this stage we set up the output virtqueue.
  *
@@ -144,7 +168,6 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
 static int __devinit virtcons_probe(struct virtio_device *dev)
 {
 	int err;
-	struct hvc_struct *hvc;
 
 	vdev = dev;
 
@@ -158,7 +181,7 @@ static int __devinit virtcons_probe(struct virtio_device *dev)
 	/* Find the input queue. */
 	/* FIXME: This is why we want to wean off hvc: we do nothing
 	 * when input comes in. */
-	in_vq = vdev->config->find_vq(vdev, 0, NULL);
+	in_vq = vdev->config->find_vq(vdev, 0, hvc_handle_input);
 	if (IS_ERR(in_vq)) {
 		err = PTR_ERR(in_vq);
 		goto free;
@@ -173,15 +196,18 @@ static int __devinit virtcons_probe(struct virtio_device *dev)
 	/* Start using the new console output. */
 	virtio_cons.get_chars = get_chars;
 	virtio_cons.put_chars = put_chars;
+	virtio_cons.notifier_add = notifier_add_vio;
+	virtio_cons.notifier_del = notifier_del_vio;
 
 	/* The first argument of hvc_alloc() is the virtual console number, so
-	 * we use zero.  The second argument is the interrupt number; we
-	 * currently leave this as zero: it would be better not to use the
-	 * hvc mechanism and fix this (FIXME!).
+	 * we use zero.  The second argument is the parameter for the
+	 * notification mechanism (like irq number). We currently leave this
+	 * as zero, virtqueues have implicit notifications.
 	 *
 	 * The third argument is a "struct hv_ops" containing the put_chars()
-	 * and get_chars() pointers.  The final argument is the output buffer
-	 * size: we can do any size, so we put PAGE_SIZE here. */
+	 * get_chars(), notifier_add() and notifier_del() pointers.
+	 * The final argument is the output buffer size: we can do any size,
+	 * so we put PAGE_SIZE here. */
 	hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE);
 	if (IS_ERR(hvc)) {
 		err = PTR_ERR(hvc);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index eb58fcf843ac..10c34b82d2de 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -888,7 +888,7 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
 		return -ENXIO;
 
 	if (filp->f_flags & O_NONBLOCK) {
-		if (down_trylock(&port->sm_sem)) {
+		if (!down_try(&port->sm_sem)) {
 			ret = -EAGAIN;
 			goto fail;
 		}
diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c
index 93a1a6ba216a..d1bb7f19d2a3 100644
--- a/drivers/input/serio/hil_mlc.c
+++ b/drivers/input/serio/hil_mlc.c
@@ -607,7 +607,7 @@ static inline void hilse_setup_input(hil_mlc *mlc, const struct hilse_node *node
 	do_gettimeofday(&(mlc->instart));
 	mlc->icount = 15;
 	memset(mlc->ipacket, 0, 16 * sizeof(hil_packet));
-	BUG_ON(down_trylock(&mlc->isem));
+	BUG_ON(!down_try(&mlc->isem));
 }
 
 #ifdef HIL_MLC_DEBUG
@@ -694,7 +694,7 @@ static int hilse_donode(hil_mlc *mlc)
 	out2:
 		write_unlock_irqrestore(&mlc->lock, flags);
 
-		if (down_trylock(&mlc->osem)) {
+		if (!down_try(&mlc->osem)) {
 			nextidx = HILSEN_DOZE;
 			break;
 		}
diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c
index 587398f5c9df..8f532bb4f3ec 100644
--- a/drivers/input/serio/hp_sdc_mlc.c
+++ b/drivers/input/serio/hp_sdc_mlc.c
@@ -148,7 +148,7 @@ static int hp_sdc_mlc_in(hil_mlc *mlc, suseconds_t timeout)
 	priv = mlc->priv;
 
 	/* Try to down the semaphore */
-	if (down_trylock(&mlc->isem)) {
+	if (!down_try(&mlc->isem)) {
 		struct timeval tv;
 		if (priv->emtestmode) {
 			mlc->ipacket[0] =
@@ -186,13 +186,13 @@ static int hp_sdc_mlc_cts(hil_mlc *mlc)
 	priv = mlc->priv;
 
 	/* Try to down the semaphores -- they should be up. */
-	BUG_ON(down_trylock(&mlc->isem));
-	BUG_ON(down_trylock(&mlc->osem));
+	BUG_ON(!down_try(&mlc->isem));
+	BUG_ON(!down_try(&mlc->osem));
 
 	up(&mlc->isem);
 	up(&mlc->osem);
 
-	if (down_trylock(&mlc->csem)) {
+	if (!down_try(&mlc->csem)) {
 		if (priv->trans.act.semaphore != &mlc->csem)
 			goto poll;
 		else
@@ -229,7 +229,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc)
 	priv = mlc->priv;
 
 	/* Try to down the semaphore -- it should be up. */
-	BUG_ON(down_trylock(&mlc->osem));
+	BUG_ON(!down_try(&mlc->osem));
 
 	if (mlc->opacket & HIL_DO_ALTER_CTRL)
 		goto do_control;
@@ -240,7 +240,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc)
 		return;
 	}
 	/* Shouldn't be sending commands when loop may be busy */
-	BUG_ON(down_trylock(&mlc->csem));
+	BUG_ON(!down_try(&mlc->csem));
 	up(&mlc->csem);
 
 	priv->trans.actidx = 0;
@@ -296,7 +296,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc)
 	priv->tseq[3] = 0;
 	if (mlc->opacket & HIL_CTRL_APE) {
 		priv->tseq[3] |= HP_SDC_LPC_APE_IPF;
-		down_trylock(&mlc->csem);
+		down_try(&mlc->csem);
 	}
  enqueue:
 	hp_sdc_enqueue_transaction(&priv->trans);
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 1a8de57289eb..f292de2ad26e 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -95,7 +95,8 @@ static u32 lg_get_features(struct virtio_device *vdev)
 		if (in_features[i / 8] & (1 << (i % 8)))
 			features |= (1 << i);
 
-	return features;
+	/* Vring may want to play with the bits it's offered. */
+	return vring_transport_features(features);
 }
 
 static void lg_set_features(struct virtio_device *vdev, u32 features)
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 2e554a4ab337..57ca4004bc6b 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -480,7 +480,7 @@ void __init lguest_arch_host_init(void)
 		 * bit on its CPU, depending on the argument (0 == unset). */
 		on_each_cpu(adjust_pge, (void *)0, 0, 1);
 		/* Turn off the feature in the global feature set. */
-		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
 	}
 	put_online_cpus();
 };
@@ -491,7 +491,7 @@ void __exit lguest_arch_host_fini(void)
 	/* If we had PGE before we started, turn it back on now. */
 	get_online_cpus();
 	if (cpu_had_pge) {
-		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
 		/* adjust_pge's argument "1" means set PGE. */
 		on_each_cpu(adjust_pge, (void *)1, 0, 1);
 	}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ff05fe893083..137f0241aed0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -587,7 +587,7 @@ static void rh_recovery_prepare(struct region_hash *rh)
 	/* Extra reference to avoid race with rh_stop_recovery */
 	atomic_inc(&rh->recovery_in_flight);
 
-	while (!down_trylock(&rh->recovery_count)) {
+	while (down_try(&rh->recovery_count)) {
 		atomic_inc(&rh->recovery_in_flight);
 		if (__rh_recovery_prepare(rh) <= 0) {
 			atomic_dec(&rh->recovery_in_flight);
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index 6aca0c640f13..9f5786326efa 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -576,7 +576,7 @@ static int mc32_command_nowait(struct net_device *dev, u16 cmd, void *data, int
 	int ioaddr = dev->base_addr;
 	int ret = -1;
 
-	if (down_trylock(&lp->cmd_mutex) == 0)
+	if (down_try(&lp->cmd_mutex))
 	{
 		lp->cmd_nonblocking=1;
 		lp->exec_box->mbox=0;
diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c
index 6078e03de9a8..894aade319b6 100644
--- a/drivers/net/irda/sir_dev.c
+++ b/drivers/net/irda/sir_dev.c
@@ -286,7 +286,7 @@ int sirdev_schedule_request(struct sir_dev *dev, int initial_state, unsigned par
 
 	IRDA_DEBUG(2, "%s - state=0x%04x / param=%u\n", __FUNCTION__, initial_state, param);
 
-	if (down_trylock(&fsm->sem)) {
+	if (!down_try(&fsm->sem)) {
 		if (in_interrupt()  ||  in_atomic()  ||  irqs_disabled()) {
 			IRDA_DEBUG(1, "%s(), state machine busy!\n", __FUNCTION__);
 			return -EWOULDBLOCK;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7ab94c825b57..479ea398f3e5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -63,6 +63,7 @@
 #include <linux/if_tun.h>
 #include <linux/crc32.h>
 #include <linux/nsproxy.h>
+#include <linux/virtio_net.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
@@ -283,6 +284,7 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
 	struct sk_buff *skb;
 	size_t len = count, align = 0;
+	struct virtio_net_hdr gso = { 0 };
 
 	if (!(tun->flags & TUN_NO_PI)) {
 		if ((len -= sizeof(pi)) > count)
@@ -292,6 +294,17 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 			return -EFAULT;
 	}
 
+	if (tun->flags & TUN_VNET_HDR) {
+		if ((len -= sizeof(gso)) > count)
+			return -EINVAL;
+
+		if (gso.hdr_len > len)
+			return -EINVAL;
+
+		if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso)))
+			return -EFAULT;
+	}
+
 	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
 		align = NET_IP_ALIGN;
 		if (unlikely(len < ETH_HLEN))
@@ -337,9 +350,46 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
 		break;
 	};
 
-	if (tun->flags & TUN_NOCHECKSUM)
+	if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		if (!skb_partial_csum_set(skb, gso.csum_start,
+					  gso.csum_offset)) {
+			tun->dev->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+	} else if (tun->flags & TUN_NOCHECKSUM)
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
+	if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		pr_debug("GSO!\n");
+		switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+		case VIRTIO_NET_HDR_GSO_TCPV4:
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+			break;
+		case VIRTIO_NET_HDR_GSO_TCPV6:
+			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+			break;
+		default:
+			tun->dev->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+
+		if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+		skb_shinfo(skb)->gso_size = gso.gso_size;
+		if (skb_shinfo(skb)->gso_size == 0) {
+			tun->dev->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+
+		/* Header must be checked, and gso_segs computed. */
+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+		skb_shinfo(skb)->gso_segs = 0;
+	}
+
 	netif_rx_ni(skb);
 	tun->dev->last_rx = jiffies;
 
@@ -384,6 +434,39 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 		total += sizeof(pi);
 	}
 
+	if (tun->flags & TUN_VNET_HDR) {
+		struct virtio_net_hdr gso = { 0 }; /* no info leak */
+		if ((len -= sizeof(gso)) < 0)
+			return -EINVAL;
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			gso.hdr_len = skb_headlen(skb);
+			gso.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			gso.csum_start = skb->csum_start - skb_headroom(skb);
+			gso.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		if (unlikely(memcpy_toiovec(iv, (void *)&gso, sizeof(gso))))
+			return -EFAULT;
+		total += sizeof(gso);
+	}
+
 	len = min_t(int, skb->len, len);
 
 	skb_copy_datagram_iovec(skb, 0, iv, len);
@@ -598,6 +681,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	else
 		tun->flags &= ~TUN_ONE_QUEUE;
 
+	if (ifr->ifr_flags & IFF_VNET_HDR)
+		tun->flags |= TUN_VNET_HDR;
+	else
+		tun->flags &= ~TUN_VNET_HDR;
+
 	file->private_data = tun;
 	tun->attached = 1;
 	get_net(dev_net(tun->dev));
@@ -611,6 +699,46 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	return err;
 }
 
+/* This is like a cut-down ethtool ops, except done via tun fd so no
+ * privs required. */
+static int set_features(struct net_device *dev, unsigned long arg)
+{
+	unsigned int old_features, features;
+
+	old_features = dev->features;
+	/* Unset features, set them as we chew on the arg. */
+	features = (old_features & ~(NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST
+				    |NETIF_F_TSO_ECN|NETIF_F_TSO|NETIF_F_TSO6));
+
+	if (arg & TUN_F_CSUM) {
+		features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
+		arg &= ~TUN_F_CSUM;
+
+		if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
+			if (arg & TUN_F_TSO_ECN) {
+				features |= NETIF_F_TSO_ECN;
+				arg &= ~TUN_F_TSO_ECN;
+			}
+			if (arg & TUN_F_TSO4)
+				features |= NETIF_F_TSO;
+			if (arg & TUN_F_TSO6)
+				features |= NETIF_F_TSO6;
+			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
+		}
+	}
+
+	/* This gives the user a way to test for new features in future by
+	 * trying to set them. */
+	if (arg)
+		return -EINVAL;
+
+	dev->features = features;
+	if (old_features != dev->features)
+		netdev_features_change(dev);
+
+	return 0;
+}
+
 static int tun_chr_ioctl(struct inode *inode, struct file *file,
 			 unsigned int cmd, unsigned long arg)
 {
@@ -707,6 +835,15 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
 		break;
 #endif
 
+	case TUNSETFEATURES:
+	{
+		int ret;
+		rtnl_lock();
+		ret = set_features(tun->dev, arg);
+		rtnl_unlock();
+		return ret;
+	}
+
 	case SIOCGIFFLAGS:
 		ifr.ifr_flags = tun->if_flags;
 		if (copy_to_user( argp, &ifr, sizeof ifr))
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4452306d5328..bd67d915bad7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -19,6 +19,7 @@
 //#define DEBUG
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_net.h>
@@ -57,6 +58,9 @@ struct virtnet_info
 	/* Receive & send queues. */
 	struct sk_buff_head recv;
 	struct sk_buff_head send;
+
+	/* Chain pages by the private ptr. */
+	struct page *pages;
 };
 
 static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb)
@@ -69,6 +73,23 @@ static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb)
 	sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr));
 }
 
+static void give_a_page(struct virtnet_info *vi, struct page *page)
+{
+	page->private = (unsigned long)vi->pages;
+	vi->pages = page;
+}
+
+static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+{
+	struct page *p = vi->pages;
+
+	if (p)
+		vi->pages = (struct page *)p->private;
+	else
+		p = alloc_page(gfp_mask);
+	return p;
+}
+
 static void skb_xmit_done(struct virtqueue *svq)
 {
 	struct virtnet_info *vi = svq->vdev->priv;
@@ -88,6 +109,7 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
 			unsigned len)
 {
 	struct virtio_net_hdr *hdr = skb_vnet_hdr(skb);
+	int err;
 
 	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
 		pr_debug("%s: short packet %i\n", dev->name, len);
@@ -95,10 +117,23 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
 		goto drop;
 	}
 	len -= sizeof(struct virtio_net_hdr);
-	BUG_ON(len > MAX_PACKET_LEN);
 
-	skb_trim(skb, len);
+	if (len <= MAX_PACKET_LEN) {
+		unsigned int i;
+
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+			give_a_page(dev->priv, skb_shinfo(skb)->frags[i].page);
+		skb->data_len = 0;
+		skb_shinfo(skb)->nr_frags = 0;
+	}
 
+	err = pskb_trim(skb, len);
+	if (err) {
+		pr_debug("%s: pskb_trim failed %i %d\n", dev->name, len, err);
+		dev->stats.rx_dropped++;
+		goto drop;
+	}
+	skb->truesize += skb->data_len;
 	dev->stats.rx_bytes += skb->len;
 	dev->stats.rx_packets++;
 
@@ -160,7 +195,7 @@ static void try_fill_recv(struct virtnet_info *vi)
 {
 	struct sk_buff *skb;
 	struct scatterlist sg[2+MAX_SKB_FRAGS];
-	int num, err;
+	int num, err, i;
 
 	sg_init_table(sg, 2+MAX_SKB_FRAGS);
 	for (;;) {
@@ -170,6 +205,24 @@ static void try_fill_recv(struct virtnet_info *vi)
 
 		skb_put(skb, MAX_PACKET_LEN);
 		vnet_hdr_to_sg(sg, skb);
+
+		if (vi->dev->features & NETIF_F_LRO) {
+			for (i = 0; i < MAX_SKB_FRAGS; i++) {
+				skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+				f->page = get_a_page(vi, GFP_ATOMIC);
+				if (!f->page)
+					break;
+
+				f->page_offset = 0;
+				f->size = PAGE_SIZE;
+
+				skb->data_len += PAGE_SIZE;
+				skb->len += PAGE_SIZE;
+
+				skb_shinfo(skb)->nr_frags++;
+			}
+		}
+
 		num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
 		skb_queue_head(&vi->recv, skb);
 
@@ -335,16 +388,11 @@ again:
 	free_old_xmit_skbs(vi);
 
 	/* If we has a buffer left over from last time, send it now. */
-	if (unlikely(vi->last_xmit_skb)) {
-		if (xmit_skb(vi, vi->last_xmit_skb) != 0) {
-			/* Drop this skb: we only queue one. */
-			vi->dev->stats.tx_dropped++;
-			kfree_skb(skb);
-			skb = NULL;
-			goto stop_queue;
-		}
-		vi->last_xmit_skb = NULL;
-	}
+	if (unlikely(vi->last_xmit_skb) &&
+	    xmit_skb(vi, vi->last_xmit_skb) != 0)
+		goto stop_queue;
+
+	vi->last_xmit_skb = NULL;
 
 	/* Put new one in send queue and do transmit */
 	if (likely(skb)) {
@@ -370,6 +418,11 @@ stop_queue:
 		netif_start_queue(dev);
 		goto again;
 	}
+	if (skb) {
+		/* Drop this skb: we only queue one. */
+		vi->dev->stats.tx_dropped++;
+		kfree_skb(skb);
+	}
 	goto done;
 }
 
@@ -408,6 +461,22 @@ static int virtnet_close(struct net_device *dev)
 	return 0;
 }
 
+static int virtnet_set_tx_csum(struct net_device *dev, u32 data)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtio_device *vdev = vi->vdev;
+
+	if (data && !virtio_has_feature(vdev, VIRTIO_NET_F_CSUM))
+		return -ENOSYS;
+
+	return ethtool_op_set_tx_hw_csum(dev, data);
+}
+
+static struct ethtool_ops virtnet_ethtool_ops = {
+	.set_tx_csum = virtnet_set_tx_csum,
+	.set_sg = ethtool_op_set_sg,
+};
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
 	int err;
@@ -427,6 +496,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	dev->poll_controller = virtnet_netpoll;
 #endif
+	SET_ETHTOOL_OPS(dev, &virtnet_ethtool_ops);
 	SET_NETDEV_DEV(dev, &vdev->dev);
 
 	/* Do we support "hardware" checksums? */
@@ -448,6 +518,12 @@ static int virtnet_probe(struct virtio_device *vdev)
 			dev->features |= NETIF_F_UFO;
 	}
 
+	/* If we can receive ANY GSO packets, we must allocate large ones. */
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4)
+	    || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)
+	    || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN))
+		dev->features |= NETIF_F_LRO;
+
 	/* Configuration may specify what MAC to use.  Otherwise random. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
 		vdev->config->get(vdev,
@@ -462,6 +538,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
+	vi->pages = NULL;
 
 	/* If they give us a callback when all buffers are done, we don't need
 	 * the timer. */
@@ -541,6 +618,10 @@ static void virtnet_remove(struct virtio_device *vdev)
 	vdev->config->del_vq(vi->svq);
 	vdev->config->del_vq(vi->rvq);
 	unregister_netdev(vi->dev);
+
+	while (vi->pages)
+		__free_pages(get_a_page(vi, GFP_KERNEL), 0);
+
 	free_netdev(vi->dev);
 }
 
@@ -550,9 +631,12 @@ static struct virtio_device_id id_table[] = {
 };
 
 static unsigned int features[] = {
-	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC,
+	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,
+	VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC,
 	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
-	VIRTIO_NET_F_HOST_ECN, VIRTIO_F_NOTIFY_ON_EMPTY,
+	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
+	VIRTIO_NET_F_GUEST_ECN, /* We don't yet handle UFO input. */
+	VIRTIO_F_NOTIFY_ON_EMPTY,
 };
 
 static struct virtio_driver virtio_net = {
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index e30f8b79ea89..100f98a2e8dd 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2137,7 +2137,7 @@ static int airo_start_xmit(struct sk_buff *skb, struct net_device *dev) {
 	fids[i] |= (len << 16);
 	priv->xmit.skb = skb;
 	priv->xmit.fid = i;
-	if (down_trylock(&priv->sem) != 0) {
+	if (!down_try(&priv->sem)) {
 		set_bit(FLAG_PENDING_XMIT, &priv->flags);
 		netif_stop_queue(dev);
 		set_bit(JOB_XMIT, &priv->jobs);
@@ -2208,7 +2208,7 @@ static int airo_start_xmit11(struct sk_buff *skb, struct net_device *dev) {
 	fids[i] |= (len << 16);
 	priv->xmit11.skb = skb;
 	priv->xmit11.fid = i;
-	if (down_trylock(&priv->sem) != 0) {
+	if (!down_try(&priv->sem)) {
 		set_bit(FLAG_PENDING_XMIT11, &priv->flags);
 		netif_stop_queue(dev);
 		set_bit(JOB_XMIT11, &priv->jobs);
@@ -2258,7 +2258,7 @@ static struct net_device_stats *airo_get_stats(struct net_device *dev)
 
 	if (!test_bit(JOB_STATS, &local->jobs)) {
 		/* Get stats out of the card if available */
-		if (down_trylock(&local->sem) != 0) {
+		if (!down_try(&local->sem)) {
 			set_bit(JOB_STATS, &local->jobs);
 			wake_up_interruptible(&local->thr_wait);
 		} else
@@ -2285,7 +2285,7 @@ static void airo_set_multicast_list(struct net_device *dev) {
 
 	if ((dev->flags ^ ai->flags) & IFF_PROMISC) {
 		change_bit(FLAG_PROMISC, &ai->flags);
-		if (down_trylock(&ai->sem) != 0) {
+		if (!down_try(&ai->sem)) {
 			set_bit(JOB_PROMISC, &ai->jobs);
 			wake_up_interruptible(&ai->thr_wait);
 		} else
@@ -3213,7 +3213,7 @@ static irqreturn_t airo_interrupt(int irq, void *dev_id)
 				set_bit(FLAG_UPDATE_UNI, &apriv->flags);
 				set_bit(FLAG_UPDATE_MULTI, &apriv->flags);
 
-				if (down_trylock(&apriv->sem) != 0) {
+				if (!down_try(&apriv->sem)) {
 					set_bit(JOB_EVENT, &apriv->jobs);
 					wake_up_interruptible(&apriv->thr_wait);
 				} else
@@ -7651,7 +7651,7 @@ static struct iw_statistics *airo_get_wireless_stats(struct net_device *dev)
 
 	if (!test_bit(JOB_WSTATS, &local->jobs)) {
 		/* Get stats out of the card if available */
-		if (down_trylock(&local->sem) != 0) {
+		if (!down_try(&local->sem)) {
 			set_bit(JOB_WSTATS, &local->jobs);
 			wake_up_interruptible(&local->thr_wait);
 		} else
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 3d1160e9e5a4..f707d95ef3db 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/virtio.h>
 #include <linux/virtio_config.h>
+#include <linux/virtio_console.h>
 #include <linux/interrupt.h>
 #include <linux/virtio_ring.h>
 #include <linux/pfn.h>
@@ -332,6 +333,25 @@ static int __init kvm_devices_init(void)
 	return 0;
 }
 
+/* code for early console output with virtio_console */
+static __init int early_put_chars(u32 vtermno, const char *buf, int count)
+{
+	char scratch[17];
+	unsigned int len = count;
+
+	if (len > sizeof(scratch) - 1)
+		len = sizeof(scratch) - 1;
+	scratch[len] = '\0';
+	memcpy(scratch, buf, len);
+	kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, __pa(scratch));
+	return len;
+}
+
+void s390_virtio_console_init(void)
+{
+	virtio_cons_early_init(early_put_chars);
+}
+
 /*
  * We do this after core stuff, but before the drivers.
  */
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 289304aab690..c011d05caaa6 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -490,7 +490,7 @@ int aac_fib_send(u16 command, struct fib *fibptr, unsigned long size,
 			 * hardware failure has occurred.
 			 */
 			unsigned long count = 36000000L; /* 3 minutes */
-			while (down_trylock(&fibptr->event_wait)) {
+			while (!down_try(&fibptr->event_wait)) {
 				int blink;
 				if (--count == 0) {
 					struct aac_queue * q = &dev->queues->queue[AdapNormCmdQueue];
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 84fcaa6a21ec..f31f0c5d3725 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -477,7 +477,7 @@ int usb_lock_device_for_reset(struct usb_device *udev,
 		}
 	}
 
-	while (usb_trylock_device(udev) != 0) {
+	while (!usb_trylock_device(udev)) {
 
 		/* If we can't acquire the lock after waiting one second,
 		 * we're probably deadlocked */
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index dffb249c2a74..fe96b64462cb 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -299,7 +299,7 @@ get_ready_ep (unsigned f_flags, struct ep_data *epdata)
 	int	val;
 
 	if (f_flags & O_NONBLOCK) {
-		if (down_trylock (&epdata->lock) != 0)
+		if (!down_try (&epdata->lock))
 			goto nonblock;
 		if (epdata->state != STATE_EP_ENABLED) {
 			up (&epdata->lock);
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 7084e7e146c0..baf103361e3a 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -71,13 +71,6 @@ static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env)
 			      dev->id.device, dev->id.vendor);
 }
 
-static struct bus_type virtio_bus = {
-	.name  = "virtio",
-	.match = virtio_dev_match,
-	.dev_attrs = virtio_dev_attrs,
-	.uevent = virtio_uevent,
-};
-
 static void add_status(struct virtio_device *dev, unsigned status)
 {
 	dev->config->set_status(dev, dev->config->get_status(dev) | status);
@@ -120,6 +113,11 @@ static int virtio_dev_probe(struct device *_d)
 			set_bit(f, dev->features);
 	}
 
+	/* Transport features are always preserved to pass to set_features. */
+	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++)
+		if (device_features & (1 << i))
+			set_bit(i, dev->features);
+
 	err = drv->probe(dev);
 	if (err)
 		add_status(dev, VIRTIO_CONFIG_S_FAILED);
@@ -147,13 +145,20 @@ static int virtio_dev_remove(struct device *_d)
 	return 0;
 }
 
+static struct bus_type virtio_bus = {
+	.name  = "virtio",
+	.match = virtio_dev_match,
+	.dev_attrs = virtio_dev_attrs,
+	.uevent = virtio_uevent,
+	.probe = virtio_dev_probe,
+	.remove = virtio_dev_remove,
+};
+
 int register_virtio_driver(struct virtio_driver *driver)
 {
 	/* Catch this early. */
 	BUG_ON(driver->feature_table_size && !driver->feature_table);
 	driver->driver.bus = &virtio_bus;
-	driver->driver.probe = virtio_dev_probe;
-	driver->driver.remove = virtio_dev_remove;
 	return driver_register(&driver->driver);
 }
 EXPORT_SYMBOL_GPL(register_virtio_driver);
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index eae7236310e4..e0e81e505658 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -88,10 +88,14 @@ static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
 static u32 vp_get_features(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	u32 features;
 
 	/* When someone needs more than 32 feature bits, we'll need to
 	 * steal a bit to indicate that the rest are somewhere else. */
-	return ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
+	features = ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
+
+	/* Vring may want to play with the bits it's offered. */
+	return vring_transport_features(features);
 }
 
 /* virtio config->set_features() implementation */
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 72bf8bc09014..daf659dc06e6 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -18,6 +18,7 @@
  */
 #include <linux/virtio.h>
 #include <linux/virtio_ring.h>
+#include <linux/virtio_config.h>
 #include <linux/device.h>
 
 #ifdef DEBUG
@@ -52,9 +53,6 @@ struct vring_virtqueue
 	/* Number we've added since last sync. */
 	unsigned int num_added;
 
-	/* Last used index we've seen. */
-	u16 last_used_idx;
-
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	void (*notify)(struct virtqueue *vq);
 
@@ -87,8 +85,11 @@ static int vring_add_buf(struct virtqueue *_vq,
 	if (vq->num_free < out + in) {
 		pr_debug("Can't add buf len %i - avail = %i\n",
 			 out + in, vq->num_free);
-		/* We notify *even if* VRING_USED_F_NO_NOTIFY is set here. */
-		vq->notify(&vq->vq);
+		/* FIXME: for historical reasons, we force a notify here if
+		 * there are outgoing parts to the buffer.  Presumably the
+		 * host should service the ring ASAP. */
+		if (out)
+			vq->notify(&vq->vq);
 		END_USE(vq);
 		return -ENOSPC;
 	}
@@ -173,12 +174,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 
 static inline bool more_used(const struct vring_virtqueue *vq)
 {
-	return vq->last_used_idx != vq->vring.used->idx;
+	return vring_last_used(&vq->vring) != vq->vring.used->idx;
 }
 
 static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct vring_used_elem *u;
 	void *ret;
 	unsigned int i;
 
@@ -195,8 +197,11 @@ static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
 		return NULL;
 	}
 
-	i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
-	*len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
+	u = &vq->vring.used->ring[vring_last_used(&vq->vring) % vq->vring.num];
+	i = u->id;
+	*len = u->len;
+	/* Make sure we don't reload i after doing checks. */
+	rmb();
 
 	if (unlikely(i >= vq->vring.num)) {
 		BAD_RING(vq, "id %u out of range\n", i);
@@ -210,7 +215,7 @@ static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
 	/* detach_buf clears data, so grab it now. */
 	ret = vq->data[i];
 	detach_buf(vq, i);
-	vq->last_used_idx++;
+	vring_last_used(&vq->vring)++;
 	END_USE(vq);
 	return ret;
 }
@@ -294,7 +299,6 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	vq->vq.vq_ops = &vring_vq_ops;
 	vq->notify = notify;
 	vq->broken = false;
-	vq->last_used_idx = 0;
 	vq->num_added = 0;
 #ifdef DEBUG
 	vq->in_use = false;
@@ -320,4 +324,15 @@ void vring_del_virtqueue(struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(vring_del_virtqueue);
 
+/* Manipulates transport-specific feature bits. */
+u32 vring_transport_features(u32 features)
+{
+	u32 mask = ~VIRTIO_TRANSPORT_F_MASK;
+
+	/* We let through any non-transport bits, and the only one we know. */
+	mask &= ~(1 << VIRTIO_RING_F_PUBLISH_INDICES);
+	return features & mask;
+}
+EXPORT_SYMBOL_GPL(vring_transport_features);
+
 MODULE_LICENSE("GPL");
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec7..64211fc74266 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1062,10 +1062,6 @@ void ocfs2_clear_inode(struct inode *inode)
 			(unsigned long long)oi->ip_blkno);
 	mutex_unlock(&oi->ip_io_mutex);
 
-	/*
-	 * down_trylock() returns 0, down_write_trylock() returns 1
-	 * kernel 1, world 0
-	 */
 	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
 			"Clear inode of %llu, alloc_sem is locked\n",
 			(unsigned long long)oi->ip_blkno);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e396b2fa4743..9ec7d66c24a5 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1412,7 +1412,7 @@ static int flush_journal_list(struct super_block *s,
 	/* if flushall == 0, the lock is already held */
 	if (flushall) {
 		down(&journal->j_flush_sem);
-	} else if (!down_trylock(&journal->j_flush_sem)) {
+	} else if (down_try(&journal->j_flush_sem)) {
 		BUG();
 	}
 
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 3abe7e9ceb33..7d20f0422c97 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -36,17 +36,15 @@ typedef struct semaphore sema_t;
 
 static inline int issemalocked(sema_t *sp)
 {
-	return down_trylock(sp) || (up(sp), 0);
+	return !down_try(sp) || (up(sp), 0);
 }
 
 /*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
+ * Map cpsema (try to get the sema) to down_try.
  */
 static inline int cpsema(sema_t *sp)
 {
-	return down_trylock(sp) ? 0 : 1;
+	return down_try(sp);
 }
 
 #endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9cc8f0213095..09e50cdcc3b3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -538,7 +538,7 @@ found:
 	 * if this does not work then we need to drop the
 	 * spinlock and do a hard attempt on the semaphore.
 	 */
-	if (down_trylock(&bp->b_sema)) {
+	if (!down_try(&bp->b_sema)) {
 		if (!(flags & XBF_TRYLOCK)) {
 			/* wait for buffer ownership */
 			XB_TRACE(bp, "get_lock", 0);
@@ -882,7 +882,7 @@ xfs_buf_cond_lock(
 {
 	int			locked;
 
-	locked = down_trylock(&bp->b_sema) == 0;
+	locked = down_try(&bp->b_sema);
 	if (locked) {
 		XB_SET_OWNER(bp);
 	}
diff --git a/include/asm-s390/kvm_virtio.h b/include/asm-s390/kvm_virtio.h
index 5c871a990c29..146100224def 100644
--- a/include/asm-s390/kvm_virtio.h
+++ b/include/asm-s390/kvm_virtio.h
@@ -50,4 +50,14 @@ struct kvm_vqconfig {
 #define KVM_S390_VIRTIO_RESET		1
 #define KVM_S390_VIRTIO_SET_STATUS	2
 
+#ifdef __KERNEL__
+/* early virtio console setup */
+#ifdef CONFIG_VIRTIO_CONSOLE
+extern void s390_virtio_console_init(void);
+#else
+static inline void s390_virtio_console_init(void)
+{
+}
+#endif /* CONFIG_VIRTIO_CONSOLE */
+#endif /* __KERNEL__ */
 #endif
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5c8351b859f0..9bf1cf1563ea 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -61,3 +61,22 @@
 #define  noinline			__attribute__((noinline))
 #define __attribute_const__		__attribute__((__const__))
 #define __maybe_unused			__attribute__((unused))
+
+/**
+ * cast_if_type - allow an alternate type
+ * @expr: the expression to optionally cast
+ * @oktype: the type to allow.
+ * @desttype: the type to cast to.
+ *
+ * This is used to accept a particular alternate type for an expression:
+ * because any other types will not be cast, they will cause a warning as
+ * normal.
+ *
+ * Note that the unnecessary trinary forces functions to devolve into
+ * function pointers as users expect, but means @expr must be a pointer or
+ * integer.
+ */
+#define cast_if_type(expr, oktype, desttype)				\
+  __builtin_choose_expr(__builtin_types_compatible_p(typeof(1?(expr):0),\
+						     oktype),		\
+			(desttype)(expr), (expr))
diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
index d8e636e5607d..7e704e6c8a5d 100644
--- a/include/linux/compiler-intel.h
+++ b/include/linux/compiler-intel.h
@@ -29,3 +29,5 @@
 #endif
 
 #define uninitialized_var(x) x
+
+#define cast_if_type(expr, oktype, desttype) ((desttype)(expr))
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 18f31b6187a3..24a1213537fb 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -31,6 +31,7 @@
 #define TUN_NO_PI	0x0040
 #define TUN_ONE_QUEUE	0x0080
 #define TUN_PERSIST 	0x0100	
+#define TUN_VNET_HDR 	0x0200
 
 /* Ioctl defines */
 #define TUNSETNOCSUM  _IOW('T', 200, int) 
@@ -40,12 +41,20 @@
 #define TUNSETOWNER   _IOW('T', 204, int)
 #define TUNSETLINK    _IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
+#define TUNSETFEATURES _IOW('T', 207, unsigned int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
 #define IFF_NO_PI	0x1000
 #define IFF_ONE_QUEUE	0x2000
+#define IFF_VNET_HDR	0x4000
+
+/* Features for GSO (TUNSETFEATURES). */
+#define TUN_F_CSUM	0x01	/* You can hand me unchecksummed packets. */
+#define TUN_F_TSO4	0x02	/* I can handle TSO for IPv4 packets */
+#define TUN_F_TSO6	0x04	/* I can handle TSO for IPv6 packets */
+#define TUN_F_TSO_ECN	0x08	/* I can handle TSO with ECN bits. */
 
 struct tun_pi {
 	unsigned short flags;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4cb8d3df414e..91d375ca15bc 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -500,4 +500,39 @@ struct sysinfo {
 #define NUMA_BUILD 0
 #endif
 
+/* If fn is of type ok1 or ok2, cast to desttype */
+#define __typesafe_cb(desttype, fn, ok1, ok2) \
+	cast_if_type(cast_if_type((fn), ok1, desttype), ok2, desttype)
+
+/**
+ * typesafe_cb - cast a callback function if it matches the arg
+ * @rettype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * If a callback function takes a single argument, this macro does
+ * appropriate casts to a function which takes a single void * argument if the
+ * callback provided matches the @arg (or a const or volatile version).
+ *
+ * It is assumed that @arg is of pointer type: usually @arg is passed
+ * or assigned to a void * elsewhere anyway.
+ */
+#define typesafe_cb(rettype, fn, arg)					\
+	__typesafe_cb(rettype (*)(void *), (fn),			\
+		      rettype (*)(const typeof(arg)),			\
+		      rettype (*)(typeof(arg)))
+
+/**
+ * typesafe_cb_preargs - cast a callback function if it matches the arg
+ * @rettype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * This is a version of typesafe_cb() for callbacks that take other arguments
+ * before the @arg.
+ */
+#define typesafe_cb_preargs(rettype, fn, arg, ...)			\
+	__typesafe_cb(rettype (*)(__VA_ARGS__, void *), (fn),		\
+		      rettype (*)(__VA_ARGS__, const typeof(arg)),	\
+		      rettype (*)(__VA_ARGS__, typeof(arg)))
 #endif
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 00dd957e245b..3152c1ef1d08 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -4,9 +4,32 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 
-struct task_struct *kthread_create(int (*threadfn)(void *data),
-				   void *data,
-				   const char namefmt[], ...);
+/**
+ * kthread_create - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread.  The thread will be stopped: use wake_up_process() to start
+ * it.  See also kthread_run(), kthread_create_on_cpu().
+ *
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either call do_exit() directly if it is a
+ * standalone thread for which noone will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called).  The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM).
+ */
+#define kthread_create(threadfn, data, namefmt...)			\
+	__kthread_create(typesafe_cb(int,(threadfn),(data)), (data), namefmt)
+
+struct task_struct *__kthread_create(int (*threadfn)(void *data),
+				     void *data,
+				     const char namefmt[], ...)
+	__attribute__((format(printf, 3, 4)));
 
 /**
  * kthread_run - create and wake a thread.
diff --git a/include/linux/module.h b/include/linux/module.h
index 3e03b1acbc94..fce15ebd0e1c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -249,27 +249,30 @@ struct module
 
 	/* Exported symbols */
 	const struct kernel_symbol *syms;
-	unsigned int num_syms;
 	const unsigned long *crcs;
+	unsigned int num_syms;
 
 	/* GPL-only exported symbols. */
-	const struct kernel_symbol *gpl_syms;
 	unsigned int num_gpl_syms;
+	const struct kernel_symbol *gpl_syms;
 	const unsigned long *gpl_crcs;
 
+#ifdef CONFIG_UNUSED_SYMBOLS
 	/* unused exported symbols. */
 	const struct kernel_symbol *unused_syms;
-	unsigned int num_unused_syms;
 	const unsigned long *unused_crcs;
+	unsigned int num_unused_syms;
+
 	/* GPL-only, unused exported symbols. */
-	const struct kernel_symbol *unused_gpl_syms;
 	unsigned int num_unused_gpl_syms;
+	const struct kernel_symbol *unused_gpl_syms;
 	const unsigned long *unused_gpl_crcs;
+#endif
 
 	/* symbols that will be GPL-only in the near future. */
 	const struct kernel_symbol *gpl_future_syms;
-	unsigned int num_gpl_future_syms;
 	const unsigned long *gpl_future_crcs;
+	unsigned int num_gpl_future_syms;
 
 	/* Exception table */
 	unsigned int num_exentries;
@@ -285,10 +288,10 @@ struct module
 	void *module_core;
 
 	/* Here are the sizes of the init and core sections */
-	unsigned long init_size, core_size;
+	unsigned int init_size, core_size;
 
 	/* The size of the executable code in each section.  */
-	unsigned long init_text_size, core_text_size;
+	unsigned int init_text_size, core_text_size;
 
 	/* The handle returned from unwind_add_table. */
 	void *unwind_info;
@@ -300,29 +303,15 @@ struct module
 
 #ifdef CONFIG_GENERIC_BUG
 	/* Support for BUG */
+	unsigned num_bugs;
 	struct list_head bug_list;
 	struct bug_entry *bug_table;
-	unsigned num_bugs;
-#endif
-
-#ifdef CONFIG_MODULE_UNLOAD
-	/* Reference counts */
-	struct module_ref ref[NR_CPUS];
-
-	/* What modules depend on me? */
-	struct list_head modules_which_use_me;
-
-	/* Who is waiting for us to be unloaded */
-	struct task_struct *waiter;
-
-	/* Destruction function. */
-	void (*exit)(void);
 #endif
 
 #ifdef CONFIG_KALLSYMS
 	/* We keep the symbol and string tables for kallsyms. */
 	Elf_Sym *symtab;
-	unsigned long num_symtab;
+	unsigned int num_symtab;
 	char *strtab;
 
 	/* Section attributes */
@@ -342,6 +331,21 @@ struct module
 	struct marker *markers;
 	unsigned int num_markers;
 #endif
+
+#ifdef CONFIG_MODULE_UNLOAD
+	/* What modules depend on me? */
+	struct list_head modules_which_use_me;
+
+	/* Who is waiting for us to be unloaded */
+	struct task_struct *waiter;
+
+	/* Destruction function. */
+	void (*exit)(void);
+
+	/* Reference counts */
+	struct module_ref ref[NR_CPUS];
+#endif
+
 };
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index bc6da10ceee0..c1f5b3f9fe2d 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -141,10 +141,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
 #endif
 
-/*
- * NOTE: mutex_trylock() follows the spin_trylock() convention,
- *       not the down_trylock() convention!
- */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 9cae64b00d6b..d245e6fade84 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -44,8 +44,12 @@ static inline void sema_init(struct semaphore *sem, int val)
 extern void down(struct semaphore *sem);
 extern int __must_check down_interruptible(struct semaphore *sem);
 extern int __must_check down_killable(struct semaphore *sem);
-extern int __must_check down_trylock(struct semaphore *sem);
+extern int __must_check down_try(struct semaphore *sem);
+/* Old down_trylock() returned the opposite of what was expected. */
+static inline int __deprecated down_trylock(struct semaphore *sem)
+{
+	return !down_try(sem);
+}
 extern int __must_check down_timeout(struct semaphore *sem, long jiffies);
 extern void up(struct semaphore *sem);
-
 #endif /* __LINUX_SEMAPHORE_H */
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 5bfc553bdb21..23c1b0d0f020 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -5,14 +5,18 @@
    (and more).  So the "read" side to such a lock is anything which
    diables preeempt. */
 #include <linux/cpu.h>
+#include <linux/compiler.h>
 #include <asm/system.h>
 
-#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
+#define ALL_CPUS ~0U
+
 /**
  * stop_machine_run: freeze the machine on all CPUs and run this function
  * @fn: the function to run
  * @data: the data ptr for the @fn()
- * @cpu: the cpu to run @fn() on (or any, if @cpu == NR_CPUS.
+ * @cpu: if @cpu == n, run @fn() on cpu n
+ *       if @cpu == NR_CPUS, run @fn() on any cpu
+ *       if @cpu == ALL_CPUS, run @fn() on every online CPU.
  *
  * Description: This causes a thread to be scheduled on every other cpu,
  * each of which disables interrupts, and finally interrupts are disabled
@@ -21,7 +25,11 @@
  *
  * This can be thought of as a very heavy write lock, equivalent to
  * grabbing every spinlock in the kernel. */
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu);
+#define stop_machine_run(fn, data, cpu)					\
+	stop_machine_run_notype(typesafe_cb(int, (fn), (data)), (data), (cpu))
+
+#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
+int stop_machine_run_notype(int (*fn)(void *), void *data, unsigned int cpu);
 
 /**
  * __stop_machine_run: freeze the machine on all CPUs and run this function
@@ -29,17 +37,14 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu);
  * @data: the data ptr for the @fn
  * @cpu: the cpu to run @fn on (or any, if @cpu == NR_CPUS.
  *
- * Description: This is a special version of the above, which returns the
- * thread which has run @fn(): kthread_stop will return the return value
- * of @fn().  Used by hotplug cpu.
+ * Description: This is a special version of the above, which assumes cpus
+ * won't come or go while it's being called.  Used by hotplug cpu.
  */
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
-				       unsigned int cpu);
-
+int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu);
 #else
 
-static inline int stop_machine_run(int (*fn)(void *), void *data,
-				   unsigned int cpu)
+static inline int stop_machine_run_notype(int (*fn)(void *), void *data,
+					  unsigned int cpu)
 {
 	int ret;
 	local_irq_disable();
diff --git a/include/linux/timer.h b/include/linux/timer.h
index d4ba79248a27..1baf40162b8c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -25,12 +25,22 @@ struct timer_list {
 
 extern struct tvec_base boot_tvec_bases;
 
-#define TIMER_INITIALIZER(_function, _expires, _data) {		\
-		.entry = { .prev = TIMER_ENTRY_STATIC },	\
-		.function = (_function),			\
-		.expires = (_expires),				\
-		.data = (_data),				\
-		.base = &boot_tvec_bases,			\
+/*
+ * For historic reasons the timer function takes an unsigned long, so
+ * we use this variant of typesafe_cb.  data is converted to an unsigned long
+ * if it is another integer type, by adding 0UL.
+ */
+#define typesafe_timerfn(fn, data)				\
+	__typesafe_cb(void (*)(unsigned long), (fn),		\
+		      void (*)(const typeof((data)+0UL)),	\
+		      void (*)(typeof((data)+0UL)))
+
+#define TIMER_INITIALIZER(_function, _expires, _data) {			\
+		.entry = { .prev = TIMER_ENTRY_STATIC },		\
+		.function = typesafe_timerfn((_function), (_data)),	\
+		.expires = (_expires),					\
+		.data = (unsigned long)(_data),				\
+		.base = &boot_tvec_bases,				\
 	}
 
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
@@ -51,9 +61,13 @@ static inline void init_timer_on_stack(struct timer_list *timer)
 }
 #endif
 
-static inline void setup_timer(struct timer_list * timer,
-				void (*function)(unsigned long),
-				unsigned long data)
+#define setup_timer(timer, function, data)				\
+	__setup_timer((timer), typesafe_timerfn((function), (data)),	\
+		      (unsigned long)(data))
+
+static inline void __setup_timer(struct timer_list *timer,
+				 void (*function)(unsigned long),
+				 unsigned long data)
 {
 	timer->function = function;
 	timer->data = data;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index c4e5be9a243f..9d5b6c60ab4d 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -493,7 +493,7 @@ extern void usb_put_dev(struct usb_device *dev);
 /* USB device locking */
 #define usb_lock_device(udev)		down(&(udev)->dev.sem)
 #define usb_unlock_device(udev)		up(&(udev)->dev.sem)
-#define usb_trylock_device(udev)	down_trylock(&(udev)->dev.sem)
+#define usb_trylock_device(udev)	down_try(&(udev)->dev.sem)
 extern int usb_lock_device_for_reset(struct usb_device *udev,
 				     const struct usb_interface *iface);
 
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 5f79a5f9de79..52b593213f77 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -11,6 +11,7 @@
 #define VIRTIO_BLK_F_SEG_MAX	2	/* Indicates maximum # of segments */
 #define VIRTIO_BLK_F_GEOMETRY	4	/* Legacy geometry available  */
 #define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
+#define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
 
 struct virtio_blk_config
 {
@@ -26,6 +27,8 @@ struct virtio_blk_config
 		__u8 heads;
 		__u8 sectors;
 	} geometry;
+	/* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
+	__u32 blk_size;
 } __attribute__((packed));
 
 /* These two define direction. */
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index f364bbf63c34..59a65e33cfa2 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -15,6 +15,13 @@
 /* We've given up on this device. */
 #define VIRTIO_CONFIG_S_FAILED		0x80
 
+/* Some virtio feature bits (currently bits 28 through 31) are reserved for the
+ * transport being used (eg. virtio_ring), the rest are per-device feature
+ * bits. */
+#define VIRTIO_TRANSPORT_F_START	28
+#define VIRTIO_TRANSPORT_F_END		32
+#define VIRTIO_TRANSPORT_F_MASK		0xF0000000
+
 /* Do we get callbacks when the ring is completely used, even if we've
  * suppressed them? */
 #define VIRTIO_F_NOTIFY_ON_EMPTY	24
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index abe481ed990e..65fae97ccc69 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -24,6 +24,9 @@
  * optimization.  */
 #define VRING_AVAIL_F_NO_INTERRUPT	1
 
+/* We publish our last-seen used index at the end of the avail ring. */
+#define VIRTIO_RING_F_PUBLISH_INDICES	28
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc
 {
@@ -82,6 +85,7 @@ struct vring {
  *	__u16 avail_flags;
  *	__u16 avail_idx;
  *	__u16 available[num];
+ *	__u16 last_used_idx;
  *
  *	// Padding to the next page boundary.
  *	char pad[];
@@ -90,6 +94,7 @@ struct vring {
  *	__u16 used_flags;
  *	__u16 used_idx;
  *	struct vring_used_elem used[num];
+ *	__u16 last_avail_idx;
  * };
  */
 static inline void vring_init(struct vring *vr, unsigned int num, void *p,
@@ -106,9 +111,14 @@ static inline unsigned vring_size(unsigned int num, unsigned long pagesize)
 {
 	return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num)
 		 + pagesize - 1) & ~(pagesize - 1))
-		+ sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num;
+		+ sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num + 2;
 }
 
+/* We publish the last-seen used index at the end of the available ring, and
+ * vice-versa.  These are at the end for backwards compatibility. */
+#define vring_last_used(vr) ((vr)->avail->ring[(vr)->num])
+#define vring_last_avail(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])
+
 #ifdef __KERNEL__
 #include <linux/irqreturn.h>
 struct virtio_device;
@@ -121,6 +131,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 				      void (*callback)(struct virtqueue *vq));
 void vring_del_virtqueue(struct virtqueue *vq);
 
+/* Filter out unsupported transport-specific feature bits. */
+u32 vring_transport_features(u32 features);
+
 irqreturn_t vring_interrupt(int irq, void *_vq);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_VIRTIO_RING_H */
diff --git a/init/Kconfig b/init/Kconfig
index 6199d1120900..c8578f9ee31d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -856,8 +856,8 @@ config MODULE_UNLOAD
 	help
 	  Without this option you will not be able to unload any
 	  modules (note that some modules may not be unloadable
-	  anyway), which makes your kernel slightly smaller and
-	  simpler.  If unsure, say Y.
+	  anyway), which makes your kernel smaller, faster
+	  and simpler.  If unsure, say Y.
 
 config MODULE_FORCE_UNLOAD
 	bool "Forced module unloading"
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d26d0b095b3b..472149ca920e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -214,7 +214,6 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -248,19 +247,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed_ptr(current, &tmp);
 
-	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
 
-	if (IS_ERR(p) || cpu_online(cpu)) {
+	if (err || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
-		if (IS_ERR(p)) {
-			err = PTR_ERR(p);
-			goto out_allowed;
-		}
-		goto out_thread;
+		goto out_allowed;
 	}
 
 	/* Wait for it to sleep (leaving idle task). */
@@ -277,8 +272,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	check_for_tasks(cpu);
 
-out_thread:
-	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ac3fb7326641..ae64f808eebb 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -111,29 +111,10 @@ static void create_kthread(struct kthread_create_info *create)
 	complete(&create->done);
 }
 
-/**
- * kthread_create - create a kthread.
- * @threadfn: the function to run until signal_pending(current).
- * @data: data ptr for @threadfn.
- * @namefmt: printf-style name for the thread.
- *
- * Description: This helper function creates and names a kernel
- * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run(), kthread_create_on_cpu().
- *
- * When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
- * return when 'kthread_should_stop()' is true (which means
- * kthread_stop() has been called).  The return value should be zero
- * or a negative error number; it will be passed to kthread_stop().
- *
- * Returns a task_struct or ERR_PTR(-ENOMEM).
- */
-struct task_struct *kthread_create(int (*threadfn)(void *data),
-				   void *data,
-				   const char namefmt[],
-				   ...)
+struct task_struct *__kthread_create(int (*threadfn)(void *data),
+				     void *data,
+				     const char namefmt[],
+				     ...)
 {
 	struct kthread_create_info create;
 
@@ -158,7 +139,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	}
 	return create.result;
 }
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(__kthread_create);
 
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/module.c b/kernel/module.c
index f525015dd65d..a4bdd77a3ea9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -135,17 +135,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
+#ifdef CONFIG_UNUSED_SYMBOLS
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const unsigned long __start___kcrctab_unused[];
 extern const unsigned long __start___kcrctab_unused_gpl[];
+#endif
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -153,156 +155,184 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-	const struct kernel_symbol *start,
-	const struct kernel_symbol *stop)
-{
-	const struct kernel_symbol *ks = start;
-	for (; ks < stop; ks++)
-		if (strcmp(ks->name, name) == 0)
-			return ks;
-	return NULL;
-}
-
-static bool always_ok(bool gplok, bool warn, const char *name)
-{
-	return true;
-}
-
-static bool printk_unused_warning(bool gplok, bool warn, const char *name)
-{
-	if (warn) {
-		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
-		       "however this module is using it.\n", name);
-		printk(KERN_WARNING
-		       "This symbol will go away in the future.\n");
-		printk(KERN_WARNING
-		       "Please evalute if this is the right api to use and if "
-		       "it really is, submit a report the linux kernel "
-		       "mailinglist together with submitting your code for "
-		       "inclusion.\n");
-	}
-	return true;
-}
-
-static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
-{
-	if (!gplok)
-		return false;
-	return printk_unused_warning(gplok, warn, name);
-}
-
-static bool gpl_only(bool gplok, bool warn, const char *name)
-{
-	return gplok;
-}
-
-static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
-{
-	if (!gplok && warn) {
-		printk(KERN_WARNING "Symbol %s is being used "
-		       "by a non-GPL module, which will not "
-		       "be allowed in the future\n", name);
-		printk(KERN_WARNING "Please see the file "
-		       "Documentation/feature-removal-schedule.txt "
-		       "in the kernel source tree for more details.\n");
-	}
-	return true;
-}
-
 struct symsearch {
 	const struct kernel_symbol *start, *stop;
 	const unsigned long *crcs;
-	bool (*check)(bool gplok, bool warn, const char *name);
+	enum {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} licence;
+	bool unused;
 };
 
-/* Look through this array of symbol tables for a symbol match which
- * passes the check function. */
-static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
-						    unsigned int num,
-						    const char *name,
-						    bool gplok,
-						    bool warn,
-						    const unsigned long **crc)
+static bool each_symbol_in_section(const struct symsearch *arr,
+				   unsigned int arrsize,
+				   struct module *owner,
+				   bool (*fn)(const struct symsearch *syms,
+					      struct module *owner,
+					      unsigned int symnum, void *data),
+				   void *data)
 {
-	unsigned int i;
-	const struct kernel_symbol *ks;
+	unsigned int i, j;
 
-	for (i = 0; i < num; i++) {
-		ks = lookup_symbol(name, arr[i].start, arr[i].stop);
-		if (!ks || !arr[i].check(gplok, warn, name))
-			continue;
-
-		if (crc)
-			*crc = symversion(arr[i].crcs, ks - arr[i].start);
-		return ks;
+	for (j = 0; j < arrsize; j++) {
+		for (i = 0; i < arr[j].stop - arr[j].start; i++)
+			if (fn(&arr[j], owner, i, data))
+				return true;
 	}
-	return NULL;
+
+	return false;
 }
 
-/* Find a symbol, return value, (optional) crc and (optional) module
- * which owns it */
-static unsigned long find_symbol(const char *name,
-				 struct module **owner,
-				 const unsigned long **crc,
-				 bool gplok,
-				 bool warn)
+/* Returns true as soon as fn returns true, otherwise false. */
+static bool each_symbol(bool (*fn)(const struct symsearch *arr,
+				   struct module *owner,
+				   unsigned int symnum, void *data),
+			void *data)
 {
 	struct module *mod;
-	const struct kernel_symbol *ks;
 	const struct symsearch arr[] = {
 		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
-		  always_ok },
+		  NOT_GPL_ONLY, false },
 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
-		  __start___kcrctab_gpl, gpl_only },
+		  __start___kcrctab_gpl,
+		  GPL_ONLY, false },
 		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
-		  __start___kcrctab_gpl_future, warn_if_not_gpl },
+		  __start___kcrctab_gpl_future,
+		  WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
 		{ __start___ksymtab_unused, __stop___ksymtab_unused,
-		  __start___kcrctab_unused, printk_unused_warning },
+		  __start___kcrctab_unused,
+		  NOT_GPL_ONLY, true },
 		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
-		  __start___kcrctab_unused_gpl, gpl_only_unused_warning },
+		  __start___kcrctab_unused_gpl,
+		  GPL_ONLY, true },
+#endif
 	};
 
-	/* Core kernel first. */
-	ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
-	if (ks) {
-		if (owner)
-			*owner = NULL;
-		return ks->value;
-	}
+	if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
+		return true;
 
-	/* Now try modules. */
 	list_for_each_entry(mod, &modules, list) {
 		struct symsearch arr[] = {
 			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
-			  always_ok },
+			  NOT_GPL_ONLY, false },
 			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
-			  mod->gpl_crcs, gpl_only },
+			  mod->gpl_crcs,
+			  GPL_ONLY, false },
 			{ mod->gpl_future_syms,
 			  mod->gpl_future_syms + mod->num_gpl_future_syms,
-			  mod->gpl_future_crcs, warn_if_not_gpl },
+			  mod->gpl_future_crcs,
+			  WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
 			{ mod->unused_syms,
 			  mod->unused_syms + mod->num_unused_syms,
-			  mod->unused_crcs, printk_unused_warning },
+			  mod->unused_crcs,
+			  NOT_GPL_ONLY, true },
 			{ mod->unused_gpl_syms,
 			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
-			  mod->unused_gpl_crcs, gpl_only_unused_warning },
+			  mod->unused_gpl_crcs,
+			  GPL_ONLY, true },
+#endif
 		};
 
-		ks = search_symarrays(arr, ARRAY_SIZE(arr),
-				      name, gplok, warn, crc);
-		if (ks) {
-			if (owner)
-				*owner = mod;
-			return ks->value;
+		if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
+			return true;
+	}
+	return false;
+}
+
+struct find_symbol_arg {
+	/* Input */
+	const char *name;
+	bool gplok;
+	bool warn;
+
+	/* Output */
+	struct module *owner;
+	const unsigned long *crc;
+	unsigned long value;
+};
+
+static bool find_symbol_in_section(const struct symsearch *syms,
+				   struct module *owner,
+				   unsigned int symnum, void *data)
+{
+	struct find_symbol_arg *fsa = data;
+
+	if (strcmp(syms->start[symnum].name, fsa->name) != 0)
+		return false;
+
+	if (!fsa->gplok) {
+		if (syms->licence == GPL_ONLY)
+			return false;
+		if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+			printk(KERN_WARNING "Symbol %s is being used "
+			       "by a non-GPL module, which will not "
+			       "be allowed in the future\n", fsa->name);
+			printk(KERN_WARNING "Please see the file "
+			       "Documentation/feature-removal-schedule.txt "
+			       "in the kernel source tree for more details.\n");
 		}
 	}
 
+#ifdef CONFIG_UNUSED_SYMBOLS
+	if (syms->unused && fsa->warn) {
+		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+		       "however this module is using it.\n", fsa->name);
+		printk(KERN_WARNING
+		       "This symbol will go away in the future.\n");
+		printk(KERN_WARNING
+		       "Please evalute if this is the right api to use and if "
+		       "it really is, submit a report the linux kernel "
+		       "mailinglist together with submitting your code for "
+		       "inclusion.\n");
+	}
+#endif
+
+	fsa->owner = owner;
+	fsa->crc = symversion(syms->crcs, symnum);
+	fsa->value = syms->start[symnum].value;
+	return true;
+}
+
+/* Find a symbol, return value, (optional) crc and (optional) module
+ * which owns it */
+static unsigned long find_symbol(const char *name,
+				 struct module **owner,
+				 const unsigned long **crc,
+				 bool gplok,
+				 bool warn)
+{
+	struct find_symbol_arg fsa;
+
+	fsa.name = name;
+	fsa.gplok = gplok;
+	fsa.warn = warn;
+
+	if (each_symbol(find_symbol_in_section, &fsa)) {
+		*owner = fsa.owner;
+		*crc = fsa.crc;
+		return fsa.value;
+	}
+
 	DEBUGP("Failed to find symbol %s\n", name);
 	return -ENOENT;
 }
 
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+	const struct kernel_symbol *start,
+	const struct kernel_symbol *stop)
+{
+	const struct kernel_symbol *ks = start;
+	for (; ks < stop; ks++)
+		if (strcmp(ks->name, name) == 0)
+			return ks;
+	return NULL;
+}
+
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -1446,8 +1476,10 @@ static int verify_export_symbols(struct module *mod)
 		{ mod->syms, mod->num_syms },
 		{ mod->gpl_syms, mod->num_gpl_syms },
 		{ mod->gpl_future_syms, mod->num_gpl_future_syms },
+#ifdef CONFIG_UNUSED_SYMBOLS
 		{ mod->unused_syms, mod->num_unused_syms },
 		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+#endif
 	};
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -1527,7 +1559,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 }
 
 /* Update size with this section: return offset. */
-static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
+static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
 {
 	long ret;
 
@@ -1765,10 +1797,12 @@ static struct module *load_module(void __user *umod,
 	unsigned int gplfutureindex;
 	unsigned int gplfuturecrcindex;
 	unsigned int unwindex = 0;
+#ifdef CONFIG_UNUSED_SYMBOLS
 	unsigned int unusedindex;
 	unsigned int unusedcrcindex;
 	unsigned int unusedgplindex;
 	unsigned int unusedgplcrcindex;
+#endif
 	unsigned int markersindex;
 	unsigned int markersstringsindex;
 	struct module *mod;
@@ -1851,13 +1885,15 @@ static struct module *load_module(void __user *umod,
 	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
 	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
 	gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
-	unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
-	unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
 	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
 	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
 	gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
+#ifdef CONFIG_UNUSED_SYMBOLS
+	unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
+	unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
 	unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
 	unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
+#endif
 	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
 	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
 	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -2019,14 +2055,15 @@ static struct module *load_module(void __user *umod,
 		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
 	mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
 					sizeof(*mod->gpl_future_syms);
-	mod->num_unused_syms = sechdrs[unusedindex].sh_size /
-					sizeof(*mod->unused_syms);
-	mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
-					sizeof(*mod->unused_gpl_syms);
 	mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
 	if (gplfuturecrcindex)
 		mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
 
+#ifdef CONFIG_UNUSED_SYMBOLS
+	mod->num_unused_syms = sechdrs[unusedindex].sh_size /
+					sizeof(*mod->unused_syms);
+	mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
+					sizeof(*mod->unused_gpl_syms);
 	mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
 	if (unusedcrcindex)
 		mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
@@ -2034,13 +2071,17 @@ static struct module *load_module(void __user *umod,
 	if (unusedgplcrcindex)
 		mod->unused_gpl_crcs
 			= (void *)sechdrs[unusedgplcrcindex].sh_addr;
+#endif
 
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !crcindex) ||
 	    (mod->num_gpl_syms && !gplcrcindex) ||
 	    (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
+#ifdef CONFIG_UNUSED_SYMBOLS
 	    (mod->num_unused_syms && !unusedcrcindex) ||
-	    (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
+	    (mod->num_unused_gpl_syms && !unusedgplcrcindex)
+#endif
+		) {
 		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
 		err = try_to_force_load(mod, "nocrc");
 		if (err)
@@ -2513,7 +2554,7 @@ static int m_show(struct seq_file *m, void *p)
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[8];
 
-	seq_printf(m, "%s %lu",
+	seq_printf(m, "%s %u",
 		   mod->name, mod->init_size + mod->core_size);
 	print_unload_info(m, mod);
 
diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac8ef60..7502055dc6d0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -370,8 +370,8 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
  * Try to acquire the mutex atomically. Returns 1 if the mutex
  * has been acquired successfully, and 0 on contention.
  *
- * NOTE: this function follows the spin_trylock() convention, so
- * it is negated to the down_trylock() return values! Be careful
+ * NOTE: this function follows the spin_trylock()/down_try() convention,
+ * so it is negated to the old down_trylock() return values! Be careful
  * about this when converting semaphore users to mutexes.
  *
  * This function must not be used in interrupt context. The
diff --git a/kernel/printk.c b/kernel/printk.c
index 07ad9e7f7a66..74271278fc5c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -969,7 +969,7 @@ EXPORT_SYMBOL(acquire_console_sem);
 
 int try_acquire_console_sem(void)
 {
-	if (down_trylock(&console_sem))
+	if (!down_try(&console_sem))
 		return -1;
 	console_locked = 1;
 	console_may_schedule = 0;
@@ -1068,7 +1068,7 @@ void console_unblank(void)
 	 * oops_in_progress is set to 1..
 	 */
 	if (oops_in_progress) {
-		if (down_trylock(&console_sem) != 0)
+		if (!down_try(&console_sem))
 			return;
 	} else
 		acquire_console_sem();
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 1a064adab658..bbab232ee185 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -14,7 +14,7 @@
  * Some notes on the implementation:
  *
  * The spinlock controls access to the other members of the semaphore.
- * down_trylock() and up() can be called from interrupt context, so we
+ * down_try() and up() can be called from interrupt context, so we
  * have to disable interrupts when taking the lock.  It turns out various
  * parts of the kernel expect to be able to use down() on a semaphore in
  * interrupt context when they know it will succeed, so we have to use
@@ -116,19 +116,18 @@ int down_killable(struct semaphore *sem)
 EXPORT_SYMBOL(down_killable);
 
 /**
- * down_trylock - try to acquire the semaphore, without waiting
+ * down_try - try to acquire the semaphore, without waiting
  * @sem: the semaphore to be acquired
  *
- * Try to acquire the semaphore atomically.  Returns 0 if the mutex has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * Try to acquire the semaphore atomically.  Returns true if the mutex has
+ * been acquired successfully or 0 if it it cannot be acquired.
  *
- * NOTE: This return value is inverted from both spin_trylock and
- * mutex_trylock!  Be careful about this when converting code.
+ * NOTE: This replaces down_trylock() which returned the reverse.
  *
  * Unlike mutex_trylock, this function can be used from interrupt context,
  * and the semaphore can be released by any task or interrupt.
  */
-int down_trylock(struct semaphore *sem)
+int down_try(struct semaphore *sem)
 {
 	unsigned long flags;
 	int count;
@@ -139,9 +138,9 @@ int down_trylock(struct semaphore *sem)
 		sem->count = count;
 	spin_unlock_irqrestore(&sem->lock, flags);
 
-	return (count < 0);
+	return (count >= 0);
 }
-EXPORT_SYMBOL(down_trylock);
+EXPORT_SYMBOL(down_try);
 
 /**
  * down_timeout - acquire the semaphore within a specified time
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba9b2054ecbd..7a785a33e12e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -13,203 +13,171 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 
-/* Since we effect priority and affinity (both of which are visible
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-
-/* Thread to stop each CPU in user context. */
+/* This controls the threads on each CPU. */
 enum stopmachine_state {
-	STOPMACHINE_WAIT,
-	STOPMACHINE_PREPARE,
+	/* Dummy starting state for thread. */
+	STOPMACHINE_NONE,
+	/* Disable interrupts. */
 	STOPMACHINE_DISABLE_IRQ,
+	/* Run the function */
+	STOPMACHINE_RUN,
+	/* Exit */
 	STOPMACHINE_EXIT,
+	/* Everyone exited. */
+	STOPMACHINE_COMPLETE,
 };
+static enum stopmachine_state state;
 
-static enum stopmachine_state stopmachine_state;
-static unsigned int stopmachine_num_threads;
-static atomic_t stopmachine_thread_ack;
+struct stop_machine_data {
+	int (*fn)(void *);
+	void *data;
+	int fnret;
+};
+
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static struct completion finished;
 
-static int stopmachine(void *cpu)
+static void set_state(enum stopmachine_state newstate)
 {
-	int irqs_disabled = 0;
-	int prepared = 0;
+	/* Reset ack counter. */
+	atomic_set(&thread_ack, num_threads);
+	smp_wmb();
+	state = newstate;
+}
 
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
+{
+	if (atomic_dec_and_test(&thread_ack)) {
+		set_state(state + 1);
+		if (state == STOPMACHINE_COMPLETE)
+			complete(&finished);
+	}
+}
 
-	/* Ack: we are alive */
-	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-	atomic_inc(&stopmachine_thread_ack);
+/* This is the actual thread which stops the CPU.  It exits by itself rather
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
+static int stop_cpu(struct stop_machine_data *smdata)
+{
+	enum stopmachine_state curstate = STOPMACHINE_NONE;
+	int uninitialized_var(ret);
 
 	/* Simple state machine */
-	while (stopmachine_state != STOPMACHINE_EXIT) {
-		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
-		    && !irqs_disabled) {
-			local_irq_disable();
-			hard_irq_disable();
-			irqs_disabled = 1;
-			/* Ack: irqs disabled. */
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		} else if (stopmachine_state == STOPMACHINE_PREPARE
-			   && !prepared) {
-			/* Everyone is in place, hold CPU. */
-			preempt_disable();
-			prepared = 1;
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		}
-		/* Yield in first stage: migration threads need to
-		 * help our sisters onto their CPUs. */
-		if (!prepared && !irqs_disabled)
-			yield();
+	do {
+		/* Chill out and ensure we re-read stopmachine_state. */
 		cpu_relax();
-	}
-
-	/* Ack: we are exiting. */
-	smp_mb(); /* Must read state first. */
-	atomic_inc(&stopmachine_thread_ack);
-
-	if (irqs_disabled)
-		local_irq_enable();
-	if (prepared)
-		preempt_enable();
+		if (state != curstate) {
+			curstate = state;
+			switch (curstate) {
+			case STOPMACHINE_DISABLE_IRQ:
+				local_irq_disable();
+				hard_irq_disable();
+				break;
+			case STOPMACHINE_RUN:
+				/* |= allows error detection if functions on
+				 * multiple CPUs. */
+				smdata->fnret |= smdata->fn(smdata->data);
+				break;
+			default:
+				break;
+			}
+			ack_state();
+		}
+	} while (curstate < STOPMACHINE_EXIT);
 
-	return 0;
+	local_irq_enable();
+	do_exit(0);
 }
 
-/* Change the thread state */
-static void stopmachine_set_state(enum stopmachine_state state)
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
 {
-	atomic_set(&stopmachine_thread_ack, 0);
-	smp_wmb();
-	stopmachine_state = state;
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		cpu_relax();
+	return 0;
 }
 
-static int stop_machine(void)
+int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	int i, ret = 0;
-
-	atomic_set(&stopmachine_thread_ack, 0);
-	stopmachine_num_threads = 0;
-	stopmachine_state = STOPMACHINE_WAIT;
-
-	for_each_online_cpu(i) {
-		if (i == raw_smp_processor_id())
-			continue;
-		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
-		if (ret < 0)
-			break;
-		stopmachine_num_threads++;
-	}
-
-	/* Wait for them all to come to life. */
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
-		yield();
-		cpu_relax();
-	}
-
-	/* If some failed, kill them all. */
-	if (ret < 0) {
-		stopmachine_set_state(STOPMACHINE_EXIT);
-		return ret;
-	}
+	int i, err;
+	struct stop_machine_data active, idle;
+	struct task_struct **threads;
 
-	/* Now they are all started, make them hold the CPUs, ready. */
-	preempt_disable();
-	stopmachine_set_state(STOPMACHINE_PREPARE);
+	active.fn = fn;
+	active.data = data;
+	active.fnret = 0;
+	idle.fn = chill;
+	idle.data = NULL;
 
-	/* Make them disable irqs. */
-	local_irq_disable();
-	hard_irq_disable();
-	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
+	/* If they don't care which cpu fn runs on, just pick one. */
+	if (cpu == NR_CPUS)
+		cpu = any_online_cpu(cpu_online_map);
 
-	return 0;
-}
+	/* This could be too big for stack on large machines. */
+	threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+	if (!threads)
+		return -ENOMEM;
 
-static void restart_machine(void)
-{
-	stopmachine_set_state(STOPMACHINE_EXIT);
-	local_irq_enable();
-	preempt_enable_no_resched();
-}
+	/* Set up initial state. */
+	init_completion(&finished);
+	num_threads = num_online_cpus();
+	set_state(STOPMACHINE_DISABLE_IRQ);
 
-struct stop_machine_data {
-	int (*fn)(void *);
-	void *data;
-	struct completion done;
-};
+	for_each_online_cpu(i) {
+		struct stop_machine_data *smdata;
+		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
-static int do_stop(void *_smdata)
-{
-	struct stop_machine_data *smdata = _smdata;
-	int ret;
+		if (cpu == ALL_CPUS || i == cpu)
+			smdata = &active;
+		else
+			smdata = &idle;
 
-	ret = stop_machine();
-	if (ret == 0) {
-		ret = smdata->fn(smdata->data);
-		restart_machine();
-	}
+		threads[i] = kthread_create(stop_cpu, smdata, "kstop%u", i);
+		if (IS_ERR(threads[i])) {
+			err = PTR_ERR(threads[i]);
+			threads[i] = NULL;
+			goto kill_threads;
+		}
 
-	/* We're done: you can kthread_stop us now */
-	complete(&smdata->done);
+		/* Place it onto correct cpu. */
+		kthread_bind(threads[i], i);
 
-	/* Wait for kthread_stop */
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
+		/* Make it highest prio. */
+		if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
+			BUG();
 	}
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
-				       unsigned int cpu)
-{
-	static DEFINE_MUTEX(stopmachine_mutex);
-	struct stop_machine_data smdata;
-	struct task_struct *p;
 
-	smdata.fn = fn;
-	smdata.data = data;
-	init_completion(&smdata.done);
+	/* We've created all the threads.  Wake them all: hold this CPU so one
+	 * doesn't hit this CPU until we're ready. */
+	cpu = get_cpu();
+	for_each_online_cpu(i)
+		wake_up_process(threads[i]);
 
-	mutex_lock(&stopmachine_mutex);
+	/* This will release the thread on our CPU. */
+	put_cpu();
+	wait_for_completion(&finished);
 
-	/* If they don't care which CPU fn runs on, bind to any online one. */
-	if (cpu == NR_CPUS)
-		cpu = raw_smp_processor_id();
+	kfree(threads);
 
-	p = kthread_create(do_stop, &smdata, "kstopmachine");
-	if (!IS_ERR(p)) {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	return active.fnret;
 
-		/* One high-prio thread per cpu.  We'll do this one. */
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-		kthread_bind(p, cpu);
-		wake_up_process(p);
-		wait_for_completion(&smdata.done);
-	}
-	mutex_unlock(&stopmachine_mutex);
-	return p;
+kill_threads:
+	for_each_online_cpu(i)
+		if (threads[i])
+			kthread_stop(threads[i]);
+	kfree(threads);
+	return err;
 }
 
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine_run_notype(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	struct task_struct *p;
 	int ret;
 
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
-	p = __stop_machine_run(fn, data, cpu);
-	if (!IS_ERR(p))
-		ret = kthread_stop(p);
-	else
-		ret = PTR_ERR(p);
+	ret = __stop_machine_run(fn, data, cpu);
 	put_online_cpus();
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(stop_machine_run);
+EXPORT_SYMBOL_GPL(stop_machine_run_notype);
author	Stephen Rothwell <sfr@canb.auug.org.au>	2008-06-25 16:27:25 +1000
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2008-06-25 16:27:25 +1000
commit	b78ffa0535e3749e4a9d621f3627a97b603de77a (patch)
tree	5546c941b7b7e7d5fc09f4fadeaef11f7eb7f3eb
parent	c9ce4244158a7a7959e7b734d7968b840db5497c (diff)
parent	febc36efc07ed8c3288523655ab7594f552e496b (diff)