summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2008-07-23 15:01:17 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2008-07-23 15:01:17 +1000
commit6e2dd0866eac9639373e643f6e4f16b0a3834058 (patch)
tree8df9171cdd3b1a5e015f0c2fe9edcd7411773bba
parent7ab8a72be33fa10596231c5a7f9b6e2933e3b7c9 (diff)
parent6239578a723b041bb047e891ac8b22475d163d96 (diff)
Merge branch 'quilt/rr'
Conflicts: kernel/stop_machine.c
-rw-r--r--Documentation/DocBook/kernel-locking.tmpl57
-rw-r--r--Documentation/lguest/lguest.c541
-rw-r--r--arch/ia64/kernel/salinfo.c4
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/kernel/kprobes.c6
-rw-r--r--arch/s390/kernel/setup.c4
-rw-r--r--arch/x86/lguest/boot.c3
-rw-r--r--drivers/block/virtio_blk.c10
-rw-r--r--drivers/char/Kconfig11
-rw-r--r--drivers/char/Makefile1
-rw-r--r--drivers/char/hvc_console.c85
-rw-r--r--drivers/char/hvc_console.h35
-rw-r--r--drivers/char/hvc_irq.c45
-rw-r--r--drivers/char/hvc_iseries.c2
-rw-r--r--drivers/char/hvc_vio.c2
-rw-r--r--drivers/char/hvc_xen.c2
-rw-r--r--drivers/char/hw_random/intel-rng.c6
-rw-r--r--drivers/char/snsc.c4
-rw-r--r--drivers/char/viotape.c4
-rw-r--r--drivers/char/virtio_console.c40
-rw-r--r--drivers/infiniband/core/user_mad.c2
-rw-r--r--drivers/input/serio/hil_mlc.c4
-rw-r--r--drivers/input/serio/hp_sdc_mlc.c14
-rw-r--r--drivers/lguest/core.c1
-rw-r--r--drivers/lguest/interrupts_and_traps.c24
-rw-r--r--drivers/lguest/lguest_device.c14
-rw-r--r--drivers/lguest/x86/core.c4
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/net/3c527.c2
-rw-r--r--drivers/net/irda/sir_dev.c2
-rw-r--r--drivers/net/virtio_net.c114
-rw-r--r--drivers/net/wireless/airo.c12
-rw-r--r--drivers/s390/kvm/kvm_virtio.c34
-rw-r--r--drivers/scsi/aacraid/commsup.c2
-rw-r--r--drivers/usb/core/usb.c2
-rw-r--r--drivers/usb/gadget/inode.c2
-rw-r--r--drivers/virtio/virtio.c26
-rw-r--r--drivers/virtio/virtio_pci.c13
-rw-r--r--drivers/virtio/virtio_ring.c41
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/xfs/linux-2.6/sema.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c4
-rw-r--r--include/asm-s390/kvm_virtio.h10
-rw-r--r--include/linux/compiler-gcc.h18
-rw-r--r--include/linux/compiler-intel.h2
-rw-r--r--include/linux/kernel.h35
-rw-r--r--include/linux/kthread.h29
-rw-r--r--include/linux/mutex.h4
-rw-r--r--include/linux/semaphore.h8
-rw-r--r--include/linux/stop_machine.h57
-rw-r--r--include/linux/timer.h32
-rw-r--r--include/linux/usb.h2
-rw-r--r--include/linux/virtio_9p.h2
-rw-r--r--include/linux/virtio_balloon.h2
-rw-r--r--include/linux/virtio_blk.h5
-rw-r--r--include/linux/virtio_config.h16
-rw-r--r--include/linux/virtio_console.h2
-rw-r--r--include/linux/virtio_net.h2
-rw-r--r--include/linux/virtio_pci.h5
-rw-r--r--include/linux/virtio_ring.h14
-rw-r--r--include/linux/virtio_rng.h2
-rw-r--r--kernel/cpu.c16
-rw-r--r--kernel/kthread.c29
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/rcuclassic.c4
-rw-r--r--kernel/semaphore.c17
-rw-r--r--kernel/stop_machine.c332
-rw-r--r--kernel/sysctl.c12
-rw-r--r--mm/page_alloc.c4
72 files changed, 1274 insertions, 599 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index 2510763295d0..084f6ad7b7a0 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -219,10 +219,10 @@
</para>
<sect1 id="lock-intro">
- <title>Three Main Types of Kernel Locks: Spinlocks, Mutexes and Semaphores</title>
+ <title>Two Main Types of Kernel Locks: Spinlocks and Mutexes</title>
<para>
- There are three main types of kernel locks. The fundamental type
+ There are two main types of kernel locks. The fundamental type
is the spinlock
(<filename class="headerfile">include/asm/spinlock.h</filename>),
which is a very simple single-holder lock: if you can't get the
@@ -240,14 +240,6 @@
use a spinlock instead.
</para>
<para>
- The third type is a semaphore
- (<filename class="headerfile">include/linux/semaphore.h</filename>): it
- can have more than one holder at any time (the number decided at
- initialization time), although it is most commonly used as a
- single-holder lock (a mutex). If you can't get a semaphore, your
- task will be suspended and later on woken up - just like for mutexes.
- </para>
- <para>
Neither type of lock is recursive: see
<xref linkend="deadlock"/>.
</para>
@@ -278,7 +270,7 @@
</para>
<para>
- Semaphores still exist, because they are required for
+ Mutexes still exist, because they are required for
synchronization between <firstterm linkend="gloss-usercontext">user
contexts</firstterm>, as we will see below.
</para>
@@ -289,18 +281,17 @@
<para>
If you have a data structure which is only ever accessed from
- user context, then you can use a simple semaphore
- (<filename>linux/linux/semaphore.h</filename>) to protect it. This
- is the most trivial case: you initialize the semaphore to the number
- of resources available (usually 1), and call
- <function>down_interruptible()</function> to grab the semaphore, and
- <function>up()</function> to release it. There is also a
- <function>down()</function>, which should be avoided, because it
+ user context, then you can use a simple mutex
+ (<filename>include/linux/mutex.h</filename>) to protect it. This
+ is the most trivial case: you initialize the mutex. Then you can
+ call <function>mutex_lock_interruptible()</function> to grab the mutex,
+ and <function>mutex_unlock()</function> to release it. There is also a
+ <function>mutex_lock()</function>, which should be avoided, because it
will not return if a signal is received.
</para>
<para>
- Example: <filename>linux/net/core/netfilter.c</filename> allows
+ Example: <filename>net/netfilter/nf_sockopt.c</filename> allows
registration of new <function>setsockopt()</function> and
<function>getsockopt()</function> calls, with
<function>nf_register_sockopt()</function>. Registration and
@@ -515,7 +506,7 @@
<listitem>
<para>
If you are in a process context (any syscall) and want to
- lock other process out, use a semaphore. You can take a semaphore
+ lock other process out, use a mutex. You can take a mutex
and sleep (<function>copy_from_user*(</function> or
<function>kmalloc(x,GFP_KERNEL)</function>).
</para>
@@ -662,7 +653,7 @@
<entry>SLBH</entry>
<entry>SLBH</entry>
<entry>SLBH</entry>
-<entry>DI</entry>
+<entry>MLI</entry>
<entry>None</entry>
</row>
@@ -692,8 +683,8 @@
<entry>spin_lock_bh</entry>
</row>
<row>
-<entry>DI</entry>
-<entry>down_interruptible</entry>
+<entry>MLI</entry>
+<entry>mutex_lock_interruptible</entry>
</row>
</tbody>
@@ -1310,7 +1301,7 @@ as Alan Cox says, <quote>Lock data, not code</quote>.
<para>
There is a coding bug where a piece of code tries to grab a
spinlock twice: it will spin forever, waiting for the lock to
- be released (spinlocks, rwlocks and semaphores are not
+ be released (spinlocks, rwlocks and mutexes are not
recursive in Linux). This is trivial to diagnose: not a
stay-up-five-nights-talk-to-fluffy-code-bunnies kind of
problem.
@@ -1335,7 +1326,7 @@ as Alan Cox says, <quote>Lock data, not code</quote>.
<para>
This complete lockup is easy to diagnose: on SMP boxes the
- watchdog timer or compiling with <symbol>DEBUG_SPINLOCKS</symbol> set
+ watchdog timer or compiling with <symbol>DEBUG_SPINLOCK</symbol> set
(<filename>include/linux/spinlock.h</filename>) will show this up
immediately when it happens.
</para>
@@ -1558,7 +1549,7 @@ the amount of locking which needs to be done.
<title>Read/Write Lock Variants</title>
<para>
- Both spinlocks and semaphores have read/write variants:
+ Both spinlocks and mutexes have read/write variants:
<type>rwlock_t</type> and <structname>struct rw_semaphore</structname>.
These divide users into two classes: the readers and the writers. If
you are only reading the data, you can get a read lock, but to write to
@@ -1681,7 +1672,7 @@ the amount of locking which needs to be done.
#include &lt;linux/slab.h&gt;
#include &lt;linux/string.h&gt;
+#include &lt;linux/rcupdate.h&gt;
- #include &lt;linux/semaphore.h&gt;
+ #include &lt;linux/mutex.h&gt;
#include &lt;asm/errno.h&gt;
struct object
@@ -1913,7 +1904,7 @@ machines due to caching.
</listitem>
<listitem>
<para>
- <function> put_user()</function>
+ <function>put_user()</function>
</para>
</listitem>
</itemizedlist>
@@ -1927,13 +1918,13 @@ machines due to caching.
<listitem>
<para>
- <function>down_interruptible()</function> and
- <function>down()</function>
+ <function>mutex_lock_interruptible()</function> and
+ <function>mutex_lock()</function>
</para>
<para>
- There is a <function>down_trylock()</function> which can be
+ There is a <function>mutex_trylock()</function> which can be
used inside interrupt context, as it will not sleep.
- <function>up()</function> will also never sleep.
+ <function>mutex_unlock()</function> will also never sleep.
</para>
</listitem>
</itemizedlist>
@@ -2023,7 +2014,7 @@ machines due to caching.
<para>
Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is
unset, processes in user context inside the kernel would not
- preempt each other (ie. you had that CPU until you have it up,
+ preempt each other (ie. you had that CPU until you gave it up,
except for interrupts). With the addition of
<symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when
in user context, higher priority tasks can "cut in": spinlocks
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 82fafe0429fe..a8937b348ee7 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -36,11 +36,13 @@
#include <sched.h>
#include <limits.h>
#include <stddef.h>
+#include <signal.h>
#include "linux/lguest_launcher.h"
#include "linux/virtio_config.h"
#include "linux/virtio_net.h"
#include "linux/virtio_blk.h"
#include "linux/virtio_console.h"
+#include "linux/virtio_rng.h"
#include "linux/virtio_ring.h"
#include "asm-x86/bootparam.h"
/*L:110 We can ignore the 39 include files we need for this program, but I do
@@ -64,8 +66,8 @@ typedef uint8_t u8;
#endif
/* We can have up to 256 pages for devices. */
#define DEVICE_PAGES 256
-/* This will occupy 2 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 128
+/* This will occupy 3 pages: it must be a power of 2. */
+#define VIRTQUEUE_NUM 256
/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
* this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -74,12 +76,19 @@ static bool verbose;
do { if (verbose) printf(args); } while(0)
/*:*/
-/* The pipe to send commands to the waker process */
-static int waker_fd;
+/* File descriptors for the Waker. */
+struct {
+ int pipe[2];
+ int lguest_fd;
+} waker_fds;
+
/* The pointer to the start of guest memory. */
static void *guest_base;
/* The maximum guest physical address allowed, and maximum possible. */
static unsigned long guest_limit, guest_max;
+/* The pipe for signal hander to write to. */
+static int timeoutpipe[2];
+static unsigned int timeout_usec = 500;
/* a per-cpu variable indicating whose vcpu is currently running */
static unsigned int __thread cpu_id;
@@ -152,16 +161,18 @@ struct virtqueue
/* The actual ring of buffers. */
struct vring vring;
- /* Last available index we saw. */
- u16 last_avail_idx;
-
- /* The routine to call when the Guest pings us. */
- void (*handle_output)(int fd, struct virtqueue *me);
+ /* The routine to call when the Guest pings us, or timeout. */
+ void (*handle_output)(int fd, struct virtqueue *me, bool timeout);
/* Outstanding buffers */
unsigned int inflight;
+
+ /* Is this blocked awaiting a timer? */
+ bool blocked;
};
+static unsigned int net_xmit_notify, net_recv_notify, net_timeout;
+
/* Remember the arguments to the program so we can "reboot" */
static char **main_args;
@@ -190,6 +201,9 @@ static void *_convert(struct iovec *iov, size_t size, size_t align,
return iov->iov_base;
}
+/* Wrapper for the last available index. Makes it easier to change. */
+#define lg_last_avail(vq) vring_last_avail(&(vq)->vring)
+
/* The virtio configuration space is defined to be little-endian. x86 is
* little-endian too, but it's nice to be explicit so we have these helpers. */
#define cpu_to_le16(v16) (v16)
@@ -199,6 +213,33 @@ static void *_convert(struct iovec *iov, size_t size, size_t align,
#define le32_to_cpu(v32) (v32)
#define le64_to_cpu(v64) (v64)
+/* Is this iovec empty? */
+static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_iov; i++)
+ if (iov[i].iov_len)
+ return false;
+ return true;
+}
+
+/* Take len bytes from the front of this iovec. */
+static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_iov; i++) {
+ unsigned int used;
+
+ used = iov[i].iov_len < len ? iov[i].iov_len : len;
+ iov[i].iov_base += used;
+ iov[i].iov_len -= used;
+ len -= used;
+ }
+ assert(len == 0);
+}
+
/* The device virtqueue descriptors are followed by feature bitmasks. */
static u8 *get_feature_bits(struct device *dev)
{
@@ -254,6 +295,7 @@ static void *map_zeroed_pages(unsigned int num)
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED)
err(1, "Mmaping %u pages of /dev/zero", num);
+ close(fd);
return addr;
}
@@ -540,69 +582,64 @@ static void add_device_fd(int fd)
* watch, but handing a file descriptor mask through to the kernel is fairly
* icky.
*
- * Instead, we fork off a process which watches the file descriptors and writes
+ * Instead, we clone off a thread which watches the file descriptors and writes
* the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
* stop running the Guest. This causes the Launcher to return from the
* /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
* the LHREQ_BREAK and wake us up again.
*
* This, of course, is merely a different *kind* of icky.
+ *
+ * Given my well-known antipathy to threads, I'd prefer to use processes. But
+ * it's easier to share Guest memory with threads, and trivial to share the
+ * devices.infds as the Launcher changes it.
*/
-static void wake_parent(int pipefd, int lguest_fd)
+static int waker(void *unused)
{
- /* Add the pipe from the Launcher to the fdset in the device_list, so
- * we watch it, too. */
- add_device_fd(pipefd);
+ /* Close the write end of the pipe: only the Launcher has it open. */
+ close(waker_fds.pipe[1]);
for (;;) {
fd_set rfds = devices.infds;
unsigned long args[] = { LHREQ_BREAK, 1 };
+ unsigned int maxfd = devices.max_infd;
+
+ /* We also listen to the pipe from the Launcher. */
+ FD_SET(waker_fds.pipe[0], &rfds);
+ if (waker_fds.pipe[0] > maxfd)
+ maxfd = waker_fds.pipe[0];
/* Wait until input is ready from one of the devices. */
- select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
- /* Is it a message from the Launcher? */
- if (FD_ISSET(pipefd, &rfds)) {
- int fd;
- /* If read() returns 0, it means the Launcher has
- * exited. We silently follow. */
- if (read(pipefd, &fd, sizeof(fd)) == 0)
- exit(0);
- /* Otherwise it's telling us to change what file
- * descriptors we're to listen to. Positive means
- * listen to a new one, negative means stop
- * listening. */
- if (fd >= 0)
- FD_SET(fd, &devices.infds);
- else
- FD_CLR(-fd - 1, &devices.infds);
- } else /* Send LHREQ_BREAK command. */
- pwrite(lguest_fd, args, sizeof(args), cpu_id);
+ select(maxfd+1, &rfds, NULL, NULL, NULL);
+
+ /* Message from Launcher? */
+ if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
+ char c;
+ /* If this fails, then assume Launcher has exited.
+ * Don't do anything on exit: we're just a thread! */
+ if (read(waker_fds.pipe[0], &c, 1) != 1)
+ _exit(0);
+ continue;
+ }
+
+ /* Send LHREQ_BREAK command to snap the Launcher out of it. */
+ pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id);
}
+ return 0;
}
/* This routine just sets up a pipe to the Waker process. */
-static int setup_waker(int lguest_fd)
-{
- int pipefd[2], child;
-
- /* We create a pipe to talk to the Waker, and also so it knows when the
- * Launcher dies (and closes pipe). */
- pipe(pipefd);
- child = fork();
- if (child == -1)
- err(1, "forking");
-
- if (child == 0) {
- /* We are the Waker: close the "writing" end of our copy of the
- * pipe and start waiting for input. */
- close(pipefd[1]);
- wake_parent(pipefd[0], lguest_fd);
- }
- /* Close the reading end of our copy of the pipe. */
- close(pipefd[0]);
+static void setup_waker(int lguest_fd)
+{
+ /* This pipe is closed when Launcher dies, telling Waker. */
+ if (pipe(waker_fds.pipe) != 0)
+ err(1, "Creating pipe for Waker");
+
+ /* Waker also needs to know the lguest fd */
+ waker_fds.lguest_fd = lguest_fd;
- /* Here is the fd used to talk to the waker. */
- return pipefd[1];
+ if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
+ err(1, "Creating Waker");
}
/*
@@ -661,19 +698,22 @@ static unsigned get_vq_desc(struct virtqueue *vq,
unsigned int *out_num, unsigned int *in_num)
{
unsigned int i, head;
+ u16 last_avail;
/* Check it isn't doing very strange things with descriptor numbers. */
- if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
+ last_avail = lg_last_avail(vq);
+ if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
errx(1, "Guest moved used index from %u to %u",
- vq->last_avail_idx, vq->vring.avail->idx);
+ last_avail, vq->vring.avail->idx);
/* If there's nothing new since last we looked, return invalid. */
- if (vq->vring.avail->idx == vq->last_avail_idx)
+ if (vq->vring.avail->idx == last_avail)
return vq->vring.num;
/* Grab the next descriptor number they're advertising, and increment
* the index we've seen. */
- head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
+ head = vq->vring.avail->ring[last_avail % vq->vring.num];
+ lg_last_avail(vq)++;
/* If their number is silly, that's a fatal mistake. */
if (head >= vq->vring.num)
@@ -821,10 +861,13 @@ static bool handle_console_input(int fd, struct device *dev)
unsigned long args[] = { LHREQ_BREAK, 0 };
/* Close the fd so Waker will know it has to
* exit. */
- close(waker_fd);
- /* Just in case waker is blocked in BREAK, send
+ close(waker_fds.pipe[1]);
+ /* Just in case Waker is blocked in BREAK, send
* unbreak now. */
write(fd, args, sizeof(args));
+ printf("network xmit %u recv %u timeout %u usec %u\n",
+ net_xmit_notify, net_recv_notify,
+ net_timeout, timeout_usec);
exit(2);
}
abort->count = 0;
@@ -839,7 +882,7 @@ static bool handle_console_input(int fd, struct device *dev)
/* Handling output for console is simple: we just get all the output buffers
* and write them to stdout. */
-static void handle_console_output(int fd, struct virtqueue *vq)
+static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
{
unsigned int head, out, in;
int len;
@@ -854,6 +897,21 @@ static void handle_console_output(int fd, struct virtqueue *vq)
}
}
+static void block_vq(struct virtqueue *vq)
+{
+ struct itimerval itm;
+
+ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+ vq->blocked = true;
+
+ itm.it_interval.tv_sec = 0;
+ itm.it_interval.tv_usec = 0;
+ itm.it_value.tv_sec = 0;
+ itm.it_value.tv_usec = timeout_usec;
+
+ setitimer(ITIMER_REAL, &itm, NULL);
+}
+
/*
* The Network
*
@@ -861,22 +919,37 @@ static void handle_console_output(int fd, struct virtqueue *vq)
* and write them (ignoring the first element) to this device's file descriptor
* (/dev/net/tun).
*/
-static void handle_net_output(int fd, struct virtqueue *vq)
+static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
{
- unsigned int head, out, in;
+ unsigned int head, out, in, num = 0;
int len;
struct iovec iov[vq->vring.num];
+ static int last_timeout_num;
+
+ if (!timeout)
+ net_xmit_notify++;
/* Keep getting output buffers from the Guest until we run out. */
while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
if (in)
errx(1, "Input buffers in output queue?");
- /* Check header, but otherwise ignore it (we told the Guest we
- * supported no features, so it shouldn't have anything
- * interesting). */
- (void)convert(&iov[0], struct virtio_net_hdr);
- len = writev(vq->dev->fd, iov+1, out-1);
+ len = writev(vq->dev->fd, iov, out);
+ if (len < 0)
+ err(1, "Writing network packet to tun");
add_used_and_trigger(fd, vq, head, len);
+ num++;
+ }
+
+ /* Block further kicks and set up a timer if we saw anything. */
+ if (!timeout && num)
+ block_vq(vq);
+
+ if (timeout) {
+ if (num < last_timeout_num)
+ timeout_usec += 10;
+ else if (timeout_usec > 1)
+ timeout_usec--;
+ last_timeout_num = num;
}
}
@@ -887,7 +960,6 @@ static bool handle_tun_input(int fd, struct device *dev)
unsigned int head, in_num, out_num;
int len;
struct iovec iov[dev->vq->vring.num];
- struct virtio_net_hdr *hdr;
/* First we need a network buffer from the Guests's recv virtqueue. */
head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
@@ -896,25 +968,23 @@ static bool handle_tun_input(int fd, struct device *dev)
* early, the Guest won't be ready yet. Wait until the device
* status says it's ready. */
/* FIXME: Actually want DRIVER_ACTIVE here. */
- if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
- warn("network: no dma buffer!");
+
+ /* Now tell it we want to know if new things appear. */
+ dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ wmb();
+
/* We'll turn this back on if input buffers are registered. */
return false;
} else if (out_num)
errx(1, "Output buffers in network recv queue?");
- /* First element is the header: we set it to 0 (no features). */
- hdr = convert(&iov[0], struct virtio_net_hdr);
- hdr->flags = 0;
- hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
-
/* Read the packet from the device directly into the Guest's buffer. */
- len = readv(dev->fd, iov+1, in_num-1);
+ len = readv(dev->fd, iov, in_num);
if (len <= 0)
err(1, "reading network");
/* Tell the Guest about the new packet. */
- add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
+ add_used_and_trigger(fd, dev->vq, head, len);
verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
@@ -927,11 +997,21 @@ static bool handle_tun_input(int fd, struct device *dev)
/*L:215 This is the callback attached to the network and console input
* virtqueues: it ensures we try again, in case we stopped console or net
* delivery because Guest didn't have any buffers. */
-static void enable_fd(int fd, struct virtqueue *vq)
+static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
+{
+ add_device_fd(vq->dev->fd);
+ /* Snap the Waker out of its select loop. */
+ write(waker_fds.pipe[1], "", 1);
+}
+
+static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout)
{
+ /* We don't need to know again when Guest refills receive buffer. */
+ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+ net_recv_notify++;
add_device_fd(vq->dev->fd);
- /* Tell waker to listen to it again */
- write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
+ /* Snap the Waker out of its select loop. */
+ write(waker_fds.pipe[1], "", 1);
}
/* When the Guest tells us they updated the status field, we handle it. */
@@ -951,7 +1031,7 @@ static void update_device_status(struct device *dev)
for (vq = dev->vq; vq; vq = vq->next) {
memset(vq->vring.desc, 0,
vring_size(vq->config.num, getpagesize()));
- vq->last_avail_idx = 0;
+ lg_last_avail(vq) = 0;
}
} else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
warnx("Device %s configuration FAILED", dev->name);
@@ -960,10 +1040,10 @@ static void update_device_status(struct device *dev)
verbose("Device %s OK: offered", dev->name);
for (i = 0; i < dev->desc->feature_len; i++)
- verbose(" %08x", get_feature_bits(dev)[i]);
+ verbose(" %02x", get_feature_bits(dev)[i]);
verbose(", accepted");
for (i = 0; i < dev->desc->feature_len; i++)
- verbose(" %08x", get_feature_bits(dev)
+ verbose(" %02x", get_feature_bits(dev)
[dev->desc->feature_len+i]);
if (dev->ready)
@@ -1000,7 +1080,7 @@ static void handle_output(int fd, unsigned long addr)
if (strcmp(vq->dev->name, "console") != 0)
verbose("Output to %s\n", vq->dev->name);
if (vq->handle_output)
- vq->handle_output(fd, vq);
+ vq->handle_output(fd, vq, false);
return;
}
}
@@ -1014,6 +1094,29 @@ static void handle_output(int fd, unsigned long addr)
strnlen(from_guest_phys(addr), guest_limit - addr));
}
+static void handle_timeout(int fd)
+{
+ char buf[32];
+ struct device *i;
+ struct virtqueue *vq;
+
+ /* Clear the pipe */
+ read(timeoutpipe[0], buf, sizeof(buf));
+
+ /* Check each device and virtqueue: flush blocked ones. */
+ for (i = devices.dev; i; i = i->next) {
+ for (vq = i->vq; vq; vq = vq->next) {
+ if (!vq->blocked)
+ continue;
+
+ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ vq->blocked = false;
+ if (vq->handle_output)
+ vq->handle_output(fd, vq, true);
+ }
+ }
+}
+
/* This is called when the Waker wakes us up: check for incoming file
* descriptors. */
static void handle_input(int fd)
@@ -1024,16 +1127,20 @@ static void handle_input(int fd)
for (;;) {
struct device *i;
fd_set fds = devices.infds;
+ int num;
+ num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
+ /* Could get interrupted */
+ if (num < 0)
+ continue;
/* If nothing is ready, we're done. */
- if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
+ if (num == 0)
break;
/* Otherwise, call the device(s) which have readable file
* descriptors and a method of handling them. */
for (i = devices.dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
- int dev_fd;
if (i->handle_input(fd, i))
continue;
@@ -1043,13 +1150,12 @@ static void handle_input(int fd)
* buffers to deliver into. Console also uses
* it when it discovers that stdin is closed. */
FD_CLR(i->fd, &devices.infds);
- /* Tell waker to ignore it too, by sending a
- * negative fd number (-1, since 0 is a valid
- * FD number). */
- dev_fd = -i->fd - 1;
- write(waker_fd, &dev_fd, sizeof(dev_fd));
}
}
+
+ /* Is this the timeout fd? */
+ if (FD_ISSET(timeoutpipe[0], &fds))
+ handle_timeout(fd);
}
}
@@ -1098,7 +1204,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
/* Each device descriptor is followed by the description of its virtqueues. We
* specify how many descriptors the virtqueue is to have. */
static void add_virtqueue(struct device *dev, unsigned int num_descs,
- void (*handle_output)(int fd, struct virtqueue *me))
+ void (*handle_output)(int, struct virtqueue *, bool))
{
unsigned int pages;
struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1111,9 +1217,9 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
/* Initialize the virtqueue */
vq->next = NULL;
- vq->last_avail_idx = 0;
vq->dev = dev;
vq->inflight = 0;
+ vq->blocked = false;
/* Initialize the configuration. */
vq->config.num = num_descs;
@@ -1168,6 +1274,10 @@ static void add_feature(struct device *dev, unsigned bit)
* how we use it. */
static void set_config(struct device *dev, unsigned len, const void *conf)
{
+ /* We always set the VIRTIO_RING_F_PUBLISH_INDICES feature
+ * bit, so now is a good time to do that. */
+ add_feature(dev, VIRTIO_RING_F_PUBLISH_INDICES);
+
/* Check we haven't overflowed our single page. */
if (device_config(dev) + len > devices.descpage + getpagesize())
errx(1, "Too many devices");
@@ -1242,10 +1352,31 @@ static void setup_console(void)
add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
+ /* Every device should set this bit. */
+ add_feature(dev, VIRTIO_RING_F_PUBLISH_INDICES);
verbose("device %u: console\n", devices.device_num++);
}
/*:*/
+static void timeout_alarm(int sig)
+{
+ net_timeout++;
+ write(timeoutpipe[1], "", 1);
+}
+
+static void setup_timeout(void)
+{
+ if (pipe(timeoutpipe) != 0)
+ err(1, "Creating timeout pipe");
+
+ if (fcntl(timeoutpipe[1], F_SETFL,
+ fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
+ err(1, "Making timeout pipe nonblocking");
+
+ add_device_fd(timeoutpipe[0]);
+ signal(SIGALRM, timeout_alarm);
+}
+
/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
* --sharenet=<name> option which opens or creates a named pipe. This can be
* used to send packets to another guest in a 1:1 manner.
@@ -1264,10 +1395,25 @@ static void setup_console(void)
static u32 str2ip(const char *ipaddr)
{
- unsigned int byte[4];
+ unsigned int b[4];
- sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
- return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
+ if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
+ errx(1, "Failed to parse IP address '%s'", ipaddr);
+ return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
+}
+
+static void str2mac(const char *macaddr, unsigned char mac[6])
+{
+ unsigned int m[6];
+ if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
+ &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
+ errx(1, "Failed to parse mac address '%s'", macaddr);
+ mac[0] = m[0];
+ mac[1] = m[1];
+ mac[2] = m[2];
+ mac[3] = m[3];
+ mac[4] = m[4];
+ mac[5] = m[5];
}
/* This code is "adapted" from libbridge: it attaches the Host end of the
@@ -1288,6 +1434,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
errx(1, "interface %s does not exist!", if_name);
strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
+ ifr.ifr_name[IFNAMSIZ-1] = '\0';
ifr.ifr_ifindex = ifidx;
if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
err(1, "can't add %s to bridge %s", if_name, br_name);
@@ -1296,64 +1443,90 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
/* This sets up the Host end of the network device with an IP address, brings
* it up so packets will flow, the copies the MAC address into the hwaddr
* pointer. */
-static void configure_device(int fd, const char *devname, u32 ipaddr,
- unsigned char hwaddr[6])
+static void configure_device(int fd, const char *tapif, u32 ipaddr)
{
struct ifreq ifr;
struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
- /* Don't read these incantations. Just cut & paste them like I did! */
memset(&ifr, 0, sizeof(ifr));
- strcpy(ifr.ifr_name, devname);
+ strcpy(ifr.ifr_name, tapif);
+
+ /* Don't read these incantations. Just cut & paste them like I did! */
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = htonl(ipaddr);
if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
- err(1, "Setting %s interface address", devname);
+ err(1, "Setting %s interface address", tapif);
ifr.ifr_flags = IFF_UP;
if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
- err(1, "Bringing interface %s up", devname);
+ err(1, "Bringing interface %s up", tapif);
+}
+
+static void get_mac(int fd, const char *tapif, unsigned char hwaddr[6])
+{
+ struct ifreq ifr;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strcpy(ifr.ifr_name, tapif);
/* SIOC stands for Socket I/O Control. G means Get (vs S for Set
* above). IF means Interface, and HWADDR is hardware address.
* Simple! */
if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
- err(1, "getting hw address for %s", devname);
+ err(1, "getting hw address for %s", tapif);
memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
}
-/*L:195 Our network is a Host<->Guest network. This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card. We
- * just shunt packets between the Guest and the tun device. */
-static void setup_tun_net(const char *arg)
+static int get_tun_device(char tapif[IFNAMSIZ])
{
- struct device *dev;
struct ifreq ifr;
- int netfd, ipfd;
- u32 ip;
- const char *br_name = NULL;
- struct virtio_net_config conf;
+ int netfd;
+
+ /* Start with this zeroed. Messy but sure. */
+ memset(&ifr, 0, sizeof(ifr));
/* We open the /dev/net/tun device and tell it we want a tap device. A
* tap device is like a tun device, only somehow different. To tell
* the truth, I completely blundered my way through this code, but it
* works now! */
netfd = open_or_die("/dev/net/tun", O_RDWR);
- memset(&ifr, 0, sizeof(ifr));
- ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
strcpy(ifr.ifr_name, "tap%d");
if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
err(1, "configuring /dev/net/tun");
+
+ if (ioctl(netfd, TUNSETOFFLOAD,
+ TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
+ err(1, "Could not set features for tun device");
+
/* We don't need checksums calculated for packets coming in this
* device: trust us! */
ioctl(netfd, TUNSETNOCSUM, 1);
+ memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
+ return netfd;
+}
+
+/*L:195 Our network is a Host<->Guest network. This can either use bridging or
+ * routing, but the principle is the same: it uses the "tun" device to inject
+ * packets into the Host as if they came in from a normal network card. We
+ * just shunt packets between the Guest and the tun device. */
+static void setup_tun_net(char *arg)
+{
+ struct device *dev;
+ int netfd, ipfd;
+ u32 ip = INADDR_ANY;
+ bool bridging = false;
+ char tapif[IFNAMSIZ], *p;
+ struct virtio_net_config conf;
+
+ netfd = get_tun_device(tapif);
+
/* First we create a new network device. */
dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
/* Network devices need a receive and a send queue, just like
* console. */
- add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
/* We need a socket to perform the magic network ioctls to bring up the
@@ -1364,28 +1537,56 @@ static void setup_tun_net(const char *arg)
/* If the command line was --tunnet=bridge:<name> do bridging. */
if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
- ip = INADDR_ANY;
- br_name = arg + strlen(BRIDGE_PFX);
- add_to_bridge(ipfd, ifr.ifr_name, br_name);
- } else /* It is an IP address to set up the device with */
+ arg += strlen(BRIDGE_PFX);
+ bridging = true;
+ }
+
+ /* A mac address may follow the bridge name or IP address */
+ p = strchr(arg, ':');
+ if (p) {
+ str2mac(p+1, conf.mac);
+ *p = '\0';
+ } else {
+ p = arg + strlen(arg);
+ /* None supplied; query the randomly assigned mac. */
+ get_mac(ipfd, tapif, conf.mac);
+ }
+
+ /* arg is now either an IP address or a bridge name */
+ if (bridging)
+ add_to_bridge(ipfd, tapif, arg);
+ else
ip = str2ip(arg);
- /* Set up the tun device, and get the mac address for the interface. */
- configure_device(ipfd, ifr.ifr_name, ip, conf.mac);
+ /* Set up the tun device. */
+ configure_device(ipfd, tapif, ip);
/* Tell Guest what MAC address to use. */
add_feature(dev, VIRTIO_NET_F_MAC);
add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY);
+ /* Expect Guest to handle everything except UFO */
+ add_feature(dev, VIRTIO_NET_F_CSUM);
+ add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
+ add_feature(dev, VIRTIO_NET_F_MAC);
+ add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
+ add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
+ add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
+ add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
+ add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
+ add_feature(dev, VIRTIO_NET_F_HOST_ECN);
set_config(dev, sizeof(conf), &conf);
/* We don't need the socket any more; setup is done. */
close(ipfd);
- verbose("device %u: tun net %u.%u.%u.%u\n",
- devices.device_num++,
- (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
- if (br_name)
- verbose("attached to bridge: %s\n", br_name);
+ devices.device_num++;
+
+ if (bridging)
+ verbose("device %u: tun %s attached to bridge: %s\n",
+ devices.device_num, tapif, arg);
+ else
+ verbose("device %u: tun %s: %s\n",
+ devices.device_num, tapif, arg);
}
/* Our block (disk) device should be really simple: the Guest asks for a block
@@ -1550,7 +1751,7 @@ static bool handle_io_finish(int fd, struct device *dev)
}
/* When the Guest submits some I/O, we just need to wake the I/O thread. */
-static void handle_virtblk_output(int fd, struct virtqueue *vq)
+static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
{
struct vblk_info *vblk = vq->dev->priv;
char c = 0;
@@ -1621,6 +1822,64 @@ static void setup_block_file(const char *filename)
verbose("device %u: virtblock %llu sectors\n",
devices.device_num, le64_to_cpu(conf.capacity));
}
+
+/* Our random number generator device reads from /dev/random into the Guest's
+ * input buffers. The usual case is that the Guest doesn't want random numbers
+ * and so has no buffers although /dev/random is still readable, whereas
+ * console is the reverse.
+ *
+ * The same logic applies, however. */
+static bool handle_rng_input(int fd, struct device *dev)
+{
+ int len;
+ unsigned int head, in_num, out_num, totlen = 0;
+ struct iovec iov[dev->vq->vring.num];
+
+ /* First we need a buffer from the Guests's virtqueue. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+
+ /* If they're not ready for input, stop listening to this file
+ * descriptor. We'll start again once they add an input buffer. */
+ if (head == dev->vq->vring.num)
+ return false;
+
+ if (out_num)
+ errx(1, "Output buffers in rng?");
+
+ /* This is why we convert to iovecs: the readv() call uses them, and so
+ * it reads straight into the Guest's buffer. We loop to make sure we
+ * fill it. */
+ while (!iov_empty(iov, in_num)) {
+ len = readv(dev->fd, iov, in_num);
+ if (len <= 0)
+ err(1, "Read from /dev/random gave %i", len);
+ iov_consume(iov, in_num, len);
+ totlen += len;
+ }
+
+ /* Tell the Guest about the new input. */
+ add_used_and_trigger(fd, dev->vq, head, totlen);
+
+ /* Everything went OK! */
+ return true;
+}
+
+/* And this creates a "hardware" random number device for the Guest. */
+static void setup_rng(void)
+{
+ struct device *dev;
+ int fd;
+
+ fd = open_or_die("/dev/random", O_RDONLY);
+
+ /* The device responds to return from I/O thread. */
+ dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input);
+
+ /* The device has one virtqueue, where the Guest places inbufs. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+
+ verbose("device %u: rng\n", devices.device_num++);
+}
/* That's the end of device setup. */
/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
@@ -1628,11 +1887,12 @@ static void __attribute__((noreturn)) restart_guest(void)
{
unsigned int i;
- /* Closing pipes causes the Waker thread and io_threads to die, and
- * closing /dev/lguest cleans up the Guest. Since we don't track all
- * open fds, we simply close everything beyond stderr. */
+ /* Since we don't track all open fds, we simply close everything beyond
+ * stderr. */
for (i = 3; i < FD_SETSIZE; i++)
close(i);
+
+ /* The exec automatically gets rid of the I/O and Waker threads. */
execv(main_args[0], main_args);
err(1, "Could not exec %s", main_args[0]);
}
@@ -1663,7 +1923,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
/* ERESTART means that we need to reboot the guest */
} else if (errno == ERESTART) {
restart_guest();
- /* EAGAIN means the Waker wanted us to look at some input.
+ /* EAGAIN means a signal (timeout).
* Anything else means a bug or incompatible change. */
} else if (errno != EAGAIN)
err(1, "Running guest failed");
@@ -1691,13 +1951,14 @@ static struct option opts[] = {
{ "verbose", 0, NULL, 'v' },
{ "tunnet", 1, NULL, 't' },
{ "block", 1, NULL, 'b' },
+ { "rng", 0, NULL, 'r' },
{ "initrd", 1, NULL, 'i' },
{ NULL },
};
static void usage(void)
{
errx(1, "Usage: lguest [--verbose] "
- "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
+ "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
"|--block=<filename>|--initrd=<filename>]...\n"
"<mem-in-mb> vmlinux [args...]");
}
@@ -1765,6 +2026,9 @@ int main(int argc, char *argv[])
case 'b':
setup_block_file(optarg);
break;
+ case 'r':
+ setup_rng();
+ break;
case 'i':
initrd_name = optarg;
break;
@@ -1783,6 +2047,9 @@ int main(int argc, char *argv[])
/* We always have a console device */
setup_console();
+ /* We can timeout waiting for Guest network transmit. */
+ setup_timeout();
+
/* Now we load the kernel */
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
@@ -1826,10 +2093,10 @@ int main(int argc, char *argv[])
* /dev/lguest file descriptor. */
lguest_fd = tell_kernel(pgdir, start);
- /* We fork off a child process, which wakes the Launcher whenever one
- * of the input file descriptors needs attention. We call this the
- * Waker, and we'll cover it in a moment. */
- waker_fd = setup_waker(lguest_fd);
+ /* We clone off a thread, which wakes the Launcher whenever one of the
+ * input file descriptors needs attention. We call this the Waker, and
+ * we'll cover it in a moment. */
+ setup_waker(lguest_fd);
/* Finally, run the Guest. This doesn't return. */
run_guest(lguest_fd);
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index ecb9eb78d687..57c10efa161c 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -192,7 +192,7 @@ struct salinfo_platform_oemdata_parms {
static void
salinfo_work_to_do(struct salinfo_data *data)
{
- down_trylock(&data->mutex);
+ down_try(&data->mutex);
up(&data->mutex);
}
@@ -309,7 +309,7 @@ salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t
int i, n, cpu = -1;
retry:
- if (cpus_empty(data->cpu_event) && down_trylock(&data->mutex)) {
+ if (cpus_empty(data->cpu_event) && !down_try(&data->mutex)) {
if (file->f_flags & O_NONBLOCK)
return -EAGAIN;
if (down_interruptible(&data->mutex))
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index eb530b4128ba..2ed88122be93 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -565,6 +565,7 @@ bool "s390 guest support (EXPERIMENTAL)"
depends on 64BIT && EXPERIMENTAL
select VIRTIO
select VIRTIO_RING
+ select VIRTIO_CONSOLE
help
Select this option if you want to run the kernel under s390 linux
endmenu
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 288ad490a6dd..2ecc256c6a5d 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -197,7 +197,7 @@ void __kprobes arch_arm_kprobe(struct kprobe *p)
args.new = BREAKPOINT_INSTRUCTION;
kcb->kprobe_status = KPROBE_SWAP_INST;
- stop_machine_run(swap_instruction, &args, NR_CPUS);
+ stop_machine(swap_instruction, &args, NULL);
kcb->kprobe_status = status;
}
@@ -212,7 +212,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
args.new = p->opcode;
kcb->kprobe_status = KPROBE_SWAP_INST;
- stop_machine_run(swap_instruction, &args, NR_CPUS);
+ stop_machine(swap_instruction, &args, NULL);
kcb->kprobe_status = status;
}
@@ -332,7 +332,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
* No kprobe at this address. The fault has not been
* caused by a kprobe breakpoint. The race of breakpoint
* vs. kprobe remove does not exist because on s390 we
- * use stop_machine_run to arm/disarm the breakpoints.
+ * use stop_machine to arm/disarm the breakpoints.
*/
goto no_kprobe;
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index b358e18273b0..62122bad1e33 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -54,6 +54,7 @@
#include <asm/sections.h>
#include <asm/ebcdic.h>
#include <asm/compat.h>
+#include <asm/kvm_virtio.h>
long psw_kernel_bits = (PSW_BASE_BITS | PSW_MASK_DAT | PSW_ASC_PRIMARY |
PSW_MASK_MCHECK | PSW_DEFAULT_KEY);
@@ -766,7 +767,8 @@ setup_arch(char **cmdline_p)
printk("We are running under VM (64 bit mode)\n");
else if (MACHINE_IS_KVM) {
printk("We are running under KVM (64 bit mode)\n");
- add_preferred_console("ttyS", 1, NULL);
+ add_preferred_console("hvc", 0, NULL);
+ s390_virtio_console_init();
} else
printk("We are running native (64 bit mode)\n");
#endif /* CONFIG_64BIT */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 756fc489652b..65f0b8a47bed 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1044,6 +1044,9 @@ __init void lguest_init(void)
init_pg_tables_start = __pa(pg0);
init_pg_tables_end = __pa(pg0);
+ /* As described in head_32.S, we map the first 128M of memory. */
+ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
+
/* Load the %fs segment register (the per-cpu segment register) with
* the normal data segment to get through booting. */
asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dd7ea203f940..42251095134f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -196,6 +196,7 @@ static int virtblk_probe(struct virtio_device *vdev)
int err;
u64 cap;
u32 v;
+ u32 blk_size;
if (index_to_minor(index) >= 1 << MINORBITS)
return -ENOSPC;
@@ -290,6 +291,13 @@ static int virtblk_probe(struct virtio_device *vdev)
if (!err)
blk_queue_max_hw_segments(vblk->disk->queue, v);
+ /* Host can optionally specify the block size of the device */
+ err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE,
+ offsetof(struct virtio_blk_config, blk_size),
+ &blk_size);
+ if (!err)
+ blk_queue_hardsect_size(vblk->disk->queue, blk_size);
+
add_disk(vblk->disk);
return 0;
@@ -330,7 +338,7 @@ static struct virtio_device_id id_table[] = {
static unsigned int features[] = {
VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
- VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO,
+ VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
};
static struct virtio_driver virtio_blk = {
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index e0bbbfb6a36b..67b5df72f90b 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -578,11 +578,14 @@ config HVC_DRIVER
It will automatically be selected if one of the back-end console drivers
is selected.
+config HVC_IRQ
+ bool
config HVC_CONSOLE
bool "pSeries Hypervisor Virtual Console support"
depends on PPC_PSERIES
select HVC_DRIVER
+ select HVC_IRQ
help
pSeries machines when partitioned support a hypervisor virtual
console. This driver allows each pSeries partition to have a console
@@ -593,6 +596,7 @@ config HVC_ISERIES
depends on PPC_ISERIES
default y
select HVC_DRIVER
+ select HVC_IRQ
help
iSeries machines support a hypervisor virtual console.
@@ -614,13 +618,18 @@ config HVC_XEN
bool "Xen Hypervisor Console support"
depends on XEN
select HVC_DRIVER
+ select HVC_IRQ
default y
help
Xen virtual console device driver
config VIRTIO_CONSOLE
- bool
+ tristate "Virtio console"
+ depends on VIRTIO
select HVC_DRIVER
+ help
+ Virtio console for use with lguest and other hypervisors.
+
config HVCS
tristate "IBM Hypervisor Virtual Console Server support"
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index dc5a327d72d5..6db3b3b1ed92 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
obj-$(CONFIG_HVC_DRIVER) += hvc_console.o
+obj-$(CONFIG_HVC_IRQ) += hvc_irq.o
obj-$(CONFIG_HVC_XEN) += hvc_xen.o
obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o
obj-$(CONFIG_RAW_DRIVER) += raw.o
diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index 2f9759d625cc..02aac104842d 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -27,7 +27,6 @@
#include <linux/init.h>
#include <linux/kbd_kern.h>
#include <linux/kernel.h>
-#include <linux/kref.h>
#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/module.h>
@@ -75,23 +74,6 @@ static int hvc_init(void);
static int sysrq_pressed;
#endif
-struct hvc_struct {
- spinlock_t lock;
- int index;
- struct tty_struct *tty;
- unsigned int count;
- int do_wakeup;
- char *outbuf;
- int outbuf_size;
- int n_outbuf;
- uint32_t vtermno;
- struct hv_ops *ops;
- int irq_requested;
- int irq;
- struct list_head next;
- struct kref kref; /* ref count & hvc_struct lifetime */
-};
-
/* dynamic list of hvc_struct instances */
static LIST_HEAD(hvc_structs);
@@ -298,27 +280,15 @@ int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops *ops)
return 0;
}
+EXPORT_SYMBOL_GPL(hvc_instantiate);
/* Wake the sleeping khvcd */
-static void hvc_kick(void)
+void hvc_kick(void)
{
hvc_kicked = 1;
wake_up_process(hvc_task);
}
-
-static int hvc_poll(struct hvc_struct *hp);
-
-/*
- * NOTE: This API isn't used if the console adapter doesn't support interrupts.
- * In this case the console is poll driven.
- */
-static irqreturn_t hvc_handle_interrupt(int irq, void *dev_instance)
-{
- /* if hvc_poll request a repoll, then kick the hvcd thread */
- if (hvc_poll(dev_instance))
- hvc_kick();
- return IRQ_HANDLED;
-}
+EXPORT_SYMBOL_GPL(hvc_kick);
static void hvc_unthrottle(struct tty_struct *tty)
{
@@ -333,7 +303,6 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
{
struct hvc_struct *hp;
unsigned long flags;
- int irq = 0;
int rc = 0;
/* Auto increments kref reference if found. */
@@ -352,18 +321,15 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
tty->low_latency = 1; /* Makes flushes to ldisc synchronous. */
hp->tty = tty;
- /* Save for request_irq outside of spin_lock. */
- irq = hp->irq;
- if (irq)
- hp->irq_requested = 1;
+
+ if (hp->ops->notifier_add)
+ rc = hp->ops->notifier_add(hp, hp->data);
spin_unlock_irqrestore(&hp->lock, flags);
- /* check error, fallback to non-irq */
- if (irq)
- rc = request_irq(irq, hvc_handle_interrupt, IRQF_DISABLED, "hvc_console", hp);
+
/*
- * If the request_irq() fails and we return an error. The tty layer
+ * If the notifier fails we return an error. The tty layer
* will call hvc_close() after a failed open but we don't want to clean
* up there so we'll clean up here and clear out the previously set
* tty fields and return the kref reference.
@@ -371,7 +337,6 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
if (rc) {
spin_lock_irqsave(&hp->lock, flags);
hp->tty = NULL;
- hp->irq_requested = 0;
spin_unlock_irqrestore(&hp->lock, flags);
tty->driver_data = NULL;
kref_put(&hp->kref, destroy_hvc_struct);
@@ -386,7 +351,6 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
static void hvc_close(struct tty_struct *tty, struct file * filp)
{
struct hvc_struct *hp;
- int irq = 0;
unsigned long flags;
if (tty_hung_up_p(filp))
@@ -404,9 +368,8 @@ static void hvc_close(struct tty_struct *tty, struct file * filp)
spin_lock_irqsave(&hp->lock, flags);
if (--hp->count == 0) {
- if (hp->irq_requested)
- irq = hp->irq;
- hp->irq_requested = 0;
+ if (hp->ops->notifier_del)
+ hp->ops->notifier_del(hp, hp->data);
/* We are done with the tty pointer now. */
hp->tty = NULL;
@@ -418,10 +381,6 @@ static void hvc_close(struct tty_struct *tty, struct file * filp)
* waking periodically to check chars_in_buffer().
*/
tty_wait_until_sent(tty, HVC_CLOSE_WAIT);
-
- if (irq)
- free_irq(irq, hp);
-
} else {
if (hp->count < 0)
printk(KERN_ERR "hvc_close %X: oops, count is %d\n",
@@ -436,7 +395,6 @@ static void hvc_hangup(struct tty_struct *tty)
{
struct hvc_struct *hp = tty->driver_data;
unsigned long flags;
- int irq = 0;
int temp_open_count;
if (!hp)
@@ -458,13 +416,12 @@ static void hvc_hangup(struct tty_struct *tty)
hp->count = 0;
hp->n_outbuf = 0;
hp->tty = NULL;
- if (hp->irq_requested)
- /* Saved for use outside of spin_lock. */
- irq = hp->irq;
- hp->irq_requested = 0;
+
+ if (hp->ops->notifier_del)
+ hp->ops->notifier_del(hp, hp->data);
+
spin_unlock_irqrestore(&hp->lock, flags);
- if (irq)
- free_irq(irq, hp);
+
while(temp_open_count) {
--temp_open_count;
kref_put(&hp->kref, destroy_hvc_struct);
@@ -575,7 +532,7 @@ static u32 timeout = MIN_TIMEOUT;
#define HVC_POLL_READ 0x00000001
#define HVC_POLL_WRITE 0x00000002
-static int hvc_poll(struct hvc_struct *hp)
+int hvc_poll(struct hvc_struct *hp)
{
struct tty_struct *tty;
int i, n, poll_mask = 0;
@@ -602,10 +559,10 @@ static int hvc_poll(struct hvc_struct *hp)
if (test_bit(TTY_THROTTLED, &tty->flags))
goto throttled;
- /* If we aren't interrupt driven and aren't throttled, we always
+ /* If we aren't notifier driven and aren't throttled, we always
* request a reschedule
*/
- if (hp->irq == 0)
+ if (!hp->irq_requested)
poll_mask |= HVC_POLL_READ;
/* Read data if any */
@@ -674,6 +631,7 @@ static int hvc_poll(struct hvc_struct *hp)
return poll_mask;
}
+EXPORT_SYMBOL_GPL(hvc_poll);
/*
* This kthread is either polling or interrupt driven. This is determined by
@@ -733,7 +691,7 @@ static const struct tty_operations hvc_ops = {
.chars_in_buffer = hvc_chars_in_buffer,
};
-struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int irq,
+struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int data,
struct hv_ops *ops, int outbuf_size)
{
struct hvc_struct *hp;
@@ -754,7 +712,7 @@ struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int irq,
memset(hp, 0x00, sizeof(*hp));
hp->vtermno = vtermno;
- hp->irq = irq;
+ hp->data = data;
hp->ops = ops;
hp->outbuf_size = outbuf_size;
hp->outbuf = &((char *)hp)[ALIGN(sizeof(*hp), sizeof(long))];
@@ -784,6 +742,7 @@ struct hvc_struct __devinit *hvc_alloc(uint32_t vtermno, int irq,
return hp;
}
+EXPORT_SYMBOL_GPL(hvc_alloc);
int __devexit hvc_remove(struct hvc_struct *hp)
{
diff --git a/drivers/char/hvc_console.h b/drivers/char/hvc_console.h
index 42ffb17e15df..d9ce10915625 100644
--- a/drivers/char/hvc_console.h
+++ b/drivers/char/hvc_console.h
@@ -26,6 +26,7 @@
#ifndef HVC_CONSOLE_H
#define HVC_CONSOLE_H
+#include <linux/kref.h>
/*
* This is the max number of console adapters that can/will be found as
@@ -42,24 +43,50 @@
*/
#define HVC_ALLOC_TTY_ADAPTERS 8
+struct hvc_struct {
+ spinlock_t lock;
+ int index;
+ struct tty_struct *tty;
+ unsigned int count;
+ int do_wakeup;
+ char *outbuf;
+ int outbuf_size;
+ int n_outbuf;
+ uint32_t vtermno;
+ struct hv_ops *ops;
+ int irq_requested;
+ int data;
+ struct list_head next;
+ struct kref kref; /* ref count & hvc_struct lifetime */
+};
/* implemented by a low level driver */
struct hv_ops {
int (*get_chars)(uint32_t vtermno, char *buf, int count);
int (*put_chars)(uint32_t vtermno, const char *buf, int count);
-};
-struct hvc_struct;
+ /* Callbacks for notification. Called in open and close */
+ int (*notifier_add)(struct hvc_struct *hp, int irq);
+ void (*notifier_del)(struct hvc_struct *hp, int irq);
+};
/* Register a vterm and a slot index for use as a console (console_init) */
extern int hvc_instantiate(uint32_t vtermno, int index, struct hv_ops *ops);
/* register a vterm for hvc tty operation (module_init or hotplug add) */
-extern struct hvc_struct * __devinit hvc_alloc(uint32_t vtermno, int irq,
+extern struct hvc_struct * __devinit hvc_alloc(uint32_t vtermno, int data,
struct hv_ops *ops, int outbuf_size);
-/* remove a vterm from hvc tty operation (modele_exit or hotplug remove) */
+/* remove a vterm from hvc tty operation (module_exit or hotplug remove) */
extern int __devexit hvc_remove(struct hvc_struct *hp);
+/* data available */
+int hvc_poll(struct hvc_struct *hp);
+void hvc_kick(void);
+
+/* default notifier for irq based notification */
+extern int notifier_add_irq(struct hvc_struct *hp, int data);
+extern void notifier_del_irq(struct hvc_struct *hp, int data);
+
#if defined(CONFIG_XMON) && defined(CONFIG_SMP)
#include <asm/xmon.h>
diff --git a/drivers/char/hvc_irq.c b/drivers/char/hvc_irq.c
new file mode 100644
index 000000000000..92ab1e6e6e0e
--- /dev/null
+++ b/drivers/char/hvc_irq.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright IBM Corp. 2001,2008
+ *
+ * This file contains the IRQ specific code for hvc_console
+ *
+ */
+
+#include <linux/interrupt.h>
+
+#include "hvc_console.h"
+
+static irqreturn_t hvc_handle_interrupt(int irq, void *dev_instance)
+{
+ /* if hvc_poll request a repoll, then kick the hvcd thread */
+ if (hvc_poll(dev_instance))
+ hvc_kick();
+ return IRQ_HANDLED;
+}
+
+/*
+ * For IRQ based systems these callbacks can be used
+ */
+int notifier_add_irq(struct hvc_struct *hp, int irq)
+{
+ int rc;
+
+ if (!irq) {
+ hp->irq_requested = 0;
+ return 0;
+ }
+ rc = request_irq(irq, hvc_handle_interrupt, IRQF_DISABLED,
+ "hvc_console", hp);
+ if (!rc)
+ hp->irq_requested = 1;
+ return rc;
+}
+
+void notifier_del_irq(struct hvc_struct *hp, int irq)
+{
+ if (!irq)
+ return;
+ free_irq(irq, hp);
+ hp->irq_requested = 0;
+}
+
diff --git a/drivers/char/hvc_iseries.c b/drivers/char/hvc_iseries.c
index a08f8f981c11..b71c610fe5ae 100644
--- a/drivers/char/hvc_iseries.c
+++ b/drivers/char/hvc_iseries.c
@@ -200,6 +200,8 @@ done:
static struct hv_ops hvc_get_put_ops = {
.get_chars = get_chars,
.put_chars = put_chars,
+ .notifier_add = notifier_add_irq,
+ .notifier_del = notifier_del_irq,
};
static int __devinit hvc_vio_probe(struct vio_dev *vdev,
diff --git a/drivers/char/hvc_vio.c b/drivers/char/hvc_vio.c
index 79711aa4b41d..93f3840c1682 100644
--- a/drivers/char/hvc_vio.c
+++ b/drivers/char/hvc_vio.c
@@ -80,6 +80,8 @@ static int filtered_get_chars(uint32_t vtermno, char *buf, int count)
static struct hv_ops hvc_get_put_ops = {
.get_chars = filtered_get_chars,
.put_chars = hvc_put_chars,
+ .notifier_add = notifier_add_irq,
+ .notifier_del = notifier_del_irq,
};
static int __devinit hvc_vio_probe(struct vio_dev *vdev,
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
index db2ae4216279..6b70aa66a587 100644
--- a/drivers/char/hvc_xen.c
+++ b/drivers/char/hvc_xen.c
@@ -100,6 +100,8 @@ static int read_console(uint32_t vtermno, char *buf, int len)
static struct hv_ops hvc_ops = {
.get_chars = read_console,
.put_chars = write_console,
+ .notifier_add = notifier_add_irq,
+ .notifier_del = notifier_del_irq,
};
static int __init xen_init(void)
diff --git a/drivers/char/hw_random/intel-rng.c b/drivers/char/hw_random/intel-rng.c
index 27fdc0866496..8a2fce0756ec 100644
--- a/drivers/char/hw_random/intel-rng.c
+++ b/drivers/char/hw_random/intel-rng.c
@@ -241,7 +241,7 @@ static int __init intel_rng_hw_init(void *_intel_rng_hw)
struct intel_rng_hw *intel_rng_hw = _intel_rng_hw;
u8 mfc, dvc;
- /* interrupts disabled in stop_machine_run call */
+ /* interrupts disabled in stop_machine call */
if (!(intel_rng_hw->fwh_dec_en1_val & FWH_F8_EN_MASK))
pci_write_config_byte(intel_rng_hw->dev,
@@ -365,10 +365,10 @@ static int __init mod_init(void)
* location with the Read ID command, all activity on the system
* must be stopped until the state is back to normal.
*
- * Use stop_machine_run because IPIs can be blocked by disabling
+ * Use stop_machine because IPIs can be blocked by disabling
* interrupts.
*/
- err = stop_machine_run(intel_rng_hw_init, intel_rng_hw, NR_CPUS);
+ err = stop_machine(intel_rng_hw_init, intel_rng_hw, NULL);
pci_dev_put(dev);
iounmap(intel_rng_hw->mem);
kfree(intel_rng_hw);
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
index 32b74de18f5f..7d4532bb52bf 100644
--- a/drivers/char/snsc.c
+++ b/drivers/char/snsc.c
@@ -164,7 +164,7 @@ scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos)
struct subch_data_s *sd = (struct subch_data_s *) file->private_data;
/* try to get control of the read buffer */
- if (down_trylock(&sd->sd_rbs)) {
+ if (!down_try(&sd->sd_rbs)) {
/* somebody else has it now;
* if we're non-blocking, then exit...
*/
@@ -256,7 +256,7 @@ scdrv_write(struct file *file, const char __user *buf,
struct subch_data_s *sd = (struct subch_data_s *) file->private_data;
/* try to get control of the write buffer */
- if (down_trylock(&sd->sd_wbs)) {
+ if (!down_try(&sd->sd_wbs)) {
/* somebody else has it now;
* if we're non-blocking, then exit...
*/
diff --git a/drivers/char/viotape.c b/drivers/char/viotape.c
index ffc9254f7e02..3a71377f0261 100644
--- a/drivers/char/viotape.c
+++ b/drivers/char/viotape.c
@@ -362,7 +362,7 @@ static ssize_t viotap_write(struct file *file, const char *buf,
* semaphore
*/
if (noblock) {
- if (down_trylock(&reqSem)) {
+ if (!down_try(&reqSem)) {
ret = -EWOULDBLOCK;
goto free_op;
}
@@ -452,7 +452,7 @@ static ssize_t viotap_read(struct file *file, char *buf, size_t count,
* semaphore
*/
if (noblock) {
- if (down_trylock(&reqSem)) {
+ if (!down_try(&reqSem)) {
ret = -EWOULDBLOCK;
goto free_op;
}
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index dc17fe3a88bc..d0f4eb6fdb7f 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -46,6 +46,9 @@ static char *in, *inbuf;
/* The operations for our console. */
static struct hv_ops virtio_cons;
+/* The hvc device */
+static struct hvc_struct *hvc;
+
/*D:310 The put_chars() callback is pretty straightforward.
*
* We turn the characters into a scatter-gather list, add it to the output
@@ -134,6 +137,27 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
return hvc_instantiate(0, 0, &virtio_cons);
}
+/*
+ * we support only one console, the hvc struct is a global var
+ * There is no need to do anything
+ */
+static int notifier_add_vio(struct hvc_struct *hp, int data)
+{
+ hp->irq_requested = 1;
+ return 0;
+}
+
+static void notifier_del_vio(struct hvc_struct *hp, int data)
+{
+ hp->irq_requested = 0;
+}
+
+static void hvc_handle_input(struct virtqueue *vq)
+{
+ if (hvc_poll(hvc))
+ hvc_kick();
+}
+
/*D:370 Once we're further in boot, we get probed like any other virtio device.
* At this stage we set up the output virtqueue.
*
@@ -144,7 +168,6 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
static int __devinit virtcons_probe(struct virtio_device *dev)
{
int err;
- struct hvc_struct *hvc;
vdev = dev;
@@ -158,7 +181,7 @@ static int __devinit virtcons_probe(struct virtio_device *dev)
/* Find the input queue. */
/* FIXME: This is why we want to wean off hvc: we do nothing
* when input comes in. */
- in_vq = vdev->config->find_vq(vdev, 0, NULL);
+ in_vq = vdev->config->find_vq(vdev, 0, hvc_handle_input);
if (IS_ERR(in_vq)) {
err = PTR_ERR(in_vq);
goto free;
@@ -173,15 +196,18 @@ static int __devinit virtcons_probe(struct virtio_device *dev)
/* Start using the new console output. */
virtio_cons.get_chars = get_chars;
virtio_cons.put_chars = put_chars;
+ virtio_cons.notifier_add = notifier_add_vio;
+ virtio_cons.notifier_del = notifier_del_vio;
/* The first argument of hvc_alloc() is the virtual console number, so
- * we use zero. The second argument is the interrupt number; we
- * currently leave this as zero: it would be better not to use the
- * hvc mechanism and fix this (FIXME!).
+ * we use zero. The second argument is the parameter for the
+ * notification mechanism (like irq number). We currently leave this
+ * as zero, virtqueues have implicit notifications.
*
* The third argument is a "struct hv_ops" containing the put_chars()
- * and get_chars() pointers. The final argument is the output buffer
- * size: we can do any size, so we put PAGE_SIZE here. */
+ * get_chars(), notifier_add() and notifier_del() pointers.
+ * The final argument is the output buffer size: we can do any size,
+ * so we put PAGE_SIZE here. */
hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE);
if (IS_ERR(hvc)) {
err = PTR_ERR(hvc);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 8c46f2257098..7c1f334639ae 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -901,7 +901,7 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
return -ENXIO;
if (filp->f_flags & O_NONBLOCK) {
- if (down_trylock(&port->sm_sem)) {
+ if (!down_try(&port->sm_sem)) {
ret = -EAGAIN;
goto fail;
}
diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c
index 37586a68d345..e779f3d80837 100644
--- a/drivers/input/serio/hil_mlc.c
+++ b/drivers/input/serio/hil_mlc.c
@@ -607,7 +607,7 @@ static inline void hilse_setup_input(hil_mlc *mlc, const struct hilse_node *node
do_gettimeofday(&(mlc->instart));
mlc->icount = 15;
memset(mlc->ipacket, 0, 16 * sizeof(hil_packet));
- BUG_ON(down_trylock(&mlc->isem));
+ BUG_ON(!down_try(&mlc->isem));
}
#ifdef HIL_MLC_DEBUG
@@ -694,7 +694,7 @@ static int hilse_donode(hil_mlc *mlc)
out2:
write_unlock_irqrestore(&mlc->lock, flags);
- if (down_trylock(&mlc->osem)) {
+ if (!down_try(&mlc->osem)) {
nextidx = HILSEN_DOZE;
break;
}
diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c
index b587e2d576ac..e88a352b0d8e 100644
--- a/drivers/input/serio/hp_sdc_mlc.c
+++ b/drivers/input/serio/hp_sdc_mlc.c
@@ -148,7 +148,7 @@ static int hp_sdc_mlc_in(hil_mlc *mlc, suseconds_t timeout)
priv = mlc->priv;
/* Try to down the semaphore */
- if (down_trylock(&mlc->isem)) {
+ if (!down_try(&mlc->isem)) {
struct timeval tv;
if (priv->emtestmode) {
mlc->ipacket[0] =
@@ -186,13 +186,13 @@ static int hp_sdc_mlc_cts(hil_mlc *mlc)
priv = mlc->priv;
/* Try to down the semaphores -- they should be up. */
- BUG_ON(down_trylock(&mlc->isem));
- BUG_ON(down_trylock(&mlc->osem));
+ BUG_ON(!down_try(&mlc->isem));
+ BUG_ON(!down_try(&mlc->osem));
up(&mlc->isem);
up(&mlc->osem);
- if (down_trylock(&mlc->csem)) {
+ if (!down_try(&mlc->csem)) {
if (priv->trans.act.semaphore != &mlc->csem)
goto poll;
else
@@ -229,7 +229,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc)
priv = mlc->priv;
/* Try to down the semaphore -- it should be up. */
- BUG_ON(down_trylock(&mlc->osem));
+ BUG_ON(!down_try(&mlc->osem));
if (mlc->opacket & HIL_DO_ALTER_CTRL)
goto do_control;
@@ -240,7 +240,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc)
return;
}
/* Shouldn't be sending commands when loop may be busy */
- BUG_ON(down_trylock(&mlc->csem));
+ BUG_ON(!down_try(&mlc->csem));
up(&mlc->csem);
priv->trans.actidx = 0;
@@ -296,7 +296,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc)
priv->tseq[3] = 0;
if (mlc->opacket & HIL_CTRL_APE) {
priv->tseq[3] |= HP_SDC_LPC_APE_IPF;
- down_trylock(&mlc->csem);
+ down_try(&mlc->csem);
}
enqueue:
hp_sdc_enqueue_transaction(&priv->trans);
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 5eea4356d703..90663e01a56e 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -135,6 +135,7 @@ static void unmap_switcher(void)
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(switcher_page[i], 0);
+ kfree(switcher_page);
}
/*H:032
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 0414ddf87587..a1039068f95c 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -406,7 +406,8 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
* deliver_trap() to bounce it back into the Guest. */
static void default_idt_entry(struct desc_struct *idt,
int trap,
- const unsigned long handler)
+ const unsigned long handler,
+ const struct desc_struct *base)
{
/* A present interrupt gate. */
u32 flags = 0x8e00;
@@ -415,6 +416,10 @@ static void default_idt_entry(struct desc_struct *idt,
* the Guest to use the "int" instruction to trigger it. */
if (trap == LGUEST_TRAP_ENTRY)
flags |= (GUEST_PL << 13);
+ else if (base)
+ /* Copy priv. level from what Guest asked for. This allows
+ * debug (int 3) traps from Guest userspace, for example. */
+ flags |= (base->b & 0x6000);
/* Now pack it into the IDT entry in its weird format. */
idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
@@ -428,7 +433,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
unsigned int i;
for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
- default_idt_entry(&state->guest_idt[i], i, def[i]);
+ default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
}
/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
@@ -442,6 +447,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
/* We can simply copy the direct traps, otherwise we use the default
* ones in the Switcher: they will return to the Host. */
for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
+ const struct desc_struct *gidt = &cpu->arch.idt[i];
+
/* If no Guest can ever override this trap, leave it alone. */
if (!direct_trap(i))
continue;
@@ -449,12 +456,15 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
/* Only trap gates (type 15) can go direct to the Guest.
* Interrupt gates (type 14) disable interrupts as they are
* entered, which we never let the Guest do. Not present
- * entries (type 0x0) also can't go direct, of course. */
- if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF)
- idt[i] = cpu->arch.idt[i];
+ * entries (type 0x0) also can't go direct, of course.
+ *
+ * If it can't go direct, we still need to copy the priv. level:
+ * they might want to give userspace access to a software
+ * interrupt. */
+ if (idt_type(gidt->a, gidt->b) == 0xF)
+ idt[i] = *gidt;
else
- /* Reset it to the default. */
- default_idt_entry(&idt[i], i, def[i]);
+ default_idt_entry(&idt[i], i, def[i], gidt);
}
}
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 1a8de57289eb..37344aaee22f 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -98,16 +98,20 @@ static u32 lg_get_features(struct virtio_device *vdev)
return features;
}
-static void lg_set_features(struct virtio_device *vdev, u32 features)
+static void lg_finalize_features(struct virtio_device *vdev)
{
- unsigned int i;
+ unsigned int i, bits;
struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
/* Second half of bitmap is features we accept. */
u8 *out_features = lg_features(desc) + desc->feature_len;
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
memset(out_features, 0, desc->feature_len);
- for (i = 0; i < min(desc->feature_len * 8, 32); i++) {
- if (features & (1 << i))
+ bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
+ for (i = 0; i < bits; i++) {
+ if (test_bit(i, vdev->features))
out_features[i / 8] |= (1 << (i % 8));
}
}
@@ -297,7 +301,7 @@ static void lg_del_vq(struct virtqueue *vq)
/* The ops structure which hooks everything together. */
static struct virtio_config_ops lguest_config_ops = {
.get_features = lg_get_features,
- .set_features = lg_set_features,
+ .finalize_features = lg_finalize_features,
.get = lg_get,
.set = lg_set,
.get_status = lg_get_status,
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 95dfda52b4f9..bf7942327bda 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -480,7 +480,7 @@ void __init lguest_arch_host_init(void)
* bit on its CPU, depending on the argument (0 == unset). */
on_each_cpu(adjust_pge, (void *)0, 1);
/* Turn off the feature in the global feature set. */
- clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
}
put_online_cpus();
};
@@ -491,7 +491,7 @@ void __exit lguest_arch_host_fini(void)
/* If we had PGE before we started, turn it back on now. */
get_online_cpus();
if (cpu_had_pge) {
- set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
/* adjust_pge's argument "1" means set PGE. */
on_each_cpu(adjust_pge, (void *)1, 1);
}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ff05fe893083..137f0241aed0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -587,7 +587,7 @@ static void rh_recovery_prepare(struct region_hash *rh)
/* Extra reference to avoid race with rh_stop_recovery */
atomic_inc(&rh->recovery_in_flight);
- while (!down_trylock(&rh->recovery_count)) {
+ while (down_try(&rh->recovery_count)) {
atomic_inc(&rh->recovery_in_flight);
if (__rh_recovery_prepare(rh) <= 0) {
atomic_dec(&rh->recovery_in_flight);
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index 6aca0c640f13..9f5786326efa 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -576,7 +576,7 @@ static int mc32_command_nowait(struct net_device *dev, u16 cmd, void *data, int
int ioaddr = dev->base_addr;
int ret = -1;
- if (down_trylock(&lp->cmd_mutex) == 0)
+ if (down_try(&lp->cmd_mutex))
{
lp->cmd_nonblocking=1;
lp->exec_box->mbox=0;
diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c
index 6078e03de9a8..894aade319b6 100644
--- a/drivers/net/irda/sir_dev.c
+++ b/drivers/net/irda/sir_dev.c
@@ -286,7 +286,7 @@ int sirdev_schedule_request(struct sir_dev *dev, int initial_state, unsigned par
IRDA_DEBUG(2, "%s - state=0x%04x / param=%u\n", __FUNCTION__, initial_state, param);
- if (down_trylock(&fsm->sem)) {
+ if (!down_try(&fsm->sem)) {
if (in_interrupt() || in_atomic() || irqs_disabled()) {
IRDA_DEBUG(1, "%s(), state machine busy!\n", __FUNCTION__);
return -EWOULDBLOCK;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c28d7cb2035b..0196a0df9021 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -19,6 +19,7 @@
//#define DEBUG
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
@@ -54,9 +55,15 @@ struct virtnet_info
struct tasklet_struct tasklet;
bool free_in_tasklet;
+ /* I like... big packets and I cannot lie! */
+ bool big_packets;
+
/* Receive & send queues. */
struct sk_buff_head recv;
struct sk_buff_head send;
+
+ /* Chain pages by the private ptr. */
+ struct page *pages;
};
static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb)
@@ -69,6 +76,23 @@ static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb)
sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr));
}
+static void give_a_page(struct virtnet_info *vi, struct page *page)
+{
+ page->private = (unsigned long)vi->pages;
+ vi->pages = page;
+}
+
+static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
+{
+ struct page *p = vi->pages;
+
+ if (p)
+ vi->pages = (struct page *)p->private;
+ else
+ p = alloc_page(gfp_mask);
+ return p;
+}
+
static void skb_xmit_done(struct virtqueue *svq)
{
struct virtnet_info *vi = svq->vdev->priv;
@@ -88,6 +112,7 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
unsigned len)
{
struct virtio_net_hdr *hdr = skb_vnet_hdr(skb);
+ int err;
if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
pr_debug("%s: short packet %i\n", dev->name, len);
@@ -95,10 +120,23 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
goto drop;
}
len -= sizeof(struct virtio_net_hdr);
- BUG_ON(len > MAX_PACKET_LEN);
- skb_trim(skb, len);
+ if (len <= MAX_PACKET_LEN) {
+ unsigned int i;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ give_a_page(dev->priv, skb_shinfo(skb)->frags[i].page);
+ skb->data_len = 0;
+ skb_shinfo(skb)->nr_frags = 0;
+ }
+
+ err = pskb_trim(skb, len);
+ if (err) {
+ pr_debug("%s: pskb_trim failed %i %d\n", dev->name, len, err);
+ dev->stats.rx_dropped++;
+ goto drop;
+ }
+ skb->truesize += skb->data_len;
dev->stats.rx_bytes += skb->len;
dev->stats.rx_packets++;
@@ -160,7 +198,7 @@ static void try_fill_recv(struct virtnet_info *vi)
{
struct sk_buff *skb;
struct scatterlist sg[2+MAX_SKB_FRAGS];
- int num, err;
+ int num, err, i;
sg_init_table(sg, 2+MAX_SKB_FRAGS);
for (;;) {
@@ -170,6 +208,24 @@ static void try_fill_recv(struct virtnet_info *vi)
skb_put(skb, MAX_PACKET_LEN);
vnet_hdr_to_sg(sg, skb);
+
+ if (vi->big_packets) {
+ for (i = 0; i < MAX_SKB_FRAGS; i++) {
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+ f->page = get_a_page(vi, GFP_ATOMIC);
+ if (!f->page)
+ break;
+
+ f->page_offset = 0;
+ f->size = PAGE_SIZE;
+
+ skb->data_len += PAGE_SIZE;
+ skb->len += PAGE_SIZE;
+
+ skb_shinfo(skb)->nr_frags++;
+ }
+ }
+
num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
skb_queue_head(&vi->recv, skb);
@@ -335,16 +391,11 @@ again:
free_old_xmit_skbs(vi);
/* If we has a buffer left over from last time, send it now. */
- if (unlikely(vi->last_xmit_skb)) {
- if (xmit_skb(vi, vi->last_xmit_skb) != 0) {
- /* Drop this skb: we only queue one. */
- vi->dev->stats.tx_dropped++;
- kfree_skb(skb);
- skb = NULL;
- goto stop_queue;
- }
- vi->last_xmit_skb = NULL;
- }
+ if (unlikely(vi->last_xmit_skb) &&
+ xmit_skb(vi, vi->last_xmit_skb) != 0)
+ goto stop_queue;
+
+ vi->last_xmit_skb = NULL;
/* Put new one in send queue and do transmit */
if (likely(skb)) {
@@ -370,6 +421,11 @@ stop_queue:
netif_start_queue(dev);
goto again;
}
+ if (skb) {
+ /* Drop this skb: we only queue one. */
+ vi->dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ }
goto done;
}
@@ -408,6 +464,22 @@ static int virtnet_close(struct net_device *dev)
return 0;
}
+static int virtnet_set_tx_csum(struct net_device *dev, u32 data)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtio_device *vdev = vi->vdev;
+
+ if (data && !virtio_has_feature(vdev, VIRTIO_NET_F_CSUM))
+ return -ENOSYS;
+
+ return ethtool_op_set_tx_hw_csum(dev, data);
+}
+
+static struct ethtool_ops virtnet_ethtool_ops = {
+ .set_tx_csum = virtnet_set_tx_csum,
+ .set_sg = ethtool_op_set_sg,
+};
+
static int virtnet_probe(struct virtio_device *vdev)
{
int err;
@@ -427,6 +499,7 @@ static int virtnet_probe(struct virtio_device *vdev)
#ifdef CONFIG_NET_POLL_CONTROLLER
dev->poll_controller = virtnet_netpoll;
#endif
+ SET_ETHTOOL_OPS(dev, &virtnet_ethtool_ops);
SET_NETDEV_DEV(dev, &vdev->dev);
/* Do we support "hardware" checksums? */
@@ -462,11 +535,18 @@ static int virtnet_probe(struct virtio_device *vdev)
vi->dev = dev;
vi->vdev = vdev;
vdev->priv = vi;
+ vi->pages = NULL;
/* If they give us a callback when all buffers are done, we don't need
* the timer. */
vi->free_in_tasklet = virtio_has_feature(vdev,VIRTIO_F_NOTIFY_ON_EMPTY);
+ /* If we can receive ANY GSO packets, we must allocate large ones. */
+ if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4)
+ || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)
+ || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN))
+ vi->big_packets = true;
+
/* We expect two virtqueues, receive then send. */
vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done);
if (IS_ERR(vi->rvq)) {
@@ -541,6 +621,10 @@ static void virtnet_remove(struct virtio_device *vdev)
vdev->config->del_vq(vi->svq);
vdev->config->del_vq(vi->rvq);
unregister_netdev(vi->dev);
+
+ while (vi->pages)
+ __free_pages(get_a_page(vi, GFP_KERNEL), 0);
+
free_netdev(vi->dev);
}
@@ -553,7 +637,9 @@ static unsigned int features[] = {
VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,
VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC,
VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
- VIRTIO_NET_F_HOST_ECN, VIRTIO_F_NOTIFY_ON_EMPTY,
+ VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
+ VIRTIO_NET_F_GUEST_ECN, /* We don't yet handle UFO input. */
+ VIRTIO_F_NOTIFY_ON_EMPTY,
};
static struct virtio_driver virtio_net = {
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index b5cd850a4a59..fc36977a1a4b 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2137,7 +2137,7 @@ static int airo_start_xmit(struct sk_buff *skb, struct net_device *dev) {
fids[i] |= (len << 16);
priv->xmit.skb = skb;
priv->xmit.fid = i;
- if (down_trylock(&priv->sem) != 0) {
+ if (!down_try(&priv->sem)) {
set_bit(FLAG_PENDING_XMIT, &priv->flags);
netif_stop_queue(dev);
set_bit(JOB_XMIT, &priv->jobs);
@@ -2208,7 +2208,7 @@ static int airo_start_xmit11(struct sk_buff *skb, struct net_device *dev) {
fids[i] |= (len << 16);
priv->xmit11.skb = skb;
priv->xmit11.fid = i;
- if (down_trylock(&priv->sem) != 0) {
+ if (!down_try(&priv->sem)) {
set_bit(FLAG_PENDING_XMIT11, &priv->flags);
netif_stop_queue(dev);
set_bit(JOB_XMIT11, &priv->jobs);
@@ -2258,7 +2258,7 @@ static struct net_device_stats *airo_get_stats(struct net_device *dev)
if (!test_bit(JOB_STATS, &local->jobs)) {
/* Get stats out of the card if available */
- if (down_trylock(&local->sem) != 0) {
+ if (!down_try(&local->sem)) {
set_bit(JOB_STATS, &local->jobs);
wake_up_interruptible(&local->thr_wait);
} else
@@ -2285,7 +2285,7 @@ static void airo_set_multicast_list(struct net_device *dev) {
if ((dev->flags ^ ai->flags) & IFF_PROMISC) {
change_bit(FLAG_PROMISC, &ai->flags);
- if (down_trylock(&ai->sem) != 0) {
+ if (!down_try(&ai->sem)) {
set_bit(JOB_PROMISC, &ai->jobs);
wake_up_interruptible(&ai->thr_wait);
} else
@@ -3213,7 +3213,7 @@ static irqreturn_t airo_interrupt(int irq, void *dev_id)
set_bit(FLAG_UPDATE_UNI, &apriv->flags);
set_bit(FLAG_UPDATE_MULTI, &apriv->flags);
- if (down_trylock(&apriv->sem) != 0) {
+ if (!down_try(&apriv->sem)) {
set_bit(JOB_EVENT, &apriv->jobs);
wake_up_interruptible(&apriv->thr_wait);
} else
@@ -7659,7 +7659,7 @@ static struct iw_statistics *airo_get_wireless_stats(struct net_device *dev)
if (!test_bit(JOB_WSTATS, &local->jobs)) {
/* Get stats out of the card if available */
- if (down_trylock(&local->sem) != 0) {
+ if (!down_try(&local->sem)) {
set_bit(JOB_WSTATS, &local->jobs);
wake_up_interruptible(&local->thr_wait);
} else
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 5ab34340919b..79954bd6bfa5 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/virtio.h>
#include <linux/virtio_config.h>
+#include <linux/virtio_console.h>
#include <linux/interrupt.h>
#include <linux/virtio_ring.h>
#include <linux/pfn.h>
@@ -87,16 +88,20 @@ static u32 kvm_get_features(struct virtio_device *vdev)
return features;
}
-static void kvm_set_features(struct virtio_device *vdev, u32 features)
+static void kvm_finalize_features(struct virtio_device *vdev)
{
- unsigned int i;
+ unsigned int i, bits;
struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
/* Second half of bitmap is features we accept. */
u8 *out_features = kvm_vq_features(desc) + desc->feature_len;
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
memset(out_features, 0, desc->feature_len);
- for (i = 0; i < min(desc->feature_len * 8, 32); i++) {
- if (features & (1 << i))
+ bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
+ for (i = 0; i < bits; i++) {
+ if (test_bit(i, vdev->features))
out_features[i / 8] |= (1 << (i % 8));
}
}
@@ -222,7 +227,7 @@ static void kvm_del_vq(struct virtqueue *vq)
*/
static struct virtio_config_ops kvm_vq_configspace_ops = {
.get_features = kvm_get_features,
- .set_features = kvm_set_features,
+ .finalize_features = kvm_finalize_features,
.get = kvm_get,
.set = kvm_set,
.get_status = kvm_get_status,
@@ -333,6 +338,25 @@ static int __init kvm_devices_init(void)
return 0;
}
+/* code for early console output with virtio_console */
+static __init int early_put_chars(u32 vtermno, const char *buf, int count)
+{
+ char scratch[17];
+ unsigned int len = count;
+
+ if (len > sizeof(scratch) - 1)
+ len = sizeof(scratch) - 1;
+ scratch[len] = '\0';
+ memcpy(scratch, buf, len);
+ kvm_hypercall1(KVM_S390_VIRTIO_NOTIFY, __pa(scratch));
+ return len;
+}
+
+void s390_virtio_console_init(void)
+{
+ virtio_cons_early_init(early_put_chars);
+}
+
/*
* We do this after core stuff, but before the drivers.
*/
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 289304aab690..c011d05caaa6 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -490,7 +490,7 @@ int aac_fib_send(u16 command, struct fib *fibptr, unsigned long size,
* hardware failure has occurred.
*/
unsigned long count = 36000000L; /* 3 minutes */
- while (down_trylock(&fibptr->event_wait)) {
+ while (!down_try(&fibptr->event_wait)) {
int blink;
if (--count == 0) {
struct aac_queue * q = &dev->queues->queue[AdapNormCmdQueue];
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 84fcaa6a21ec..f31f0c5d3725 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -477,7 +477,7 @@ int usb_lock_device_for_reset(struct usb_device *udev,
}
}
- while (usb_trylock_device(udev) != 0) {
+ while (!usb_trylock_device(udev)) {
/* If we can't acquire the lock after waiting one second,
* we're probably deadlocked */
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 04692d59fc1c..e2cf50c89225 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -299,7 +299,7 @@ get_ready_ep (unsigned f_flags, struct ep_data *epdata)
int val;
if (f_flags & O_NONBLOCK) {
- if (down_trylock (&epdata->lock) != 0)
+ if (!down_try (&epdata->lock))
goto nonblock;
if (epdata->state != STATE_EP_ENABLED) {
up (&epdata->lock);
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 7084e7e146c0..5b78fd0aff0a 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -71,13 +71,6 @@ static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env)
dev->id.device, dev->id.vendor);
}
-static struct bus_type virtio_bus = {
- .name = "virtio",
- .match = virtio_dev_match,
- .dev_attrs = virtio_dev_attrs,
- .uevent = virtio_uevent,
-};
-
static void add_status(struct virtio_device *dev, unsigned status)
{
dev->config->set_status(dev, dev->config->get_status(dev) | status);
@@ -120,12 +113,16 @@ static int virtio_dev_probe(struct device *_d)
set_bit(f, dev->features);
}
+ /* Transport features always preserved to pass to finalize_features. */
+ for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++)
+ if (device_features & (1 << i))
+ set_bit(i, dev->features);
+
err = drv->probe(dev);
if (err)
add_status(dev, VIRTIO_CONFIG_S_FAILED);
else {
- /* They should never have set feature bits beyond 32 */
- dev->config->set_features(dev, dev->features[0]);
+ dev->config->finalize_features(dev);
add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
}
return err;
@@ -147,13 +144,20 @@ static int virtio_dev_remove(struct device *_d)
return 0;
}
+static struct bus_type virtio_bus = {
+ .name = "virtio",
+ .match = virtio_dev_match,
+ .dev_attrs = virtio_dev_attrs,
+ .uevent = virtio_uevent,
+ .probe = virtio_dev_probe,
+ .remove = virtio_dev_remove,
+};
+
int register_virtio_driver(struct virtio_driver *driver)
{
/* Catch this early. */
BUG_ON(driver->feature_table_size && !driver->feature_table);
driver->driver.bus = &virtio_bus;
- driver->driver.probe = virtio_dev_probe;
- driver->driver.remove = virtio_dev_remove;
return driver_register(&driver->driver);
}
EXPORT_SYMBOL_GPL(register_virtio_driver);
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index eae7236310e4..c7dc37c7cce9 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -94,12 +94,17 @@ static u32 vp_get_features(struct virtio_device *vdev)
return ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
}
-/* virtio config->set_features() implementation */
-static void vp_set_features(struct virtio_device *vdev, u32 features)
+/* virtio config->finalize_features() implementation */
+static void vp_finalize_features(struct virtio_device *vdev)
{
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
- iowrite32(features, vp_dev->ioaddr + VIRTIO_PCI_GUEST_FEATURES);
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ /* We only support 32 feature bits. */
+ BUILD_BUG_ON(ARRAY_SIZE(vdev->features) != 1);
+ iowrite32(vdev->features[0], vp_dev->ioaddr+VIRTIO_PCI_GUEST_FEATURES);
}
/* virtio config->get() implementation */
@@ -297,7 +302,7 @@ static struct virtio_config_ops virtio_pci_config_ops = {
.find_vq = vp_find_vq,
.del_vq = vp_del_vq,
.get_features = vp_get_features,
- .set_features = vp_set_features,
+ .finalize_features = vp_finalize_features,
};
/* the PCI probing function */
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 72bf8bc09014..c10c6753533a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -18,6 +18,7 @@
*/
#include <linux/virtio.h>
#include <linux/virtio_ring.h>
+#include <linux/virtio_config.h>
#include <linux/device.h>
#ifdef DEBUG
@@ -52,9 +53,6 @@ struct vring_virtqueue
/* Number we've added since last sync. */
unsigned int num_added;
- /* Last used index we've seen. */
- u16 last_used_idx;
-
/* How to notify other side. FIXME: commonalize hcalls! */
void (*notify)(struct virtqueue *vq);
@@ -87,8 +85,11 @@ static int vring_add_buf(struct virtqueue *_vq,
if (vq->num_free < out + in) {
pr_debug("Can't add buf len %i - avail = %i\n",
out + in, vq->num_free);
- /* We notify *even if* VRING_USED_F_NO_NOTIFY is set here. */
- vq->notify(&vq->vq);
+ /* FIXME: for historical reasons, we force a notify here if
+ * there are outgoing parts to the buffer. Presumably the
+ * host should service the ring ASAP. */
+ if (out)
+ vq->notify(&vq->vq);
END_USE(vq);
return -ENOSPC;
}
@@ -173,12 +174,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
static inline bool more_used(const struct vring_virtqueue *vq)
{
- return vq->last_used_idx != vq->vring.used->idx;
+ return vring_last_used(&vq->vring) != vq->vring.used->idx;
}
static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
{
struct vring_virtqueue *vq = to_vvq(_vq);
+ struct vring_used_elem *u;
void *ret;
unsigned int i;
@@ -195,8 +197,11 @@ static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
return NULL;
}
- i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
- *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
+ u = &vq->vring.used->ring[vring_last_used(&vq->vring) % vq->vring.num];
+ i = u->id;
+ *len = u->len;
+ /* Make sure we don't reload i after doing checks. */
+ rmb();
if (unlikely(i >= vq->vring.num)) {
BAD_RING(vq, "id %u out of range\n", i);
@@ -210,7 +215,7 @@ static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
/* detach_buf clears data, so grab it now. */
ret = vq->data[i];
detach_buf(vq, i);
- vq->last_used_idx++;
+ vring_last_used(&vq->vring)++;
END_USE(vq);
return ret;
}
@@ -294,7 +299,6 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
vq->vq.vq_ops = &vring_vq_ops;
vq->notify = notify;
vq->broken = false;
- vq->last_used_idx = 0;
vq->num_added = 0;
#ifdef DEBUG
vq->in_use = false;
@@ -320,4 +324,21 @@ void vring_del_virtqueue(struct virtqueue *vq)
}
EXPORT_SYMBOL_GPL(vring_del_virtqueue);
+/* Manipulates transport-specific feature bits. */
+void vring_transport_features(struct virtio_device *vdev)
+{
+ unsigned int i;
+
+ for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
+ switch (i) {
+ case VIRTIO_RING_F_PUBLISH_INDICES:
+ break;
+ default:
+ /* We don't understand this bit. */
+ clear_bit(i, vdev->features);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(vring_transport_features);
+
MODULE_LICENSE("GPL");
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec7..64211fc74266 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1062,10 +1062,6 @@ void ocfs2_clear_inode(struct inode *inode)
(unsigned long long)oi->ip_blkno);
mutex_unlock(&oi->ip_io_mutex);
- /*
- * down_trylock() returns 0, down_write_trylock() returns 1
- * kernel 1, world 0
- */
mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
"Clear inode of %llu, alloc_sem is locked\n",
(unsigned long long)oi->ip_blkno);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e396b2fa4743..9ec7d66c24a5 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1412,7 +1412,7 @@ static int flush_journal_list(struct super_block *s,
/* if flushall == 0, the lock is already held */
if (flushall) {
down(&journal->j_flush_sem);
- } else if (!down_trylock(&journal->j_flush_sem)) {
+ } else if (down_try(&journal->j_flush_sem)) {
BUG();
}
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 3abe7e9ceb33..7d20f0422c97 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -36,17 +36,15 @@ typedef struct semaphore sema_t;
static inline int issemalocked(sema_t *sp)
{
- return down_trylock(sp) || (up(sp), 0);
+ return !down_try(sp) || (up(sp), 0);
}
/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
+ * Map cpsema (try to get the sema) to down_try.
*/
static inline int cpsema(sema_t *sp)
{
- return down_trylock(sp) ? 0 : 1;
+ return down_try(sp);
}
#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9cc8f0213095..09e50cdcc3b3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -538,7 +538,7 @@ found:
* if this does not work then we need to drop the
* spinlock and do a hard attempt on the semaphore.
*/
- if (down_trylock(&bp->b_sema)) {
+ if (!down_try(&bp->b_sema)) {
if (!(flags & XBF_TRYLOCK)) {
/* wait for buffer ownership */
XB_TRACE(bp, "get_lock", 0);
@@ -882,7 +882,7 @@ xfs_buf_cond_lock(
{
int locked;
- locked = down_trylock(&bp->b_sema) == 0;
+ locked = down_try(&bp->b_sema);
if (locked) {
XB_SET_OWNER(bp);
}
diff --git a/include/asm-s390/kvm_virtio.h b/include/asm-s390/kvm_virtio.h
index 5c871a990c29..146100224def 100644
--- a/include/asm-s390/kvm_virtio.h
+++ b/include/asm-s390/kvm_virtio.h
@@ -50,4 +50,14 @@ struct kvm_vqconfig {
#define KVM_S390_VIRTIO_RESET 1
#define KVM_S390_VIRTIO_SET_STATUS 2
+#ifdef __KERNEL__
+/* early virtio console setup */
+#ifdef CONFIG_VIRTIO_CONSOLE
+extern void s390_virtio_console_init(void);
+#else
+static inline void s390_virtio_console_init(void)
+{
+}
+#endif /* CONFIG_VIRTIO_CONSOLE */
+#endif /* __KERNEL__ */
#endif
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5c8351b859f0..3cdd22e1bddd 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -61,3 +61,21 @@
#define noinline __attribute__((noinline))
#define __attribute_const__ __attribute__((__const__))
#define __maybe_unused __attribute__((unused))
+
+/**
+ * cast_if_type - allow an alternate type
+ * @expr: the expression to optionally cast
+ * @oktype: the type to allow.
+ * @desttype: the type to cast to.
+ *
+ * This is used to accept a particular alternate type for an expression:
+ * because any other types will not be cast, they will cause a warning as
+ * normal.
+ *
+ * Note that the unnecessary trinary forces functions to devolve into
+ * function pointers as users expect, but means @expr must be a pointer or
+ * integer.
+ */
+#define cast_if_type(expr, oktype, desttype) __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(1?(expr):(expr)), oktype), \
+ (desttype)(expr), (expr))
diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
index d8e636e5607d..7e704e6c8a5d 100644
--- a/include/linux/compiler-intel.h
+++ b/include/linux/compiler-intel.h
@@ -29,3 +29,5 @@
#endif
#define uninitialized_var(x) x
+
+#define cast_if_type(expr, oktype, desttype) ((desttype)(expr))
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f9cd7a513f9c..e37982db1582 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -503,4 +503,39 @@ struct sysinfo {
#define NUMA_BUILD 0
#endif
+/* If fn is of type ok1 or ok2, cast to desttype */
+#define __typesafe_cb(desttype, fn, ok1, ok2) \
+ cast_if_type(cast_if_type((fn), ok1, desttype), ok2, desttype)
+
+/**
+ * typesafe_cb - cast a callback function if it matches the arg
+ * @rettype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * If a callback function takes a single argument, this macro does
+ * appropriate casts to a function which takes a single void * argument if the
+ * callback provided matches the @arg (or a const or volatile version).
+ *
+ * It is assumed that @arg is of pointer type: usually @arg is passed
+ * or assigned to a void * elsewhere anyway.
+ */
+#define typesafe_cb(rettype, fn, arg) \
+ __typesafe_cb(rettype (*)(void *), (fn), \
+ rettype (*)(const typeof(arg)), \
+ rettype (*)(typeof(arg)))
+
+/**
+ * typesafe_cb_preargs - cast a callback function if it matches the arg
+ * @rettype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * This is a version of typesafe_cb() for callbacks that take other arguments
+ * before the @arg.
+ */
+#define typesafe_cb_preargs(rettype, fn, arg, ...) \
+ __typesafe_cb(rettype (*)(__VA_ARGS__, void *), (fn), \
+ rettype (*)(__VA_ARGS__, const typeof(arg)), \
+ rettype (*)(__VA_ARGS__, typeof(arg)))
#endif
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 00dd957e245b..3152c1ef1d08 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -4,9 +4,32 @@
#include <linux/err.h>
#include <linux/sched.h>
-struct task_struct *kthread_create(int (*threadfn)(void *data),
- void *data,
- const char namefmt[], ...);
+/**
+ * kthread_create - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread. The thread will be stopped: use wake_up_process() to start
+ * it. See also kthread_run(), kthread_create_on_cpu().
+ *
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either call do_exit() directly if it is a
+ * standalone thread for which noone will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called). The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM).
+ */
+#define kthread_create(threadfn, data, namefmt...) \
+ __kthread_create(typesafe_cb(int,(threadfn),(data)), (data), namefmt)
+
+struct task_struct *__kthread_create(int (*threadfn)(void *data),
+ void *data,
+ const char namefmt[], ...)
+ __attribute__((format(printf, 3, 4)));
/**
* kthread_run - create and wake a thread.
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index bc6da10ceee0..c1f5b3f9fe2d 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -141,10 +141,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
# define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
#endif
-/*
- * NOTE: mutex_trylock() follows the spin_trylock() convention,
- * not the down_trylock() convention!
- */
extern int mutex_trylock(struct mutex *lock);
extern void mutex_unlock(struct mutex *lock);
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 9cae64b00d6b..d245e6fade84 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -44,8 +44,12 @@ static inline void sema_init(struct semaphore *sem, int val)
extern void down(struct semaphore *sem);
extern int __must_check down_interruptible(struct semaphore *sem);
extern int __must_check down_killable(struct semaphore *sem);
-extern int __must_check down_trylock(struct semaphore *sem);
+extern int __must_check down_try(struct semaphore *sem);
+/* Old down_trylock() returned the opposite of what was expected. */
+static inline int __deprecated down_trylock(struct semaphore *sem)
+{
+ return !down_try(sem);
+}
extern int __must_check down_timeout(struct semaphore *sem, long jiffies);
extern void up(struct semaphore *sem);
-
#endif /* __LINUX_SEMAPHORE_H */
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 5bfc553bdb21..ffc0d8513697 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -5,41 +5,46 @@
(and more). So the "read" side to such a lock is anything which
diables preeempt. */
#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/compiler.h>
#include <asm/system.h>
-#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
+/* Deprecated, but useful for transition. */
+#define ALL_CPUS ~0U
+
/**
- * stop_machine_run: freeze the machine on all CPUs and run this function
+ * stop_machine: freeze the machine on all CPUs and run this function
* @fn: the function to run
* @data: the data ptr for the @fn()
- * @cpu: the cpu to run @fn() on (or any, if @cpu == NR_CPUS.
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
*
- * Description: This causes a thread to be scheduled on every other cpu,
- * each of which disables interrupts, and finally interrupts are disabled
- * on the current CPU. The result is that noone is holding a spinlock
- * or inside any other preempt-disabled region when @fn() runs.
+ * Description: This causes a thread to be scheduled on every cpu,
+ * each of which disables interrupts. The result is that noone is
+ * holding a spinlock or inside any other preempt-disabled region when
+ * @fn() runs.
*
* This can be thought of as a very heavy write lock, equivalent to
* grabbing every spinlock in the kernel. */
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu);
+#define stop_machine(fn, data, cpus) \
+ stop_machine_notype(typesafe_cb(int, (fn), (data)), (data), (cpus))
+#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
/**
- * __stop_machine_run: freeze the machine on all CPUs and run this function
+ * __stop_machine: freeze the machine on all CPUs and run this function
* @fn: the function to run
* @data: the data ptr for the @fn
- * @cpu: the cpu to run @fn on (or any, if @cpu == NR_CPUS.
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
*
- * Description: This is a special version of the above, which returns the
- * thread which has run @fn(): kthread_stop will return the return value
- * of @fn(). Used by hotplug cpu.
+ * Description: This is a special version of the above, which assumes cpus
+ * won't come or go while it's being called. Used by hotplug cpu.
*/
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
- unsigned int cpu);
+int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus);
+int stop_machine_notype(int (*fn)(void *), void *data, const cpumask_t *cpus);
#else
-static inline int stop_machine_run(int (*fn)(void *), void *data,
- unsigned int cpu)
+static inline int stop_machine_notype(int (*fn)(void *), void *data,
+ const cpumask_t *cpus)
{
int ret;
local_irq_disable();
@@ -48,4 +53,22 @@ static inline int stop_machine_run(int (*fn)(void *), void *data,
return ret;
}
#endif /* CONFIG_SMP */
+
+static inline int __deprecated stop_machine_run(int (*fn)(void *), void *data,
+ unsigned int cpu)
+{
+ /* If they don't care which cpu fn runs on, just pick one. */
+ if (cpu == NR_CPUS)
+ return stop_machine(fn, data, NULL);
+ else if (cpu == ~0U)
+ return stop_machine(fn, data, &cpu_possible_map);
+ else {
+ cpumask_t cpus = cpumask_of_cpu(cpu);
+ return stop_machine(fn, data, &cpus);
+ }
+}
+
+/* For sysctl to access. */
+extern unsigned long stopmachine_timeout;
+
#endif /* _LINUX_STOP_MACHINE */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index d4ba79248a27..1baf40162b8c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -25,12 +25,22 @@ struct timer_list {
extern struct tvec_base boot_tvec_bases;
-#define TIMER_INITIALIZER(_function, _expires, _data) { \
- .entry = { .prev = TIMER_ENTRY_STATIC }, \
- .function = (_function), \
- .expires = (_expires), \
- .data = (_data), \
- .base = &boot_tvec_bases, \
+/*
+ * For historic reasons the timer function takes an unsigned long, so
+ * we use this variant of typesafe_cb. data is converted to an unsigned long
+ * if it is another integer type, by adding 0UL.
+ */
+#define typesafe_timerfn(fn, data) \
+ __typesafe_cb(void (*)(unsigned long), (fn), \
+ void (*)(const typeof((data)+0UL)), \
+ void (*)(typeof((data)+0UL)))
+
+#define TIMER_INITIALIZER(_function, _expires, _data) { \
+ .entry = { .prev = TIMER_ENTRY_STATIC }, \
+ .function = typesafe_timerfn((_function), (_data)), \
+ .expires = (_expires), \
+ .data = (unsigned long)(_data), \
+ .base = &boot_tvec_bases, \
}
#define DEFINE_TIMER(_name, _function, _expires, _data) \
@@ -51,9 +61,13 @@ static inline void init_timer_on_stack(struct timer_list *timer)
}
#endif
-static inline void setup_timer(struct timer_list * timer,
- void (*function)(unsigned long),
- unsigned long data)
+#define setup_timer(timer, function, data) \
+ __setup_timer((timer), typesafe_timerfn((function), (data)), \
+ (unsigned long)(data))
+
+static inline void __setup_timer(struct timer_list *timer,
+ void (*function)(unsigned long),
+ unsigned long data)
{
timer->function = function;
timer->data = data;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 5811c5da69f9..2664efdfe401 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -492,7 +492,7 @@ extern void usb_put_dev(struct usb_device *dev);
/* USB device locking */
#define usb_lock_device(udev) down(&(udev)->dev.sem)
#define usb_unlock_device(udev) up(&(udev)->dev.sem)
-#define usb_trylock_device(udev) down_trylock(&(udev)->dev.sem)
+#define usb_trylock_device(udev) down_try(&(udev)->dev.sem)
extern int usb_lock_device_for_reset(struct usb_device *udev,
const struct usb_interface *iface);
diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h
index 8eff0b53910b..b3c4a60ceeb3 100644
--- a/include/linux/virtio_9p.h
+++ b/include/linux/virtio_9p.h
@@ -1,5 +1,7 @@
#ifndef _LINUX_VIRTIO_9P_H
#define _LINUX_VIRTIO_9P_H
+/* This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers. */
#include <linux/virtio_config.h>
/* The ID for virtio console */
diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h
index 979524ee75b7..c30c7bfbf39b 100644
--- a/include/linux/virtio_balloon.h
+++ b/include/linux/virtio_balloon.h
@@ -1,5 +1,7 @@
#ifndef _LINUX_VIRTIO_BALLOON_H
#define _LINUX_VIRTIO_BALLOON_H
+/* This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers. */
#include <linux/virtio_config.h>
/* The ID for virtio_balloon */
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 5f79a5f9de79..c1aef85243bf 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -1,5 +1,7 @@
#ifndef _LINUX_VIRTIO_BLK_H
#define _LINUX_VIRTIO_BLK_H
+/* This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers. */
#include <linux/virtio_config.h>
/* The ID for virtio_block */
@@ -11,6 +13,7 @@
#define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */
#define VIRTIO_BLK_F_GEOMETRY 4 /* Legacy geometry available */
#define VIRTIO_BLK_F_RO 5 /* Disk is read-only */
+#define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is available*/
struct virtio_blk_config
{
@@ -26,6 +29,8 @@ struct virtio_blk_config
__u8 heads;
__u8 sectors;
} geometry;
+ /* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
+ __u32 blk_size;
} __attribute__((packed));
/* These two define direction. */
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index f364bbf63c34..bf8ec283b232 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -1,5 +1,8 @@
#ifndef _LINUX_VIRTIO_CONFIG_H
#define _LINUX_VIRTIO_CONFIG_H
+/* This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers. */
+
/* Virtio devices use a standardized configuration space to define their
* features and pass configuration information, but each implementation can
* store and access that space differently. */
@@ -15,6 +18,12 @@
/* We've given up on this device. */
#define VIRTIO_CONFIG_S_FAILED 0x80
+/* Some virtio feature bits (currently bits 28 through 31) are reserved for the
+ * transport being used (eg. virtio_ring), the rest are per-device feature
+ * bits. */
+#define VIRTIO_TRANSPORT_F_START 28
+#define VIRTIO_TRANSPORT_F_END 32
+
/* Do we get callbacks when the ring is completely used, even if we've
* suppressed them? */
#define VIRTIO_F_NOTIFY_ON_EMPTY 24
@@ -52,9 +61,10 @@
* @get_features: get the array of feature bits for this device.
* vdev: the virtio_device
* Returns the first 32 feature bits (all we currently need).
- * @set_features: confirm what device features we'll be using.
+ * @finalize_features: confirm what device features we'll be using.
* vdev: the virtio_device
- * feature: the first 32 feature bits
+ * This gives the final feature bits for the device: it can change
+ * the dev->feature bits if it wants.
*/
struct virtio_config_ops
{
@@ -70,7 +80,7 @@ struct virtio_config_ops
void (*callback)(struct virtqueue *));
void (*del_vq)(struct virtqueue *vq);
u32 (*get_features)(struct virtio_device *vdev);
- void (*set_features)(struct virtio_device *vdev, u32 features);
+ void (*finalize_features)(struct virtio_device *vdev);
};
/* If driver didn't advertise the feature, it will never appear. */
diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h
index ed2d4ead7eb7..19a0da0dba41 100644
--- a/include/linux/virtio_console.h
+++ b/include/linux/virtio_console.h
@@ -1,6 +1,8 @@
#ifndef _LINUX_VIRTIO_CONSOLE_H
#define _LINUX_VIRTIO_CONSOLE_H
#include <linux/virtio_config.h>
+/* This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers. */
/* The ID for virtio console */
#define VIRTIO_ID_CONSOLE 3
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 38c0571820fb..5e33761b9b8a 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -1,5 +1,7 @@
#ifndef _LINUX_VIRTIO_NET_H
#define _LINUX_VIRTIO_NET_H
+/* This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers. */
#include <linux/virtio_config.h>
/* The ID for virtio_net */
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index b3151659cf49..cdef35742932 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -9,9 +9,8 @@
* Authors:
* Anthony Liguori <aliguori@us.ibm.com>
*
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
*/
#ifndef _LINUX_VIRTIO_PCI_H
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index abe481ed990e..06e0a881c812 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -24,6 +24,9 @@
* optimization. */
#define VRING_AVAIL_F_NO_INTERRUPT 1
+/* We publish our last-seen used index at the end of the avail ring. */
+#define VIRTIO_RING_F_PUBLISH_INDICES 28
+
/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
struct vring_desc
{
@@ -82,6 +85,7 @@ struct vring {
* __u16 avail_flags;
* __u16 avail_idx;
* __u16 available[num];
+ * __u16 last_used_idx;
*
* // Padding to the next page boundary.
* char pad[];
@@ -90,6 +94,7 @@ struct vring {
* __u16 used_flags;
* __u16 used_idx;
* struct vring_used_elem used[num];
+ * __u16 last_avail_idx;
* };
*/
static inline void vring_init(struct vring *vr, unsigned int num, void *p,
@@ -106,9 +111,14 @@ static inline unsigned vring_size(unsigned int num, unsigned long pagesize)
{
return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num)
+ pagesize - 1) & ~(pagesize - 1))
- + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num;
+ + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num + 2;
}
+/* We publish the last-seen used index at the end of the available ring, and
+ * vice-versa. These are at the end for backwards compatibility. */
+#define vring_last_used(vr) ((vr)->avail->ring[(vr)->num])
+#define vring_last_avail(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])
+
#ifdef __KERNEL__
#include <linux/irqreturn.h>
struct virtio_device;
@@ -120,6 +130,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
void (*notify)(struct virtqueue *vq),
void (*callback)(struct virtqueue *vq));
void vring_del_virtqueue(struct virtqueue *vq);
+/* Filter out transport-specific feature bits. */
+void vring_transport_features(struct virtio_device *vdev);
irqreturn_t vring_interrupt(int irq, void *_vq);
#endif /* __KERNEL__ */
diff --git a/include/linux/virtio_rng.h b/include/linux/virtio_rng.h
index 331afb6c9f62..1a85dab8a940 100644
--- a/include/linux/virtio_rng.h
+++ b/include/linux/virtio_rng.h
@@ -1,5 +1,7 @@
#ifndef _LINUX_VIRTIO_RNG_H
#define _LINUX_VIRTIO_RNG_H
+/* This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers. */
#include <linux/virtio_config.h>
/* The ID for virtio_rng */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2cc409ce0a8f..35e57fc02f12 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -216,7 +216,6 @@ static int __ref take_cpu_down(void *_param)
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
- struct task_struct *p;
cpumask_t old_allowed, tmp;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -249,21 +248,18 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
cpus_setall(tmp);
cpu_clear(cpu, tmp);
set_cpus_allowed_ptr(current, &tmp);
+ tmp = cpumask_of_cpu(cpu);
- p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
-
- if (IS_ERR(p) || cpu_online(cpu)) {
+ err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
+ if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
hcpu) == NOTIFY_BAD)
BUG();
- if (IS_ERR(p)) {
- err = PTR_ERR(p);
- goto out_allowed;
- }
- goto out_thread;
+ goto out_allowed;
}
+ BUG_ON(cpu_online(cpu));
/* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu))
@@ -279,8 +275,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
check_for_tasks(cpu);
-out_thread:
- err = kthread_stop(p);
out_allowed:
set_cpus_allowed_ptr(current, &old_allowed);
out_release:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27491b1..5df80ab1deb0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -111,29 +111,10 @@ static void create_kthread(struct kthread_create_info *create)
complete(&create->done);
}
-/**
- * kthread_create - create a kthread.
- * @threadfn: the function to run until signal_pending(current).
- * @data: data ptr for @threadfn.
- * @namefmt: printf-style name for the thread.
- *
- * Description: This helper function creates and names a kernel
- * thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run(), kthread_create_on_cpu().
- *
- * When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
- * return when 'kthread_should_stop()' is true (which means
- * kthread_stop() has been called). The return value should be zero
- * or a negative error number; it will be passed to kthread_stop().
- *
- * Returns a task_struct or ERR_PTR(-ENOMEM).
- */
-struct task_struct *kthread_create(int (*threadfn)(void *data),
- void *data,
- const char namefmt[],
- ...)
+struct task_struct *__kthread_create(int (*threadfn)(void *data),
+ void *data,
+ const char namefmt[],
+ ...)
{
struct kthread_create_info create;
@@ -158,7 +139,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
}
return create.result;
}
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(__kthread_create);
/**
* kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/module.c b/kernel/module.c
index 38258c9ab435..670d45e50072 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -691,7 +691,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
if (flags & O_NONBLOCK) {
struct stopref sref = { mod, flags, forced };
- return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+ return stop_machine(__try_stop_module, &sref, NULL);
} else {
/* We don't need to stop the machine for this. */
mod->state = MODULE_STATE_GOING;
@@ -1429,7 +1429,7 @@ static int __unlink_module(void *_mod)
static void free_module(struct module *mod)
{
/* Delete from various lists */
- stop_machine_run(__unlink_module, mod, NR_CPUS);
+ stop_machine(__unlink_module, mod, NULL);
remove_notes_attrs(mod);
remove_sect_attrs(mod);
mod_kobject_remove(mod);
@@ -2197,7 +2197,7 @@ static struct module *load_module(void __user *umod,
/* Now sew it into the lists so we can get lockdep and oops
* info during argument parsing. Noone should access us, since
* strong_try_module_get() will fail. */
- stop_machine_run(__link_module, mod, NR_CPUS);
+ stop_machine(__link_module, mod, NULL);
/* Size of section 0 is 0, so this works well if no params */
err = parse_args(mod->name, mod->args,
@@ -2231,7 +2231,7 @@ static struct module *load_module(void __user *umod,
return mod;
unlink:
- stop_machine_run(__unlink_module, mod, NR_CPUS);
+ stop_machine(__unlink_module, mod, NULL);
module_arch_cleanup(mod);
cleanup:
kobject_del(&mod->mkobj.kobj);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac8ef60..7502055dc6d0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -370,8 +370,8 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
* Try to acquire the mutex atomically. Returns 1 if the mutex
* has been acquired successfully, and 0 on contention.
*
- * NOTE: this function follows the spin_trylock() convention, so
- * it is negated to the down_trylock() return values! Be careful
+ * NOTE: this function follows the spin_trylock()/down_try() convention,
+ * so it is negated to the old down_trylock() return values! Be careful
* about this when converting semaphore users to mutexes.
*
* This function must not be used in interrupt context. The
diff --git a/kernel/printk.c b/kernel/printk.c
index 07ad9e7f7a66..74271278fc5c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -969,7 +969,7 @@ EXPORT_SYMBOL(acquire_console_sem);
int try_acquire_console_sem(void)
{
- if (down_trylock(&console_sem))
+ if (!down_try(&console_sem))
return -1;
console_locked = 1;
console_may_schedule = 0;
@@ -1068,7 +1068,7 @@ void console_unblank(void)
* oops_in_progress is set to 1..
*/
if (oops_in_progress) {
- if (down_trylock(&console_sem) != 0)
+ if (!down_try(&console_sem))
return;
} else
acquire_console_sem();
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index fb404457f60c..d4271146a9bd 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -93,8 +93,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
* rdp->cpu is the current cpu.
*
* cpu_online_map is updated by the _cpu_down()
- * using stop_machine_run(). Since we're in irqs disabled
- * section, stop_machine_run() is not exectuting, hence
+ * using __stop_machine(). Since we're in irqs disabled
+ * section, __stop_machine() is not exectuting, hence
* the cpu_online_map is stable.
*
* However, a cpu might have been offlined _just_ before
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index aaaeae8244e7..62f30c22f666 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -14,7 +14,7 @@
* Some notes on the implementation:
*
* The spinlock controls access to the other members of the semaphore.
- * down_trylock() and up() can be called from interrupt context, so we
+ * down_try() and up() can be called from interrupt context, so we
* have to disable interrupts when taking the lock. It turns out various
* parts of the kernel expect to be able to use down() on a semaphore in
* interrupt context when they know it will succeed, so we have to use
@@ -115,19 +115,18 @@ int down_killable(struct semaphore *sem)
EXPORT_SYMBOL(down_killable);
/**
- * down_trylock - try to acquire the semaphore, without waiting
+ * down_try - try to acquire the semaphore, without waiting
* @sem: the semaphore to be acquired
*
- * Try to acquire the semaphore atomically. Returns 0 if the mutex has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * Try to acquire the semaphore atomically. Returns true if the mutex has
+ * been acquired successfully or 0 if it it cannot be acquired.
*
- * NOTE: This return value is inverted from both spin_trylock and
- * mutex_trylock! Be careful about this when converting code.
+ * NOTE: This replaces down_trylock() which returned the reverse.
*
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
-int down_trylock(struct semaphore *sem)
+int down_try(struct semaphore *sem)
{
unsigned long flags;
int count;
@@ -138,9 +137,9 @@ int down_trylock(struct semaphore *sem)
sem->count = count;
spin_unlock_irqrestore(&sem->lock, flags);
- return (count < 0);
+ return (count >= 0);
}
-EXPORT_SYMBOL(down_trylock);
+EXPORT_SYMBOL(down_try);
/**
* down_timeout - acquire the semaphore within a specified time
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 738b411ff2d3..a6c111d3fc29 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,4 +1,4 @@
-/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
* GPL v2 and any later version.
*/
#include <linux/cpu.h>
@@ -13,204 +13,238 @@
#include <asm/atomic.h>
#include <asm/uaccess.h>
-/* Since we effect priority and affinity (both of which are visible
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-
-/* Thread to stop each CPU in user context. */
+/* This controls the threads on each CPU. */
enum stopmachine_state {
- STOPMACHINE_WAIT,
+ /* Dummy starting state for thread. */
+ STOPMACHINE_NONE,
+ /* Awaiting everyone to be scheduled. */
STOPMACHINE_PREPARE,
+ /* Disable interrupts. */
STOPMACHINE_DISABLE_IRQ,
+ /* Run the function */
+ STOPMACHINE_RUN,
+ /* Exit */
STOPMACHINE_EXIT,
};
+static enum stopmachine_state state;
-static enum stopmachine_state stopmachine_state;
-static unsigned int stopmachine_num_threads;
-static atomic_t stopmachine_thread_ack;
+struct stop_machine_data {
+ int (*fn)(void *);
+ void *data;
+ int fnret;
+};
-static int stopmachine(void *cpu)
-{
- int irqs_disabled = 0;
- int prepared = 0;
- cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+static unsigned num_threads;
+static atomic_t thread_ack;
+static cpumask_t prepared_cpus;
+static struct completion finished;
+static DEFINE_MUTEX(lock);
- set_cpus_allowed_ptr(current, cpumask);
+static unsigned long limit;
+unsigned long stopmachine_timeout = 5; /* secs, arbitrary */
- /* Ack: we are alive */
- smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
- atomic_inc(&stopmachine_thread_ack);
+static void set_state(enum stopmachine_state newstate)
+{
+ /* Reset ack counter. */
+ atomic_set(&thread_ack, num_threads);
+ smp_wmb();
+ state = newstate;
+}
- /* Simple state machine */
- while (stopmachine_state != STOPMACHINE_EXIT) {
- if (stopmachine_state == STOPMACHINE_DISABLE_IRQ
- && !irqs_disabled) {
- local_irq_disable();
- hard_irq_disable();
- irqs_disabled = 1;
- /* Ack: irqs disabled. */
- smp_mb(); /* Must read state first. */
- atomic_inc(&stopmachine_thread_ack);
- } else if (stopmachine_state == STOPMACHINE_PREPARE
- && !prepared) {
- /* Everyone is in place, hold CPU. */
- preempt_disable();
- prepared = 1;
- smp_mb(); /* Must read state first. */
- atomic_inc(&stopmachine_thread_ack);
- }
- /* Yield in first stage: migration threads need to
- * help our sisters onto their CPUs. */
- if (!prepared && !irqs_disabled)
- yield();
- cpu_relax();
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
+{
+ if (atomic_dec_and_test(&thread_ack)) {
+ /* If we're the last one to ack the EXIT, we're finished. */
+ if (state == STOPMACHINE_EXIT)
+ complete(&finished);
+ else
+ set_state(state + 1);
}
+}
- /* Ack: we are exiting. */
- smp_mb(); /* Must read state first. */
- atomic_inc(&stopmachine_thread_ack);
+/* This is the actual thread which stops the CPU. It exits by itself rather
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
+static int stop_cpu(struct stop_machine_data *smdata)
+{
+ enum stopmachine_state curstate = STOPMACHINE_NONE;
+ int uninitialized_var(ret);
- if (irqs_disabled)
- local_irq_enable();
- if (prepared)
- preempt_enable();
+ /* If we've been shoved off the normal CPU, abort. */
+ if (cpu_test_and_set(smp_processor_id(), prepared_cpus))
+ do_exit(0);
- return 0;
+ /* Simple state machine */
+ do {
+ /* Chill out and ensure we re-read stopmachine_state. */
+ cpu_relax();
+ if (state != curstate) {
+ curstate = state;
+ switch (curstate) {
+ case STOPMACHINE_DISABLE_IRQ:
+ local_irq_disable();
+ hard_irq_disable();
+ break;
+ case STOPMACHINE_RUN:
+ /* |= allows error detection if functions on
+ * multiple CPUs. */
+ smdata->fnret |= smdata->fn(smdata->data);
+ break;
+ default:
+ break;
+ }
+ ack_state();
+ }
+ } while (curstate != STOPMACHINE_EXIT);
+
+ local_irq_enable();
+ do_exit(0);
}
-/* Change the thread state */
-static void stopmachine_set_state(enum stopmachine_state state)
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
{
- atomic_set(&stopmachine_thread_ack, 0);
- smp_wmb();
- stopmachine_state = state;
- while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
- cpu_relax();
+ return 0;
}
-static int stop_machine(void)
+static bool fixup_timeout(struct task_struct **threads, const cpumask_t *cpus)
{
- int i, ret = 0;
+ unsigned int i;
+ bool stagger_onwards = true;
- atomic_set(&stopmachine_thread_ack, 0);
- stopmachine_num_threads = 0;
- stopmachine_state = STOPMACHINE_WAIT;
+ printk(KERN_CRIT "stopmachine: Failed to stop machine in time(%lds).\n",
+ stopmachine_timeout);
for_each_online_cpu(i) {
- if (i == raw_smp_processor_id())
- continue;
- ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
- if (ret < 0)
- break;
- stopmachine_num_threads++;
- }
-
- /* Wait for them all to come to life. */
- while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
- yield();
- cpu_relax();
+ if (!cpu_isset(i, prepared_cpus) && i != smp_processor_id()) {
+ bool ignore;
+
+ /* If we wanted to run on a particular CPU, and that's
+ * the one which is stuck, it's a real failure. */
+ ignore = !cpus || !cpu_isset(i, *cpus);
+ printk(KERN_CRIT "stopmachine: cpu#%d seems to be "
+ "stuck, %s.\n",
+ i, ignore ? "ignoring" : "FAILING");
+ /* Unbind thread: it will exit when it sees
+ * that prepared_cpus bit set. */
+ set_cpus_allowed(threads[i], cpu_online_map);
+
+ if (!ignore)
+ stagger_onwards = false;
+
+ /* Pretend this one doesn't exist. */
+ num_threads--;
+ }
}
- /* If some failed, kill them all. */
- if (ret < 0) {
- stopmachine_set_state(STOPMACHINE_EXIT);
- return ret;
+ if (stagger_onwards) {
+ /* Force progress. */
+ set_state(state + 1);
}
- /* Now they are all started, make them hold the CPUs, ready. */
- preempt_disable();
- stopmachine_set_state(STOPMACHINE_PREPARE);
-
- /* Make them disable irqs. */
- local_irq_disable();
- hard_irq_disable();
- stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
-
- return 0;
+ return stagger_onwards;
}
-static void restart_machine(void)
+int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
{
- stopmachine_set_state(STOPMACHINE_EXIT);
- local_irq_enable();
- preempt_enable_no_resched();
-}
+ int i, err;
+ struct stop_machine_data active, idle;
+ struct task_struct **threads;
+
+ active.fn = fn;
+ active.data = data;
+ active.fnret = 0;
+ idle.fn = chill;
+ idle.data = NULL;
+
+ /* This could be too big for stack on large machines. */
+ threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+ if (!threads)
+ return -ENOMEM;
+
+ /* Set up initial state. */
+ mutex_lock(&lock);
+ init_completion(&finished);
+ num_threads = num_online_cpus();
+ limit = jiffies + msecs_to_jiffies(stopmachine_timeout * MSEC_PER_SEC);
+ set_state(STOPMACHINE_PREPARE);
-struct stop_machine_data {
- int (*fn)(void *);
- void *data;
- struct completion done;
-};
+ for_each_online_cpu(i) {
+ struct stop_machine_data *smdata = &idle;
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-static int do_stop(void *_smdata)
-{
- struct stop_machine_data *smdata = _smdata;
- int ret;
+ if (!cpus) {
+ if (i == first_cpu(cpu_online_map))
+ smdata = &active;
+ } else {
+ if (cpu_isset(i, *cpus))
+ smdata = &active;
+ }
- ret = stop_machine();
- if (ret == 0) {
- ret = smdata->fn(smdata->data);
- restart_machine();
- }
+ threads[i] = kthread_create(stop_cpu, smdata, "kstop%u", i);
+ if (IS_ERR(threads[i])) {
+ err = PTR_ERR(threads[i]);
+ threads[i] = NULL;
+ goto kill_threads;
+ }
- /* We're done: you can kthread_stop us now */
- complete(&smdata->done);
+ /* Place it onto correct cpu. */
+ kthread_bind(threads[i], i);
- /* Wait for kthread_stop */
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
+ /* Make it highest prio. */
+ if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
+ BUG();
}
- __set_current_state(TASK_RUNNING);
- return ret;
-}
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
- unsigned int cpu)
-{
- static DEFINE_MUTEX(stopmachine_mutex);
- struct stop_machine_data smdata;
- struct task_struct *p;
+ /* We've created all the threads. Wake them all: hold this CPU so one
+ * doesn't hit this CPU until we're ready. */
+ cpus_clear(prepared_cpus);
+ get_cpu();
+ for_each_online_cpu(i)
+ wake_up_process(threads[i]);
+
+ /* Wait all others come to life */
+ while (cpus_weight(prepared_cpus) != num_online_cpus() - 1) {
+ if (time_is_before_jiffies(limit)) {
+ if (!fixup_timeout(threads, cpus)) {
+ /* Tell them all to exit. */
+ set_state(STOPMACHINE_EXIT);
+ active.fnret = -EIO;
+ }
+ break;
+ }
+ cpu_relax();
+ }
- smdata.fn = fn;
- smdata.data = data;
- init_completion(&smdata.done);
+ /* This will release the thread on our CPU. */
+ put_cpu();
+ wait_for_completion(&finished);
+ mutex_unlock(&lock);
- mutex_lock(&stopmachine_mutex);
+ kfree(threads);
- /* If they don't care which CPU fn runs on, bind to any online one. */
- if (cpu == NR_CPUS)
- cpu = raw_smp_processor_id();
+ return active.fnret;
- p = kthread_create(do_stop, &smdata, "kstopmachine");
- if (!IS_ERR(p)) {
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+kill_threads:
+ for_each_online_cpu(i)
+ if (threads[i])
+ kthread_stop(threads[i]);
+ mutex_unlock(&lock);
- /* One high-prio thread per cpu. We'll do this one. */
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
- kthread_bind(p, cpu);
- wake_up_process(p);
- wait_for_completion(&smdata.done);
- }
- mutex_unlock(&stopmachine_mutex);
- return p;
+ kfree(threads);
+ return err;
}
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine_notype(int (*fn)(void *), void *data, const cpumask_t *cpus)
{
- struct task_struct *p;
int ret;
/* No CPUs can come up or down during this. */
get_online_cpus();
- p = __stop_machine_run(fn, data, cpu);
- if (!IS_ERR(p))
- ret = kthread_stop(p);
- else
- ret = PTR_ERR(p);
+ ret = __stop_machine(fn, data, cpus);
put_online_cpus();
return ret;
}
-EXPORT_SYMBOL_GPL(stop_machine_run);
+EXPORT_SYMBOL_GPL(stop_machine_notype);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 448aa482c13a..e386dcdb9b1c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -47,6 +47,7 @@
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
+#include <linux/stop_machine.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -836,6 +837,17 @@ static struct ctl_table kern_table[] = {
.child = key_sysctls,
},
#endif
+#ifdef CONFIG_STOP_MACHINE
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "stopmachine_timeout",
+ .data = &stopmachine_timeout,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+#endif
#ifdef CONFIG_RCU_TORTURE_TEST
{
.ctl_name = CTL_UNNUMBERED,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79ac4afc908c..6efb5f91a3ed 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2332,7 +2332,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
#endif /* CONFIG_NUMA */
-/* return values int ....just for stop_machine_run() */
+/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *dummy)
{
int nid;
@@ -2356,7 +2356,7 @@ void build_all_zonelists(void)
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+ stop_machine(__build_all_zonelists, NULL, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();