summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2011-03-01 16:15:13 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2011-03-01 16:15:13 +1100
commitc9c095224f424a458d5e014d134b10f66640101a (patch)
tree2bfca3d7b34966a1d90591548363fa6a0e8cbb8a /drivers
parent0af3c770644255633a435f4f9d336355695b9340 (diff)
parent0b02dbbcf863d6aa985f13be72a9bea38bd54b92 (diff)
Merge remote-tracking branch 'xen-two/linux-next'
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/xen-blkfront.c8
-rw-r--r--drivers/pci/xen-pcifront.c31
-rw-r--r--drivers/xen/Kconfig10
-rw-r--r--drivers/xen/Makefile2
-rw-r--r--drivers/xen/balloon.c2
-rw-r--r--drivers/xen/events.c380
-rw-r--r--drivers/xen/gntalloc.c545
-rw-r--r--drivers/xen/gntdev.c387
-rw-r--r--drivers/xen/grant-table.c6
9 files changed, 1005 insertions, 366 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d7aa39e349a6..cc4514c9d8a6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -281,7 +281,7 @@ static int blkif_queue_request(struct request *req)
info->shadow[id].request = req;
ring_req->id = id;
- ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
+ ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
ring_req->handle = info->handle;
ring_req->operation = rq_data_dir(req) ?
@@ -317,7 +317,7 @@ static int blkif_queue_request(struct request *req)
rq_data_dir(req) );
info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
- ring_req->seg[i] =
+ ring_req->u.rw.seg[i] =
(struct blkif_request_segment) {
.gref = ref,
.first_sect = fsect,
@@ -615,7 +615,7 @@ static void blkif_completion(struct blk_shadow *s)
{
int i;
for (i = 0; i < s->req.nr_segments; i++)
- gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+ gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
}
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -932,7 +932,7 @@ static int blkif_recover(struct blkfront_info *info)
/* Rewrite any grant references invalidated by susp/resume. */
for (j = 0; j < req->nr_segments; j++)
gnttab_grant_foreign_access_ref(
- req->seg[j].gref,
+ req->u.rw.seg[j].gref,
info->xbdev->otherend_id,
pfn_to_mfn(info->shadow[req->id].frame[j]),
rq_data_dir(info->shadow[req->id].request));
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 3a5a6fcc0ead..492b7d807fe8 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -243,7 +243,7 @@ struct pci_ops pcifront_bus_ops = {
#ifdef CONFIG_PCI_MSI
static int pci_frontend_enable_msix(struct pci_dev *dev,
- int **vector, int nvec)
+ int vector[], int nvec)
{
int err;
int i;
@@ -277,18 +277,24 @@ static int pci_frontend_enable_msix(struct pci_dev *dev,
if (likely(!err)) {
if (likely(!op.value)) {
/* we get the result */
- for (i = 0; i < nvec; i++)
- *(*vector+i) = op.msix_entries[i].vector;
- return 0;
+ for (i = 0; i < nvec; i++) {
+ if (op.msix_entries[i].vector <= 0) {
+ dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n",
+ i, op.msix_entries[i].vector);
+ err = -EINVAL;
+ vector[i] = -1;
+ continue;
+ }
+ vector[i] = op.msix_entries[i].vector;
+ }
} else {
printk(KERN_DEBUG "enable msix get value %x\n",
op.value);
- return op.value;
}
} else {
dev_err(&dev->dev, "enable msix get err %x\n", err);
- return err;
}
+ return err;
}
static void pci_frontend_disable_msix(struct pci_dev *dev)
@@ -310,7 +316,7 @@ static void pci_frontend_disable_msix(struct pci_dev *dev)
dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
}
-static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
+static int pci_frontend_enable_msi(struct pci_dev *dev, int vector[])
{
int err;
struct xen_pci_op op = {
@@ -324,7 +330,13 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
err = do_pci_op(pdev, &op);
if (likely(!err)) {
- *(*vector) = op.value;
+ vector[0] = op.value;
+ if (op.value <= 0) {
+ dev_warn(&dev->dev, "MSI entry is invalid: %d!\n",
+ op.value);
+ err = -EINVAL;
+ vector[0] = -1;
+ }
} else {
dev_err(&dev->dev, "pci frontend enable msi failed for dev "
"%x:%x\n", op.bus, op.devfn);
@@ -733,8 +745,7 @@ static void free_pdev(struct pcifront_device *pdev)
pcifront_free_roots(pdev);
- /*For PCIE_AER error handling job*/
- flush_scheduled_work();
+ cancel_work_sync(&pdev->op_work);
if (pdev->irq >= 0)
unbind_from_irqhandler(pdev->irq, pdev);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 07bec09d1dad..a59638b37c1a 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -76,10 +76,20 @@ config XEN_XENBUS_FRONTEND
config XEN_GNTDEV
tristate "userspace grant access device driver"
depends on XEN
+ default m
select MMU_NOTIFIER
help
Allows userspace processes to use grants.
+config XEN_GRANT_DEV_ALLOC
+ tristate "User-space grant reference allocator driver"
+ depends on XEN
+ default m
+ help
+ Allows userspace processes to create pages with access granted
+ to other domains. This can be used to implement frontend drivers
+ or as part of an inter-domain shared memory channel.
+
config XEN_PLATFORM_PCI
tristate "xen platform pci device driver"
depends on XEN_PVHVM && PCI
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 5088cc2e6fe2..9585a1da52c6 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
obj-$(CONFIG_XEN_BALLOON) += balloon.o
obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
+obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o
obj-$(CONFIG_XENFS) += xenfs/
obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o
@@ -18,5 +19,6 @@ obj-$(CONFIG_XEN_DOM0) += pci.o
xen-evtchn-y := evtchn.o
xen-gntdev-y := gntdev.o
+xen-gntalloc-y := gntalloc.o
xen-platform-pci-y := platform-pci.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 43f9f02c7db0..b1661cd416b0 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -296,7 +296,7 @@ static int decrease_reservation(unsigned long nr_pages)
/* No more mappings: invalidate P2M and add to balloon. */
for (i = 0; i < nr_pages; i++) {
pfn = mfn_to_pfn(frame_list[i]);
- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
balloon_append(pfn_to_page(pfn));
}
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 74681478100a..0dd0af30d6a0 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -114,7 +114,7 @@ struct cpu_evtchn_s {
static __initdata struct cpu_evtchn_s init_evtchn_mask = {
.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
};
-static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+static struct cpu_evtchn_s __refdata *cpu_evtchn_mask_p = &init_evtchn_mask;
static inline unsigned long *cpu_evtchn_mask(int cpu)
{
@@ -277,7 +277,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
BUG_ON(irq == -1);
#ifdef CONFIG_SMP
- cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
+ cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu));
#endif
clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
@@ -294,7 +294,7 @@ static void init_evtchn_cpu_bindings(void)
/* By default all event channels notify CPU#0. */
for_each_irq_desc(i, desc) {
- cpumask_copy(desc->affinity, cpumask_of(0));
+ cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
}
#endif
@@ -376,81 +376,69 @@ static void unmask_evtchn(int port)
put_cpu();
}
-static int get_nr_hw_irqs(void)
+static int xen_allocate_irq_dynamic(void)
{
- int ret = 1;
+ int first = 0;
+ int irq;
#ifdef CONFIG_X86_IO_APIC
- ret = get_nr_irqs_gsi();
+ /*
+ * For an HVM guest or domain 0 which see "real" (emulated or
+ * actual repectively) GSIs we allocate dynamic IRQs
+ * e.g. those corresponding to event channels or MSIs
+ * etc. from the range above those "real" GSIs to avoid
+ * collisions.
+ */
+ if (xen_initial_domain() || xen_hvm_domain())
+ first = get_nr_irqs_gsi();
#endif
- return ret;
-}
+retry:
+ irq = irq_alloc_desc_from(first, -1);
-static int find_unbound_pirq(int type)
-{
- int rc, i;
- struct physdev_get_free_pirq op_get_free_pirq;
- op_get_free_pirq.type = type;
+ if (irq == -ENOMEM && first > NR_IRQS_LEGACY) {
+ printk(KERN_ERR "Out of dynamic IRQ space and eating into GSI space. You should increase nr_irqs\n");
+ first = max(NR_IRQS_LEGACY, first - NR_IRQS_LEGACY);
+ goto retry;
+ }
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
- if (!rc)
- return op_get_free_pirq.pirq;
+ if (irq < 0)
+ panic("No available IRQ to bind to: increase nr_irqs!\n");
- for (i = 0; i < nr_irqs; i++) {
- if (pirq_to_irq[i] < 0)
- return i;
- }
- return -1;
+ return irq;
}
-static int find_unbound_irq(void)
+static int xen_allocate_irq_gsi(unsigned gsi)
{
- struct irq_data *data;
- int irq, res;
- int bottom = get_nr_hw_irqs();
- int top = nr_irqs-1;
-
- if (bottom == nr_irqs)
- goto no_irqs;
+ int irq;
- /* This loop starts from the top of IRQ space and goes down.
- * We need this b/c if we have a PCI device in a Xen PV guest
- * we do not have an IO-APIC (though the backend might have them)
- * mapped in. To not have a collision of physical IRQs with the Xen
- * event channels start at the top of the IRQ space for virtual IRQs.
+ /*
+ * A PV guest has no concept of a GSI (since it has no ACPI
+ * nor access to/knowledge of the physical APICs). Therefore
+ * all IRQs are dynamically allocated from the entire IRQ
+ * space.
*/
- for (irq = top; irq > bottom; irq--) {
- data = irq_get_irq_data(irq);
- /* only 15->0 have init'd desc; handle irq > 16 */
- if (!data)
- break;
- if (data->chip == &no_irq_chip)
- break;
- if (data->chip != &xen_dynamic_chip)
- continue;
- if (irq_info[irq].type == IRQT_UNBOUND)
- return irq;
- }
+ if (xen_pv_domain() && !xen_initial_domain())
+ return xen_allocate_irq_dynamic();
- if (irq == bottom)
- goto no_irqs;
+ /* Legacy IRQ descriptors are already allocated by the arch. */
+ if (gsi < NR_IRQS_LEGACY)
+ return gsi;
- res = irq_alloc_desc_at(irq, -1);
-
- if (WARN_ON(res != irq))
- return -1;
+ irq = irq_alloc_desc_at(gsi, -1);
+ if (irq < 0)
+ panic("Unable to allocate to IRQ%d (%d)\n", gsi, irq);
return irq;
-
-no_irqs:
- panic("No available IRQ to bind to: increase nr_irqs!\n");
}
-static bool identity_mapped_irq(unsigned irq)
+static void xen_free_irq(unsigned irq)
{
- /* identity map all the hardware irqs */
- return irq < get_nr_hw_irqs();
+ /* Legacy IRQ descriptors are managed by the arch. */
+ if (irq < NR_IRQS_LEGACY)
+ return;
+
+ irq_free_desc(irq);
}
static void pirq_unmask_notify(int irq)
@@ -486,7 +474,7 @@ static bool probing_irq(int irq)
return desc && desc->action == NULL;
}
-static unsigned int startup_pirq(unsigned int irq)
+static unsigned int __startup_pirq(unsigned int irq)
{
struct evtchn_bind_pirq bind_pirq;
struct irq_info *info = info_for_irq(irq);
@@ -524,9 +512,15 @@ out:
return 0;
}
-static void shutdown_pirq(unsigned int irq)
+static unsigned int startup_pirq(struct irq_data *data)
+{
+ return __startup_pirq(data->irq);
+}
+
+static void shutdown_pirq(struct irq_data *data)
{
struct evtchn_close close;
+ unsigned int irq = data->irq;
struct irq_info *info = info_for_irq(irq);
int evtchn = evtchn_from_irq(irq);
@@ -546,20 +540,20 @@ static void shutdown_pirq(unsigned int irq)
info->evtchn = 0;
}
-static void enable_pirq(unsigned int irq)
+static void enable_pirq(struct irq_data *data)
{
- startup_pirq(irq);
+ startup_pirq(data);
}
-static void disable_pirq(unsigned int irq)
+static void disable_pirq(struct irq_data *data)
{
}
-static void ack_pirq(unsigned int irq)
+static void ack_pirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
- move_native_irq(irq);
+ move_native_irq(data->irq);
if (VALID_EVTCHN(evtchn)) {
mask_evtchn(evtchn);
@@ -567,23 +561,6 @@ static void ack_pirq(unsigned int irq)
}
}
-static void end_pirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (WARN_ON(!desc))
- return;
-
- if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
- (IRQ_DISABLED|IRQ_PENDING)) {
- shutdown_pirq(irq);
- } else if (VALID_EVTCHN(evtchn)) {
- unmask_evtchn(evtchn);
- pirq_unmask_notify(irq);
- }
-}
-
static int find_irq_by_gsi(unsigned gsi)
{
int irq;
@@ -638,14 +615,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
goto out; /* XXX need refcount? */
}
- /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
- * we are using the !xen_initial_domain() to drop in the function.*/
- if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
- xen_pv_domain())) {
- irq = gsi;
- irq_alloc_desc_at(irq, -1);
- } else
- irq = find_unbound_irq();
+ irq = xen_allocate_irq_gsi(gsi);
set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
handle_level_irq, name);
@@ -658,7 +628,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
* this in the priv domain. */
if (xen_initial_domain() &&
HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
- irq_free_desc(irq);
+ xen_free_irq(irq);
irq = -ENOSPC;
goto out;
}
@@ -674,87 +644,46 @@ out:
}
#ifdef CONFIG_PCI_MSI
-#include <linux/msi.h>
-#include "../pci/msi.h"
-
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
{
- spin_lock(&irq_mapping_update_lock);
-
- if (alloc & XEN_ALLOC_IRQ) {
- *irq = find_unbound_irq();
- if (*irq == -1)
- goto out;
- }
-
- if (alloc & XEN_ALLOC_PIRQ) {
- *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
- if (*pirq == -1)
- goto out;
- }
+ int rc;
+ struct physdev_get_free_pirq op_get_free_pirq;
- set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
- handle_level_irq, name);
+ op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
- irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0);
- pirq_to_irq[*pirq] = *irq;
+ WARN_ONCE(rc == -ENOSYS,
+ "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");
-out:
- spin_unlock(&irq_mapping_update_lock);
+ return rc ? -1 : op_get_free_pirq.pirq;
}
-int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ int pirq, int vector, const char *name)
{
- int irq = -1;
- struct physdev_map_pirq map_irq;
- int rc;
- int pos;
- u32 table_offset, bir;
-
- memset(&map_irq, 0, sizeof(map_irq));
- map_irq.domid = DOMID_SELF;
- map_irq.type = MAP_PIRQ_TYPE_MSI;
- map_irq.index = -1;
- map_irq.pirq = -1;
- map_irq.bus = dev->bus->number;
- map_irq.devfn = dev->devfn;
-
- if (type == PCI_CAP_ID_MSIX) {
- pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-
- pci_read_config_dword(dev, msix_table_offset_reg(pos),
- &table_offset);
- bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
-
- map_irq.table_base = pci_resource_start(dev, bir);
- map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
- }
+ int irq, ret;
spin_lock(&irq_mapping_update_lock);
- irq = find_unbound_irq();
-
+ irq = xen_allocate_irq_dynamic();
if (irq == -1)
goto out;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
- if (rc) {
- printk(KERN_WARNING "xen map irq failed %d\n", rc);
-
- irq_free_desc(irq);
-
- irq = -1;
- goto out;
- }
- irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
-
set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
- handle_level_irq,
- (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
+ handle_level_irq, name);
+ irq_info[irq] = mk_pirq_info(0, pirq, 0, vector);
+ pirq_to_irq[pirq] = irq;
+ ret = set_irq_msi(irq, msidesc);
+ if (ret < 0)
+ goto error_irq;
out:
spin_unlock(&irq_mapping_update_lock);
return irq;
+error_irq:
+ spin_unlock(&irq_mapping_update_lock);
+ xen_free_irq(irq);
+ return -1;
}
#endif
@@ -779,11 +708,12 @@ int xen_destroy_irq(int irq)
printk(KERN_WARNING "unmap irq failed %d\n", rc);
goto out;
}
- pirq_to_irq[info->u.pirq.pirq] = -1;
}
+ pirq_to_irq[info->u.pirq.pirq] = -1;
+
irq_info[irq] = mk_unbound_info();
- irq_free_desc(irq);
+ xen_free_irq(irq);
out:
spin_unlock(&irq_mapping_update_lock);
@@ -814,7 +744,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
irq = evtchn_to_irq[evtchn];
if (irq == -1) {
- irq = find_unbound_irq();
+ irq = xen_allocate_irq_dynamic();
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_fasteoi_irq, "event");
@@ -839,7 +769,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
irq = per_cpu(ipi_to_irq, cpu)[ipi];
if (irq == -1) {
- irq = find_unbound_irq();
+ irq = xen_allocate_irq_dynamic();
if (irq < 0)
goto out;
@@ -864,6 +794,21 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
return irq;
}
+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+ unsigned int remote_port)
+{
+ struct evtchn_bind_interdomain bind_interdomain;
+ int err;
+
+ bind_interdomain.remote_dom = remote_domain;
+ bind_interdomain.remote_port = remote_port;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+
+ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
+}
+
int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
@@ -875,7 +820,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
irq = per_cpu(virq_to_irq, cpu)[virq];
if (irq == -1) {
- irq = find_unbound_irq();
+ irq = xen_allocate_irq_dynamic();
set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
handle_percpu_irq, "virq");
@@ -934,7 +879,7 @@ static void unbind_from_irq(unsigned int irq)
if (irq_info[irq].type != IRQT_UNBOUND) {
irq_info[irq] = mk_unbound_info();
- irq_free_desc(irq);
+ xen_free_irq(irq);
}
spin_unlock(&irq_mapping_update_lock);
@@ -959,6 +904,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
@@ -990,7 +958,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
if (irq < 0)
return irq;
- irqflags |= IRQF_NO_SUSPEND;
+ irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
@@ -1234,11 +1202,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
return 0;
}
-static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
+ bool force)
{
unsigned tcpu = cpumask_first(dest);
- return rebind_irq_to_cpu(irq, tcpu);
+ return rebind_irq_to_cpu(data->irq, tcpu);
}
int resend_irq_on_evtchn(unsigned int irq)
@@ -1257,35 +1226,35 @@ int resend_irq_on_evtchn(unsigned int irq)
return 1;
}
-static void enable_dynirq(unsigned int irq)
+static void enable_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
if (VALID_EVTCHN(evtchn))
unmask_evtchn(evtchn);
}
-static void disable_dynirq(unsigned int irq)
+static void disable_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
if (VALID_EVTCHN(evtchn))
mask_evtchn(evtchn);
}
-static void ack_dynirq(unsigned int irq)
+static void ack_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
- move_masked_irq(irq);
+ move_masked_irq(data->irq);
if (VALID_EVTCHN(evtchn))
unmask_evtchn(evtchn);
}
-static int retrigger_dynirq(unsigned int irq)
+static int retrigger_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
struct shared_info *sh = HYPERVISOR_shared_info;
int ret = 0;
@@ -1334,7 +1303,7 @@ static void restore_cpu_pirqs(void)
printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
- startup_pirq(irq);
+ __startup_pirq(irq);
}
}
@@ -1445,7 +1414,6 @@ void xen_poll_irq(int irq)
void xen_irq_resume(void)
{
unsigned int cpu, irq, evtchn;
- struct irq_desc *desc;
init_evtchn_cpu_bindings();
@@ -1465,66 +1433,48 @@ void xen_irq_resume(void)
restore_cpu_ipis(cpu);
}
- /*
- * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
- * are not handled by the IRQ core.
- */
- for_each_irq_desc(irq, desc) {
- if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
- continue;
- if (desc->status & IRQ_DISABLED)
- continue;
-
- evtchn = evtchn_from_irq(irq);
- if (evtchn == -1)
- continue;
-
- unmask_evtchn(evtchn);
- }
-
restore_cpu_pirqs();
}
static struct irq_chip xen_dynamic_chip __read_mostly = {
- .name = "xen-dyn",
+ .name = "xen-dyn",
- .disable = disable_dynirq,
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
+ .irq_disable = disable_dynirq,
+ .irq_mask = disable_dynirq,
+ .irq_unmask = enable_dynirq,
- .eoi = ack_dynirq,
- .set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
+ .irq_eoi = ack_dynirq,
+ .irq_set_affinity = set_affinity_irq,
+ .irq_retrigger = retrigger_dynirq,
};
static struct irq_chip xen_pirq_chip __read_mostly = {
- .name = "xen-pirq",
+ .name = "xen-pirq",
- .startup = startup_pirq,
- .shutdown = shutdown_pirq,
+ .irq_startup = startup_pirq,
+ .irq_shutdown = shutdown_pirq,
- .enable = enable_pirq,
- .unmask = enable_pirq,
+ .irq_enable = enable_pirq,
+ .irq_unmask = enable_pirq,
- .disable = disable_pirq,
- .mask = disable_pirq,
+ .irq_disable = disable_pirq,
+ .irq_mask = disable_pirq,
- .ack = ack_pirq,
- .end = end_pirq,
+ .irq_ack = ack_pirq,
- .set_affinity = set_affinity_irq,
+ .irq_set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
+ .irq_retrigger = retrigger_dynirq,
};
static struct irq_chip xen_percpu_chip __read_mostly = {
- .name = "xen-percpu",
+ .name = "xen-percpu",
- .disable = disable_dynirq,
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
+ .irq_disable = disable_dynirq,
+ .irq_mask = disable_dynirq,
+ .irq_unmask = enable_dynirq,
- .ack = ack_dynirq,
+ .irq_ack = ack_dynirq,
};
int xen_set_callback_via(uint64_t via)
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
new file mode 100644
index 000000000000..a7ffdfe19fc9
--- /dev/null
+++ b/drivers/xen/gntalloc.c
@@ -0,0 +1,545 @@
+/******************************************************************************
+ * gntalloc.c
+ *
+ * Device for creating grant references (in user-space) that may be shared
+ * with other domains.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * This driver exists to allow userspace programs in Linux to allocate kernel
+ * memory that will later be shared with another domain. Without this device,
+ * Linux userspace programs cannot create grant references.
+ *
+ * How this stuff works:
+ * X -> granting a page to Y
+ * Y -> mapping the grant from X
+ *
+ * 1. X uses the gntalloc device to allocate a page of kernel memory, P.
+ * 2. X creates an entry in the grant table that says domid(Y) can access P.
+ * This is done without a hypercall unless the grant table needs expansion.
+ * 3. X gives the grant reference identifier, GREF, to Y.
+ * 4. Y maps the page, either directly into kernel memory for use in a backend
+ * driver, or via a the gntdev device to map into the address space of an
+ * application running in Y. This is the first point at which Xen does any
+ * tracking of the page.
+ * 5. A program in X mmap()s a segment of the gntalloc device that corresponds
+ * to the shared page, and can now communicate with Y over the shared page.
+ *
+ *
+ * NOTE TO USERSPACE LIBRARIES:
+ * The grant allocation and mmap()ing are, naturally, two separate operations.
+ * You set up the sharing by calling the create ioctl() and then the mmap().
+ * Teardown requires munmap() and either close() or ioctl().
+ *
+ * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant
+ * reference, this device can be used to consume kernel memory by leaving grant
+ * references mapped by another domain when an application exits. Therefore,
+ * there is a global limit on the number of pages that can be allocated. When
+ * all references to the page are unmapped, it will be freed during the next
+ * grant operation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/gntalloc.h>
+#include <xen/events.h>
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by "
+ "the gntalloc device");
+
+static LIST_HEAD(gref_list);
+static DEFINE_SPINLOCK(gref_lock);
+static int gref_size;
+
+struct notify_info {
+ uint16_t pgoff:12; /* Bits 0-11: Offset of the byte to clear */
+ uint16_t flags:2; /* Bits 12-13: Unmap notification flags */
+ int event; /* Port (event channel) to notify */
+};
+
+/* Metadata on a grant reference. */
+struct gntalloc_gref {
+ struct list_head next_gref; /* list entry gref_list */
+ struct list_head next_file; /* list entry file->list, if open */
+ struct page *page; /* The shared page */
+ uint64_t file_index; /* File offset for mmap() */
+ unsigned int users; /* Use count - when zero, waiting on Xen */
+ grant_ref_t gref_id; /* The grant reference number */
+ struct notify_info notify; /* Unmap notification */
+};
+
+struct gntalloc_file_private_data {
+ struct list_head list;
+ uint64_t index;
+};
+
+static void __del_gref(struct gntalloc_gref *gref);
+
+static void do_cleanup(void)
+{
+ struct gntalloc_gref *gref, *n;
+ list_for_each_entry_safe(gref, n, &gref_list, next_gref) {
+ if (!gref->users)
+ __del_gref(gref);
+ }
+}
+
+static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
+ uint32_t *gref_ids, struct gntalloc_file_private_data *priv)
+{
+ int i, rc, readonly;
+ LIST_HEAD(queue_gref);
+ LIST_HEAD(queue_file);
+ struct gntalloc_gref *gref;
+
+ readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE);
+ rc = -ENOMEM;
+ for (i = 0; i < op->count; i++) {
+ gref = kzalloc(sizeof(*gref), GFP_KERNEL);
+ if (!gref)
+ goto undo;
+ list_add_tail(&gref->next_gref, &queue_gref);
+ list_add_tail(&gref->next_file, &queue_file);
+ gref->users = 1;
+ gref->file_index = op->index + i * PAGE_SIZE;
+ gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!gref->page)
+ goto undo;
+
+ /* Grant foreign access to the page. */
+ gref->gref_id = gnttab_grant_foreign_access(op->domid,
+ pfn_to_mfn(page_to_pfn(gref->page)), readonly);
+ if (gref->gref_id < 0) {
+ rc = gref->gref_id;
+ goto undo;
+ }
+ gref_ids[i] = gref->gref_id;
+ }
+
+ /* Add to gref lists. */
+ spin_lock(&gref_lock);
+ list_splice_tail(&queue_gref, &gref_list);
+ list_splice_tail(&queue_file, &priv->list);
+ spin_unlock(&gref_lock);
+
+ return 0;
+
+undo:
+ spin_lock(&gref_lock);
+ gref_size -= (op->count - i);
+
+ list_for_each_entry(gref, &queue_file, next_file) {
+ /* __del_gref does not remove from queue_file */
+ __del_gref(gref);
+ }
+
+ /* It's possible for the target domain to map the just-allocated grant
+ * references by blindly guessing their IDs; if this is done, then
+ * __del_gref will leave them in the queue_gref list. They need to be
+ * added to the global list so that we can free them when they are no
+ * longer referenced.
+ */
+ if (unlikely(!list_empty(&queue_gref)))
+ list_splice_tail(&queue_gref, &gref_list);
+ spin_unlock(&gref_lock);
+ return rc;
+}
+
+static void __del_gref(struct gntalloc_gref *gref)
+{
+ if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+ uint8_t *tmp = kmap(gref->page);
+ tmp[gref->notify.pgoff] = 0;
+ kunmap(gref->page);
+ }
+ if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+ notify_remote_via_evtchn(gref->notify.event);
+
+ gref->notify.flags = 0;
+
+ if (gref->gref_id > 0) {
+ if (gnttab_query_foreign_access(gref->gref_id))
+ return;
+
+ if (!gnttab_end_foreign_access_ref(gref->gref_id, 0))
+ return;
+ }
+
+ gref_size--;
+ list_del(&gref->next_gref);
+
+ if (gref->page)
+ __free_page(gref->page);
+
+ kfree(gref);
+}
+
+/* finds contiguous grant references in a file, returns the first */
+static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv,
+ uint64_t index, uint32_t count)
+{
+ struct gntalloc_gref *rv = NULL, *gref;
+ list_for_each_entry(gref, &priv->list, next_file) {
+ if (gref->file_index == index && !rv)
+ rv = gref;
+ if (rv) {
+ if (gref->file_index != index)
+ return NULL;
+ index += PAGE_SIZE;
+ count--;
+ if (count == 0)
+ return rv;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * -------------------------------------
+ * File operations.
+ * -------------------------------------
+ */
+static int gntalloc_open(struct inode *inode, struct file *filp)
+{
+ struct gntalloc_file_private_data *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ goto out_nomem;
+ INIT_LIST_HEAD(&priv->list);
+
+ filp->private_data = priv;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ return 0;
+
+out_nomem:
+ return -ENOMEM;
+}
+
+static int gntalloc_release(struct inode *inode, struct file *filp)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+ struct gntalloc_gref *gref;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ spin_lock(&gref_lock);
+ while (!list_empty(&priv->list)) {
+ gref = list_entry(priv->list.next,
+ struct gntalloc_gref, next_file);
+ list_del(&gref->next_file);
+ gref->users--;
+ if (gref->users == 0)
+ __del_gref(gref);
+ }
+ kfree(priv);
+ spin_unlock(&gref_lock);
+
+ return 0;
+}
+
+static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv,
+ struct ioctl_gntalloc_alloc_gref __user *arg)
+{
+ int rc = 0;
+ struct ioctl_gntalloc_alloc_gref op;
+ uint32_t *gref_ids;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ if (copy_from_user(&op, arg, sizeof(op))) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ gref_ids = kzalloc(sizeof(gref_ids[0]) * op.count, GFP_TEMPORARY);
+ if (!gref_ids) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock(&gref_lock);
+ /* Clean up pages that were at zero (local) users but were still mapped
+ * by remote domains. Since those pages count towards the limit that we
+ * are about to enforce, removing them here is a good idea.
+ */
+ do_cleanup();
+ if (gref_size + op.count > limit) {
+ spin_unlock(&gref_lock);
+ rc = -ENOSPC;
+ goto out_free;
+ }
+ gref_size += op.count;
+ op.index = priv->index;
+ priv->index += op.count * PAGE_SIZE;
+ spin_unlock(&gref_lock);
+
+ rc = add_grefs(&op, gref_ids, priv);
+ if (rc < 0)
+ goto out_free;
+
+ /* Once we finish add_grefs, it is unsafe to touch the new reference,
+ * since it is possible for a concurrent ioctl to remove it (by guessing
+ * its index). If the userspace application doesn't provide valid memory
+ * to write the IDs to, then it will need to close the file in order to
+ * release - which it will do by segfaulting when it tries to access the
+ * IDs to close them.
+ */
+ if (copy_to_user(arg, &op, sizeof(op))) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+ if (copy_to_user(arg->gref_ids, gref_ids,
+ sizeof(gref_ids[0]) * op.count)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ kfree(gref_ids);
+out:
+ return rc;
+}
+
+static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv,
+ void __user *arg)
+{
+ int i, rc = 0;
+ struct ioctl_gntalloc_dealloc_gref op;
+ struct gntalloc_gref *gref, *n;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ if (copy_from_user(&op, arg, sizeof(op))) {
+ rc = -EFAULT;
+ goto dealloc_grant_out;
+ }
+
+ spin_lock(&gref_lock);
+ gref = find_grefs(priv, op.index, op.count);
+ if (gref) {
+ /* Remove from the file list only, and decrease reference count.
+ * The later call to do_cleanup() will remove from gref_list and
+ * free the memory if the pages aren't mapped anywhere.
+ */
+ for (i = 0; i < op.count; i++) {
+ n = list_entry(gref->next_file.next,
+ struct gntalloc_gref, next_file);
+ list_del(&gref->next_file);
+ gref->users--;
+ gref = n;
+ }
+ } else {
+ rc = -EINVAL;
+ }
+
+ do_cleanup();
+
+ spin_unlock(&gref_lock);
+dealloc_grant_out:
+ return rc;
+}
+
+static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv,
+ void __user *arg)
+{
+ struct ioctl_gntalloc_unmap_notify op;
+ struct gntalloc_gref *gref;
+ uint64_t index;
+ int pgoff;
+ int rc;
+
+ if (copy_from_user(&op, arg, sizeof(op)))
+ return -EFAULT;
+
+ index = op.index & ~(PAGE_SIZE - 1);
+ pgoff = op.index & (PAGE_SIZE - 1);
+
+ spin_lock(&gref_lock);
+
+ gref = find_grefs(priv, index, 1);
+ if (!gref) {
+ rc = -ENOENT;
+ goto unlock_out;
+ }
+
+ if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) {
+ rc = -EINVAL;
+ goto unlock_out;
+ }
+
+ gref->notify.flags = op.action;
+ gref->notify.pgoff = pgoff;
+ gref->notify.event = op.event_channel_port;
+ rc = 0;
+ unlock_out:
+ spin_unlock(&gref_lock);
+ return rc;
+}
+
+static long gntalloc_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+
+ switch (cmd) {
+ case IOCTL_GNTALLOC_ALLOC_GREF:
+ return gntalloc_ioctl_alloc(priv, (void __user *)arg);
+
+ case IOCTL_GNTALLOC_DEALLOC_GREF:
+ return gntalloc_ioctl_dealloc(priv, (void __user *)arg);
+
+ case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY:
+ return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg);
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static void gntalloc_vma_close(struct vm_area_struct *vma)
+{
+ struct gntalloc_gref *gref = vma->vm_private_data;
+ if (!gref)
+ return;
+
+ spin_lock(&gref_lock);
+ gref->users--;
+ if (gref->users == 0)
+ __del_gref(gref);
+ spin_unlock(&gref_lock);
+}
+
+static struct vm_operations_struct gntalloc_vmops = {
+ .close = gntalloc_vma_close,
+};
+
+static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+ struct gntalloc_gref *gref;
+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ int rv, i;
+
+ pr_debug("%s: priv %p, page %lu+%d\n", __func__,
+ priv, vma->vm_pgoff, count);
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ printk(KERN_ERR "%s: Mapping must be shared.\n", __func__);
+ return -EINVAL;
+ }
+
+ spin_lock(&gref_lock);
+ gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count);
+ if (gref == NULL) {
+ rv = -ENOENT;
+ pr_debug("%s: Could not find grant reference",
+ __func__);
+ goto out_unlock;
+ }
+
+ vma->vm_private_data = gref;
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTCOPY;
+ vma->vm_flags |= VM_PFNMAP | VM_PFN_AT_MMAP;
+
+ vma->vm_ops = &gntalloc_vmops;
+
+ for (i = 0; i < count; i++) {
+ gref->users++;
+ rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
+ gref->page);
+ if (rv)
+ goto out_unlock;
+
+ gref = list_entry(gref->next_file.next,
+ struct gntalloc_gref, next_file);
+ }
+ rv = 0;
+
+out_unlock:
+ spin_unlock(&gref_lock);
+ return rv;
+}
+
+static const struct file_operations gntalloc_fops = {
+ .owner = THIS_MODULE,
+ .open = gntalloc_open,
+ .release = gntalloc_release,
+ .unlocked_ioctl = gntalloc_ioctl,
+ .mmap = gntalloc_mmap
+};
+
+/*
+ * -------------------------------------
+ * Module creation/destruction.
+ * -------------------------------------
+ */
+static struct miscdevice gntalloc_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/gntalloc",
+ .fops = &gntalloc_fops,
+};
+
+static int __init gntalloc_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&gntalloc_miscdev);
+ if (err != 0) {
+ printk(KERN_ERR "Could not register misc gntalloc device\n");
+ return err;
+ }
+
+ pr_debug("Created grant allocation device at %d,%d\n",
+ MISC_MAJOR, gntalloc_miscdev.minor);
+
+ return 0;
+}
+
+static void __exit gntalloc_exit(void)
+{
+ misc_deregister(&gntalloc_miscdev);
+}
+
+module_init(gntalloc_init);
+module_exit(gntalloc_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, "
+ "Daniel De Graaf <dgdegra@tycho.nsa.gov>");
+MODULE_DESCRIPTION("User-space grant reference allocator driver");
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 1e31cdcdae1e..d43ff3072c99 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -32,10 +32,12 @@
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
+#include <linux/highmem.h>
#include <xen/xen.h>
#include <xen/grant_table.h>
#include <xen/gntdev.h>
+#include <xen/events.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
@@ -45,35 +47,46 @@ MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
"Gerd Hoffmann <kraxel@redhat.com>");
MODULE_DESCRIPTION("User-space granted page access driver");
-static int limit = 1024;
+static int limit = 1024*1024;
module_param(limit, int, 0644);
-MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
- "once by a gntdev instance");
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
+ "the gntdev device");
+
+static atomic_t pages_mapped = ATOMIC_INIT(0);
+
+static int use_ptemod;
struct gntdev_priv {
struct list_head maps;
- uint32_t used;
- uint32_t limit;
/* lock protects maps from concurrent changes */
spinlock_t lock;
struct mm_struct *mm;
struct mmu_notifier mn;
};
+struct unmap_notify {
+ int flags;
+ /* Address relative to the start of the grant_map */
+ int addr;
+ int event;
+};
+
struct grant_map {
struct list_head next;
- struct gntdev_priv *priv;
struct vm_area_struct *vma;
int index;
int count;
int flags;
- int is_mapped;
+ atomic_t users;
+ struct unmap_notify notify;
struct ioctl_gntdev_grant_ref *grants;
struct gnttab_map_grant_ref *map_ops;
struct gnttab_unmap_grant_ref *unmap_ops;
struct page **pages;
};
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages);
+
/* ------------------------------------------------------------------ */
static void gntdev_print_maps(struct gntdev_priv *priv,
@@ -82,9 +95,7 @@ static void gntdev_print_maps(struct gntdev_priv *priv,
#ifdef DEBUG
struct grant_map *map;
- pr_debug("maps list (priv %p, usage %d/%d)\n",
- priv, priv->used, priv->limit);
-
+ pr_debug("%s: maps list (priv %p)\n", __func__, priv);
list_for_each_entry(map, &priv->maps, next)
pr_debug(" index %2d, count %2d %s\n",
map->index, map->count,
@@ -115,14 +126,13 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
if (add->pages[i] == NULL)
goto err;
+ add->map_ops[i].handle = -1;
+ add->unmap_ops[i].handle = -1;
}
add->index = 0;
add->count = count;
- add->priv = priv;
-
- if (add->count + priv->used > priv->limit)
- goto err;
+ atomic_set(&add->users, 1);
return add;
@@ -154,7 +164,6 @@ static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
list_add_tail(&add->next, &priv->maps);
done:
- priv->used += add->count;
gntdev_print_maps(priv, "[new]", add->index);
}
@@ -166,57 +175,57 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
list_for_each_entry(map, &priv->maps, next) {
if (map->index != index)
continue;
- if (map->count != count)
- continue;
- return map;
- }
- return NULL;
-}
-
-static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
- unsigned long vaddr)
-{
- struct grant_map *map;
-
- list_for_each_entry(map, &priv->maps, next) {
- if (!map->vma)
- continue;
- if (vaddr < map->vma->vm_start)
- continue;
- if (vaddr >= map->vma->vm_end)
+ if (count && map->count != count)
continue;
return map;
}
return NULL;
}
-static int gntdev_del_map(struct grant_map *map)
+static void gntdev_put_map(struct grant_map *map)
{
int i;
- if (map->vma)
- return -EBUSY;
- for (i = 0; i < map->count; i++)
- if (map->unmap_ops[i].handle)
- return -EBUSY;
+ if (!map)
+ return;
- map->priv->used -= map->count;
- list_del(&map->next);
- return 0;
-}
+ if (!atomic_dec_and_test(&map->users))
+ return;
-static void gntdev_free_map(struct grant_map *map)
-{
- int i;
+ atomic_sub(map->count, &pages_mapped);
- if (!map)
- return;
+ if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
+ notify_remote_via_evtchn(map->notify.event);
+ }
+
+ if (map->pages) {
+ if (!use_ptemod)
+ unmap_grant_pages(map, 0, map->count);
- if (map->pages)
for (i = 0; i < map->count; i++) {
- if (map->pages[i])
+ uint32_t check, *tmp;
+ if (!map->pages[i])
+ continue;
+ /* XXX When unmapping in an HVM domain, Xen will
+ * sometimes end up mapping the GFN to an invalid MFN.
+ * In this case, writes will be discarded and reads will
+ * return all 0xFF bytes. Leak these unusable GFNs
+ * until Xen supports fixing their p2m mapping.
+ *
+ * Confirmed present in Xen 4.1-RC3 with HVM source
+ */
+ tmp = kmap(map->pages[i]);
+ *tmp = 0xdeaddead;
+ mb();
+ check = *tmp;
+ kunmap(map->pages[i]);
+ if (check == 0xdeaddead)
__free_page(map->pages[i]);
+ else
+ pr_debug("Discard page %d=%ld\n", i,
+ page_to_pfn(map->pages[i]));
}
+ }
kfree(map->pages);
kfree(map->grants);
kfree(map->map_ops);
@@ -231,24 +240,39 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token,
{
struct grant_map *map = data;
unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+ int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte;
u64 pte_maddr;
BUG_ON(pgnr >= map->count);
pte_maddr = arbitrary_virt_to_machine(pte).maddr;
- gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
- GNTMAP_contains_pte | map->flags,
+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags,
map->grants[pgnr].ref,
map->grants[pgnr].domid);
- gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
- GNTMAP_contains_pte | map->flags,
- 0 /* handle */);
+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags,
+ -1 /* handle */);
return 0;
}
static int map_grant_pages(struct grant_map *map)
{
int i, err = 0;
+ phys_addr_t addr;
+
+ if (!use_ptemod) {
+ /* Note: it could already be mapped */
+ if (map->map_ops[0].handle != -1)
+ return 0;
+ for (i = 0; i < map->count; i++) {
+ addr = (phys_addr_t)
+ pfn_to_kaddr(page_to_pfn(map->pages[i]));
+ gnttab_set_map_op(&map->map_ops[i], addr, map->flags,
+ map->grants[i].ref,
+ map->grants[i].domid);
+ gnttab_set_unmap_op(&map->unmap_ops[i], addr,
+ map->flags, -1 /* handle */);
+ }
+ }
pr_debug("map %d+%d\n", map->index, map->count);
err = gnttab_map_refs(map->map_ops, map->pages, map->count);
@@ -258,28 +282,81 @@ static int map_grant_pages(struct grant_map *map)
for (i = 0; i < map->count; i++) {
if (map->map_ops[i].status)
err = -EINVAL;
- map->unmap_ops[i].handle = map->map_ops[i].handle;
+ else {
+ BUG_ON(map->map_ops[i].handle == -1);
+ map->unmap_ops[i].handle = map->map_ops[i].handle;
+ pr_debug("map handle=%d\n", map->map_ops[i].handle);
+ }
}
return err;
}
-static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+static int __unmap_grant_pages(struct grant_map *map, int offset, int pages)
{
int i, err = 0;
- pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
- err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
+ if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+ int pgno = (map->notify.addr >> PAGE_SHIFT);
+ if (pgno >= offset && pgno < offset + pages && use_ptemod) {
+ void __user *tmp = (void __user *)
+ map->vma->vm_start + map->notify.addr;
+ err = copy_to_user(tmp, &err, 1);
+ if (err)
+ return err;
+ map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
+ } else if (pgno >= offset && pgno < offset + pages) {
+ uint8_t *tmp = kmap(map->pages[pgno]);
+ tmp[map->notify.addr & (PAGE_SIZE-1)] = 0;
+ kunmap(map->pages[pgno]);
+ map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
+ }
+ }
+
+ err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages + offset, pages);
if (err)
return err;
for (i = 0; i < pages; i++) {
if (map->unmap_ops[offset+i].status)
err = -EINVAL;
- map->unmap_ops[offset+i].handle = 0;
+ pr_debug("unmap handle=%d st=%d\n",
+ map->unmap_ops[offset+i].handle,
+ map->unmap_ops[offset+i].status);
+ map->unmap_ops[offset+i].handle = -1;
}
return err;
}
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+ int range, err = 0;
+
+ pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+
+ /* It is possible the requested range will have a "hole" where we
+ * already unmapped some of the grants. Only unmap valid ranges.
+ */
+ while (pages && !err) {
+ while (pages && map->unmap_ops[offset].handle == -1) {
+ offset++;
+ pages--;
+ }
+ range = 0;
+ while (range < pages) {
+ if (map->unmap_ops[offset+range].handle == -1) {
+ range--;
+ break;
+ }
+ range++;
+ }
+ err = __unmap_grant_pages(map, offset, range);
+ offset += range;
+ pages -= range;
+ }
+
+ return err;
+}
+
/* ------------------------------------------------------------------ */
static void gntdev_vma_close(struct vm_area_struct *vma)
@@ -287,22 +364,13 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
struct grant_map *map = vma->vm_private_data;
pr_debug("close %p\n", vma);
- map->is_mapped = 0;
map->vma = NULL;
vma->vm_private_data = NULL;
-}
-
-static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
- vmf->virtual_address, vmf->pgoff);
- vmf->flags = VM_FAULT_ERROR;
- return 0;
+ gntdev_put_map(map);
}
static struct vm_operations_struct gntdev_vmops = {
.close = gntdev_vma_close,
- .fault = gntdev_vma_fault,
};
/* ------------------------------------------------------------------ */
@@ -320,8 +388,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
list_for_each_entry(map, &priv->maps, next) {
if (!map->vma)
continue;
- if (!map->is_mapped)
- continue;
if (map->vma->vm_start >= end)
continue;
if (map->vma->vm_end <= start)
@@ -386,16 +452,17 @@ static int gntdev_open(struct inode *inode, struct file *flip)
INIT_LIST_HEAD(&priv->maps);
spin_lock_init(&priv->lock);
- priv->limit = limit;
- priv->mm = get_task_mm(current);
- if (!priv->mm) {
- kfree(priv);
- return -ENOMEM;
+ if (use_ptemod) {
+ priv->mm = get_task_mm(current);
+ if (!priv->mm) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+ priv->mn.ops = &gntdev_mmu_ops;
+ ret = mmu_notifier_register(&priv->mn, priv->mm);
+ mmput(priv->mm);
}
- priv->mn.ops = &gntdev_mmu_ops;
- ret = mmu_notifier_register(&priv->mn, priv->mm);
- mmput(priv->mm);
if (ret) {
kfree(priv);
@@ -412,21 +479,19 @@ static int gntdev_release(struct inode *inode, struct file *flip)
{
struct gntdev_priv *priv = flip->private_data;
struct grant_map *map;
- int err;
pr_debug("priv %p\n", priv);
spin_lock(&priv->lock);
while (!list_empty(&priv->maps)) {
map = list_entry(priv->maps.next, struct grant_map, next);
- err = gntdev_del_map(map);
- if (WARN_ON(err))
- gntdev_free_map(map);
-
+ list_del(&map->next);
+ gntdev_put_map(map);
}
spin_unlock(&priv->lock);
- mmu_notifier_unregister(&priv->mn, priv->mm);
+ if (use_ptemod)
+ mmu_notifier_unregister(&priv->mn, priv->mm);
kfree(priv);
return 0;
}
@@ -443,16 +508,21 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
pr_debug("priv %p, add %d\n", priv, op.count);
if (unlikely(op.count <= 0))
return -EINVAL;
- if (unlikely(op.count > priv->limit))
- return -EINVAL;
err = -ENOMEM;
map = gntdev_alloc_map(priv, op.count);
if (!map)
return err;
+
+ if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) {
+ pr_debug("can't map: over limit\n");
+ gntdev_put_map(map);
+ return err;
+ }
+
if (copy_from_user(map->grants, &u->refs,
sizeof(map->grants[0]) * op.count) != 0) {
- gntdev_free_map(map);
+ gntdev_put_map(map);
return err;
}
@@ -461,13 +531,9 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
op.index = map->index << PAGE_SHIFT;
spin_unlock(&priv->lock);
- if (copy_to_user(u, &op, sizeof(op)) != 0) {
- spin_lock(&priv->lock);
- gntdev_del_map(map);
- spin_unlock(&priv->lock);
- gntdev_free_map(map);
- return err;
- }
+ if (copy_to_user(u, &op, sizeof(op)) != 0)
+ return -EFAULT;
+
return 0;
}
@@ -484,11 +550,12 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
spin_lock(&priv->lock);
map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
- if (map)
- err = gntdev_del_map(map);
+ if (map) {
+ list_del(&map->next);
+ gntdev_put_map(map);
+ err = 0;
+ }
spin_unlock(&priv->lock);
- if (!err)
- gntdev_free_map(map);
return err;
}
@@ -496,43 +563,66 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
struct ioctl_gntdev_get_offset_for_vaddr __user *u)
{
struct ioctl_gntdev_get_offset_for_vaddr op;
+ struct vm_area_struct *vma;
struct grant_map *map;
if (copy_from_user(&op, u, sizeof(op)) != 0)
return -EFAULT;
pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
- spin_lock(&priv->lock);
- map = gntdev_find_map_vaddr(priv, op.vaddr);
- if (map == NULL ||
- map->vma->vm_start != op.vaddr) {
- spin_unlock(&priv->lock);
+ vma = find_vma(current->mm, op.vaddr);
+ if (!vma || vma->vm_ops != &gntdev_vmops)
return -EINVAL;
- }
+
+ map = vma->vm_private_data;
+ if (!map)
+ return -EINVAL;
+
op.offset = map->index << PAGE_SHIFT;
op.count = map->count;
- spin_unlock(&priv->lock);
if (copy_to_user(u, &op, sizeof(op)) != 0)
return -EFAULT;
return 0;
}
-static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
- struct ioctl_gntdev_set_max_grants __user *u)
+static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
{
- struct ioctl_gntdev_set_max_grants op;
+ struct ioctl_gntdev_unmap_notify op;
+ struct grant_map *map;
+ int rc;
- if (copy_from_user(&op, u, sizeof(op)) != 0)
+ if (copy_from_user(&op, u, sizeof(op)))
return -EFAULT;
- pr_debug("priv %p, limit %d\n", priv, op.count);
- if (op.count > limit)
- return -E2BIG;
+
+ if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
+ return -EINVAL;
spin_lock(&priv->lock);
- priv->limit = op.count;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ uint64_t begin = map->index << PAGE_SHIFT;
+ uint64_t end = (map->index + map->count) << PAGE_SHIFT;
+ if (op.index >= begin && op.index < end)
+ goto found;
+ }
+ rc = -ENOENT;
+ goto unlock_out;
+
+ found:
+ if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) &&
+ (map->flags & GNTMAP_readonly)) {
+ rc = -EINVAL;
+ goto unlock_out;
+ }
+
+ map->notify.flags = op.action;
+ map->notify.addr = op.index - (map->index << PAGE_SHIFT);
+ map->notify.event = op.event_channel_port;
+ rc = 0;
+ unlock_out:
spin_unlock(&priv->lock);
- return 0;
+ return rc;
}
static long gntdev_ioctl(struct file *flip,
@@ -551,8 +641,8 @@ static long gntdev_ioctl(struct file *flip,
case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
- case IOCTL_GNTDEV_SET_MAX_GRANTS:
- return gntdev_ioctl_set_max_grants(priv, ptr);
+ case IOCTL_GNTDEV_SET_UNMAP_NOTIFY:
+ return gntdev_ioctl_notify(priv, ptr);
default:
pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
@@ -568,7 +658,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
int index = vma->vm_pgoff;
int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
struct grant_map *map;
- int err = -EINVAL;
+ int i, err = -EINVAL;
if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
return -EINVAL;
@@ -580,47 +670,70 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
map = gntdev_find_map_index(priv, index, count);
if (!map)
goto unlock_out;
- if (map->vma)
+ if (use_ptemod && map->vma)
goto unlock_out;
- if (priv->mm != vma->vm_mm) {
+ if (use_ptemod && priv->mm != vma->vm_mm) {
printk(KERN_WARNING "Huh? Other mm?\n");
goto unlock_out;
}
+ atomic_inc(&map->users);
+
vma->vm_ops = &gntdev_vmops;
vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;
vma->vm_private_data = map;
- map->vma = vma;
- map->flags = GNTMAP_host_map | GNTMAP_application_map;
- if (!(vma->vm_flags & VM_WRITE))
- map->flags |= GNTMAP_readonly;
+ if (use_ptemod)
+ map->vma = vma;
+
+ if (map->flags) {
+ if ((vma->vm_flags & VM_WRITE) &&
+ (map->flags & GNTMAP_readonly))
+ return -EINVAL;
+ } else {
+ map->flags = GNTMAP_host_map;
+ if (!(vma->vm_flags & VM_WRITE))
+ map->flags |= GNTMAP_readonly;
+ }
spin_unlock(&priv->lock);
- err = apply_to_page_range(vma->vm_mm, vma->vm_start,
- vma->vm_end - vma->vm_start,
- find_grant_ptes, map);
- if (err) {
- printk(KERN_WARNING "find_grant_ptes() failure.\n");
- return err;
+ if (use_ptemod) {
+ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+ vma->vm_end - vma->vm_start,
+ find_grant_ptes, map);
+ if (err) {
+ printk(KERN_WARNING "find_grant_ptes() failure.\n");
+ goto out_put_map;
+ }
}
err = map_grant_pages(map);
- if (err) {
- printk(KERN_WARNING "map_grant_pages() failure.\n");
- return err;
- }
+ if (err)
+ goto out_put_map;
- map->is_mapped = 1;
+ if (!use_ptemod) {
+ for (i = 0; i < count; i++) {
+ err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
+ map->pages[i]);
+ if (err)
+ goto out_put_map;
+ }
+ }
return 0;
unlock_out:
spin_unlock(&priv->lock);
return err;
+
+out_put_map:
+ if (use_ptemod)
+ map->vma = NULL;
+ gntdev_put_map(map);
+ return err;
}
static const struct file_operations gntdev_fops = {
@@ -646,6 +759,8 @@ static int __init gntdev_init(void)
if (!xen_domain())
return -ENODEV;
+ use_ptemod = xen_pv_domain();
+
err = misc_register(&gntdev_miscdev);
if (err != 0) {
printk(KERN_ERR "Could not register gntdev device\n");
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 9ef54ebc1194..9428ced04807 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -458,6 +458,9 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
if (ret)
return ret;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return ret;
+
for (i = 0; i < count; i++) {
/* m2p override only supported for GNTMAP_contains_pte mappings */
if (!(map_ops[i].flags & GNTMAP_contains_pte))
@@ -483,6 +486,9 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
if (ret)
return ret;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return ret;
+
for (i = 0; i < count; i++) {
ret = m2p_remove_override(pages[i]);
if (ret)