diff options
Diffstat (limited to 'drivers/xen')
42 files changed, 3018 insertions, 174 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index f15bb3b789d5..d8dd54678ab7 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -196,6 +196,29 @@ config XEN_PCIDEV_BACKEND If in doubt, say m. +config XEN_PVCALLS_FRONTEND + tristate "XEN PV Calls frontend driver" + depends on INET && XEN + default n + select XEN_XENBUS_FRONTEND + help + Experimental frontend for the Xen PV Calls protocol + (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It + sends a small set of POSIX calls to the backend, which + implements them. + +config XEN_PVCALLS_BACKEND + bool "XEN PV Calls backend driver" + depends on INET && XEN && XEN_BACKEND + default n + help + Experimental backend for the Xen PV Calls protocol + (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It + allows PV Calls frontends to send POSIX calls to the backend, + which implements them. + + If in doubt, say n. + config XEN_SCSI_BACKEND tristate "XEN SCSI backend driver" depends on XEN && XEN_BACKEND && TARGET_CORE diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 8feab810aed9..451e833f5931 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_X86) += fallback.o obj-y += grant-table.o features.o balloon.o manage.o preempt.o time.o @@ -7,9 +8,6 @@ obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_features.o := $(nostackp) -CFLAGS_efi.o += -fshort-wchar -LDFLAGS += $(call ld-option, --no-wchar-size-warning) - dom0-$(CONFIG_ARM64) += arm-device.o dom0-$(CONFIG_PCI) += pci.o dom0-$(CONFIG_USB_SUPPORT) += dbgp.o @@ -38,6 +36,8 @@ obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o obj-$(CONFIG_XEN_EFI) += efi.o obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o obj-$(CONFIG_XEN_AUTO_XLATE) += xlate_mmu.o +obj-$(CONFIG_XEN_PVCALLS_BACKEND) += pvcalls-back.o +obj-$(CONFIG_XEN_PVCALLS_FRONTEND) += pvcalls-front.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ab609255a0f3..f77e499afddd 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -664,9 +664,11 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) */ BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); - ret = xen_alloc_p2m_entry(page_to_pfn(page)); - if (ret < 0) - goto out_undo; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + ret = xen_alloc_p2m_entry(page_to_pfn(page)); + if (ret < 0) + goto out_undo; + } #endif } else { ret = add_ballooned_pages(nr_pages - pgno); diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index 4da69dbf7dca..30d7f52eb7ca 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bio.h> #include <linux/io.h> #include <linux/export.h> @@ -10,8 +11,7 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page)); unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page)); - return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && - ((bfn1 == bfn2) || ((bfn1+1) == bfn2)); + return bfn1 + PFN_DOWN(vec1->bv_offset + vec1->bv_len) == bfn2; #else /* * XXX: Add support for merging bio_vec when using different page diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 0003912a8111..d4265c8ebb22 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt #include <linux/notifier.h> diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index bdff01095f54..8edef51c92e5 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Xen event channels (2-level ABI) * diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b241bfa529ce..1ab4bd11f5f3 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -343,14 +343,6 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) info->cpu = cpu; } -static void xen_evtchn_mask_all(void) -{ - unsigned int evtchn; - - for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) - mask_evtchn(evtchn); -} - /** * notify_remote_via_irq - send event to remote end of event channel via irq * @irq: irq of event channel to send event to @@ -582,7 +574,7 @@ static void shutdown_pirq(struct irq_data *data) static void enable_pirq(struct irq_data *data) { - startup_pirq(data); + enable_dynirq(data); } static void disable_pirq(struct irq_data *data) @@ -1573,7 +1565,6 @@ void xen_irq_resume(void) struct irq_info *info; /* New event-channel space is not 'live' yet. */ - xen_evtchn_mask_all(); xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ @@ -1662,10 +1653,8 @@ void xen_callback_vector(void) return; } pr_info("Xen HVM callback vector for event delivery is enabled\n"); - /* in the restore case the vector has already been allocated */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - xen_hvm_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + xen_hvm_callback_vector); } } #else @@ -1681,6 +1670,7 @@ module_param(fifo_events, bool, 0); void __init xen_init_IRQ(void) { int ret = -EINVAL; + unsigned int evtchn; if (fifo_events) ret = xen_evtchn_fifo_init(); @@ -1692,7 +1682,8 @@ void __init xen_init_IRQ(void) BUG_ON(!evtchn_to_irq); /* No event channels are 'live' right now. */ - xen_evtchn_mask_all(); + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); pirq_needs_eoi = pirq_needs_eoi_flag; diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index 3c41470c7fc4..76b318e88382 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -432,12 +432,12 @@ static int xen_evtchn_cpu_dead(unsigned int cpu) int __init xen_evtchn_fifo_init(void) { - int cpu = get_cpu(); + int cpu = smp_processor_id(); int ret; ret = evtchn_fifo_alloc_control_block(cpu); if (ret < 0) - goto out; + return ret; pr_info("Using FIFO-based ABI\n"); @@ -446,7 +446,6 @@ int __init xen_evtchn_fifo_init(void) cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE, "xen/evtchn:prepare", xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead); -out: - put_cpu(); + return ret; } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 1bf55a32a4b3..3fa40c723e8e 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -294,7 +294,7 @@ static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, goto out; } - gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_TEMPORARY); + gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_KERNEL); if (!gref_ids) { rc = -ENOMEM; goto out; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index f3bf8f4e2d6c..57efbd3b053b 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -484,13 +484,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn, mutex_unlock(&priv->lock); } -static void mn_invl_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); -} - static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -522,7 +515,6 @@ static void mn_release(struct mmu_notifier *mn, static const struct mmu_notifier_ops gntdev_mmu_ops = { .release = mn_release, - .invalidate_page = mn_invl_page, .invalidate_range_start = mn_invl_range_start, }; @@ -1032,6 +1024,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) mutex_unlock(&priv->lock); if (use_ptemod) { + map->pages_vm_start = vma->vm_start; err = apply_to_page_range(vma->vm_mm, vma->vm_start, vma->vm_end - vma->vm_start, find_grant_ptes, map); @@ -1069,7 +1062,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) set_grant_ptes_as_special, NULL); } #endif - map->pages_vm_start = vma->vm_start; } return 0; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 2c6a9114d332..f45114fd8e1e 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -33,6 +33,7 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/bootmem.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/slab.h> @@ -43,6 +44,7 @@ #include <linux/hardirq.h> #include <linux/workqueue.h> #include <linux/ratelimit.h> +#include <linux/moduleparam.h> #include <xen/xen.h> #include <xen/interface/xen.h> @@ -52,6 +54,9 @@ #include <xen/hvc-console.h> #include <xen/swiotlb-xen.h> #include <xen/balloon.h> +#ifdef CONFIG_X86 +#include <asm/xen/cpuid.h> +#endif #include <asm/xen/hypercall.h> #include <asm/xen/interface.h> @@ -68,15 +73,26 @@ static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); struct grant_frames xen_auto_xlat_grant_frames; +static unsigned int xen_gnttab_version; +module_param_named(version, xen_gnttab_version, uint, 0); static union { struct grant_entry_v1 *v1; + union grant_entry_v2 *v2; void *addr; } gnttab_shared; /*This is a structure of function pointers for grant table*/ struct gnttab_ops { /* + * Version of the grant interface. + */ + unsigned int version; + /* + * Grant refs per grant frame. + */ + unsigned int grefs_per_grant_frame; + /* * Mapping a list of frames for storing grant entries. Frames parameter * is used to store grant table address when grant table being setup, * nr_gframes is the number of frames to map grant table. Returning @@ -130,14 +146,15 @@ struct unmap_refs_callback_data { static const struct gnttab_ops *gnttab_interface; -static int grant_table_version; -static int grefs_per_grant_frame; +/* This reflects status of grant entries, so act as a global value. */ +static grant_status_t *grstatus; static struct gnttab_free_callback *gnttab_free_callback_list; static int gnttab_expand(unsigned int req_entries); #define RPP (PAGE_SIZE / sizeof(grant_ref_t)) +#define SPP (PAGE_SIZE / sizeof(grant_status_t)) static inline grant_ref_t *__gnttab_entry(grant_ref_t entry) { @@ -210,7 +227,7 @@ static void put_free_entry(grant_ref_t ref) } /* - * Following applies to gnttab_update_entry_v1. + * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. * Introducing a valid entry into the grant table: * 1. Write ent->domid. * 2. Write ent->frame: @@ -229,6 +246,15 @@ static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid, gnttab_shared.v1[ref].flags = flags; } +static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid, + unsigned long frame, unsigned int flags) +{ + gnttab_shared.v2[ref].hdr.domid = domid; + gnttab_shared.v2[ref].full_page.frame = frame; + wmb(); /* Hypervisor concurrent accesses. */ + gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags; +} + /* * Public grant-issuing interface functions */ @@ -260,6 +286,11 @@ static int gnttab_query_foreign_access_v1(grant_ref_t ref) return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); } +static int gnttab_query_foreign_access_v2(grant_ref_t ref) +{ + return grstatus[ref] & (GTF_reading|GTF_writing); +} + int gnttab_query_foreign_access(grant_ref_t ref) { return gnttab_interface->query_foreign_access(ref); @@ -282,6 +313,29 @@ static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) return 1; } +static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) +{ + gnttab_shared.v2[ref].hdr.flags = 0; + mb(); /* Concurrent access by hypervisor. */ + if (grstatus[ref] & (GTF_reading|GTF_writing)) { + return 0; + } else { + /* + * The read of grstatus needs to have acquire semantics. + * On x86, reads already have that, and we just need to + * protect against compiler reorderings. + * On other architectures we may need a full barrier. + */ +#ifdef CONFIG_X86 + barrier(); +#else + mb(); +#endif + } + + return 1; +} + static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) { return gnttab_interface->end_foreign_access_ref(ref, readonly); @@ -304,10 +358,10 @@ struct deferred_entry { struct page *page; }; static LIST_HEAD(deferred_list); -static void gnttab_handle_deferred(unsigned long); -static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0); +static void gnttab_handle_deferred(struct timer_list *); +static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred); -static void gnttab_handle_deferred(unsigned long unused) +static void gnttab_handle_deferred(struct timer_list *unused) { unsigned int nr = 10; struct deferred_entry *first = NULL; @@ -442,6 +496,37 @@ static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref) return frame; } +static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref) +{ + unsigned long frame; + u16 flags; + u16 *pflags; + + pflags = &gnttab_shared.v2[ref].hdr.flags; + + /* + * If a transfer is not even yet started, try to reclaim the grant + * reference and return failure (== 0). + */ + while (!((flags = *pflags) & GTF_transfer_committed)) { + if (sync_cmpxchg(pflags, flags, 0) == flags) + return 0; + cpu_relax(); + } + + /* If a transfer is in progress then wait until it is completed. */ + while (!(flags & GTF_transfer_completed)) { + flags = *pflags; + cpu_relax(); + } + + rmb(); /* Read the frame number /after/ reading completion status. */ + frame = gnttab_shared.v2[ref].full_page.frame; + BUG_ON(frame == 0); + + return frame; +} + unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) { return gnttab_interface->end_foreign_transfer_ref(ref); @@ -563,19 +648,26 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback) } EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback); +static unsigned int gnttab_frames(unsigned int frames, unsigned int align) +{ + return (frames * gnttab_interface->grefs_per_grant_frame + align - 1) / + align; +} + static int grow_gnttab_list(unsigned int more_frames) { unsigned int new_nr_grant_frames, extra_entries, i; unsigned int nr_glist_frames, new_nr_glist_frames; + unsigned int grefs_per_frame; - BUG_ON(grefs_per_grant_frame == 0); + BUG_ON(gnttab_interface == NULL); + grefs_per_frame = gnttab_interface->grefs_per_grant_frame; new_nr_grant_frames = nr_grant_frames + more_frames; - extra_entries = more_frames * grefs_per_grant_frame; + extra_entries = more_frames * grefs_per_frame; - nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; - new_nr_glist_frames = - (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + nr_glist_frames = gnttab_frames(nr_grant_frames, RPP); + new_nr_glist_frames = gnttab_frames(new_nr_grant_frames, RPP); for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); if (!gnttab_list[i]) @@ -583,12 +675,12 @@ static int grow_gnttab_list(unsigned int more_frames) } - for (i = grefs_per_grant_frame * nr_grant_frames; - i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++) + for (i = grefs_per_frame * nr_grant_frames; + i < grefs_per_frame * new_nr_grant_frames - 1; i++) gnttab_entry(i) = i + 1; gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = grefs_per_grant_frame * nr_grant_frames; + gnttab_free_head = grefs_per_frame * nr_grant_frames; gnttab_free_count += extra_entries; nr_grant_frames = new_nr_grant_frames; @@ -938,6 +1030,12 @@ int gnttab_unmap_refs_sync(struct gntab_unmap_queue_data *item) } EXPORT_SYMBOL_GPL(gnttab_unmap_refs_sync); +static unsigned int nr_status_frames(unsigned int nr_grant_frames) +{ + BUG_ON(gnttab_interface == NULL); + return gnttab_frames(nr_grant_frames, SPP); +} + static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) { int rc; @@ -955,6 +1053,55 @@ static void gnttab_unmap_frames_v1(void) arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); } +static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes) +{ + uint64_t *sframes; + unsigned int nr_sframes; + struct gnttab_get_status_frames getframes; + int rc; + + nr_sframes = nr_status_frames(nr_gframes); + + /* No need for kzalloc as it is initialized in following hypercall + * GNTTABOP_get_status_frames. + */ + sframes = kmalloc_array(nr_sframes, sizeof(uint64_t), GFP_ATOMIC); + if (!sframes) + return -ENOMEM; + + getframes.dom = DOMID_SELF; + getframes.nr_frames = nr_sframes; + set_xen_guest_handle(getframes.frame_list, sframes); + + rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames, + &getframes, 1); + if (rc == -ENOSYS) { + kfree(sframes); + return -ENOSYS; + } + + BUG_ON(rc || getframes.status); + + rc = arch_gnttab_map_status(sframes, nr_sframes, + nr_status_frames(gnttab_max_grant_frames()), + &grstatus); + BUG_ON(rc); + kfree(sframes); + + rc = arch_gnttab_map_shared(frames, nr_gframes, + gnttab_max_grant_frames(), + &gnttab_shared.addr); + BUG_ON(rc); + + return 0; +} + +static void gnttab_unmap_frames_v2(void) +{ + arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); + arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames)); +} + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; @@ -1014,6 +1161,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) } static const struct gnttab_ops gnttab_v1_ops = { + .version = 1, + .grefs_per_grant_frame = XEN_PAGE_SIZE / + sizeof(struct grant_entry_v1), .map_frames = gnttab_map_frames_v1, .unmap_frames = gnttab_unmap_frames_v1, .update_entry = gnttab_update_entry_v1, @@ -1022,14 +1172,56 @@ static const struct gnttab_ops gnttab_v1_ops = { .query_foreign_access = gnttab_query_foreign_access_v1, }; -static void gnttab_request_version(void) +static const struct gnttab_ops gnttab_v2_ops = { + .version = 2, + .grefs_per_grant_frame = XEN_PAGE_SIZE / + sizeof(union grant_entry_v2), + .map_frames = gnttab_map_frames_v2, + .unmap_frames = gnttab_unmap_frames_v2, + .update_entry = gnttab_update_entry_v2, + .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, + .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, + .query_foreign_access = gnttab_query_foreign_access_v2, +}; + +static bool gnttab_need_v2(void) { - /* Only version 1 is used, which will always be available. */ - grant_table_version = 1; - grefs_per_grant_frame = XEN_PAGE_SIZE / sizeof(struct grant_entry_v1); - gnttab_interface = &gnttab_v1_ops; +#ifdef CONFIG_X86 + uint32_t base, width; + + if (xen_pv_domain()) { + base = xen_cpuid_base(); + if (cpuid_eax(base) < 5) + return false; /* Information not available, use V1. */ + width = cpuid_ebx(base + 5) & + XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK; + return width > 32 + PAGE_SHIFT; + } +#endif + return !!(max_possible_pfn >> 32); +} - pr_info("Grant tables using version %d layout\n", grant_table_version); +static void gnttab_request_version(void) +{ + long rc; + struct gnttab_set_version gsv; + + if (gnttab_need_v2()) + gsv.version = 2; + else + gsv.version = 1; + + /* Boot parameter overrides automatic selection. */ + if (xen_gnttab_version >= 1 && xen_gnttab_version <= 2) + gsv.version = xen_gnttab_version; + + rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); + if (rc == 0 && gsv.version == 2) + gnttab_interface = &gnttab_v2_ops; + else + gnttab_interface = &gnttab_v1_ops; + pr_info("Grant tables using version %d layout\n", + gnttab_interface->version); } static int gnttab_setup(void) @@ -1069,10 +1261,10 @@ static int gnttab_expand(unsigned int req_entries) int rc; unsigned int cur, extra; - BUG_ON(grefs_per_grant_frame == 0); + BUG_ON(gnttab_interface == NULL); cur = nr_grant_frames; - extra = ((req_entries + (grefs_per_grant_frame-1)) / - grefs_per_grant_frame); + extra = ((req_entries + gnttab_interface->grefs_per_grant_frame - 1) / + gnttab_interface->grefs_per_grant_frame); if (cur + extra > gnttab_max_grant_frames()) { pr_warn_ratelimited("xen/grant-table: max_grant_frames reached" " cur=%u extra=%u limit=%u" @@ -1104,16 +1296,16 @@ int gnttab_init(void) /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ - BUG_ON(grefs_per_grant_frame == 0); + BUG_ON(gnttab_interface == NULL); max_nr_glist_frames = (max_nr_grant_frames * - grefs_per_grant_frame / RPP); + gnttab_interface->grefs_per_grant_frame / RPP); gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), GFP_KERNEL); if (gnttab_list == NULL) return -ENOMEM; - nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + nr_glist_frames = gnttab_frames(nr_grant_frames, RPP); for (i = 0; i < nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); if (gnttab_list[i] == NULL) { @@ -1122,7 +1314,8 @@ int gnttab_init(void) } } - ret = arch_gnttab_init(max_nr_grant_frames); + ret = arch_gnttab_init(max_nr_grant_frames, + nr_status_frames(max_nr_grant_frames)); if (ret < 0) goto ini_nomem; @@ -1131,7 +1324,8 @@ int gnttab_init(void) goto ini_nomem; } - nr_init_grefs = nr_grant_frames * grefs_per_grant_frame; + nr_init_grefs = nr_grant_frames * + gnttab_interface->grefs_per_grant_frame; for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) gnttab_entry(i) = i + 1; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index c425d03d37d2..8835065029d3 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -72,18 +72,15 @@ static int xen_suspend(void *data) } gnttab_suspend(); + xen_manage_runstate_time(-1); xen_arch_pre_suspend(); - /* - * This hypercall returns 1 if suspend was cancelled - * or the domain was merely checkpointed, and 0 if it - * is resuming in a new domain. - */ si->cancelled = HYPERVISOR_suspend(xen_pv_domain() ? virt_to_gfn(xen_start_info) : 0); xen_arch_post_suspend(si->cancelled); + xen_manage_runstate_time(si->cancelled ? 1 : 0); gnttab_resume(); if (!si->cancelled) { diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 1275df83070f..5d7dcad0b0a0 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -175,7 +175,7 @@ pci_out: return ret; } -static struct pci_device_id platform_pci_tbl[] = { +static const struct pci_device_id platform_pci_tbl[] = { {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {0,} diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index feca75b07fdd..1c909183c42a 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -191,13 +191,10 @@ static int traverse_pages_block(unsigned nelem, size_t size, void *state) { void *pagedata; - unsigned pageidx; int ret = 0; BUG_ON(size > PAGE_SIZE); - pageidx = PAGE_SIZE; - while (nelem) { int nr = (PAGE_SIZE/size); struct page *page; diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c new file mode 100644 index 000000000000..c7822d8078b9 --- /dev/null +++ b/drivers/xen/pvcalls-back.c @@ -0,0 +1,1236 @@ +/* + * (c) 2017 Stefano Stabellini <stefano@aporeto.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/inet.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/radix-tree.h> +#include <linux/module.h> +#include <linux/semaphore.h> +#include <linux/wait.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <net/inet_connection_sock.h> +#include <net/request_sock.h> + +#include <xen/events.h> +#include <xen/grant_table.h> +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/pvcalls.h> + +#define PVCALLS_VERSIONS "1" +#define MAX_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER + +struct pvcalls_back_global { + struct list_head frontends; + struct semaphore frontends_lock; +} pvcalls_back_global; + +/* + * Per-frontend data structure. It contains pointers to the command + * ring, its event channel, a list of active sockets and a tree of + * passive sockets. + */ +struct pvcalls_fedata { + struct list_head list; + struct xenbus_device *dev; + struct xen_pvcalls_sring *sring; + struct xen_pvcalls_back_ring ring; + int irq; + struct list_head socket_mappings; + struct radix_tree_root socketpass_mappings; + struct semaphore socket_lock; +}; + +struct pvcalls_ioworker { + struct work_struct register_work; + struct workqueue_struct *wq; +}; + +struct sock_mapping { + struct list_head list; + struct pvcalls_fedata *fedata; + struct sockpass_mapping *sockpass; + struct socket *sock; + uint64_t id; + grant_ref_t ref; + struct pvcalls_data_intf *ring; + void *bytes; + struct pvcalls_data data; + uint32_t ring_order; + int irq; + atomic_t read; + atomic_t write; + atomic_t io; + atomic_t release; + void (*saved_data_ready)(struct sock *sk); + struct pvcalls_ioworker ioworker; +}; + +struct sockpass_mapping { + struct list_head list; + struct pvcalls_fedata *fedata; + struct socket *sock; + uint64_t id; + struct xen_pvcalls_request reqcopy; + spinlock_t copy_lock; + struct workqueue_struct *wq; + struct work_struct register_work; + void (*saved_data_ready)(struct sock *sk); +}; + +static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map); +static int pvcalls_back_release_active(struct xenbus_device *dev, + struct pvcalls_fedata *fedata, + struct sock_mapping *map); + +static void pvcalls_conn_back_read(void *opaque) +{ + struct sock_mapping *map = (struct sock_mapping *)opaque; + struct msghdr msg; + struct kvec vec[2]; + RING_IDX cons, prod, size, wanted, array_size, masked_prod, masked_cons; + int32_t error; + struct pvcalls_data_intf *intf = map->ring; + struct pvcalls_data *data = &map->data; + unsigned long flags; + int ret; + + array_size = XEN_FLEX_RING_SIZE(map->ring_order); + cons = intf->in_cons; + prod = intf->in_prod; + error = intf->in_error; + /* read the indexes first, then deal with the data */ + virt_mb(); + + if (error) + return; + + size = pvcalls_queued(prod, cons, array_size); + if (size >= array_size) + return; + spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags); + if (skb_queue_empty(&map->sock->sk->sk_receive_queue)) { + atomic_set(&map->read, 0); + spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, + flags); + return; + } + spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags); + wanted = array_size - size; + masked_prod = pvcalls_mask(prod, array_size); + masked_cons = pvcalls_mask(cons, array_size); + + memset(&msg, 0, sizeof(msg)); + if (masked_prod < masked_cons) { + vec[0].iov_base = data->in + masked_prod; + vec[0].iov_len = wanted; + iov_iter_kvec(&msg.msg_iter, ITER_KVEC|WRITE, vec, 1, wanted); + } else { + vec[0].iov_base = data->in + masked_prod; + vec[0].iov_len = array_size - masked_prod; + vec[1].iov_base = data->in; + vec[1].iov_len = wanted - vec[0].iov_len; + iov_iter_kvec(&msg.msg_iter, ITER_KVEC|WRITE, vec, 2, wanted); + } + + atomic_set(&map->read, 0); + ret = inet_recvmsg(map->sock, &msg, wanted, MSG_DONTWAIT); + WARN_ON(ret > wanted); + if (ret == -EAGAIN) /* shouldn't happen */ + return; + if (!ret) + ret = -ENOTCONN; + spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags); + if (ret > 0 && !skb_queue_empty(&map->sock->sk->sk_receive_queue)) + atomic_inc(&map->read); + spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags); + + /* write the data, then modify the indexes */ + virt_wmb(); + if (ret < 0) + intf->in_error = ret; + else + intf->in_prod = prod + ret; + /* update the indexes, then notify the other end */ + virt_wmb(); + notify_remote_via_irq(map->irq); + + return; +} + +static void pvcalls_conn_back_write(struct sock_mapping *map) +{ + struct pvcalls_data_intf *intf = map->ring; + struct pvcalls_data *data = &map->data; + struct msghdr msg; + struct kvec vec[2]; + RING_IDX cons, prod, size, array_size; + int ret; + + cons = intf->out_cons; + prod = intf->out_prod; + /* read the indexes before dealing with the data */ + virt_mb(); + + array_size = XEN_FLEX_RING_SIZE(map->ring_order); + size = pvcalls_queued(prod, cons, array_size); + if (size == 0) + return; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags |= MSG_DONTWAIT; + if (pvcalls_mask(prod, array_size) > pvcalls_mask(cons, array_size)) { + vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); + vec[0].iov_len = size; + iov_iter_kvec(&msg.msg_iter, ITER_KVEC|READ, vec, 1, size); + } else { + vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); + vec[0].iov_len = array_size - pvcalls_mask(cons, array_size); + vec[1].iov_base = data->out; + vec[1].iov_len = size - vec[0].iov_len; + iov_iter_kvec(&msg.msg_iter, ITER_KVEC|READ, vec, 2, size); + } + + atomic_set(&map->write, 0); + ret = inet_sendmsg(map->sock, &msg, size); + if (ret == -EAGAIN || (ret >= 0 && ret < size)) { + atomic_inc(&map->write); + atomic_inc(&map->io); + } + if (ret == -EAGAIN) + return; + + /* write the data, then update the indexes */ + virt_wmb(); + if (ret < 0) { + intf->out_error = ret; + } else { + intf->out_error = 0; + intf->out_cons = cons + ret; + prod = intf->out_prod; + } + /* update the indexes, then notify the other end */ + virt_wmb(); + if (prod != cons + ret) + atomic_inc(&map->write); + notify_remote_via_irq(map->irq); +} + +static void pvcalls_back_ioworker(struct work_struct *work) +{ + struct pvcalls_ioworker *ioworker = container_of(work, + struct pvcalls_ioworker, register_work); + struct sock_mapping *map = container_of(ioworker, struct sock_mapping, + ioworker); + + while (atomic_read(&map->io) > 0) { + if (atomic_read(&map->release) > 0) { + atomic_set(&map->release, 0); + return; + } + + if (atomic_read(&map->read) > 0) + pvcalls_conn_back_read(map); + if (atomic_read(&map->write) > 0) + pvcalls_conn_back_write(map); + + atomic_dec(&map->io); + } +} + +static int pvcalls_back_socket(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + int ret; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + + if (req->u.socket.domain != AF_INET || + req->u.socket.type != SOCK_STREAM || + (req->u.socket.protocol != IPPROTO_IP && + req->u.socket.protocol != AF_INET)) + ret = -EAFNOSUPPORT; + else + ret = 0; + + /* leave the actual socket allocation for later */ + + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.socket.id = req->u.socket.id; + rsp->ret = ret; + + return 0; +} + +static void pvcalls_sk_state_change(struct sock *sock) +{ + struct sock_mapping *map = sock->sk_user_data; + struct pvcalls_data_intf *intf; + + if (map == NULL) + return; + + intf = map->ring; + intf->in_error = -ENOTCONN; + notify_remote_via_irq(map->irq); +} + +static void pvcalls_sk_data_ready(struct sock *sock) +{ + struct sock_mapping *map = sock->sk_user_data; + struct pvcalls_ioworker *iow; + + if (map == NULL) + return; + + iow = &map->ioworker; + atomic_inc(&map->read); + atomic_inc(&map->io); + queue_work(iow->wq, &iow->register_work); +} + +static struct sock_mapping *pvcalls_new_active_socket( + struct pvcalls_fedata *fedata, + uint64_t id, + grant_ref_t ref, + uint32_t evtchn, + struct socket *sock) +{ + int ret; + struct sock_mapping *map; + void *page; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) + return NULL; + + map->fedata = fedata; + map->sock = sock; + map->id = id; + map->ref = ref; + + ret = xenbus_map_ring_valloc(fedata->dev, &ref, 1, &page); + if (ret < 0) + goto out; + map->ring = page; + map->ring_order = map->ring->ring_order; + /* first read the order, then map the data ring */ + virt_rmb(); + if (map->ring_order > MAX_RING_ORDER) { + pr_warn("%s frontend requested ring_order %u, which is > MAX (%u)\n", + __func__, map->ring_order, MAX_RING_ORDER); + goto out; + } + ret = xenbus_map_ring_valloc(fedata->dev, map->ring->ref, + (1 << map->ring_order), &page); + if (ret < 0) + goto out; + map->bytes = page; + + ret = bind_interdomain_evtchn_to_irqhandler(fedata->dev->otherend_id, + evtchn, + pvcalls_back_conn_event, + 0, + "pvcalls-backend", + map); + if (ret < 0) + goto out; + map->irq = ret; + + map->data.in = map->bytes; + map->data.out = map->bytes + XEN_FLEX_RING_SIZE(map->ring_order); + + map->ioworker.wq = alloc_workqueue("pvcalls_io", WQ_UNBOUND, 1); + if (!map->ioworker.wq) + goto out; + atomic_set(&map->io, 1); + INIT_WORK(&map->ioworker.register_work, pvcalls_back_ioworker); + + down(&fedata->socket_lock); + list_add_tail(&map->list, &fedata->socket_mappings); + up(&fedata->socket_lock); + + write_lock_bh(&map->sock->sk->sk_callback_lock); + map->saved_data_ready = map->sock->sk->sk_data_ready; + map->sock->sk->sk_user_data = map; + map->sock->sk->sk_data_ready = pvcalls_sk_data_ready; + map->sock->sk->sk_state_change = pvcalls_sk_state_change; + write_unlock_bh(&map->sock->sk->sk_callback_lock); + + return map; +out: + down(&fedata->socket_lock); + list_del(&map->list); + pvcalls_back_release_active(fedata->dev, fedata, map); + up(&fedata->socket_lock); + return NULL; +} + +static int pvcalls_back_connect(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + int ret = -EINVAL; + struct socket *sock; + struct sock_mapping *map; + struct xen_pvcalls_response *rsp; + struct sockaddr *sa = (struct sockaddr *)&req->u.connect.addr; + + fedata = dev_get_drvdata(&dev->dev); + + if (req->u.connect.len < sizeof(sa->sa_family) || + req->u.connect.len > sizeof(req->u.connect.addr) || + sa->sa_family != AF_INET) + goto out; + + ret = sock_create(AF_INET, SOCK_STREAM, 0, &sock); + if (ret < 0) + goto out; + ret = inet_stream_connect(sock, sa, req->u.connect.len, 0); + if (ret < 0) { + sock_release(sock); + goto out; + } + + map = pvcalls_new_active_socket(fedata, + req->u.connect.id, + req->u.connect.ref, + req->u.connect.evtchn, + sock); + if (!map) { + ret = -EFAULT; + sock_release(map->sock); + } + +out: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.connect.id = req->u.connect.id; + rsp->ret = ret; + + return 0; +} + +static int pvcalls_back_release_active(struct xenbus_device *dev, + struct pvcalls_fedata *fedata, + struct sock_mapping *map) +{ + disable_irq(map->irq); + if (map->sock->sk != NULL) { + write_lock_bh(&map->sock->sk->sk_callback_lock); + map->sock->sk->sk_user_data = NULL; + map->sock->sk->sk_data_ready = map->saved_data_ready; + write_unlock_bh(&map->sock->sk->sk_callback_lock); + } + + atomic_set(&map->release, 1); + flush_work(&map->ioworker.register_work); + + xenbus_unmap_ring_vfree(dev, map->bytes); + xenbus_unmap_ring_vfree(dev, (void *)map->ring); + unbind_from_irqhandler(map->irq, map); + + sock_release(map->sock); + kfree(map); + + return 0; +} + +static int pvcalls_back_release_passive(struct xenbus_device *dev, + struct pvcalls_fedata *fedata, + struct sockpass_mapping *mappass) +{ + if (mappass->sock->sk != NULL) { + write_lock_bh(&mappass->sock->sk->sk_callback_lock); + mappass->sock->sk->sk_user_data = NULL; + mappass->sock->sk->sk_data_ready = mappass->saved_data_ready; + write_unlock_bh(&mappass->sock->sk->sk_callback_lock); + } + sock_release(mappass->sock); + flush_workqueue(mappass->wq); + destroy_workqueue(mappass->wq); + kfree(mappass); + + return 0; +} + +static int pvcalls_back_release(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + struct sock_mapping *map, *n; + struct sockpass_mapping *mappass; + int ret = 0; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + list_for_each_entry_safe(map, n, &fedata->socket_mappings, list) { + if (map->id == req->u.release.id) { + list_del(&map->list); + up(&fedata->socket_lock); + ret = pvcalls_back_release_active(dev, fedata, map); + goto out; + } + } + mappass = radix_tree_lookup(&fedata->socketpass_mappings, + req->u.release.id); + if (mappass != NULL) { + radix_tree_delete(&fedata->socketpass_mappings, mappass->id); + up(&fedata->socket_lock); + ret = pvcalls_back_release_passive(dev, fedata, mappass); + } else + up(&fedata->socket_lock); + +out: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->u.release.id = req->u.release.id; + rsp->cmd = req->cmd; + rsp->ret = ret; + return 0; +} + +static void __pvcalls_back_accept(struct work_struct *work) +{ + struct sockpass_mapping *mappass = container_of( + work, struct sockpass_mapping, register_work); + struct sock_mapping *map; + struct pvcalls_ioworker *iow; + struct pvcalls_fedata *fedata; + struct socket *sock; + struct xen_pvcalls_response *rsp; + struct xen_pvcalls_request *req; + int notify; + int ret = -EINVAL; + unsigned long flags; + + fedata = mappass->fedata; + /* + * __pvcalls_back_accept can race against pvcalls_back_accept. + * We only need to check the value of "cmd" on read. It could be + * done atomically, but to simplify the code on the write side, we + * use a spinlock. + */ + spin_lock_irqsave(&mappass->copy_lock, flags); + req = &mappass->reqcopy; + if (req->cmd != PVCALLS_ACCEPT) { + spin_unlock_irqrestore(&mappass->copy_lock, flags); + return; + } + spin_unlock_irqrestore(&mappass->copy_lock, flags); + + sock = sock_alloc(); + if (sock == NULL) + goto out_error; + sock->type = mappass->sock->type; + sock->ops = mappass->sock->ops; + + ret = inet_accept(mappass->sock, sock, O_NONBLOCK, true); + if (ret == -EAGAIN) { + sock_release(sock); + goto out_error; + } + + map = pvcalls_new_active_socket(fedata, + req->u.accept.id_new, + req->u.accept.ref, + req->u.accept.evtchn, + sock); + if (!map) { + ret = -EFAULT; + sock_release(sock); + goto out_error; + } + + map->sockpass = mappass; + iow = &map->ioworker; + atomic_inc(&map->read); + atomic_inc(&map->io); + queue_work(iow->wq, &iow->register_work); + +out_error: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.accept.id = req->u.accept.id; + rsp->ret = ret; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&fedata->ring, notify); + if (notify) + notify_remote_via_irq(fedata->irq); + + mappass->reqcopy.cmd = 0; +} + +static void pvcalls_pass_sk_data_ready(struct sock *sock) +{ + struct sockpass_mapping *mappass = sock->sk_user_data; + struct pvcalls_fedata *fedata; + struct xen_pvcalls_response *rsp; + unsigned long flags; + int notify; + + if (mappass == NULL) + return; + + fedata = mappass->fedata; + spin_lock_irqsave(&mappass->copy_lock, flags); + if (mappass->reqcopy.cmd == PVCALLS_POLL) { + rsp = RING_GET_RESPONSE(&fedata->ring, + fedata->ring.rsp_prod_pvt++); + rsp->req_id = mappass->reqcopy.req_id; + rsp->u.poll.id = mappass->reqcopy.u.poll.id; + rsp->cmd = mappass->reqcopy.cmd; + rsp->ret = 0; + + mappass->reqcopy.cmd = 0; + spin_unlock_irqrestore(&mappass->copy_lock, flags); + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&fedata->ring, notify); + if (notify) + notify_remote_via_irq(mappass->fedata->irq); + } else { + spin_unlock_irqrestore(&mappass->copy_lock, flags); + queue_work(mappass->wq, &mappass->register_work); + } +} + +static int pvcalls_back_bind(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + int ret; + struct sockpass_mapping *map; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + ret = -ENOMEM; + goto out; + } + + INIT_WORK(&map->register_work, __pvcalls_back_accept); + spin_lock_init(&map->copy_lock); + map->wq = alloc_workqueue("pvcalls_wq", WQ_UNBOUND, 1); + if (!map->wq) { + ret = -ENOMEM; + goto out; + } + + ret = sock_create(AF_INET, SOCK_STREAM, 0, &map->sock); + if (ret < 0) + goto out; + + ret = inet_bind(map->sock, (struct sockaddr *)&req->u.bind.addr, + req->u.bind.len); + if (ret < 0) + goto out; + + map->fedata = fedata; + map->id = req->u.bind.id; + + down(&fedata->socket_lock); + ret = radix_tree_insert(&fedata->socketpass_mappings, map->id, + map); + up(&fedata->socket_lock); + if (ret) + goto out; + + write_lock_bh(&map->sock->sk->sk_callback_lock); + map->saved_data_ready = map->sock->sk->sk_data_ready; + map->sock->sk->sk_user_data = map; + map->sock->sk->sk_data_ready = pvcalls_pass_sk_data_ready; + write_unlock_bh(&map->sock->sk->sk_callback_lock); + +out: + if (ret) { + if (map && map->sock) + sock_release(map->sock); + if (map && map->wq) + destroy_workqueue(map->wq); + kfree(map); + } + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.bind.id = req->u.bind.id; + rsp->ret = ret; + return 0; +} + +static int pvcalls_back_listen(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + int ret = -EINVAL; + struct sockpass_mapping *map; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + map = radix_tree_lookup(&fedata->socketpass_mappings, req->u.listen.id); + up(&fedata->socket_lock); + if (map == NULL) + goto out; + + ret = inet_listen(map->sock, req->u.listen.backlog); + +out: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.listen.id = req->u.listen.id; + rsp->ret = ret; + return 0; +} + +static int pvcalls_back_accept(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + struct sockpass_mapping *mappass; + int ret = -EINVAL; + struct xen_pvcalls_response *rsp; + unsigned long flags; + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + mappass = radix_tree_lookup(&fedata->socketpass_mappings, + req->u.accept.id); + up(&fedata->socket_lock); + if (mappass == NULL) + goto out_error; + + /* + * Limitation of the current implementation: only support one + * concurrent accept or poll call on one socket. + */ + spin_lock_irqsave(&mappass->copy_lock, flags); + if (mappass->reqcopy.cmd != 0) { + spin_unlock_irqrestore(&mappass->copy_lock, flags); + ret = -EINTR; + goto out_error; + } + + mappass->reqcopy = *req; + spin_unlock_irqrestore(&mappass->copy_lock, flags); + queue_work(mappass->wq, &mappass->register_work); + + /* Tell the caller we don't need to send back a notification yet */ + return -1; + +out_error: + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.accept.id = req->u.accept.id; + rsp->ret = ret; + return 0; +} + +static int pvcalls_back_poll(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + struct pvcalls_fedata *fedata; + struct sockpass_mapping *mappass; + struct xen_pvcalls_response *rsp; + struct inet_connection_sock *icsk; + struct request_sock_queue *queue; + unsigned long flags; + int ret; + bool data; + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + mappass = radix_tree_lookup(&fedata->socketpass_mappings, + req->u.poll.id); + up(&fedata->socket_lock); + if (mappass == NULL) + return -EINVAL; + + /* + * Limitation of the current implementation: only support one + * concurrent accept or poll call on one socket. + */ + spin_lock_irqsave(&mappass->copy_lock, flags); + if (mappass->reqcopy.cmd != 0) { + ret = -EINTR; + goto out; + } + + mappass->reqcopy = *req; + icsk = inet_csk(mappass->sock->sk); + queue = &icsk->icsk_accept_queue; + data = queue->rskq_accept_head != NULL; + if (data) { + mappass->reqcopy.cmd = 0; + ret = 0; + goto out; + } + spin_unlock_irqrestore(&mappass->copy_lock, flags); + + /* Tell the caller we don't need to send back a notification yet */ + return -1; + +out: + spin_unlock_irqrestore(&mappass->copy_lock, flags); + + rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->u.poll.id = req->u.poll.id; + rsp->ret = ret; + return 0; +} + +static int pvcalls_back_handle_cmd(struct xenbus_device *dev, + struct xen_pvcalls_request *req) +{ + int ret = 0; + + switch (req->cmd) { + case PVCALLS_SOCKET: + ret = pvcalls_back_socket(dev, req); + break; + case PVCALLS_CONNECT: + ret = pvcalls_back_connect(dev, req); + break; + case PVCALLS_RELEASE: + ret = pvcalls_back_release(dev, req); + break; + case PVCALLS_BIND: + ret = pvcalls_back_bind(dev, req); + break; + case PVCALLS_LISTEN: + ret = pvcalls_back_listen(dev, req); + break; + case PVCALLS_ACCEPT: + ret = pvcalls_back_accept(dev, req); + break; + case PVCALLS_POLL: + ret = pvcalls_back_poll(dev, req); + break; + default: + { + struct pvcalls_fedata *fedata; + struct xen_pvcalls_response *rsp; + + fedata = dev_get_drvdata(&dev->dev); + rsp = RING_GET_RESPONSE( + &fedata->ring, fedata->ring.rsp_prod_pvt++); + rsp->req_id = req->req_id; + rsp->cmd = req->cmd; + rsp->ret = -ENOTSUPP; + break; + } + } + return ret; +} + +static void pvcalls_back_work(struct pvcalls_fedata *fedata) +{ + int notify, notify_all = 0, more = 1; + struct xen_pvcalls_request req; + struct xenbus_device *dev = fedata->dev; + + while (more) { + while (RING_HAS_UNCONSUMED_REQUESTS(&fedata->ring)) { + RING_COPY_REQUEST(&fedata->ring, + fedata->ring.req_cons++, + &req); + + if (!pvcalls_back_handle_cmd(dev, &req)) { + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY( + &fedata->ring, notify); + notify_all += notify; + } + } + + if (notify_all) { + notify_remote_via_irq(fedata->irq); + notify_all = 0; + } + + RING_FINAL_CHECK_FOR_REQUESTS(&fedata->ring, more); + } +} + +static irqreturn_t pvcalls_back_event(int irq, void *dev_id) +{ + struct xenbus_device *dev = dev_id; + struct pvcalls_fedata *fedata = NULL; + + if (dev == NULL) + return IRQ_HANDLED; + + fedata = dev_get_drvdata(&dev->dev); + if (fedata == NULL) + return IRQ_HANDLED; + + pvcalls_back_work(fedata); + return IRQ_HANDLED; +} + +static irqreturn_t pvcalls_back_conn_event(int irq, void *sock_map) +{ + struct sock_mapping *map = sock_map; + struct pvcalls_ioworker *iow; + + if (map == NULL || map->sock == NULL || map->sock->sk == NULL || + map->sock->sk->sk_user_data != map) + return IRQ_HANDLED; + + iow = &map->ioworker; + + atomic_inc(&map->write); + atomic_inc(&map->io); + queue_work(iow->wq, &iow->register_work); + + return IRQ_HANDLED; +} + +static int backend_connect(struct xenbus_device *dev) +{ + int err, evtchn; + grant_ref_t ring_ref; + struct pvcalls_fedata *fedata = NULL; + + fedata = kzalloc(sizeof(struct pvcalls_fedata), GFP_KERNEL); + if (!fedata) + return -ENOMEM; + + fedata->irq = -1; + err = xenbus_scanf(XBT_NIL, dev->otherend, "port", "%u", + &evtchn); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/event-channel", + dev->otherend); + goto error; + } + + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", "%u", &ring_ref); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", + dev->otherend); + goto error; + } + + err = bind_interdomain_evtchn_to_irq(dev->otherend_id, evtchn); + if (err < 0) + goto error; + fedata->irq = err; + + err = request_threaded_irq(fedata->irq, NULL, pvcalls_back_event, + IRQF_ONESHOT, "pvcalls-back", dev); + if (err < 0) + goto error; + + err = xenbus_map_ring_valloc(dev, &ring_ref, 1, + (void **)&fedata->sring); + if (err < 0) + goto error; + + BACK_RING_INIT(&fedata->ring, fedata->sring, XEN_PAGE_SIZE * 1); + fedata->dev = dev; + + INIT_LIST_HEAD(&fedata->socket_mappings); + INIT_RADIX_TREE(&fedata->socketpass_mappings, GFP_KERNEL); + sema_init(&fedata->socket_lock, 1); + dev_set_drvdata(&dev->dev, fedata); + + down(&pvcalls_back_global.frontends_lock); + list_add_tail(&fedata->list, &pvcalls_back_global.frontends); + up(&pvcalls_back_global.frontends_lock); + + return 0; + + error: + if (fedata->irq >= 0) + unbind_from_irqhandler(fedata->irq, dev); + if (fedata->sring != NULL) + xenbus_unmap_ring_vfree(dev, fedata->sring); + kfree(fedata); + return err; +} + +static int backend_disconnect(struct xenbus_device *dev) +{ + struct pvcalls_fedata *fedata; + struct sock_mapping *map, *n; + struct sockpass_mapping *mappass; + struct radix_tree_iter iter; + void **slot; + + + fedata = dev_get_drvdata(&dev->dev); + + down(&fedata->socket_lock); + list_for_each_entry_safe(map, n, &fedata->socket_mappings, list) { + list_del(&map->list); + pvcalls_back_release_active(dev, fedata, map); + } + + radix_tree_for_each_slot(slot, &fedata->socketpass_mappings, &iter, 0) { + mappass = radix_tree_deref_slot(slot); + if (!mappass) + continue; + if (radix_tree_exception(mappass)) { + if (radix_tree_deref_retry(mappass)) + slot = radix_tree_iter_retry(&iter); + } else { + radix_tree_delete(&fedata->socketpass_mappings, + mappass->id); + pvcalls_back_release_passive(dev, fedata, mappass); + } + } + up(&fedata->socket_lock); + + unbind_from_irqhandler(fedata->irq, dev); + xenbus_unmap_ring_vfree(dev, fedata->sring); + + list_del(&fedata->list); + kfree(fedata); + dev_set_drvdata(&dev->dev, NULL); + + return 0; +} + +static int pvcalls_back_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err, abort; + struct xenbus_transaction xbt; + +again: + abort = 1; + + err = xenbus_transaction_start(&xbt); + if (err) { + pr_warn("%s cannot create xenstore transaction\n", __func__); + return err; + } + + err = xenbus_printf(xbt, dev->nodename, "versions", "%s", + PVCALLS_VERSIONS); + if (err) { + pr_warn("%s write out 'versions' failed\n", __func__); + goto abort; + } + + err = xenbus_printf(xbt, dev->nodename, "max-page-order", "%u", + MAX_RING_ORDER); + if (err) { + pr_warn("%s write out 'max-page-order' failed\n", __func__); + goto abort; + } + + err = xenbus_printf(xbt, dev->nodename, "function-calls", + XENBUS_FUNCTIONS_CALLS); + if (err) { + pr_warn("%s write out 'function-calls' failed\n", __func__); + goto abort; + } + + abort = 0; +abort: + err = xenbus_transaction_end(xbt, abort); + if (err) { + if (err == -EAGAIN && !abort) + goto again; + pr_warn("%s cannot complete xenstore transaction\n", __func__); + return err; + } + + if (abort) + return -EFAULT; + + xenbus_switch_state(dev, XenbusStateInitWait); + + return 0; +} + +static void set_backend_state(struct xenbus_device *dev, + enum xenbus_state state) +{ + while (dev->state != state) { + switch (dev->state) { + case XenbusStateClosed: + switch (state) { + case XenbusStateInitWait: + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateInitWait); + break; + case XenbusStateClosing: + xenbus_switch_state(dev, XenbusStateClosing); + break; + default: + WARN_ON(1); + } + break; + case XenbusStateInitWait: + case XenbusStateInitialised: + switch (state) { + case XenbusStateConnected: + backend_connect(dev); + xenbus_switch_state(dev, XenbusStateConnected); + break; + case XenbusStateClosing: + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosing); + break; + default: + WARN_ON(1); + } + break; + case XenbusStateConnected: + switch (state) { + case XenbusStateInitWait: + case XenbusStateClosing: + case XenbusStateClosed: + down(&pvcalls_back_global.frontends_lock); + backend_disconnect(dev); + up(&pvcalls_back_global.frontends_lock); + xenbus_switch_state(dev, XenbusStateClosing); + break; + default: + WARN_ON(1); + } + break; + case XenbusStateClosing: + switch (state) { + case XenbusStateInitWait: + case XenbusStateConnected: + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + break; + default: + WARN_ON(1); + } + break; + default: + WARN_ON(1); + } + } +} + +static void pvcalls_back_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + switch (frontend_state) { + case XenbusStateInitialising: + set_backend_state(dev, XenbusStateInitWait); + break; + + case XenbusStateInitialised: + case XenbusStateConnected: + set_backend_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + set_backend_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + set_backend_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + device_unregister(&dev->dev); + break; + case XenbusStateUnknown: + set_backend_state(dev, XenbusStateClosed); + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +static int pvcalls_back_remove(struct xenbus_device *dev) +{ + return 0; +} + +static int pvcalls_back_uevent(struct xenbus_device *xdev, + struct kobj_uevent_env *env) +{ + return 0; +} + +static const struct xenbus_device_id pvcalls_back_ids[] = { + { "pvcalls" }, + { "" } +}; + +static struct xenbus_driver pvcalls_back_driver = { + .ids = pvcalls_back_ids, + .probe = pvcalls_back_probe, + .remove = pvcalls_back_remove, + .uevent = pvcalls_back_uevent, + .otherend_changed = pvcalls_back_changed, +}; + +static int __init pvcalls_back_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + ret = xenbus_register_backend(&pvcalls_back_driver); + if (ret < 0) + return ret; + + sema_init(&pvcalls_back_global.frontends_lock, 1); + INIT_LIST_HEAD(&pvcalls_back_global.frontends); + return 0; +} +module_init(pvcalls_back_init); + +static void __exit pvcalls_back_fin(void) +{ + struct pvcalls_fedata *fedata, *nfedata; + + down(&pvcalls_back_global.frontends_lock); + list_for_each_entry_safe(fedata, nfedata, + &pvcalls_back_global.frontends, list) { + backend_disconnect(fedata->dev); + } + up(&pvcalls_back_global.frontends_lock); + + xenbus_unregister_driver(&pvcalls_back_driver); +} + +module_exit(pvcalls_back_fin); + +MODULE_DESCRIPTION("Xen PV Calls backend driver"); +MODULE_AUTHOR("Stefano Stabellini <sstabellini@kernel.org>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c new file mode 100644 index 000000000000..40caa92bff33 --- /dev/null +++ b/drivers/xen/pvcalls-front.c @@ -0,0 +1,1278 @@ +/* + * (c) 2017 Stefano Stabellini <stefano@aporeto.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/socket.h> + +#include <net/sock.h> + +#include <xen/events.h> +#include <xen/grant_table.h> +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/pvcalls.h> + +#include "pvcalls-front.h" + +#define PVCALLS_INVALID_ID UINT_MAX +#define PVCALLS_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER +#define PVCALLS_NR_RSP_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE) +#define PVCALLS_FRONT_MAX_SPIN 5000 + +struct pvcalls_bedata { + struct xen_pvcalls_front_ring ring; + grant_ref_t ref; + int irq; + + struct list_head socket_mappings; + spinlock_t socket_lock; + + wait_queue_head_t inflight_req; + struct xen_pvcalls_response rsp[PVCALLS_NR_RSP_PER_RING]; +}; +/* Only one front/back connection supported. */ +static struct xenbus_device *pvcalls_front_dev; +static atomic_t pvcalls_refcount; + +/* first increment refcount, then proceed */ +#define pvcalls_enter() { \ + atomic_inc(&pvcalls_refcount); \ +} + +/* first complete other operations, then decrement refcount */ +#define pvcalls_exit() { \ + atomic_dec(&pvcalls_refcount); \ +} + +struct sock_mapping { + bool active_socket; + struct list_head list; + struct socket *sock; + union { + struct { + int irq; + grant_ref_t ref; + struct pvcalls_data_intf *ring; + struct pvcalls_data data; + struct mutex in_mutex; + struct mutex out_mutex; + + wait_queue_head_t inflight_conn_req; + } active; + struct { + /* Socket status */ +#define PVCALLS_STATUS_UNINITALIZED 0 +#define PVCALLS_STATUS_BIND 1 +#define PVCALLS_STATUS_LISTEN 2 + uint8_t status; + /* + * Internal state-machine flags. + * Only one accept operation can be inflight for a socket. + * Only one poll operation can be inflight for a given socket. + */ +#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0 +#define PVCALLS_FLAG_POLL_INFLIGHT 1 +#define PVCALLS_FLAG_POLL_RET 2 + uint8_t flags; + uint32_t inflight_req_id; + struct sock_mapping *accept_map; + wait_queue_head_t inflight_accept_req; + } passive; + }; +}; + +static inline int get_request(struct pvcalls_bedata *bedata, int *req_id) +{ + *req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); + if (RING_FULL(&bedata->ring) || + bedata->rsp[*req_id].req_id != PVCALLS_INVALID_ID) + return -EAGAIN; + return 0; +} + +static bool pvcalls_front_write_todo(struct sock_mapping *map) +{ + struct pvcalls_data_intf *intf = map->active.ring; + RING_IDX cons, prod, size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + error = intf->out_error; + if (error == -ENOTCONN) + return false; + if (error != 0) + return true; + + cons = intf->out_cons; + prod = intf->out_prod; + return !!(size - pvcalls_queued(prod, cons, size)); +} + +static bool pvcalls_front_read_todo(struct sock_mapping *map) +{ + struct pvcalls_data_intf *intf = map->active.ring; + RING_IDX cons, prod; + int32_t error; + + cons = intf->in_cons; + prod = intf->in_prod; + error = intf->in_error; + return (error != 0 || + pvcalls_queued(prod, cons, + XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) != 0); +} + +static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id) +{ + struct xenbus_device *dev = dev_id; + struct pvcalls_bedata *bedata; + struct xen_pvcalls_response *rsp; + uint8_t *src, *dst; + int req_id = 0, more = 0, done = 0; + + if (dev == NULL) + return IRQ_HANDLED; + + pvcalls_enter(); + bedata = dev_get_drvdata(&dev->dev); + if (bedata == NULL) { + pvcalls_exit(); + return IRQ_HANDLED; + } + +again: + while (RING_HAS_UNCONSUMED_RESPONSES(&bedata->ring)) { + rsp = RING_GET_RESPONSE(&bedata->ring, bedata->ring.rsp_cons); + + req_id = rsp->req_id; + if (rsp->cmd == PVCALLS_POLL) { + struct sock_mapping *map = (struct sock_mapping *)(uintptr_t) + rsp->u.poll.id; + + clear_bit(PVCALLS_FLAG_POLL_INFLIGHT, + (void *)&map->passive.flags); + /* + * clear INFLIGHT, then set RET. It pairs with + * the checks at the beginning of + * pvcalls_front_poll_passive. + */ + smp_wmb(); + set_bit(PVCALLS_FLAG_POLL_RET, + (void *)&map->passive.flags); + } else { + dst = (uint8_t *)&bedata->rsp[req_id] + + sizeof(rsp->req_id); + src = (uint8_t *)rsp + sizeof(rsp->req_id); + memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id)); + /* + * First copy the rest of the data, then req_id. It is + * paired with the barrier when accessing bedata->rsp. + */ + smp_wmb(); + bedata->rsp[req_id].req_id = req_id; + } + + done = 1; + bedata->ring.rsp_cons++; + } + + RING_FINAL_CHECK_FOR_RESPONSES(&bedata->ring, more); + if (more) + goto again; + if (done) + wake_up(&bedata->inflight_req); + pvcalls_exit(); + return IRQ_HANDLED; +} + +static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, + struct sock_mapping *map) +{ + int i; + + unbind_from_irqhandler(map->active.irq, map); + + spin_lock(&bedata->socket_lock); + if (!list_empty(&map->list)) + list_del_init(&map->list); + spin_unlock(&bedata->socket_lock); + + for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++) + gnttab_end_foreign_access(map->active.ring->ref[i], 0, 0); + gnttab_end_foreign_access(map->active.ref, 0, 0); + free_page((unsigned long)map->active.ring); + + kfree(map); +} + +static irqreturn_t pvcalls_front_conn_handler(int irq, void *sock_map) +{ + struct sock_mapping *map = sock_map; + + if (map == NULL) + return IRQ_HANDLED; + + wake_up_interruptible(&map->active.inflight_conn_req); + + return IRQ_HANDLED; +} + +int pvcalls_front_socket(struct socket *sock) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + /* + * PVCalls only supports domain AF_INET, + * type SOCK_STREAM and protocol 0 sockets for now. + * + * Check socket type here, AF_INET and protocol checks are done + * by the caller. + */ + if (sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -EACCES; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + pvcalls_exit(); + return -ENOMEM; + } + + spin_lock(&bedata->socket_lock); + + ret = get_request(bedata, &req_id); + if (ret < 0) { + kfree(map); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + + /* + * sock->sk->sk_send_head is not used for ip sockets: reuse the + * field to store a pointer to the struct sock_mapping + * corresponding to the socket. This way, we can easily get the + * struct sock_mapping from the struct socket. + */ + sock->sk->sk_send_head = (void *)map; + list_add_tail(&map->list, &bedata->socket_mappings); + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_SOCKET; + req->u.socket.id = (uintptr_t) map; + req->u.socket.domain = AF_INET; + req->u.socket.type = SOCK_STREAM; + req->u.socket.protocol = IPPROTO_IP; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + pvcalls_exit(); + return ret; +} + +static int create_active(struct sock_mapping *map, int *evtchn) +{ + void *bytes; + int ret = -ENOMEM, irq = -1, i; + + *evtchn = -1; + init_waitqueue_head(&map->active.inflight_conn_req); + + map->active.ring = (struct pvcalls_data_intf *) + __get_free_page(GFP_KERNEL | __GFP_ZERO); + if (map->active.ring == NULL) + goto out_error; + map->active.ring->ring_order = PVCALLS_RING_ORDER; + bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + PVCALLS_RING_ORDER); + if (bytes == NULL) + goto out_error; + for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++) + map->active.ring->ref[i] = gnttab_grant_foreign_access( + pvcalls_front_dev->otherend_id, + pfn_to_gfn(virt_to_pfn(bytes) + i), 0); + + map->active.ref = gnttab_grant_foreign_access( + pvcalls_front_dev->otherend_id, + pfn_to_gfn(virt_to_pfn((void *)map->active.ring)), 0); + + map->active.data.in = bytes; + map->active.data.out = bytes + + XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + + ret = xenbus_alloc_evtchn(pvcalls_front_dev, evtchn); + if (ret) + goto out_error; + irq = bind_evtchn_to_irqhandler(*evtchn, pvcalls_front_conn_handler, + 0, "pvcalls-frontend", map); + if (irq < 0) { + ret = irq; + goto out_error; + } + + map->active.irq = irq; + map->active_socket = true; + mutex_init(&map->active.in_mutex); + mutex_init(&map->active.out_mutex); + + return 0; + +out_error: + if (*evtchn >= 0) + xenbus_free_evtchn(pvcalls_front_dev, *evtchn); + kfree(map->active.data.in); + kfree(map->active.ring); + return ret; +} + +int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret, evtchn; + + if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *)sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + ret = create_active(map, &evtchn); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_CONNECT; + req->u.connect.id = (uintptr_t)map; + req->u.connect.len = addr_len; + req->u.connect.flags = flags; + req->u.connect.ref = map->active.ref; + req->u.connect.evtchn = evtchn; + memcpy(req->u.connect.addr, addr, sizeof(*addr)); + + map->sock = sock; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + pvcalls_exit(); + return ret; +} + +static int __write_ring(struct pvcalls_data_intf *intf, + struct pvcalls_data *data, + struct iov_iter *msg_iter, + int len) +{ + RING_IDX cons, prod, size, masked_prod, masked_cons; + RING_IDX array_size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + error = intf->out_error; + if (error < 0) + return error; + cons = intf->out_cons; + prod = intf->out_prod; + /* read indexes before continuing */ + virt_mb(); + + size = pvcalls_queued(prod, cons, array_size); + if (size >= array_size) + return -EINVAL; + if (len > array_size - size) + len = array_size - size; + + masked_prod = pvcalls_mask(prod, array_size); + masked_cons = pvcalls_mask(cons, array_size); + + if (masked_prod < masked_cons) { + len = copy_from_iter(data->out + masked_prod, len, msg_iter); + } else { + if (len > array_size - masked_prod) { + int ret = copy_from_iter(data->out + masked_prod, + array_size - masked_prod, msg_iter); + if (ret != array_size - masked_prod) { + len = ret; + goto out; + } + len = ret + copy_from_iter(data->out, len - ret, msg_iter); + } else { + len = copy_from_iter(data->out + masked_prod, len, msg_iter); + } + } +out: + /* write to ring before updating pointer */ + virt_wmb(); + intf->out_prod += len; + + return len; +} + +int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + int sent, tot_sent = 0; + int count = 0, flags; + + flags = msg->msg_flags; + if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB)) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + mutex_lock(&map->active.out_mutex); + if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) { + mutex_unlock(&map->active.out_mutex); + pvcalls_exit(); + return -EAGAIN; + } + if (len > INT_MAX) + len = INT_MAX; + +again: + count++; + sent = __write_ring(map->active.ring, + &map->active.data, &msg->msg_iter, + len); + if (sent > 0) { + len -= sent; + tot_sent += sent; + notify_remote_via_irq(map->active.irq); + } + if (sent >= 0 && len > 0 && count < PVCALLS_FRONT_MAX_SPIN) + goto again; + if (sent < 0) + tot_sent = sent; + + mutex_unlock(&map->active.out_mutex); + pvcalls_exit(); + return tot_sent; +} + +static int __read_ring(struct pvcalls_data_intf *intf, + struct pvcalls_data *data, + struct iov_iter *msg_iter, + size_t len, int flags) +{ + RING_IDX cons, prod, size, masked_prod, masked_cons; + RING_IDX array_size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + cons = intf->in_cons; + prod = intf->in_prod; + error = intf->in_error; + /* get pointers before reading from the ring */ + virt_rmb(); + if (error < 0) + return error; + + size = pvcalls_queued(prod, cons, array_size); + masked_prod = pvcalls_mask(prod, array_size); + masked_cons = pvcalls_mask(cons, array_size); + + if (size == 0) + return 0; + + if (len > size) + len = size; + + if (masked_prod > masked_cons) { + len = copy_to_iter(data->in + masked_cons, len, msg_iter); + } else { + if (len > (array_size - masked_cons)) { + int ret = copy_to_iter(data->in + masked_cons, + array_size - masked_cons, msg_iter); + if (ret != array_size - masked_cons) { + len = ret; + goto out; + } + len = ret + copy_to_iter(data->in, len - ret, msg_iter); + } else { + len = copy_to_iter(data->in + masked_cons, len, msg_iter); + } + } +out: + /* read data from the ring before increasing the index */ + virt_mb(); + if (!(flags & MSG_PEEK)) + intf->in_cons += len; + + return len; +} + +int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct pvcalls_bedata *bedata; + int ret; + struct sock_mapping *map; + + if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC)) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + mutex_lock(&map->active.in_mutex); + if (len > XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) + len = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + + while (!(flags & MSG_DONTWAIT) && !pvcalls_front_read_todo(map)) { + wait_event_interruptible(map->active.inflight_conn_req, + pvcalls_front_read_todo(map)); + } + ret = __read_ring(map->active.ring, &map->active.data, + &msg->msg_iter, len, flags); + + if (ret > 0) + notify_remote_via_irq(map->active.irq); + if (ret == 0) + ret = (flags & MSG_DONTWAIT) ? -EAGAIN : 0; + if (ret == -ENOTCONN) + ret = 0; + + mutex_unlock(&map->active.in_mutex); + pvcalls_exit(); + return ret; +} + +int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (map == NULL) { + pvcalls_exit(); + return -ENOTSOCK; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + map->sock = sock; + req->cmd = PVCALLS_BIND; + req->u.bind.id = (uintptr_t)map; + memcpy(req->u.bind.addr, addr, sizeof(*addr)); + req->u.bind.len = addr_len; + + init_waitqueue_head(&map->passive.inflight_accept_req); + + map->active_socket = false; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + map->passive.status = PVCALLS_STATUS_BIND; + pvcalls_exit(); + return 0; +} + +int pvcalls_front_listen(struct socket *sock, int backlog) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + if (map->passive.status != PVCALLS_STATUS_BIND) { + pvcalls_exit(); + return -EOPNOTSUPP; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_LISTEN; + req->u.listen.id = (uintptr_t) map; + req->u.listen.backlog = backlog; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + map->passive.status = PVCALLS_STATUS_LISTEN; + pvcalls_exit(); + return ret; +} + +int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + struct sock_mapping *map2 = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret, evtchn, nonblock; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + if (map->passive.status != PVCALLS_STATUS_LISTEN) { + pvcalls_exit(); + return -EINVAL; + } + + nonblock = flags & SOCK_NONBLOCK; + /* + * Backend only supports 1 inflight accept request, will return + * errors for the others + */ + if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags)) { + req_id = READ_ONCE(map->passive.inflight_req_id); + if (req_id != PVCALLS_INVALID_ID && + READ_ONCE(bedata->rsp[req_id].req_id) == req_id) { + map2 = map->passive.accept_map; + goto received; + } + if (nonblock) { + pvcalls_exit(); + return -EAGAIN; + } + if (wait_event_interruptible(map->passive.inflight_accept_req, + !test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags))) { + pvcalls_exit(); + return -EINTR; + } + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + map2 = kzalloc(sizeof(*map2), GFP_KERNEL); + if (map2 == NULL) { + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return -ENOMEM; + } + ret = create_active(map2, &evtchn); + if (ret < 0) { + kfree(map2); + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + list_add_tail(&map2->list, &bedata->socket_mappings); + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_ACCEPT; + req->u.accept.id = (uintptr_t) map; + req->u.accept.ref = map2->active.ref; + req->u.accept.id_new = (uintptr_t) map2; + req->u.accept.evtchn = evtchn; + map->passive.accept_map = map2; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + /* We could check if we have received a response before returning. */ + if (nonblock) { + WRITE_ONCE(map->passive.inflight_req_id, req_id); + pvcalls_exit(); + return -EAGAIN; + } + + if (wait_event_interruptible(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) { + pvcalls_exit(); + return -EINTR; + } + /* read req_id, then the content */ + smp_rmb(); + +received: + map2->sock = newsock; + newsock->sk = kzalloc(sizeof(*newsock->sk), GFP_KERNEL); + if (!newsock->sk) { + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + map->passive.inflight_req_id = PVCALLS_INVALID_ID; + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + pvcalls_front_free_map(bedata, map2); + pvcalls_exit(); + return -ENOMEM; + } + newsock->sk->sk_send_head = (void *)map2; + + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + map->passive.inflight_req_id = PVCALLS_INVALID_ID; + + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); + wake_up(&map->passive.inflight_accept_req); + + pvcalls_exit(); + return ret; +} + +static unsigned int pvcalls_front_poll_passive(struct file *file, + struct pvcalls_bedata *bedata, + struct sock_mapping *map, + poll_table *wait) +{ + int notify, req_id, ret; + struct xen_pvcalls_request *req; + + if (test_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags)) { + uint32_t req_id = READ_ONCE(map->passive.inflight_req_id); + + if (req_id != PVCALLS_INVALID_ID && + READ_ONCE(bedata->rsp[req_id].req_id) == req_id) + return POLLIN | POLLRDNORM; + + poll_wait(file, &map->passive.inflight_accept_req, wait); + return 0; + } + + if (test_and_clear_bit(PVCALLS_FLAG_POLL_RET, + (void *)&map->passive.flags)) + return POLLIN | POLLRDNORM; + + /* + * First check RET, then INFLIGHT. No barriers necessary to + * ensure execution ordering because of the conditional + * instructions creating control dependencies. + */ + + if (test_and_set_bit(PVCALLS_FLAG_POLL_INFLIGHT, + (void *)&map->passive.flags)) { + poll_wait(file, &bedata->inflight_req, wait); + return 0; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_POLL; + req->u.poll.id = (uintptr_t) map; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + poll_wait(file, &bedata->inflight_req, wait); + return 0; +} + +static unsigned int pvcalls_front_poll_active(struct file *file, + struct pvcalls_bedata *bedata, + struct sock_mapping *map, + poll_table *wait) +{ + unsigned int mask = 0; + int32_t in_error, out_error; + struct pvcalls_data_intf *intf = map->active.ring; + + out_error = intf->out_error; + in_error = intf->in_error; + + poll_wait(file, &map->active.inflight_conn_req, wait); + if (pvcalls_front_write_todo(map)) + mask |= POLLOUT | POLLWRNORM; + if (pvcalls_front_read_todo(map)) + mask |= POLLIN | POLLRDNORM; + if (in_error != 0 || out_error != 0) + mask |= POLLERR; + + return mask; +} + +unsigned int pvcalls_front_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + int ret; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return POLLNVAL; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return POLLNVAL; + } + if (map->active_socket) + ret = pvcalls_front_poll_active(file, bedata, map, wait); + else + ret = pvcalls_front_poll_passive(file, bedata, map, wait); + pvcalls_exit(); + return ret; +} + +int pvcalls_front_release(struct socket *sock) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + int req_id, notify, ret; + struct xen_pvcalls_request *req; + + if (sock->sk == NULL) + return 0; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -EIO; + } + + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (map == NULL) { + pvcalls_exit(); + return 0; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + sock->sk->sk_send_head = NULL; + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_RELEASE; + req->u.release.id = (uintptr_t)map; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + if (map->active_socket) { + /* + * Set in_error and wake up inflight_conn_req to force + * recvmsg waiters to exit. + */ + map->active.ring->in_error = -EBADF; + wake_up_interruptible(&map->active.inflight_conn_req); + + /* + * We need to make sure that sendmsg/recvmsg on this socket have + * not started before we've cleared sk_send_head here. The + * easiest (though not optimal) way to guarantee this is to see + * that no pvcall (other than us) is in progress. + */ + while (atomic_read(&pvcalls_refcount) > 1) + cpu_relax(); + + pvcalls_front_free_map(bedata, map); + } else { + spin_lock(&bedata->socket_lock); + list_del(&map->list); + spin_unlock(&bedata->socket_lock); + if (READ_ONCE(map->passive.inflight_req_id) != + PVCALLS_INVALID_ID) { + pvcalls_front_free_map(bedata, + map->passive.accept_map); + } + kfree(map); + } + WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID); + + pvcalls_exit(); + return 0; +} + +static const struct xenbus_device_id pvcalls_front_ids[] = { + { "pvcalls" }, + { "" } +}; + +static int pvcalls_front_remove(struct xenbus_device *dev) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL, *n; + + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + dev_set_drvdata(&dev->dev, NULL); + pvcalls_front_dev = NULL; + if (bedata->irq >= 0) + unbind_from_irqhandler(bedata->irq, dev); + + list_for_each_entry_safe(map, n, &bedata->socket_mappings, list) { + map->sock->sk->sk_send_head = NULL; + if (map->active_socket) { + map->active.ring->in_error = -EBADF; + wake_up_interruptible(&map->active.inflight_conn_req); + } + } + + smp_mb(); + while (atomic_read(&pvcalls_refcount) > 0) + cpu_relax(); + list_for_each_entry_safe(map, n, &bedata->socket_mappings, list) { + if (map->active_socket) { + /* No need to lock, refcount is 0 */ + pvcalls_front_free_map(bedata, map); + } else { + list_del(&map->list); + kfree(map); + } + } + if (bedata->ref >= 0) + gnttab_end_foreign_access(bedata->ref, 0, 0); + kfree(bedata->ring.sring); + kfree(bedata); + xenbus_switch_state(dev, XenbusStateClosed); + return 0; +} + +static int pvcalls_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int ret = -ENOMEM, evtchn, i; + unsigned int max_page_order, function_calls, len; + char *versions; + grant_ref_t gref_head = 0; + struct xenbus_transaction xbt; + struct pvcalls_bedata *bedata = NULL; + struct xen_pvcalls_sring *sring; + + if (pvcalls_front_dev != NULL) { + dev_err(&dev->dev, "only one PV Calls connection supported\n"); + return -EINVAL; + } + + versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); + if (!len) + return -EINVAL; + if (strcmp(versions, "1")) { + kfree(versions); + return -EINVAL; + } + kfree(versions); + max_page_order = xenbus_read_unsigned(dev->otherend, + "max-page-order", 0); + if (max_page_order < PVCALLS_RING_ORDER) + return -ENODEV; + function_calls = xenbus_read_unsigned(dev->otherend, + "function-calls", 0); + /* See XENBUS_FUNCTIONS_CALLS in pvcalls.h */ + if (function_calls != 1) + return -ENODEV; + pr_info("%s max-page-order is %u\n", __func__, max_page_order); + + bedata = kzalloc(sizeof(struct pvcalls_bedata), GFP_KERNEL); + if (!bedata) + return -ENOMEM; + + dev_set_drvdata(&dev->dev, bedata); + pvcalls_front_dev = dev; + init_waitqueue_head(&bedata->inflight_req); + INIT_LIST_HEAD(&bedata->socket_mappings); + spin_lock_init(&bedata->socket_lock); + bedata->irq = -1; + bedata->ref = -1; + + for (i = 0; i < PVCALLS_NR_RSP_PER_RING; i++) + bedata->rsp[i].req_id = PVCALLS_INVALID_ID; + + sring = (struct xen_pvcalls_sring *) __get_free_page(GFP_KERNEL | + __GFP_ZERO); + if (!sring) + goto error; + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&bedata->ring, sring, XEN_PAGE_SIZE); + + ret = xenbus_alloc_evtchn(dev, &evtchn); + if (ret) + goto error; + + bedata->irq = bind_evtchn_to_irqhandler(evtchn, + pvcalls_front_event_handler, + 0, "pvcalls-frontend", dev); + if (bedata->irq < 0) { + ret = bedata->irq; + goto error; + } + + ret = gnttab_alloc_grant_references(1, &gref_head); + if (ret < 0) + goto error; + ret = gnttab_claim_grant_reference(&gref_head); + if (ret < 0) + goto error; + bedata->ref = ret; + gnttab_grant_foreign_access_ref(bedata->ref, dev->otherend_id, + virt_to_gfn((void *)sring), 0); + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + goto error; + } + ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "ring-ref", "%d", bedata->ref); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "port", "%u", + evtchn); + if (ret) + goto error_xenbus; + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + goto error; + } + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + error: + pvcalls_front_remove(dev); + return ret; +} + +static void pvcalls_front_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + switch (backend_state) { + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateUnknown: + break; + + case XenbusStateInitWait: + break; + + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's CLOSING state */ + /* fall through */ + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +static struct xenbus_driver pvcalls_front_driver = { + .ids = pvcalls_front_ids, + .probe = pvcalls_front_probe, + .remove = pvcalls_front_remove, + .otherend_changed = pvcalls_front_changed, +}; + +static int __init pvcalls_frontend_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising Xen pvcalls frontend driver\n"); + + return xenbus_register_frontend(&pvcalls_front_driver); +} + +module_init(pvcalls_frontend_init); + +MODULE_DESCRIPTION("Xen PV Calls frontend driver"); +MODULE_AUTHOR("Stefano Stabellini <sstabellini@kernel.org>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h new file mode 100644 index 000000000000..3332978f4fcd --- /dev/null +++ b/drivers/xen/pvcalls-front.h @@ -0,0 +1,28 @@ +#ifndef __PVCALLS_FRONT_H__ +#define __PVCALLS_FRONT_H__ + +#include <linux/net.h> + +int pvcalls_front_socket(struct socket *sock); +int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags); +int pvcalls_front_bind(struct socket *sock, + struct sockaddr *addr, + int addr_len); +int pvcalls_front_listen(struct socket *sock, int backlog); +int pvcalls_front_accept(struct socket *sock, + struct socket *newsock, + int flags); +int pvcalls_front_sendmsg(struct socket *sock, + struct msghdr *msg, + size_t len); +int pvcalls_front_recvmsg(struct socket *sock, + struct msghdr *msg, + size_t len, + int flags); +unsigned int pvcalls_front_poll(struct file *file, + struct socket *sock, + poll_table *wait); +int pvcalls_front_release(struct socket *sock); + +#endif diff --git a/drivers/xen/time.c b/drivers/xen/time.c index ac5f23fcafc2..3e741cd1409c 100644 --- a/drivers/xen/time.c +++ b/drivers/xen/time.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Xen stolen ticks accounting. */ @@ -5,6 +6,7 @@ #include <linux/kernel_stat.h> #include <linux/math64.h> #include <linux/gfp.h> +#include <linux/slab.h> #include <asm/paravirt.h> #include <asm/xen/hypervisor.h> @@ -19,6 +21,8 @@ /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); +static DEFINE_PER_CPU(u64[4], old_runstate_time); + /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) { @@ -47,8 +51,8 @@ static u64 get64(const u64 *p) return ret; } -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, - unsigned int cpu) +static void xen_get_runstate_snapshot_cpu_delta( + struct vcpu_runstate_info *res, unsigned int cpu) { u64 state_time; struct vcpu_runstate_info *state; @@ -66,6 +70,71 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, (state_time & XEN_RUNSTATE_UPDATE)); } +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, + unsigned int cpu) +{ + int i; + + xen_get_runstate_snapshot_cpu_delta(res, cpu); + + for (i = 0; i < 4; i++) + res->time[i] += per_cpu(old_runstate_time, cpu)[i]; +} + +void xen_manage_runstate_time(int action) +{ + static struct vcpu_runstate_info *runstate_delta; + struct vcpu_runstate_info state; + int cpu, i; + + switch (action) { + case -1: /* backup runstate time before suspend */ + if (unlikely(runstate_delta)) + pr_warn_once("%s: memory leak as runstate_delta is not NULL\n", + __func__); + + runstate_delta = kmalloc_array(num_possible_cpus(), + sizeof(*runstate_delta), + GFP_ATOMIC); + if (unlikely(!runstate_delta)) { + pr_warn("%s: failed to allocate runstate_delta\n", + __func__); + return; + } + + for_each_possible_cpu(cpu) { + xen_get_runstate_snapshot_cpu_delta(&state, cpu); + memcpy(runstate_delta[cpu].time, state.time, + sizeof(runstate_delta[cpu].time)); + } + + break; + + case 0: /* backup runstate time after resume */ + if (unlikely(!runstate_delta)) { + pr_warn("%s: cannot accumulate runstate time as runstate_delta is NULL\n", + __func__); + return; + } + + for_each_possible_cpu(cpu) { + for (i = 0; i < 4; i++) + per_cpu(old_runstate_time, cpu)[i] += + runstate_delta[cpu].time[i]; + } + + break; + + default: /* do not accumulate runstate time for checkpointing */ + break; + } + + if (action != -1 && runstate_delta) { + kfree(runstate_delta); + runstate_delta = NULL; + } +} + /* * Runstate accounting */ diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index e89136ab851e..b437fccd4e62 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -57,7 +57,7 @@ static int register_balloon(struct device *dev); static void watch_target(struct xenbus_watch *watch, const char *path, const char *token) { - unsigned long long new_target; + unsigned long long new_target, static_max; int err; static bool watch_fired; static long target_diff; @@ -72,13 +72,20 @@ static void watch_target(struct xenbus_watch *watch, * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. */ new_target >>= PAGE_SHIFT - 10; - if (watch_fired) { - balloon_set_new_target(new_target - target_diff); - return; + + if (!watch_fired) { + watch_fired = true; + err = xenbus_scanf(XBT_NIL, "memory", "static-max", "%llu", + &static_max); + if (err != 1) + static_max = new_target; + else + static_max >>= PAGE_SHIFT - 10; + target_diff = xen_pv_domain() ? 0 + : static_max - balloon_stats.target_pages; } - watch_fired = true; - target_diff = new_target - balloon_stats.target_pages; + balloon_set_new_target(new_target - target_diff); } static struct xenbus_watch target_watch = { .node = "memory/target", diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile index ffe0ad3438bd..e8d981d43235 100644 --- a/drivers/xen/xen-pciback/Makefile +++ b/drivers/xen/xen-pciback/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c index 9e9286d0872e..60111719b01f 100644 --- a/drivers/xen/xen-pciback/conf_space.c +++ b/drivers/xen/xen-pciback/conf_space.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Functions for creating a virtual configuration space for * exported PCI Devices. diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h index 62461a8ba1d6..22db630717ea 100644 --- a/drivers/xen/xen-pciback/conf_space.h +++ b/drivers/xen/xen-pciback/conf_space.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCI Backend - Common data structures for overriding the configuration space * diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c index 7f83e9083e9d..73427d8e0116 100644 --- a/drivers/xen/xen-pciback/conf_space_capability.c +++ b/drivers/xen/xen-pciback/conf_space_capability.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Handles the virtual fields found on the capability lists * in the configuration space. diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index 5fbfd9cfb6d6..10ae24b5a76e 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Handles the virtual fields in the configuration space headers. * @@ -169,6 +170,9 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) { struct pci_bar_info *bar = data; + unsigned int pos = (offset - PCI_BASE_ADDRESS_0) / 4; + const struct resource *res = dev->resource; + u32 mask; if (unlikely(!bar)) { pr_warn(DRV_NAME ": driver data not found for %s\n", @@ -179,7 +183,13 @@ static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) /* A write to obtain the length must happen as a 32-bit write. * This does not (yet) support writing individual bytes */ - if (value == ~0) + if (res[pos].flags & IORESOURCE_IO) + mask = ~PCI_BASE_ADDRESS_IO_MASK; + else if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64)) + mask = 0; + else + mask = ~PCI_BASE_ADDRESS_MEM_MASK; + if ((value | mask) == ~0U) bar->which = 1; else { u32 tmpval; diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c index 7476791cab40..89d9744ece61 100644 --- a/drivers/xen/xen-pciback/conf_space_quirks.c +++ b/drivers/xen/xen-pciback/conf_space_quirks.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Handle special overlays for broken devices. * diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h index cfcc517e4570..d873abe35bf6 100644 --- a/drivers/xen/xen-pciback/conf_space_quirks.h +++ b/drivers/xen/xen-pciback/conf_space_quirks.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCI Backend - Data structures for special overlays for broken devices. * diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c index f16a30e2a110..66e9b814cc86 100644 --- a/drivers/xen/xen-pciback/passthrough.c +++ b/drivers/xen/xen-pciback/passthrough.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Provides restricted access to the real PCI bus topology * to the frontend diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 6331a95691a4..9e480fdebe1f 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -1172,8 +1172,8 @@ out: return err; } -static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, - size_t count) +static ssize_t new_slot_store(struct device_driver *drv, const char *buf, + size_t count) { int domain, bus, slot, func; int err; @@ -1189,10 +1189,10 @@ out: err = count; return err; } -static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); +static DRIVER_ATTR_WO(new_slot); -static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, - size_t count) +static ssize_t remove_slot_store(struct device_driver *drv, const char *buf, + size_t count) { int domain, bus, slot, func; int err; @@ -1208,9 +1208,9 @@ out: err = count; return err; } -static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); +static DRIVER_ATTR_WO(remove_slot); -static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) +static ssize_t slots_show(struct device_driver *drv, char *buf) { struct pcistub_device_id *pci_dev_id; size_t count = 0; @@ -1231,9 +1231,9 @@ static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) return count; } -static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); +static DRIVER_ATTR_RO(slots); -static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) +static ssize_t irq_handlers_show(struct device_driver *drv, char *buf) { struct pcistub_device *psdev; struct xen_pcibk_dev_data *dev_data; @@ -1260,11 +1260,10 @@ static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) spin_unlock_irqrestore(&pcistub_devices_lock, flags); return count; } -static DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); +static DRIVER_ATTR_RO(irq_handlers); -static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, - const char *buf, - size_t count) +static ssize_t irq_handler_state_store(struct device_driver *drv, + const char *buf, size_t count) { struct pcistub_device *psdev; struct xen_pcibk_dev_data *dev_data; @@ -1301,11 +1300,10 @@ out: err = count; return err; } -static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, - pcistub_irq_handler_switch); +static DRIVER_ATTR_WO(irq_handler_state); -static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, - size_t count) +static ssize_t quirks_store(struct device_driver *drv, const char *buf, + size_t count) { int domain, bus, slot, func, reg, size, mask; int err; @@ -1323,7 +1321,7 @@ out: return err; } -static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) +static ssize_t quirks_show(struct device_driver *drv, char *buf) { int count = 0; unsigned long flags; @@ -1366,11 +1364,10 @@ out: return count; } -static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, - pcistub_quirk_add); +static DRIVER_ATTR_RW(quirks); -static ssize_t permissive_add(struct device_driver *drv, const char *buf, - size_t count) +static ssize_t permissive_store(struct device_driver *drv, const char *buf, + size_t count) { int domain, bus, slot, func; int err; @@ -1431,8 +1428,7 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf) spin_unlock_irqrestore(&pcistub_devices_lock, flags); return count; } -static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, - permissive_add); +static DRIVER_ATTR_RW(permissive); static void pcistub_exit(void) { diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index 7af369b6aaa2..263c059bff90 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCI Backend Common Data Structures & Function Declarations * diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index f8c77751f330..ee2c891b55c6 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend Operations - respond to PCI requests from Frontend * diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c index c99f8bb1c56c..f6ba18191c0f 100644 --- a/drivers/xen/xen-pciback/vpci.c +++ b/drivers/xen/xen-pciback/vpci.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend - Provides a Virtual PCI bus (with real devices) * to the frontend diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 3814b44bf1f7..581c4e1a8b82 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * PCI Backend Xenbus Setup - handles setup with frontend and xend * diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 66620713242a..55988b8418ee 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /****************************************************************************** * Xen selfballoon driver (and optional frontswap self-shrinking driver) * @@ -151,8 +152,8 @@ static unsigned long frontswap_inertia_counter; static void frontswap_selfshrink(void) { static unsigned long cur_frontswap_pages; - static unsigned long last_frontswap_pages; - static unsigned long tgt_frontswap_pages; + unsigned long last_frontswap_pages; + unsigned long tgt_frontswap_pages; last_frontswap_pages = cur_frontswap_pages; cur_frontswap_pages = frontswap_curr_pages(); diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile index 31e2e9050c7a..0c7532110815 100644 --- a/drivers/xen/xenbus/Makefile +++ b/drivers/xen/xenbus/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y += xenbus.o obj-y += xenbus_dev_frontend.o diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 82a8866758ee..a1c17000129b 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -519,64 +519,6 @@ static int __xenbus_map_ring(struct xenbus_device *dev, return err; } -static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, - grant_ref_t *gnt_refs, - unsigned int nr_grefs, - void **vaddr) -{ - struct xenbus_map_node *node; - struct vm_struct *area; - pte_t *ptes[XENBUS_MAX_RING_GRANTS]; - phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; - int err = GNTST_okay; - int i; - bool leaked; - - *vaddr = NULL; - - if (nr_grefs > XENBUS_MAX_RING_GRANTS) - return -EINVAL; - - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - - area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); - if (!area) { - kfree(node); - return -ENOMEM; - } - - for (i = 0; i < nr_grefs; i++) - phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr; - - err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, - phys_addrs, - GNTMAP_host_map | GNTMAP_contains_pte, - &leaked); - if (err) - goto failed; - - node->nr_handles = nr_grefs; - node->pv.area = area; - - spin_lock(&xenbus_valloc_lock); - list_add(&node->next, &xenbus_valloc_pages); - spin_unlock(&xenbus_valloc_lock); - - *vaddr = area->addr; - return 0; - -failed: - if (!leaked) - free_vm_area(area); - else - pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); - - kfree(node); - return err; -} - struct map_ring_valloc_hvm { unsigned int idx; @@ -725,6 +667,65 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) } EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); +#ifdef CONFIG_XEN_PV +static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, + grant_ref_t *gnt_refs, + unsigned int nr_grefs, + void **vaddr) +{ + struct xenbus_map_node *node; + struct vm_struct *area; + pte_t *ptes[XENBUS_MAX_RING_GRANTS]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; + int err = GNTST_okay; + int i; + bool leaked; + + *vaddr = NULL; + + if (nr_grefs > XENBUS_MAX_RING_GRANTS) + return -EINVAL; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); + if (!area) { + kfree(node); + return -ENOMEM; + } + + for (i = 0; i < nr_grefs; i++) + phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr; + + err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, + phys_addrs, + GNTMAP_host_map | GNTMAP_contains_pte, + &leaked); + if (err) + goto failed; + + node->nr_handles = nr_grefs; + node->pv.area = area; + + spin_lock(&xenbus_valloc_lock); + list_add(&node->next, &xenbus_valloc_pages); + spin_unlock(&xenbus_valloc_lock); + + *vaddr = area->addr; + return 0; + +failed: + if (!leaked) + free_vm_area(area); + else + pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); + + kfree(node); + return err; +} + static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) { struct xenbus_map_node *node; @@ -788,6 +789,12 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) return err; } +static const struct xenbus_ring_ops ring_ops_pv = { + .map = xenbus_map_ring_valloc_pv, + .unmap = xenbus_unmap_ring_vfree_pv, +}; +#endif + struct unmap_ring_vfree_hvm { unsigned int idx; @@ -916,11 +923,6 @@ enum xenbus_state xenbus_read_driver_state(const char *path) } EXPORT_SYMBOL_GPL(xenbus_read_driver_state); -static const struct xenbus_ring_ops ring_ops_pv = { - .map = xenbus_map_ring_valloc_pv, - .unmap = xenbus_unmap_ring_vfree_pv, -}; - static const struct xenbus_ring_ops ring_ops_hvm = { .map = xenbus_map_ring_valloc_hvm, .unmap = xenbus_unmap_ring_vfree_hvm, @@ -928,8 +930,10 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { +#ifdef CONFIG_XEN_PV if (!xen_feature(XENFEAT_auto_translated_physmap)) ring_ops = &ring_ops_pv; else +#endif ring_ops = &ring_ops_hvm; } diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c index 1126701e212e..edba5fecde4d 100644 --- a/drivers/xen/xenbus/xenbus_dev_backend.c +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 19e45ce21f89..07896f4b2736 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -379,10 +379,12 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state) case XenbusStateConnected: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); xenbus_reset_wait_for_backend(be, XenbusStateClosing); + /* fall through */ case XenbusStateClosing: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); xenbus_reset_wait_for_backend(be, XenbusStateClosed); + /* fall through */ case XenbusStateClosed: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index e46080214955..3e59590c7254 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -857,6 +857,8 @@ static int xenwatch_thread(void *unused) struct list_head *ent; struct xs_watch_event *event; + xenwatch_pid = current->pid; + for (;;) { wait_event_interruptible(watch_events_waitq, !list_empty(&watch_events)); @@ -925,7 +927,6 @@ int xs_init(void) task = kthread_run(xenwatch_thread, NULL, "xenwatch"); if (IS_ERR(task)) return PTR_ERR(task); - xenwatch_pid = task->pid; /* shutdown watches for kexec boot */ xs_reset_watches(); diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 967f069385d0..71ddfb4cf61c 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -87,7 +87,6 @@ static int __init xenfs_init(void) if (xen_domain()) return register_filesystem(&xenfs_type); - pr_info("not registering filesystem on non-xen platform\n"); return 0; } diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index 2c5934ea9b1e..cfe4874b83a7 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XENFS_XENBUS_H #define _XENFS_XENBUS_H diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c index 82fd2a396d96..f59235f9f8a2 100644 --- a/drivers/xen/xenfs/xenstored.c +++ b/drivers/xen/xenfs/xenstored.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/types.h> #include <linux/mm.h> diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c index c6e2b4a542ea..c6c73a33c44d 100644 --- a/drivers/xen/xenfs/xensyms.c +++ b/drivers/xen/xenfs/xensyms.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> |