From 7e186bdd0098b34c69fb8067c67340ae610ea499 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 6 May 2011 12:27:50 +0100 Subject: xen: do not clear and mask evtchns in __xen_evtchn_do_upcall Change the irq handler of evtchns and pirqs that don't need EOI (pirqs that correspond to physical edge interrupts) to handle_edge_irq. Use handle_fasteoi_irq for pirqs that need eoi (they generally correspond to level triggered irqs), no risk in loosing interrupts because we have to EOI the irq anyway. This change has the following benefits: - it uses the very same handlers that Linux would use on native for the same irqs (handle_edge_irq for edge irqs and msis, and handle_fasteoi_irq for everything else); - it uses these handlers in the same way native code would use them: it let Linux mask\unmask and ack the irq when Linux want to mask\unmask and ack the irq; - it fixes a problem occurring when a driver calls disable_irq() in its handler: the old code was unconditionally unmasking the evtchn even if the irq is disabled when irq_eoi was called. See Documentation/DocBook/genericirq.tmpl for more informations. Signed-off-by: Stefano Stabellini [v1: Fixed space/tab issues] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 113 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 40 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 33167b43ac7e..0ae1d4d7e18c 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -118,6 +118,8 @@ static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], static struct irq_chip xen_dynamic_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; +static void enable_dynirq(struct irq_data *data); +static void disable_dynirq(struct irq_data *data); /* Get info for IRQ */ static struct irq_info *info_for_irq(unsigned irq) @@ -473,16 +475,6 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } -static void pirq_unmask_notify(int irq) -{ - struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) }; - - if (unlikely(pirq_needs_eoi(irq))) { - int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); - WARN_ON(rc); - } -} - static void pirq_query_unmask(int irq) { struct physdev_irq_status_query irq_status; @@ -506,6 +498,29 @@ static bool probing_irq(int irq) return desc && desc->action == NULL; } +static void eoi_pirq(struct irq_data *data) +{ + int evtchn = evtchn_from_irq(data->irq); + struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; + int rc = 0; + + irq_move_irq(data); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); + + if (pirq_needs_eoi(data->irq)) { + rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + WARN_ON(rc); + } +} + +static void mask_ack_pirq(struct irq_data *data) +{ + disable_dynirq(data); + eoi_pirq(data); +} + static unsigned int __startup_pirq(unsigned int irq) { struct evtchn_bind_pirq bind_pirq; @@ -539,7 +554,7 @@ static unsigned int __startup_pirq(unsigned int irq) out: unmask_evtchn(evtchn); - pirq_unmask_notify(irq); + eoi_pirq(irq_get_irq_data(irq)); return 0; } @@ -579,18 +594,7 @@ static void enable_pirq(struct irq_data *data) static void disable_pirq(struct irq_data *data) { -} - -static void ack_pirq(struct irq_data *data) -{ - int evtchn = evtchn_from_irq(data->irq); - - irq_move_irq(data); - - if (VALID_EVTCHN(evtchn)) { - mask_evtchn(evtchn); - clear_evtchn(evtchn); - } + disable_dynirq(data); } static int find_irq_by_gsi(unsigned gsi) @@ -639,9 +643,6 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, if (irq < 0) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, - name); - irq_op.irq = irq; irq_op.vector = 0; @@ -658,6 +659,32 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, shareable ? PIRQ_SHAREABLE : 0); + pirq_query_unmask(irq); + /* We try to use the handler with the appropriate semantic for the + * type of interrupt: if the interrupt doesn't need an eoi + * (pirq_needs_eoi returns false), we treat it like an edge + * triggered interrupt so we use handle_edge_irq. + * As a matter of fact this only happens when the corresponding + * physical interrupt is edge triggered or an msi. + * + * On the other hand if the interrupt needs an eoi (pirq_needs_eoi + * returns true) we treat it like a level triggered interrupt so we + * use handle_fasteoi_irq like the native code does for this kind of + * interrupts. + * Depending on the Xen version, pirq_needs_eoi might return true + * not only for level triggered interrupts but for edge triggered + * interrupts too. In any case Xen always honors the eoi mechanism, + * not injecting any more pirqs of the same kind if the first one + * hasn't received an eoi yet. Therefore using the fasteoi handler + * is the right choice either way. + */ + if (pirq_needs_eoi(irq)) + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + handle_fasteoi_irq, name); + else + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + handle_edge_irq, name); + out: spin_unlock(&irq_mapping_update_lock); @@ -690,8 +717,8 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, if (irq == -1) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, - name); + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, + name); xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, 0); ret = irq_set_msi_desc(irq, msidesc); @@ -773,7 +800,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) goto out; irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_fasteoi_irq, "event"); + handle_edge_irq, "event"); xen_irq_info_evtchn_init(irq, evtchn); } @@ -1179,9 +1206,6 @@ static void __xen_evtchn_do_upcall(void) port = (word_idx * BITS_PER_LONG) + bit_idx; irq = evtchn_to_irq[port]; - mask_evtchn(port); - clear_evtchn(port); - if (irq != -1) { desc = irq_to_desc(irq); if (desc) @@ -1337,10 +1361,16 @@ static void ack_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); - irq_move_masked_irq(data); + irq_move_irq(data); if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); + clear_evtchn(evtchn); +} + +static void mask_ack_dynirq(struct irq_data *data) +{ + disable_dynirq(data); + ack_dynirq(data); } static int retrigger_dynirq(struct irq_data *data) @@ -1535,7 +1565,9 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .irq_mask = disable_dynirq, .irq_unmask = enable_dynirq, - .irq_eoi = ack_dynirq, + .irq_ack = ack_dynirq, + .irq_mask_ack = mask_ack_dynirq, + .irq_set_affinity = set_affinity_irq, .irq_retrigger = retrigger_dynirq, }; @@ -1545,14 +1577,15 @@ static struct irq_chip xen_pirq_chip __read_mostly = { .irq_startup = startup_pirq, .irq_shutdown = shutdown_pirq, - .irq_enable = enable_pirq, - .irq_unmask = enable_pirq, - .irq_disable = disable_pirq, - .irq_mask = disable_pirq, - .irq_ack = ack_pirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = eoi_pirq, + .irq_eoi = eoi_pirq, + .irq_mask_ack = mask_ack_pirq, .irq_set_affinity = set_affinity_irq, -- cgit v1.2.3 From 7899891c7d161752f29abcc9bc0a9c6c3a3af26c Mon Sep 17 00:00:00 2001 From: "Tian, Kevin" Date: Thu, 12 May 2011 10:56:08 +0800 Subject: xen mmu: fix a race window causing leave_mm BUG() There's a race window in xen_drop_mm_ref, where remote cpu may exit dirty bitmap between the check on this cpu and the point where remote cpu handles drop request. So in drop_other_mm_ref we need check whether TLB state is still lazy before calling into leave_mm. This bug is rarely observed in earlier kernel, but exaggerated by the commit 831d52bc153971b70e64eccfbed2b232394f22f8 ("x86, mm: avoid possible bogus tlb entries by clearing prev mm_cpumask after switching mm") which clears bitmap after changing the TLB state. the call trace is as below: --------------------------------- kernel BUG at arch/x86/mm/tlb.c:61! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/system/xen_memory/xen_memory0/info/current_kb CPU 1 Modules linked in: 8021q garp xen_netback xen_blkback blktap blkback_pagemap nbd bridge stp llc autofs4 ipmi_devintf ipmi_si ipmi_msghandler lockd sunrpc bonding ipv6 xenfs dm_multipath video output sbs sbshc parport_pc lp parport ses enclosure snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device serio_raw bnx2 snd_pcm_oss snd_mixer_oss snd_pcm snd_timer iTCO_wdt snd soundcore snd_page_alloc i2c_i801 iTCO_vendor_support i2c_core pcs pkr pata_acpi ata_generic ata_piix shpchp mptsas mptscsih mptbase [last unloaded: freq_table] Pid: 25581, comm: khelper Not tainted 2.6.32.36fixxen #1 Tecal RH2285 RIP: e030:[] [] leave_mm+0x15/0x46 RSP: e02b:ffff88002805be48 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff88015f8e2da0 RDX: ffff88002805be78 RSI: 0000000000000000 RDI: 0000000000000001 RBP: ffff88002805be48 R08: ffff88009d662000 R09: dead000000200200 R10: dead000000100100 R11: ffffffff814472b2 R12: ffff88009bfc1880 R13: ffff880028063020 R14: 00000000000004f6 R15: 0000000000000000 FS: 00007f62362d66e0(0000) GS:ffff880028058000(0000) knlGS:0000000000000000 CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000003aabc11909 CR3: 000000009b8ca000 CR4: 0000000000002660 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000000 00 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process khelper (pid: 25581, threadinfo ffff88007691e000, task ffff88009b92db40) Stack: ffff88002805be68 ffffffff8100e4ae 0000000000000001 ffff88009d733b88 <0> ffff88002805be98 ffffffff81087224 ffff88002805be78 ffff88002805be78 <0> ffff88015f808360 00000000000004f6 ffff88002805bea8 ffffffff81010108 Call Trace: [] drop_other_mm_ref+0x2a/0x53 [] generic_smp_call_function_single_interrupt+0xd8/0xfc [] xen_call_function_single_interrupt+0x13/0x28 [] handle_IRQ_event+0x66/0x120 [] handle_percpu_irq+0x41/0x6e [] __xen_evtchn_do_upcall+0x1ab/0x27d [] xen_evtchn_do_upcall+0x33/0x46 [] xen_do_hyper visor_callback+0x1e/0x30 [] ? _spin_unlock_irqrestore+0x15/0x17 [] ? xen_restore_fl_direct_end+0x0/0x1 [] ? flush_old_exec+0x3ac/0x500 [] ? load_elf_binary+0x0/0x17ef [] ? load_elf_binary+0x0/0x17ef [] ? load_elf_binary+0x398/0x17ef [] ? need_resched+0x23/0x2d [] ? process_measurement+0xc0/0xd7 [] ? load_elf_binary+0x0/0x17ef [] ? search_binary_handler+0xc8/0x255 [] ? do_execve+0x1c3/0x29e [] ? sys_execve+0x43/0x5d [] ? __call_usermodehelper+0x0/0x6f [] ? kernel_execve+0x68/0xd0 [] ? __call_usermodehelper+0x0/0x6f [] ? xen_restore_fl_direct_end+0x0/0x1 [] ? ____call_usermodehelper+0x113/0x11e [] ? child_rip+0xa/0x20 [] ? __call_usermodehelper+0x0/0x6f [] ? int_ret_from_sys_call+0x7/0x1b [] ? retint_restore_args+0x5/0x6 [] ? child_rip+0x0/0x20 Code: 41 5e 41 5f c9 c3 55 48 89 e5 0f 1f 44 00 00 e8 17 ff ff ff c9 c3 55 48 89 e5 0f 1f 44 00 00 65 8b 04 25 c8 55 01 00 ff c8 75 04 <0f> 0b eb fe 65 48 8b 34 25 c0 55 01 00 48 81 c6 b8 02 00 00 e8 RIP [] leave_mm+0x15/0x46 RSP ---[ end trace ce9cee6832a9c503 ]--- Tested-by: Maoxiaoyun Signed-off-by: Kevin Tian [v1: Fleshed out the git description a bit] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e92b61ad574..4fd7387222bf 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1140,7 +1140,7 @@ static void drop_other_mm_ref(void *info) active_mm = percpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm) + if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure -- cgit v1.2.3 From 15bfc094517db2ddf38ca7ed47f3a1c0ad24f7c4 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 12 Apr 2011 07:57:15 -0400 Subject: xen/setup: Ignore E820_UNUSABLE when setting 1-1 mappings. When we parse the raw E820, the Xen hypervisor can set "E820_RAM" to "E820_UNUSABLE" if the mem=X argument is used. As such we should _not_ consider the E820_UNUSABLE as an 1-1 identity mapping, but instead use the same case as for E820_RAM. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 90bac0aac3a5..fba4a6cc3a2d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -166,7 +166,7 @@ static unsigned long __init xen_set_identity(const struct e820entry *list, if (last > end) continue; - if (entry->type == E820_RAM) { + if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { if (start > start_pci) identity += set_phys_range_identity( PFN_UP(start_pci), PFN_DOWN(start)); -- cgit v1.2.3 From 0f16d0dfcdb5aab97d9e368f008b070b5b3ec6d3 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 11 May 2011 22:34:38 +0200 Subject: xen/setup: Fix for incorrect xen_extra_mem_start initialization under 32-bit git commit 24bdb0b62cc82120924762ae6bc85afc8c3f2b26 (xen: do not create the extra e820 region at an addr lower than 4G) does not take into account that ifdef CONFIG_X86_32 instead of e820_end_of_low_ram_pfn() find_low_pfn_range() is called (both calls are from arch/x86/kernel/setup.c). find_low_pfn_range() behaves correctly and does not require change in xen_extra_mem_start initialization. Additionally, if xen_extra_mem_start is initialized in the same way as ifdef CONFIG_X86_64 then memory hotplug support for Xen balloon driver (under development) is broken. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index fba4a6cc3a2d..ca6297bd4e3c 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -227,7 +227,11 @@ char * __init xen_memory_setup(void) memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; +#ifdef CONFIG_X86_32 + xen_extra_mem_start = mem_end; +#else xen_extra_mem_start = max((1ULL << 32), mem_end); +#endif for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; -- cgit v1.2.3 From 8c5950881c3b5e6e350e4b0438a8ccc513d90df9 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 1 Apr 2011 15:18:48 -0400 Subject: xen/p2m: Create entries in the P2M_MFN trees's to track 1-1 mappings .. when applicable. We need to track in the p2m_mfn and p2m_mfn_p the MFNs and pointers, respectivly, for the P2M entries that are allocated for the identity mappings. Without this, a PV domain with an E820 that triggers the 1-1 mapping to kick in, won't be able to be restored as the P2M won't have the identity mappings. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 141eb0de8b06..a01e6532b46a 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -522,11 +522,20 @@ static bool __init __early_alloc_p2m(unsigned long pfn) /* Boundary cross-over for the edges: */ if (idx) { unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); + unsigned long *mid_mfn_p; p2m_init(p2m); p2m_top[topidx][mididx] = p2m; + /* For save/restore we need to MFN of the P2M saved */ + + mid_mfn_p = p2m_top_mfn_p[topidx]; + WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), + "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", + topidx, mididx); + mid_mfn_p[mididx] = virt_to_mfn(p2m); + } return idx != 0; } @@ -549,12 +558,29 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == p2m_mid_missing) { - unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + unsigned long *mid_mfn_p; + unsigned long **mid; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + if (mid == p2m_mid_missing) { + mid = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_init(mid); p2m_top[topidx] = mid; + + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + } + /* And the save/restore P2M tables.. */ + if (mid_mfn_p == p2m_mid_missing_mfn) { + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + /* Note: we don't set mid_mfn_p[midix] here, + * look in __early_alloc_p2m */ } } -- cgit v1.2.3