aboutsummaryrefslogtreecommitdiffstats
path: root/virt
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2013-03-31 18:10:34 +0200
committerPatrick McHardy <kaber@trash.net>2013-03-31 18:10:34 +0200
commit70711d223510ba1773cfe1d7770a56141c815ff8 (patch)
tree4a71f38a3a554ddecaa31b7d8c6bc49b7d1705b4 /virt
parentd53b4ed072d9779cdf53582c46436dec06d0961f (diff)
parent19f949f52599ba7c3f67a5897ac6be14bfcb1200 (diff)
Merge tag 'v3.8' of /home/kaber/src/repos/linux
Linux 3.8 Signed-off-by: Patrick McHardy <kaber@trash.net> Conflicts: include/linux/Kbuild include/linux/netlink.h
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/assigned-dev.c36
-rw-r--r--virt/kvm/async_pf.c11
-rw-r--r--virt/kvm/eventfd.c158
-rw-r--r--virt/kvm/ioapic.c48
-rw-r--r--virt/kvm/ioapic.h4
-rw-r--r--virt/kvm/iommu.c26
-rw-r--r--virt/kvm/irq_comm.c131
-rw-r--r--virt/kvm/kvm_main.c638
9 files changed, 692 insertions, 363 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 28694f4a913..d01b24b72c6 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
config HAVE_KVM_MSI
bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+ bool
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 23a41a9f8db..3642239252b 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -105,6 +105,15 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
}
#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
+ assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 1);
+ return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
+}
+
static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -117,6 +126,23 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
#endif
#ifdef __KVM_HAVE_MSIX
+static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int index = find_index_from_host_irq(assigned_dev, irq);
+ u32 vector;
+ int ret = 0;
+
+ if (index >= 0) {
+ vector = assigned_dev->guest_msix_entries[index].vector;
+ ret = kvm_set_irq_inatomic(assigned_dev->kvm,
+ assigned_dev->irq_source_id,
+ vector, 1);
+ }
+
+ return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
+}
+
static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -334,11 +360,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
}
#ifdef __KVM_HAVE_MSI
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
- return IRQ_WAKE_THREAD;
-}
-
static int assigned_device_enable_host_msi(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@@ -363,11 +384,6 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
#endif
#ifdef __KVM_HAVE_MSIX
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
- return IRQ_WAKE_THREAD;
-}
-
static int assigned_device_enable_host_msix(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4c2ee..ea475cd0351 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
list_entry(vcpu->async_pf.done.next,
typeof(*work), link);
list_del(&work->link);
- if (work->page)
- put_page(work->page);
+ if (!is_error_page(work->page))
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
list_del(&work->queue);
vcpu->async_pf.queued--;
- if (work->page)
- put_page(work->page);
+ if (!is_error_page(work->page))
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
}
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
if (!work)
return -ENOMEM;
- work->page = bad_page;
- get_page(bad_page);
+ work->page = KVM_ERR_PTR_BAD_PAGE;
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 7d7e2aaffec..b6eea5cc7b3 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -35,6 +35,7 @@
#include "iodev.h"
+#ifdef __KVM_HAVE_IOAPIC
/*
* --------------------------------------------------------------------
* irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -43,6 +44,31 @@
* --------------------------------------------------------------------
*/
+/*
+ * Resampling irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts. The interrupt is asserted on eventfd
+ * trigger. On acknowledgement through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd. All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user. We can also therefore share the same irq source ID.
+ */
+struct _irqfd_resampler {
+ struct kvm *kvm;
+ /*
+ * List of resampling struct _irqfd objects sharing this gsi.
+ * RCU list modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head list;
+ struct kvm_irq_ack_notifier notifier;
+ /*
+ * Entry in list of kvm->irqfd.resampler_list. Use for sharing
+ * resamplers among irqfds on the same gsi.
+ * Accessed and modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head link;
+};
+
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
@@ -52,6 +78,12 @@ struct _irqfd {
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
+ /* The resampler used by this irqfd (resampler-only) */
+ struct _irqfd_resampler *resampler;
+ /* Eventfd notified on resample (resampler-only) */
+ struct eventfd_ctx *resamplefd;
+ /* Entry in list of irqfds for a resampler (resampler-only) */
+ struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
@@ -67,8 +99,58 @@ irqfd_inject(struct work_struct *work)
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd->kvm;
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ if (!irqfd->resampler) {
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ } else
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ irqfd->gsi, 1);
+}
+
+/*
+ * Since resampler irqfds share an IRQ source ID, we de-assert once
+ * then notify all of the resampler irqfds using this GSI. We can't
+ * do multiple de-asserts or we risk racing with incoming re-asserts.
+ */
+static void
+irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
+{
+ struct _irqfd_resampler *resampler;
+ struct _irqfd *irqfd;
+
+ resampler = container_of(kian, struct _irqfd_resampler, notifier);
+
+ kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
+ eventfd_signal(irqfd->resamplefd, 1);
+
+ rcu_read_unlock();
+}
+
+static void
+irqfd_resampler_shutdown(struct _irqfd *irqfd)
+{
+ struct _irqfd_resampler *resampler = irqfd->resampler;
+ struct kvm *kvm = resampler->kvm;
+
+ mutex_lock(&kvm->irqfds.resampler_lock);
+
+ list_del_rcu(&irqfd->resampler_link);
+ synchronize_rcu();
+
+ if (list_empty(&resampler->list)) {
+ list_del(&resampler->link);
+ kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+ kfree(resampler);
+ }
+
+ mutex_unlock(&kvm->irqfds.resampler_lock);
}
/*
@@ -90,7 +172,12 @@ irqfd_shutdown(struct work_struct *work)
* We know no new events will be scheduled at this point, so block
* until all previously outstanding events have completed
*/
- flush_work_sync(&irqfd->inject);
+ flush_work(&irqfd->inject);
+
+ if (irqfd->resampler) {
+ irqfd_resampler_shutdown(irqfd);
+ eventfd_ctx_put(irqfd->resamplefd);
+ }
/*
* It is now safe to release the object's resources
@@ -203,7 +290,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
struct file *file = NULL;
- struct eventfd_ctx *eventfd = NULL;
+ struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
unsigned int events;
@@ -231,6 +318,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
irqfd->eventfd = eventfd;
+ if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
+ struct _irqfd_resampler *resampler;
+
+ resamplefd = eventfd_ctx_fdget(args->resamplefd);
+ if (IS_ERR(resamplefd)) {
+ ret = PTR_ERR(resamplefd);
+ goto fail;
+ }
+
+ irqfd->resamplefd = resamplefd;
+ INIT_LIST_HEAD(&irqfd->resampler_link);
+
+ mutex_lock(&kvm->irqfds.resampler_lock);
+
+ list_for_each_entry(resampler,
+ &kvm->irqfds.resampler_list, link) {
+ if (resampler->notifier.gsi == irqfd->gsi) {
+ irqfd->resampler = resampler;
+ break;
+ }
+ }
+
+ if (!irqfd->resampler) {
+ resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+ if (!resampler) {
+ ret = -ENOMEM;
+ mutex_unlock(&kvm->irqfds.resampler_lock);
+ goto fail;
+ }
+
+ resampler->kvm = kvm;
+ INIT_LIST_HEAD(&resampler->list);
+ resampler->notifier.gsi = irqfd->gsi;
+ resampler->notifier.irq_acked = irqfd_resampler_ack;
+ INIT_LIST_HEAD(&resampler->link);
+
+ list_add(&resampler->link, &kvm->irqfds.resampler_list);
+ kvm_register_irq_ack_notifier(kvm,
+ &resampler->notifier);
+ irqfd->resampler = resampler;
+ }
+
+ list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
+ synchronize_rcu();
+
+ mutex_unlock(&kvm->irqfds.resampler_lock);
+ }
+
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
@@ -276,6 +411,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
return 0;
fail:
+ if (irqfd->resampler)
+ irqfd_resampler_shutdown(irqfd);
+
+ if (resamplefd && !IS_ERR(resamplefd))
+ eventfd_ctx_put(resamplefd);
+
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);
@@ -285,15 +426,21 @@ fail:
kfree(irqfd);
return ret;
}
+#endif
void
kvm_eventfd_init(struct kvm *kvm)
{
+#ifdef __KVM_HAVE_IOAPIC
spin_lock_init(&kvm->irqfds.lock);
INIT_LIST_HEAD(&kvm->irqfds.items);
+ INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+ mutex_init(&kvm->irqfds.resampler_lock);
+#endif
INIT_LIST_HEAD(&kvm->ioeventfds);
}
+#ifdef __KVM_HAVE_IOAPIC
/*
* shutdown any irqfd's that match fd+gsi
*/
@@ -340,7 +487,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
int
kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
- if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
+ if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
return -EINVAL;
if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
@@ -413,6 +560,7 @@ static void __exit irqfd_module_exit(void)
module_init(irqfd_module_init);
module_exit(irqfd_module_exit);
+#endif
/*
* --------------------------------------------------------------------
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 26fd54dc459..cfb7e4d52dc 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -191,36 +191,50 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
}
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level)
{
u32 old_irr;
u32 mask = 1 << irq;
union kvm_ioapic_redirect_entry entry;
- int ret = 1;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
spin_lock(&ioapic->lock);
old_irr = ioapic->irr;
- if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
- entry = ioapic->redirtbl[irq];
- level ^= entry.fields.polarity;
- if (!level)
- ioapic->irr &= ~mask;
- else {
- int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
- ioapic->irr |= mask;
- if ((edge && old_irr != ioapic->irr) ||
- (!edge && !entry.fields.remote_irr))
- ret = ioapic_service(ioapic, irq);
- else
- ret = 0; /* report coalesced interrupt */
- }
- trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ entry = ioapic->redirtbl[irq];
+ irq_level ^= entry.fields.polarity;
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ } else {
+ int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+ ioapic->irr |= mask;
+ if ((edge && old_irr != ioapic->irr) ||
+ (!edge && !entry.fields.remote_irr))
+ ret = ioapic_service(ioapic, irq);
+ else
+ ret = 0; /* report coalesced interrupt */
}
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
spin_unlock(&ioapic->lock);
return ret;
}
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
+{
+ int i;
+
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &ioapic->irq_states[i]);
+ spin_unlock(&ioapic->lock);
+}
+
static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
int trigger_mode)
{
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 32872a09b63..a30abfe6ed1 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -74,7 +74,9 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level);
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index e9fff9830bf..4a340cb2301 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -42,21 +42,21 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, unsigned long size)
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+ unsigned long size)
{
gfn_t end_gfn;
pfn_t pfn;
- pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + (size >> PAGE_SHIFT);
gfn += 1;
- if (is_error_pfn(pfn))
+ if (is_error_noslot_pfn(pfn))
return pfn;
while (gfn < end_gfn)
- gfn_to_pfn_memslot(kvm, slot, gfn++);
+ gfn_to_pfn_memslot(slot, gfn++);
return pfn;
}
@@ -105,8 +105,8 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
* Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later.
*/
- pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
- if (is_error_pfn(pfn)) {
+ pfn = kvm_pin_pages(slot, gfn, page_size);
+ if (is_error_noslot_pfn(pfn)) {
gfn += 1;
continue;
}
@@ -168,11 +168,7 @@ int kvm_assign_device(struct kvm *kvm,
r = iommu_attach_device(domain, &pdev->dev);
if (r) {
- printk(KERN_ERR "assign device %x:%x:%x.%x failed",
- pci_domain_nr(pdev->bus),
- pdev->bus->number,
- PCI_SLOT(pdev->devfn),
- PCI_FUNC(pdev->devfn));
+ dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
return r;
}
@@ -300,6 +296,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
/* Get physical address */
phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+
+ if (!phys) {
+ gfn++;
+ continue;
+ }
+
pfn = phys >> PAGE_SHIFT;
/* Unmap address from IO address space */
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 5afb4311402..656fa455e15 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -33,26 +33,12 @@
#include "ioapic.h"
-static inline int kvm_irq_line_state(unsigned long *irq_state,
- int irq_source_id, int level)
-{
- /* Logical OR for level trig interrupt */
- if (level)
- set_bit(irq_source_id, irq_state);
- else
- clear_bit(irq_source_id, irq_state);
-
- return !!(*irq_state);
-}
-
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level)
{
#ifdef CONFIG_X86
struct kvm_pic *pic = pic_irqchip(kvm);
- level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
- irq_source_id, level);
- return kvm_pic_set_irq(pic, e->irqchip.pin, level);
+ return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
#else
return -1;
#endif
@@ -62,10 +48,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
- irq_source_id, level);
-
- return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level);
}
inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -85,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_is_dm_lowest_prio(irq))
+ kvm_is_dm_lowest_prio(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
+ return r;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
@@ -114,6 +102,23 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
+static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+
+ irq->dest_id = (e->msi.address_lo &
+ MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ irq->vector = (e->msi.data &
+ MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+ irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+ irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+ irq->delivery_mode = e->msi.data & 0x700;
+ irq->level = 1;
+ irq->shorthand = 0;
+ /* TODO Deal with RH bit of MSI message address */
+}
+
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level)
{
@@ -122,22 +127,26 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
if (!level)
return -1;
- trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+ kvm_set_msi_irq(e, &irq);
- irq.dest_id = (e->msi.address_lo &
- MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
- irq.vector = (e->msi.data &
- MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
- irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
- irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
- irq.delivery_mode = e->msi.data & 0x700;
- irq.level = 1;
- irq.shorthand = 0;
-
- /* TODO Deal with RH bit of MSI message address */
return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
}
+
+static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ kvm_set_msi_irq(e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r))
+ return r;
+ else
+ return -EWOULDBLOCK;
+}
+
int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
{
struct kvm_kernel_irq_routing_entry route;
@@ -190,6 +199,44 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
return ret;
}
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ * Other values - No need to retry.
+ */
+int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ int ret = -EINVAL;
+ struct kvm_irq_routing_table *irq_rt;
+ struct hlist_node *n;
+
+ trace_kvm_set_irq(irq, level, irq_source_id);
+
+ /*
+ * Injection into either PIC or IOAPIC might need to scan all CPUs,
+ * which would need to be retried from thread context; when same GSI
+ * is connected to both PIC and IOAPIC, we'd have to report a
+ * partial failure here.
+ * Since there's no easy way to do this, we only support injecting MSI
+ * which is limited to 1:1 GSI mapping.
+ */
+ rcu_read_lock();
+ irq_rt = rcu_dereference(kvm->irq_routing);
+ if (irq < irq_rt->nr_rt_entries)
+ hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
+ if (likely(e->type == KVM_IRQ_ROUTING_MSI))
+ ret = kvm_set_msi_inatomic(e, kvm);
+ else
+ ret = -EWOULDBLOCK;
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
struct kvm_irq_ack_notifier *kian;
@@ -240,6 +287,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)
}
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
set_bit(irq_source_id, bitmap);
unlock:
mutex_unlock(&kvm->irq_lock);
@@ -249,9 +299,10 @@ unlock:
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
{
- int i;
-
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
@@ -263,14 +314,10 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
if (!irqchip_in_kernel(kvm))
goto unlock;
- for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
- clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
- if (i >= 16)
- continue;
+ kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
#ifdef CONFIG_X86
- clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
+ kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
#endif
- }
unlock:
mutex_unlock(&kvm->irq_lock);
}
@@ -344,11 +391,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 44ee7124b16..1cd693a76a5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-struct page *fault_page;
-pfn_t fault_pfn;
-
-inline int kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
int reserved;
@@ -137,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
-void vcpu_load(struct kvm_vcpu *vcpu)
+int vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
- mutex_lock(&vcpu->mutex);
+ if (mutex_lock_killable(&vcpu->mutex))
+ return -EINTR;
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
@@ -154,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
+ return 0;
}
void vcpu_put(struct kvm_vcpu *vcpu)
@@ -216,6 +212,11 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
}
+void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+ make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
+
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
struct page *page;
@@ -236,6 +237,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
}
vcpu->run = page_address(page);
+ kvm_vcpu_set_in_spin_loop(vcpu, false);
+ kvm_vcpu_set_dy_eligible(vcpu, false);
+
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
@@ -332,8 +336,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- for (; start < end; start += PAGE_SIZE)
- need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -412,7 +415,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
int idx;
idx = srcu_read_lock(&kvm->srcu);
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_all(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -516,16 +519,32 @@ out_err_nodisable:
return ERR_PTR(r);
}
+/*
+ * Avoid using vmalloc for a small buffer.
+ * Should not be used when the size is statically known.
+ */
+void *kvm_kvzalloc(unsigned long size)
+{
+ if (size > PAGE_SIZE)
+ return vzalloc(size);
+ else
+ return kzalloc(size, GFP_KERNEL);
+}
+
+void kvm_kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+
static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
{
if (!memslot->dirty_bitmap)
return;
- if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
- vfree(memslot->dirty_bitmap);
- else
- kfree(memslot->dirty_bitmap);
-
+ kvm_kvfree(memslot->dirty_bitmap);
memslot->dirty_bitmap = NULL;
}
@@ -535,16 +554,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
- if (!dont || free->rmap != dont->rmap)
- vfree(free->rmap);
-
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kvm_destroy_dirty_bitmap(free);
kvm_arch_free_memslot(free, dont);
free->npages = 0;
- free->rmap = NULL;
}
void kvm_free_physmem(struct kvm *kvm)
@@ -574,7 +589,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_free_physmem(kvm);
@@ -617,11 +632,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
#ifndef CONFIG_S390
unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
- if (dirty_bytes > PAGE_SIZE)
- memslot->dirty_bitmap = vzalloc(dirty_bytes);
- else
- memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
-
+ memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
if (!memslot->dirty_bitmap)
return -ENOMEM;
@@ -674,6 +685,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
slots->generation++;
}
+static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+{
+ u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+#ifdef KVM_CAP_READONLY_MEM
+ valid_flags |= KVM_MEM_READONLY;
+#endif
+
+ if (mem->flags & ~valid_flags)
+ return -EINVAL;
+
+ return 0;
+}
+
/*
* Allocate some memory and give it an address in the guest physical address
* space.
@@ -689,11 +714,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
int r;
gfn_t base_gfn;
unsigned long npages;
- unsigned long i;
- struct kvm_memory_slot *memslot;
+ struct kvm_memory_slot *memslot, *slot;
struct kvm_memory_slot old, new;
struct kvm_memslots *slots, *old_memslots;
+ r = check_memory_region_flags(mem);
+ if (r)
+ goto out;
+
r = -EINVAL;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
@@ -737,13 +765,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* Check for overlaps */
r = -EEXIST;
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
-
- if (s == memslot || !s->npages)
+ kvm_for_each_memslot(slot, kvm->memslots) {
+ if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot)
continue;
- if (!((base_gfn + npages <= s->base_gfn) ||
- (base_gfn >= s->base_gfn + s->npages)))
+ if (!((base_gfn + npages <= slot->base_gfn) ||
+ (base_gfn >= slot->base_gfn + slot->npages)))
goto out_free;
}
@@ -757,11 +783,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (npages && !old.npages) {
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
-#ifndef CONFIG_S390
- new.rmap = vzalloc(npages * sizeof(*new.rmap));
- if (!new.rmap)
- goto out_free;
-#endif /* not defined CONFIG_S390 */
+
if (kvm_arch_create_memslot(&new, npages))
goto out_free;
}
@@ -773,7 +795,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* destroy any largepage mappings for dirty tracking */
}
- if (!npages) {
+ if (!npages || base_gfn != old.base_gfn) {
struct kvm_memory_slot *slot;
r = -ENOMEM;
@@ -789,14 +811,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
old_memslots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
- /* From this point no new shadow pages pointing to a deleted
- * memslot will be created.
+ /* From this point no new shadow pages pointing to a deleted,
+ * or moved, memslot will be created.
*
* validation of sp->gfn happens in:
* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
* - kvm_is_visible_gfn (mmu_check_roots)
*/
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_memslot(kvm, slot);
kfree(old_memslots);
}
@@ -820,7 +842,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* actual memory is freed via old in kvm_free_physmem_slot below */
if (!npages) {
- new.rmap = NULL;
new.dirty_bitmap = NULL;
memset(&new.arch, 0, sizeof(new.arch));
}
@@ -832,13 +853,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
- /*
- * If the new memory slot is created, we need to clear all
- * mmio sptes.
- */
- if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
- kvm_arch_flush_shadow(kvm);
-
kvm_free_physmem_slot(&old, &new);
kfree(old_memslots);
@@ -920,53 +934,6 @@ void kvm_disable_largepages(void)
}
EXPORT_SYMBOL_GPL(kvm_disable_largepages);
-int is_error_page(struct page *page)
-{
- return page == bad_page || page == hwpoison_page || page == fault_page;
-}
-EXPORT_SYMBOL_GPL(is_error_page);
-
-int is_error_pfn(pfn_t pfn)
-{
- return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_error_pfn);
-
-int is_hwpoison_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn;
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
-int is_fault_pfn(pfn_t pfn)
-{
- return pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_fault_pfn);
-
-int is_noslot_pfn(pfn_t pfn)
-{
- return pfn == bad_pfn;
-}
-EXPORT_SYMBOL_GPL(is_noslot_pfn);
-
-int is_invalid_pfn(pfn_t pfn)
-{
- return pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_invalid_pfn);
-
-static inline unsigned long bad_hva(void)
-{
- return PAGE_OFFSET;
-}
-
-int kvm_is_error_hva(unsigned long addr)
-{
- return addr == bad_hva();
-}
-EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1009,17 +976,38 @@ out:
return size;
}
-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
- gfn_t *nr_pages)
+static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+{
+ return slot->flags & KVM_MEM_READONLY;
+}
+
+static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return bad_hva();
+ return KVM_HVA_ERR_BAD;
+
+ if (memslot_is_readonly(slot) && write)
+ return KVM_HVA_ERR_RO_BAD;
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);
- return gfn_to_hva_memslot(slot, gfn);
+ return __gfn_to_hva_memslot(slot, gfn);
+}
+
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages)
+{
+ return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+}
+
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ return gfn_to_hva_many(slot, gfn, NULL);
}
+EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
@@ -1027,10 +1015,23 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t get_fault_pfn(void)
+/*
+ * The hva returned by this function is only allowed to be read.
+ * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
+ */
+static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+}
+
+static int kvm_read_hva(void *data, void __user *hva, int len)
+{
+ return __copy_from_user(data, hva, len);
+}
+
+static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
{
- get_page(fault_page);
- return fault_pfn;
+ return __copy_from_user_inatomic(data, hva, len);
}
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1053,108 +1054,186 @@ static inline int check_user_page_hwpoison(unsigned long addr)
return rc == -EHWPOISON;
}
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
- bool *async, bool write_fault, bool *writable)
+/*
+ * The atomic path to get the writable pfn which will be stored in @pfn,
+ * true indicates success, otherwise false is returned.
+ */
+static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable, pfn_t *pfn)
{
struct page *page[1];
- int npages = 0;
- pfn_t pfn;
+ int npages;
- /* we can do it either atomically or asynchronously, not both */
- BUG_ON(atomic && async);
+ if (!(async || atomic))
+ return false;
- BUG_ON(!write_fault && !writable);
+ /*
+ * Fast pin a writable pfn only if it is a write fault request
+ * or the caller allows to map a writable pfn for a read fault
+ * request.
+ */
+ if (!(write_fault || writable))
+ return false;
- if (writable)
- *writable = true;
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (npages == 1) {
+ *pfn = page_to_pfn(page[0]);
- if (atomic || async)
- npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (writable)
+ *writable = true;
+ return true;
+ }
- if (unlikely(npages != 1) && !atomic) {
- might_sleep();
+ return false;
+}
- if (writable)
- *writable = write_fault;
+/*
+ * The slow path to get the pfn of the specified host virtual address,
+ * 1 indicates success, -errno is returned if error is detected.
+ */
+static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+ bool *writable, pfn_t *pfn)
+{
+ struct page *page[1];
+ int npages = 0;
- if (async) {
- down_read(&current->mm->mmap_sem);
- npages = get_user_page_nowait(current, current->mm,
- addr, write_fault, page);
- up_read(&current->mm->mmap_sem);
- } else
- npages = get_user_pages_fast(addr, 1, write_fault,
- page);
-
- /* map read fault as writable if possible */
- if (unlikely(!write_fault) && npages == 1) {
- struct page *wpage[1];
-
- npages = __get_user_pages_fast(addr, 1, 1, wpage);
- if (npages == 1) {
- *writable = true;
- put_page(page[0]);
- page[0] = wpage[0];
- }
- npages = 1;
+ might_sleep();
+
+ if (writable)
+ *writable = write_fault;
+
+ if (async) {
+ down_read(&current->mm->mmap_sem);
+ npages = get_user_page_nowait(current, current->mm,
+ addr, write_fault, page);
+ up_read(&current->mm->mmap_sem);
+ } else
+ npages = get_user_pages_fast(addr, 1, write_fault,
+ page);
+ if (npages != 1)
+ return npages;
+
+ /* map read fault as writable if possible */
+ if (unlikely(!write_fault) && writable) {
+ struct page *wpage[1];
+
+ npages = __get_user_pages_fast(addr, 1, 1, wpage);
+ if (npages == 1) {
+ *writable = true;
+ put_page(page[0]);
+ page[0] = wpage[0];
}
+
+ npages = 1;
}
+ *pfn = page_to_pfn(page[0]);
+ return npages;
+}
- if (unlikely(npages != 1)) {
- struct vm_area_struct *vma;
+static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+{
+ if (unlikely(!(vma->vm_flags & VM_READ)))
+ return false;
- if (atomic)
- return get_fault_pfn();
+ if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+ return false;
- down_read(&current->mm->mmap_sem);
- if (npages == -EHWPOISON ||
- (!async && check_user_page_hwpoison(addr))) {
- up_read(&current->mm->mmap_sem);
- get_page(hwpoison_page);
- return page_to_pfn(hwpoison_page);
- }
+ return true;
+}
- vma = find_vma_intersection(current->mm, addr, addr+1);
-
- if (vma == NULL)
- pfn = get_fault_pfn();
- else if ((vma->vm_flags & VM_PFNMAP)) {
- pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- BUG_ON(!kvm_is_mmio_pfn(pfn));
- } else {
- if (async && (vma->vm_flags & VM_WRITE))
- *async = true;
- pfn = get_fault_pfn();
- }
- up_read(&current->mm->mmap_sem);
- } else
- pfn = page_to_pfn(page[0]);
+/*
+ * Pin guest page in memory and return its pfn.
+ * @addr: host virtual address which maps memory to the guest
+ * @atomic: whether this function can sleep
+ * @async: whether this function need to wait IO complete if the
+ * host page is not in the memory
+ * @write_fault: whether we should get a writable host page
+ * @writable: whether it allows to map a writable host page for !@write_fault
+ *
+ * The function will map a writable host page for these two cases:
+ * 1): @write_fault = true
+ * 2): @write_fault = false && @writable, @writable will tell the caller
+ * whether the mapping is writable.
+ */
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable)
+{
+ struct vm_area_struct *vma;
+ pfn_t pfn = 0;
+ int npages;
+
+ /* we can do it either atomically or asynchronously, not both */
+ BUG_ON(atomic && async);
+
+ if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+ return pfn;
+
+ if (atomic)
+ return KVM_PFN_ERR_FAULT;
+ npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+ if (npages == 1)
+ return pfn;
+
+ down_read(&current->mm->mmap_sem);
+ if (npages == -EHWPOISON ||
+ (!async && check_user_page_hwpoison(addr))) {
+ pfn = KVM_PFN_ERR_HWPOISON;
+ goto exit;
+ }
+
+ vma = find_vma_intersection(current->mm, addr, addr + 1);
+
+ if (vma == NULL)
+ pfn = KVM_PFN_ERR_FAULT;
+ else if ((vma->vm_flags & VM_PFNMAP)) {
+ pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ BUG_ON(!kvm_is_mmio_pfn(pfn));
+ } else {
+ if (async && vma_is_valid(vma, write_fault))
+ *async = true;
+ pfn = KVM_PFN_ERR_FAULT;
+ }
+exit:
+ up_read(&current->mm->mmap_sem);
return pfn;
}
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+static pfn_t
+__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable)
{
- return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+ unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+
+ if (addr == KVM_HVA_ERR_RO_BAD)
+ return KVM_PFN_ERR_RO_FAULT;
+
+ if (kvm_is_error_hva(addr))
+ return KVM_PFN_NOSLOT;
+
+ /* Do not map writable pfn in the readonly memslot. */
+ if (writable && memslot_is_readonly(slot)) {
+ *writable = false;
+ writable = NULL;
+ }
+
+ return hva_to_pfn(addr, atomic, async, write_fault,
+ writable);
}
-EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
bool write_fault, bool *writable)
{
- unsigned long addr;
+ struct kvm_memory_slot *slot;
if (async)
*async = false;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr)) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
+ slot = gfn_to_memslot(kvm, gfn);
- return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+ return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+ writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1183,13 +1262,17 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
- unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
}
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
+
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages)
{
@@ -1207,37 +1290,49 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
}
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+static struct page *kvm_pfn_to_page(pfn_t pfn)
+{
+ if (is_error_noslot_pfn(pfn))
+ return KVM_ERR_PTR_BAD_PAGE;
+
+ if (kvm_is_mmio_pfn(pfn)) {
+ WARN_ON(1);
+ return KVM_ERR_PTR_BAD_PAGE;
+ }
+
+ return pfn_to_page(pfn);
+}
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn);
- if (!kvm_is_mmio_pfn(pfn))
- return pfn_to_page(pfn);
- WARN_ON(kvm_is_mmio_pfn(pfn));
-
- get_page(bad_page);
- return bad_page;
+ return kvm_pfn_to_page(pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_clean(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(pfn_t pfn)
{
- if (!kvm_is_mmio_pfn(pfn))
+ if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
put_page(pfn_to_page(pfn));
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
void kvm_release_page_dirty(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1293,10 +1388,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int r;
unsigned long addr;
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
- r = __copy_from_user(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
@@ -1331,11 +1426,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
gfn_t gfn = gpa >> PAGE_SHIFT;
int offset = offset_in_page(gpa);
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
pagefault_disable();
- r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
pagefault_enable();
if (r)
return -EFAULT;
@@ -1473,8 +1568,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
- /* TODO: introduce set_bit_le() and use it */
- test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap);
+ set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
}
@@ -1568,6 +1662,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ * (preempted lock holder), indicated by @in_spin_loop.
+ * Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ * chance last time (mostly it has become eligible now since we have probably
+ * yielded to lockholder in last iteration. This is done by toggling
+ * @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ * to preempted lock-holder could result in wrong VCPU selection and CPU
+ * burning. Giving priority for a potential lock-holder increases lock
+ * progress.
+ *
+ * Since algorithm is based on heuristics, accessing another VCPU data without
+ * locking does not harm. It may result in trying to yield to same VCPU, fail
+ * and continue with next VCPU and so on.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+ bool eligible;
+
+ eligible = !vcpu->spin_loop.in_spin_loop ||
+ (vcpu->spin_loop.in_spin_loop &&
+ vcpu->spin_loop.dy_eligible);
+
+ if (vcpu->spin_loop.in_spin_loop)
+ kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+ return eligible;
+}
+#endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
struct kvm *kvm = me->kvm;
@@ -1577,6 +1708,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass;
int i;
+ kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something
@@ -1586,7 +1718,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
*/
for (pass = 0; pass < 2 && !yielded; pass++) {
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!pass && i < last_boosted_vcpu) {
+ if (!pass && i <= last_boosted_vcpu) {
i = last_boosted_vcpu;
continue;
} else if (pass && i > last_boosted_vcpu)
@@ -1595,6 +1727,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (waitqueue_active(&vcpu->wq))
continue;
+ if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+ continue;
if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i;
yielded = 1;
@@ -1602,6 +1736,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
}
+ kvm_vcpu_set_in_spin_loop(me, false);
+
+ /* Ensure vcpu is not eligible during next spinloop */
+ kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
@@ -1712,6 +1850,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
atomic_inc(&kvm->online_vcpus);
mutex_unlock(&kvm->lock);
+ kvm_arch_vcpu_postcreate(vcpu);
return r;
unlock_vcpu_destroy:
@@ -1754,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
#endif
- vcpu_load(vcpu);
+ r = vcpu_load(vcpu);
+ if (r)
+ return r;
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
@@ -1791,10 +1932,6 @@ out_free1:
goto out;
}
r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
- if (r)
- goto out_free2;
- r = 0;
-out_free2:
kfree(kvm_regs);
break;
}
@@ -1816,12 +1953,10 @@ out_free2:
kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
if (IS_ERR(kvm_sregs)) {
r = PTR_ERR(kvm_sregs);
+ kvm_sregs = NULL;
goto out;
}
r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_GET_MP_STATE: {
@@ -1843,9 +1978,6 @@ out_free2:
if (copy_from_user(&mp_state, argp, sizeof mp_state))
goto out;
r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_TRANSLATE: {
@@ -1870,9 +2002,6 @@ out_free2:
if (copy_from_user(&dbg, argp, sizeof dbg))
goto out;
r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_SET_SIGNAL_MASK: {
@@ -1916,12 +2045,10 @@ out_free2:
fpu = memdup_user(argp, sizeof(*fpu));
if (IS_ERR(fpu)) {
r = PTR_ERR(fpu);
+ fpu = NULL;
goto out;
}
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
- if (r)
- goto out;
- r = 0;
break;
}
default:
@@ -1964,9 +2091,10 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
if (copy_from_user(&csigset, sigmask_arg->sigset,
sizeof csigset))
goto out;
- }
- sigset_from_compat(&sigset, &csigset);
- r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ sigset_from_compat(&sigset, &csigset);
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ } else
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
break;
}
default:
@@ -1990,8 +2118,6 @@ static long kvm_vm_ioctl(struct file *filp,
switch (ioctl) {
case KVM_CREATE_VCPU:
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
- if (r < 0)
- goto out;
break;
case KVM_SET_USER_MEMORY_REGION: {
struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -2002,8 +2128,6 @@ static long kvm_vm_ioctl(struct file *filp,
goto out;
r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
- if (r)
- goto out;
break;
}
case KVM_GET_DIRTY_LOG: {
@@ -2013,8 +2137,6 @@ static long kvm_vm_ioctl(struct file *filp,
if (copy_from_user(&log, argp, sizeof log))
goto out;
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
- if (r)
- goto out;
break;
}
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2024,9 +2146,6 @@ static long kvm_vm_ioctl(struct file *filp,
if (copy_from_user(&zone, argp, sizeof zone))
goto out;
r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_UNREGISTER_COALESCED_MMIO: {
@@ -2035,9 +2154,6 @@ static long kvm_vm_ioctl(struct file *filp,
if (copy_from_user(&zone, argp, sizeof zone))
goto out;
r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
- if (r)
- goto out;
- r = 0;
break;
}
#endif
@@ -2081,6 +2197,29 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
#endif
+#ifdef __KVM_HAVE_IRQ_LINE
+ case KVM_IRQ_LINE_STATUS:
+ case KVM_IRQ_LINE: {
+ struct kvm_irq_level irq_event;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
+ goto out;
+
+ r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (ioctl == KVM_IRQ_LINE_STATUS) {
+ if (copy_to_user(argp, &irq_event, sizeof irq_event))
+ goto out;
+ }
+
+ r = 0;
+ break;
+ }
+#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
if (r == -ENOTTY)
@@ -2123,8 +2262,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,
log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
- if (r)
- goto out;
break;
}
default:
@@ -2213,7 +2350,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
case KVM_CAP_SIGNAL_MSI:
#endif
return 1;
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
+#ifdef KVM_CAP_IRQ_ROUTING
case KVM_CAP_IRQ_ROUTING:
return KVM_MAX_IRQ_ROUTES;
#endif
@@ -2685,9 +2822,6 @@ static struct syscore_ops kvm_syscore_ops = {
.resume = kvm_resume,
};
-struct page *bad_page;
-pfn_t bad_pfn;
-
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
@@ -2719,33 +2853,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
if (r)
goto out_fail;
- bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (bad_page == NULL) {
- r = -ENOMEM;
- goto out;
- }
-
- bad_pfn = page_to_pfn(bad_page);
-
- hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (hwpoison_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- hwpoison_pfn = page_to_pfn(hwpoison_page);
-
- fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (fault_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- fault_pfn = page_to_pfn(fault_page);
-
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
@@ -2820,12 +2927,6 @@ out_free_1:
out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
- if (fault_page)
- __free_page(fault_page);
- if (hwpoison_page)
- __free_page(hwpoison_page);
- __free_page(bad_page);
-out:
kvm_arch_exit();
out_fail:
return r;
@@ -2845,8 +2946,5 @@ void kvm_exit(void)
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);
- __free_page(fault_page);
- __free_page(hwpoison_page);
- __free_page(bad_page);
}
EXPORT_SYMBOL_GPL(kvm_exit);